├── .gitignore ├── History.md ├── Loremipsum.docx ├── Readme.md ├── component.json ├── gulpfile.js ├── index.js ├── lib ├── node-office.js ├── paragraph.js ├── parseDocument.js ├── processor.js └── run.js └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | extracts 2 | node_modules 3 | .idea -------------------------------------------------------------------------------- /History.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigmeech/nodeoffice/9753c576aaa03a63a224944b8d10975244a5f6db/History.md -------------------------------------------------------------------------------- /Loremipsum.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigmeech/nodeoffice/9753c576aaa03a63a224944b8d10975244a5f6db/Loremipsum.docx -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | 2 | # node-office 3 | 4 | Read and Extract Data from Open Office Documents 5 | 6 | ## Installation 7 | 8 | $ npm install nodeoffice 9 | 10 | ## Usage 11 | 12 | ```javascript 13 | var NodeOffice = require("./lib/node-office"); 14 | NodeOffice.readFile("Loremipsum.docx", function (err, bodyObject) { 15 | if(err) throw err 16 | var paras = bodyObject.getParagraphs(); 17 | var runs = []; 18 | var content = "" 19 | 20 | //for each paragraph 21 | for (var paraIndex in paras) { 22 | var paragraph = paras[paraIndex]; 23 | var runs = bodyObject.getRuns(paragraph); 24 | for (var runIndex in runs){ 25 | var run = runs[runIndex]; 26 | content += bodyObject.getRunContent(run)+"\n"; 27 | } 28 | } 29 | console.log(content) 30 | }) 31 | ``` 32 | 33 | ## License 34 | 35 | The MIT License (MIT) 36 | 37 | Copyright (c) 2014 38 | 39 | Permission is hereby granted, free of charge, to any person obtaining a copy 40 | of this software and associated documentation files (the "Software"), to deal 41 | in the Software without restriction, including without limitation the rights 42 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 43 | copies of the Software, and to permit persons to whom the Software is 44 | furnished to do so, subject to the following conditions: 45 | 46 | The above copyright notice and this permission notice shall be included in 47 | all copies or substantial portions of the Software. 48 | 49 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 50 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 51 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 52 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 53 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 54 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 55 | THE SOFTWARE. 56 | -------------------------------------------------------------------------------- /component.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-office", 3 | "repo": "bigmeech/node-office", 4 | "description": "Read and Extract Data from Open Office Documents", 5 | "version": "0.0.1", 6 | "keywords": [], 7 | "dependencies": {}, 8 | "development": {}, 9 | "license": "MIT" 10 | } -------------------------------------------------------------------------------- /gulpfile.js: -------------------------------------------------------------------------------- 1 | var gulp = require("gulp"); 2 | 3 | gulp.task("default",function(){ 4 | console.log("this is my first task!!!") 5 | }); 6 | 7 | gulp.task("build-component",function(){ 8 | console.log("this should run component CLI!!!") 9 | }) -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by laggie on 08/05/14. 3 | * Example Usage 4 | */ 5 | 6 | var NodeOffice = require("./lib/node-office"); 7 | NodeOffice.readFile("Loremipsum.docx", function (err, bodyObject) { 8 | /*var paras = bodyObject.getParagraphs(); 9 | var runs = []; 10 | var content = "" 11 | //for each paragraph 12 | for (var paraIndex in paras) { 13 | var paragraph = paras[paraIndex]; 14 | var runs = bodyObject.getRuns(paragraph); 15 | for (var runIndex in runs){ 16 | var run = runs[runIndex]; 17 | content += bodyObject.getRunContent(run)+"\n"; 18 | } 19 | } 20 | console.log(content)*/ 21 | 22 | var media = bodyObject.getMedia(); 23 | }); 24 | 25 | 26 | -------------------------------------------------------------------------------- /lib/node-office.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Author - Larry Eliemenye 3 | * Description - Read and extract Data from Office Files, Microsoft word, Powerpoint, Spreadsheet etc 4 | * 5 | * 6 | * Useful Elements of the WordProcessingML SPEC - http://officeopenxml.com/anatomyofOOXML.php 7 | * ==================================================================== 8 | * w:p = paragraph 9 | * w:r = runs 10 | * w:t = textblock 11 | * w:tbl = table 12 | * w:tr = table row 13 | * w:tc = table column 14 | * **/ 15 | 16 | var NodeOffice = (function () { 17 | 18 | var fs = require("fs"), 19 | async = require("async"), 20 | xml2js = require("xml2js"), 21 | zip = require("adm-zip"), 22 | path = require("path"), 23 | gm = require("gm"), 24 | parser = xml2js.Parser({xmlns: "w"}), 25 | Worker = require("webworker-threads").Worker; 26 | 27 | 28 | 29 | var xml_data = null, 30 | EXTRACT_FOLDER = "./extracts", 31 | ext = ['.docx', '.xlsx', '.pptx'], 32 | RAW_XMLPATH = "./extracts/word/document.xml", 33 | MEDIA_PATH = "./extracts/word/media" 34 | xml_json = null, 35 | xml_obj = null, 36 | xml_body = null, 37 | content = null, 38 | err = null; 39 | 40 | parser.addListener("end", function (result) { 41 | xml_data = result; 42 | xml_json = JSON.stringify(xml_data); 43 | }) 44 | 45 | //sample worker snippet 46 | var worker = new Worker(function(){ 47 | postMessage("I just started Parsing"); 48 | var onmessage = function(event){ 49 | console.log(event.data); 50 | self.close(); 51 | } 52 | }); 53 | 54 | worker.onmessage = function(event){ 55 | console.log("Parser said:"+event.data); 56 | } 57 | 58 | var readBuffer = function(buffer){ 59 | var zipFile = new zip(buffer); 60 | var entries = zipFile.getEntries() 61 | entries.forEach(function (e) { 62 | console.log(e.entryName); 63 | }); 64 | zipFile.extractAllTo(EXTRACT_FOLDER, true); 65 | parseDocument(RAW_XMLPATH, function (data) { 66 | xml_obj = JSON.parse(data); 67 | xml_body = xml_obj["w:document"]["w:body"]; 68 | next(err, getBodyObject); 69 | }); 70 | worker.postMessage("hi"); 71 | return xml_body 72 | } 73 | 74 | //reads file and returns of the file 75 | var readFile = function (file, next) { 76 | //extract content of file, first test for open office extension 77 | fs.exists(file, function (exist) { 78 | if (exist) { 79 | if (HasSupportedExtension(file)) { 80 | return readBuffer(file); 81 | } 82 | } 83 | else { 84 | var err = new Error("cannot find file: " + file); 85 | next(err, getBodyObject); 86 | } 87 | }) 88 | }; 89 | 90 | 91 | 92 | //returns paragraphs(w:p) as an array 93 | var getParagraphs = function () { 94 | var body = xml_body[0] 95 | var paragraphs = []; 96 | for (var element in body) { 97 | if (element === "w:p" && typeof(element) === "string") { 98 | paragraphs = body[element]; 99 | } 100 | } 101 | return paragraphs; 102 | } 103 | 104 | //returns runs(w:r) as an array from which to get textual content. 105 | var getRuns = function (paragraph) { 106 | var runs; 107 | for (var element in paragraph) { 108 | if (element === "w:r" && typeof(element) == "string") { 109 | runs = paragraph[element]; 110 | } 111 | } 112 | return runs 113 | } 114 | var getRunContent = function (run) { 115 | var content = ""; 116 | for (var key in run) { 117 | if (key === "w:t" && typeof(key) === "string") { 118 | var contentArray = run[key]; 119 | for (var textIndex in contentArray) { 120 | content += contentArray[textIndex]._; 121 | } 122 | } 123 | } 124 | return content 125 | } 126 | var containsRichFormatting = function (para) { 127 | for (var key in para) { 128 | console.log(key); 129 | } 130 | } 131 | 132 | //parse content xml document 133 | var parseDocument = function (rel, next) { 134 | fs.exists(rel, function (exist) { 135 | if (exist) { 136 | fs.readFile(rel, function (err, data) { 137 | parser.parseString(data); 138 | next(JSON.stringify(xml_data)); 139 | }) 140 | } else throw "File not found at specified path: " + rel 141 | }) 142 | } 143 | 144 | //returns tables 145 | var getTableData = function () { 146 | //for() 147 | } 148 | 149 | var getMedia = function(){ 150 | 151 | } 152 | 153 | var getContent = function () { 154 | 155 | } 156 | 157 | var hasTables = function () { 158 | return false 159 | } 160 | 161 | 162 | 163 | //Utlity functions 164 | var HasSupportedExtension = function (file) { 165 | for (var key in ext) { 166 | if (ext[key] === path.extname(file)) { 167 | return true 168 | } else throw "Unsupported File format, Must be an Open office XML format of either .docx,.xlsx or .pptx" 169 | } 170 | } 171 | 172 | var hasMedia = function (callback) { 173 | fs.exists(MEDIA_PATH,callback) 174 | } 175 | 176 | //API to return after a call to readfile 177 | var getBodyObject = { 178 | getParagraphs: getParagraphs, 179 | containRichFormatting: containsRichFormatting, 180 | getTableData: getTableData, 181 | getRuns: getRuns, 182 | getRunContent: getRunContent, 183 | hasMedia: hasMedia, 184 | hasTables: hasTables, 185 | getMedia:getMedia 186 | } 187 | //returns API 188 | return{ 189 | readFile: readFile, 190 | readBuffer: readBuffer 191 | } 192 | 193 | })() 194 | 195 | module.exports = NodeOffice; -------------------------------------------------------------------------------- /lib/paragraph.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Larry Eliemenye on 25/05/2014. 3 | */ 4 | -------------------------------------------------------------------------------- /lib/parseDocument.js: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /lib/processor.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Larry Eliemenye on 25/05/2014. 3 | */ 4 | -------------------------------------------------------------------------------- /lib/run.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Created by Larry Eliemenye on 25/05/2014. 3 | */ 4 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nodeoffice", 3 | "description": "Read and extract Data from Office Files, Microsoft word, Powerpoint, Spreadsheet etc", 4 | "author": { 5 | "name":"Larry Eliemenye", 6 | "email":"denachural@gmail.com" 7 | }, 8 | "version": "0.0.1", 9 | "keywords":[ 10 | "office", 11 | "msword", 12 | "docx", 13 | "pptx", 14 | "xlsx" 15 | ], 16 | "homepage":"https://github.com/bigmeech/node-office", 17 | "main":"./lib/node-office.js", 18 | "directories":{ 19 | "lib":"./lib" 20 | }, 21 | "dependencies": { 22 | "async": "*", 23 | "xml2js": "*", 24 | "adm-zip":"*", 25 | "gm":"*", 26 | "webworker-threads":"*" 27 | 28 | }, 29 | "devDependencies": { 30 | "mocha": "*" 31 | }, 32 | "repository":{ 33 | "type":"git", 34 | "url":"https://github.com/bigmeech/node-office.git" 35 | }, 36 | "licenses": [ 37 | { 38 | "type": "MIT", 39 | "url": "https://github.com/bigmeech/node-office/master/LICENSE" 40 | } 41 | ], 42 | "readme": "nodeoffice\n===========\n\nRead and extract Data from Office Files, Microsoft word, Powerpoint, Spreadsheet etc", 43 | "readmeFilename": "README.md", 44 | "bugs":{ 45 | "url":"https://github.com/bigmeech/node-office/issues" 46 | }, 47 | "_id": "nodeoffice@0.0.1", 48 | "_from": "nodeoffice@*" 49 | } 50 | --------------------------------------------------------------------------------