├── .gitignore
├── History.md
├── Loremipsum.docx
├── Readme.md
├── component.json
├── gulpfile.js
├── index.js
├── lib
    ├── node-office.js
    ├── paragraph.js
    ├── parseDocument.js
    ├── processor.js
    └── run.js
└── package.json


/.gitignore:
--------------------------------------------------------------------------------
1 | extracts
2 | node_modules
3 | .idea


--------------------------------------------------------------------------------
/History.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigmeech/nodeoffice/9753c576aaa03a63a224944b8d10975244a5f6db/History.md


--------------------------------------------------------------------------------
/Loremipsum.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigmeech/nodeoffice/9753c576aaa03a63a224944b8d10975244a5f6db/Loremipsum.docx


--------------------------------------------------------------------------------
/Readme.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # node-office
 3 | 
 4 |   Read and Extract Data from Open Office Documents
 5 | 
 6 | ## Installation
 7 | 
 8 |     $ npm install nodeoffice
 9 | 
10 | ## Usage
11 | 
12 | ```javascript
13 | var NodeOffice = require("./lib/node-office");
14 | NodeOffice.readFile("Loremipsum.docx", function (err, bodyObject) {
15 |   if(err) throw err
16 |   var paras = bodyObject.getParagraphs();
17 |   var runs = [];
18 |   var content = ""
19 |     
20 |   //for each paragraph
21 |   for (var paraIndex in paras) {
22 |     var paragraph = paras[paraIndex];
23 |     var runs = bodyObject.getRuns(paragraph);
24 |     for (var runIndex in runs){
25 |       var run = runs[runIndex];
26 |       content += bodyObject.getRunContent(run)+"\n";
27 |     }
28 |   }
29 |   console.log(content)
30 | })
31 | ```
32 | 
33 | ## License
34 | 
35 |   The MIT License (MIT)
36 | 
37 |   Copyright (c) 2014 <copyright holders>
38 | 
39 |   Permission is hereby granted, free of charge, to any person obtaining a copy
40 |   of this software and associated documentation files (the "Software"), to deal
41 |   in the Software without restriction, including without limitation the rights
42 |   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
43 |   copies of the Software, and to permit persons to whom the Software is
44 |   furnished to do so, subject to the following conditions:
45 | 
46 |   The above copyright notice and this permission notice shall be included in
47 |   all copies or substantial portions of the Software.
48 | 
49 |   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
50 |   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
51 |   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
52 |   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
53 |   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
54 |   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
55 |   THE SOFTWARE.
56 | 


--------------------------------------------------------------------------------
/component.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "node-office",
 3 |   "repo": "bigmeech/node-office",
 4 |   "description": "Read and Extract Data from Open Office Documents",
 5 |   "version": "0.0.1",
 6 |   "keywords": [],
 7 |   "dependencies": {},
 8 |   "development": {},
 9 |   "license": "MIT"
10 | }


--------------------------------------------------------------------------------
/gulpfile.js:
--------------------------------------------------------------------------------
1 | var gulp = require("gulp");
2 | 
3 | gulp.task("default",function(){
4 |   console.log("this is my first task!!!")
5 | });
6 | 
7 | gulp.task("build-component",function(){
8 |   console.log("this should run component CLI!!!")
9 | })


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Created by laggie on 08/05/14.
 3 |  * Example Usage
 4 |  */
 5 | 
 6 | var NodeOffice = require("./lib/node-office");
 7 | NodeOffice.readFile("Loremipsum.docx", function (err, bodyObject) {
 8 |   /*var paras = bodyObject.getParagraphs();
 9 |   var runs = [];
10 |   var content = ""
11 |   //for each paragraph
12 |   for (var paraIndex in paras) {
13 |     var paragraph = paras[paraIndex];
14 |     var runs = bodyObject.getRuns(paragraph);
15 |     for (var runIndex in runs){
16 |       var run = runs[runIndex];
17 |       content += bodyObject.getRunContent(run)+"\n";
18 |     }
19 |   }
20 |   console.log(content)*/
21 | 
22 |   var media = bodyObject.getMedia();
23 | });
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/lib/node-office.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Author           -  Larry Eliemenye
  3 |  * Description      -  Read and extract Data from Office Files, Microsoft word, Powerpoint, Spreadsheet etc
  4 |  *
  5 |  *
  6 |  * Useful Elements of the WordProcessingML SPEC - http://officeopenxml.com/anatomyofOOXML.php
  7 |  * ====================================================================
  8 |  *  w:p     = paragraph
  9 |  *  w:r     = runs
 10 |  *  w:t     = textblock
 11 |  *  w:tbl   = table
 12 |  *  w:tr    = table row
 13 |  *  w:tc    = table column
 14 |  * **/
 15 | 
 16 | var NodeOffice = (function () {
 17 | 
 18 |   var fs        = require("fs"),
 19 |       async     = require("async"),
 20 |       xml2js    = require("xml2js"),
 21 |       zip       = require("adm-zip"),
 22 |       path      = require("path"),
 23 |       gm        =  require("gm"),
 24 |       parser    = xml2js.Parser({xmlns: "w"}),
 25 |       Worker = require("webworker-threads").Worker;
 26 | 
 27 | 
 28 | 
 29 |   var xml_data = null,
 30 |       EXTRACT_FOLDER = "./extracts",
 31 |       ext = ['.docx', '.xlsx', '.pptx'],
 32 |       RAW_XMLPATH = "./extracts/word/document.xml",
 33 |       MEDIA_PATH = "./extracts/word/media"
 34 |       xml_json = null,
 35 |       xml_obj = null,
 36 |       xml_body = null,
 37 |       content = null,
 38 |       err = null;
 39 | 
 40 |   parser.addListener("end", function (result) {
 41 |     xml_data = result;
 42 |     xml_json = JSON.stringify(xml_data);
 43 |   })
 44 | 
 45 |   //sample worker snippet
 46 |   var worker = new Worker(function(){
 47 |     postMessage("I just started Parsing");
 48 |     var onmessage = function(event){
 49 |       console.log(event.data);
 50 |       self.close();
 51 |     }
 52 |   });
 53 | 
 54 |   worker.onmessage = function(event){
 55 |     console.log("Parser said:"+event.data);
 56 |   }
 57 | 
 58 |   var readBuffer = function(buffer){
 59 |     var zipFile = new zip(buffer);
 60 |     var entries = zipFile.getEntries()
 61 |     entries.forEach(function (e) {
 62 |       console.log(e.entryName);
 63 |     });
 64 |     zipFile.extractAllTo(EXTRACT_FOLDER, true);
 65 |     parseDocument(RAW_XMLPATH, function (data) {
 66 |       xml_obj = JSON.parse(data);
 67 |       xml_body = xml_obj["w:document"]["w:body"];
 68 |       next(err, getBodyObject);
 69 |     });
 70 |     worker.postMessage("hi");
 71 |     return xml_body
 72 |   }
 73 | 
 74 |   //reads file and returns of the file
 75 |   var readFile = function (file, next) {
 76 |     //extract content of file, first test for open office extension
 77 |     fs.exists(file, function (exist) {
 78 |       if (exist) {
 79 |         if (HasSupportedExtension(file)) {
 80 |           return readBuffer(file);
 81 |         }
 82 |       }
 83 |       else {
 84 |         var err = new Error("cannot find file: " + file);
 85 |         next(err, getBodyObject);
 86 |       }
 87 |     })
 88 |   };
 89 | 
 90 | 
 91 | 
 92 |   //returns paragraphs(w:p) as an array
 93 |   var getParagraphs = function () {
 94 |     var body = xml_body[0]
 95 |     var paragraphs = [];
 96 |     for (var element in body) {
 97 |       if (element === "w:p" && typeof(element) === "string") {
 98 |         paragraphs = body[element];
 99 |       }
100 |     }
101 |     return paragraphs;
102 |   }
103 | 
104 |   //returns runs(w:r) as an array from which to get textual content.
105 |   var getRuns = function (paragraph) {
106 |     var runs;
107 |     for (var element in paragraph) {
108 |       if (element === "w:r" && typeof(element) == "string") {
109 |         runs = paragraph[element];
110 |       }
111 |     }
112 |     return runs
113 |   }
114 |   var getRunContent = function (run) {
115 |     var content = "";
116 |     for (var key in run) {
117 |       if (key === "w:t" && typeof(key) === "string") {
118 |         var contentArray = run[key];
119 |         for (var textIndex in contentArray) {
120 |           content += contentArray[textIndex]._;
121 |         }
122 |       }
123 |     }
124 |     return content
125 |   }
126 |   var containsRichFormatting = function (para) {
127 |     for (var key in para) {
128 |       console.log(key);
129 |     }
130 |   }
131 | 
132 |   //parse content xml document
133 |   var parseDocument = function (rel, next) {
134 |     fs.exists(rel, function (exist) {
135 |       if (exist) {
136 |         fs.readFile(rel, function (err, data) {
137 |           parser.parseString(data);
138 |           next(JSON.stringify(xml_data));
139 |         })
140 |       } else throw "File not found at specified path: " + rel
141 |     })
142 |   }
143 | 
144 |   //returns tables
145 |   var getTableData = function () {
146 |     //for()
147 |   }
148 | 
149 |   var getMedia = function(){
150 | 
151 |   }
152 | 
153 |   var getContent = function () {
154 | 
155 |   }
156 | 
157 |   var hasTables = function () {
158 |     return false
159 |   }
160 | 
161 | 
162 | 
163 |   //Utlity functions
164 |   var HasSupportedExtension = function (file) {
165 |     for (var key in ext) {
166 |       if (ext[key] === path.extname(file)) {
167 |         return true
168 |       } else throw "Unsupported File format, Must be an Open office XML format of either .docx,.xlsx or .pptx"
169 |     }
170 |   }
171 | 
172 |   var hasMedia = function (callback) {
173 |     fs.exists(MEDIA_PATH,callback)
174 |   }
175 | 
176 |   //API to return after a call to readfile
177 |   var getBodyObject = {
178 |     getParagraphs: getParagraphs,
179 |     containRichFormatting: containsRichFormatting,
180 |     getTableData: getTableData,
181 |     getRuns: getRuns,
182 |     getRunContent: getRunContent,
183 |     hasMedia: hasMedia,
184 |     hasTables: hasTables,
185 |     getMedia:getMedia
186 |   }
187 |   //returns API
188 |   return{
189 |     readFile: readFile,
190 |     readBuffer: readBuffer
191 |   }
192 | 
193 | })()
194 | 
195 | module.exports = NodeOffice;


--------------------------------------------------------------------------------
/lib/paragraph.js:
--------------------------------------------------------------------------------
1 | /**
2 |  * Created by Larry Eliemenye on 25/05/2014.
3 |  */
4 | 


--------------------------------------------------------------------------------
/lib/parseDocument.js:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/lib/processor.js:
--------------------------------------------------------------------------------
1 | /**
2 |  * Created by Larry Eliemenye on 25/05/2014.
3 |  */
4 | 


--------------------------------------------------------------------------------
/lib/run.js:
--------------------------------------------------------------------------------
1 | /**
2 |  * Created by Larry Eliemenye on 25/05/2014.
3 |  */
4 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "nodeoffice",
 3 |   "description": "Read and extract Data from Office Files, Microsoft word, Powerpoint, Spreadsheet etc",
 4 |   "author": {
 5 |       "name":"Larry Eliemenye",
 6 |       "email":"denachural@gmail.com"
 7 |   },
 8 |   "version": "0.0.1",
 9 |   "keywords":[
10 |       "office",
11 |       "msword",
12 |       "docx",
13 |       "pptx",
14 |       "xlsx"
15 |   ],
16 |   "homepage":"https://github.com/bigmeech/node-office",
17 |   "main":"./lib/node-office.js",
18 |   "directories":{
19 |      "lib":"./lib"
20 |   },
21 |   "dependencies": {
22 |     "async": "*",
23 |     "xml2js": "*",
24 |     "adm-zip":"*",
25 |     "gm":"*",
26 |     "webworker-threads":"*"
27 | 
28 |   },
29 |   "devDependencies": {
30 | 	"mocha": "*"
31 |   },
32 |   "repository":{
33 |       "type":"git",
34 |       "url":"https://github.com/bigmeech/node-office.git"
35 |   },
36 |   "licenses": [
37 |         {
38 |             "type": "MIT",
39 |             "url": "https://github.com/bigmeech/node-office/master/LICENSE"
40 |         }
41 |   ],
42 |   "readme": "nodeoffice\n===========\n\nRead and extract Data from Office Files, Microsoft word, Powerpoint, Spreadsheet etc",
43 |   "readmeFilename": "README.md",
44 |   "bugs":{
45 |       "url":"https://github.com/bigmeech/node-office/issues"
46 |   },
47 |   "_id": "nodeoffice@0.0.1",
48 |   "_from": "nodeoffice@*"
49 | }
50 | 


--------------------------------------------------------------------------------