├── .eslintrc.yml ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .npmignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── __tests__ ├── 00_base_test.js ├── 01_word_files_ole_test.js ├── 02_word_files_extract_test.js ├── 03_document_test.js ├── 04_word_files_snapshot_test.js ├── 05_word_buffers_snapshot_test.js ├── 06_openoffice_files_extract_test.js ├── 07_openoffice_buffers_extract_test.js ├── 08_bigfiles_test.js ├── 09_headers_footers_test.js ├── 10_word_invalid_file_test.js ├── 11_openoffice_invalid_xml_test.js ├── 99_file_notest.js ├── __snapshots__ │ ├── test01.doc.snapx │ ├── test02.doc.snapx │ ├── test03.doc.snapx │ ├── test04.doc.snapx │ ├── test05.doc.snapx │ ├── test06.doc.snapx │ ├── test07.doc.snapx │ ├── test08.doc.snapx │ ├── test09.doc.snapx │ ├── test10.doc.snapx │ ├── test11.doc.snapx │ ├── test12.doc.snapx │ ├── test13.doc.snapx │ ├── test14.doc.snapx │ ├── test15.doc.snapx │ ├── test16.doc.snapx │ ├── test17.doc.snapx │ ├── test18.doc.snapx │ ├── test19.doc.snapx │ └── test20.doc.snapx └── data │ ├── bad-xml.docx │ ├── badfile-01-bad-header.doc │ ├── bigfile-01.doc │ ├── bigfile-01.docx │ ├── manifest.md │ ├── test01.doc │ ├── test01.docx │ ├── test02.doc │ ├── test02.docx │ ├── test03.doc │ ├── test03.docx │ ├── test04.doc │ ├── test04.docx │ ├── test05.doc │ ├── test06.doc │ ├── test06.docx │ ├── test07.doc │ ├── test07.docx │ ├── test08.doc │ ├── test08.docx │ ├── test09.doc │ ├── test09.docx │ ├── test10.doc │ ├── test10.docx │ ├── test11.doc │ ├── test11.docx │ ├── test12.doc │ ├── test12.docx │ ├── test13.doc │ ├── test13.docx │ ├── test14.doc │ ├── test14.docx │ ├── test15.doc │ ├── test15.docx │ ├── test16.doc │ ├── test16.docx │ ├── test17.doc │ ├── test17.docx │ ├── test18.doc │ ├── test18.docx │ ├── test19.doc │ ├── test19.docx │ ├── test20.doc │ └── test20.docx ├── jsdoc.json ├── lib ├── buffer-reader.js ├── document.js ├── file-reader.js ├── filters.js ├── ole-allocation-table.js ├── ole-compound-doc.js ├── ole-directory-tree.js ├── ole-header.js ├── ole-storage-stream.js ├── ole-storage.js ├── open-office-extractor.js ├── word-ole-extractor.js └── word.js ├── package.json └── yarn.lock /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | env: 2 | jest: true 3 | es6: true 4 | node: true 5 | extends: 'eslint:recommended' 6 | rules: 7 | indent: 8 | - error 9 | - 2 10 | - SwitchCase: 1 11 | no-control-regex: 12 | - off 13 | linebreak-style: 14 | - error 15 | - unix 16 | quotes: 17 | - off 18 | - double 19 | semi: 20 | - error 21 | - always 22 | no-commonjs: 23 | - off 24 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | # This is a basic workflow to help you get started with Actions 2 | 3 | name: CI 4 | 5 | on: 6 | push: 7 | branches: [ develop ] 8 | pull_request: 9 | branches: [ develop ] 10 | 11 | # Allows you to run this workflow manually from the Actions tab 12 | workflow_dispatch: 13 | 14 | jobs: 15 | build: 16 | # The type of runner that the job will run on 17 | runs-on: ubuntu-latest 18 | 19 | # Steps represent a sequence of tasks that will be executed as part of the job 20 | steps: 21 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it 22 | - uses: actions/checkout@v2 23 | 24 | - name: Use Node.js ${{ matrix.node-version }} 25 | uses: actions/setup-node@v1 26 | with: 27 | node-version: ${{ matrix.node-version }} 28 | 29 | - uses: actions/cache@v2 30 | with: 31 | path: '**/node_modules' 32 | key: ${{ runner.os }}-modules-${{ hashFiles('**/yarn.lock') }} 33 | 34 | - name: Install dependencies 35 | run: yarn --frozen-lockfile 36 | 37 | - run: yarn test 38 | 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # node-waf configuration 20 | .lock-wscript 21 | 22 | # Compiled binary addons (http://nodejs.org/api/addons.html) 23 | build/Release 24 | 25 | # Dependency directory 26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git 27 | node_modules 28 | 29 | ~*.doc 30 | ~*.docx 31 | 32 | /jsdoc 33 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | __tests__ 2 | jsdoc 3 | coverage 4 | .github 5 | .eslintrc.yml 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change log 2 | 3 | ### 1.0.4 / 26th July 2021 4 | 5 | * Fixed issue with missing content from LibreOffice files. See #40 6 | * Fixed order of entry reading from LibreOffice OOXML files. See #41 7 | 8 | ### 1.0.3 / 17th June 2021 9 | 10 | * Fixes issues with long attribute values (> 65k) in OO XML. See #37 11 | * Propogate errors from XML failures into promise rejections. See #38 12 | * Changed the XML parser dependency for maintenance and fixes. See #39 13 | 14 | ### 1.0.2 / 28th May 2021 15 | 16 | * Added a new method for reading textbox content. See #35 17 | 18 | ### 1.0.1 / 24th May 2021 19 | 20 | * Added separation between headers and footers. See #34 21 | 22 | ### 1.0.0 / 16th May 2021 23 | 24 | * Major refactoring of the OLE code to use promises internally 25 | * Added support for Open Office XML-based (.docx) Word files. See #1 26 | * Added support for reading direct from a Buffer. See #11 27 | * Removed event-stream dependency. See #19 28 | * Fixed an issue with not closing files properly. See #23 29 | * Corrected handling of extracting files with files. See #31 30 | * Corrected handling of extracting files with deleted text. See #32 31 | * Fixed issues with extracting multiple rows of table data. See #33 32 | 33 | This is a major release, and while there are no incompatible API changes, 34 | it seemed best to bump the version so as not to pick up updates automatically. 35 | However, all old applications should not require any code changes to use 36 | this version. 37 | 38 | ### 0.3.0 / 18th February 2019 39 | 40 | * Re-fixed the bad loop in the OLE code. See #15, #18 41 | * A few errors previously rejected as strings, they're now errors 42 | * Updated dependencies to safe versions. See #20 43 | 44 | 45 | ### 0.2.2 / 23rd January 2019 46 | 47 | * Fixed [the bad dependency on event-stream](https://github.com/dominictarr/event-stream/issues/116) 48 | 49 | 50 | ### 0.2.1 / 21st January 2019 51 | 52 | * Added a new getEndnotes method. See #16 53 | * Fixed a bad loop in the OLE code 54 | 55 | 56 | ### 0.2.0 / 31st October 3018 57 | 58 | * Removed coffeescript and mocha, now using jest and plain ES6 59 | * Removed partial work on .docx (for now) 60 | 61 | 62 | ### 0.1.4 / 25th March 2017 63 | 64 | * Fixed a documentation issue. `extract` returns a Promise. See #6 65 | * Corrected table cell delimiters to be tabs. See #9 66 | * Fixed an issue where replacements weren't being applied right. 67 | 68 | 69 | ### 0.1.3 / 6th July 2016 70 | 71 | * Added the missing `lib` folder 72 | * Added a missing dependency to `package.json` 73 | 74 | 75 | ### 0.1.1 / 17th January 2016 76 | 77 | * Fixed a bug with text boundary calculations 78 | * Added endpoints `getHeaders`, `getFootnotes`, `getAnnotations` 79 | 80 | 81 | ### 0.1.0 / 14th January 2016 82 | 83 | * Initial release to npm 84 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016-2021 Stuart Watt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## word-extractor 2 | 3 | [![npm version](https://badge.fury.io/js/word-extractor.svg)](https://badge.fury.io/js/word-extractor) ![test workflow](https://github.com/morungos/node-word-extractor/actions/workflows/main.yml/badge.svg) 4 | 5 | Read data from a Word document (.doc or .docx) using Node.js 6 | 7 | ### Why use this module? 8 | 9 | There are a fair number of npm components which can extract text from Word .doc 10 | files, but they often appear to require some external helper program, and involve 11 | either spawning a process or communicating with a persistent one. That raises 12 | the installation and deployment burden as well as the runtime one. 13 | 14 | This module is intended to provide a much faster way of reading the text from a 15 | Word file, without leaving the Node.js environment. 16 | 17 | This means you do not need to install Word, Office, or anything else, and the 18 | module will work on all platforms, without any native binary code requirements. 19 | 20 | As of version 1.0, this module supports both traditional, OLE-based, Word files (usually .doc), 21 | and modern, Open Office-style, ECMA-376 Word files (usually .docx). It can be 22 | used both with files and with file contents in a Node.js Buffer. 23 | 24 | ### How do I install this module? 25 | 26 | ```bash= 27 | yarn add word-extractor 28 | 29 | # Or using npm... 30 | npm install word-extractor 31 | ``` 32 | 33 | ### How do I use this module? 34 | 35 | ``` 36 | const WordExtractor = require("word-extractor"); 37 | const extractor = new WordExtractor(); 38 | const extracted = extractor.extract("file.doc"); 39 | 40 | extracted.then(function(doc) { console.log(doc.getBody()); }); 41 | ``` 42 | 43 | The object returned from the `extract()` method is a promise that resolves to a 44 | document object, which then provides several views onto different parts of the 45 | document contents. 46 | 47 | ### Methods 48 | 49 | `WordExtractor#extract( | )` 50 | 51 | Main method to open a Word file and retrieve the data. Returns a promise which 52 | resolves to a `Document`. If a Buffer is passed instead of a filename, then 53 | the buffer is used directly, instad of reading a disk from the file system. 54 | 55 | `Document#getBody()` 56 | 57 | Retrieves the content text from a Word document. This will handle UNICODE 58 | characters correctly, so if there are accented or non-Latin-1 characters 59 | present in the document, they'll show as is in the returned string. 60 | 61 | `Document#getFootnotes()` 62 | 63 | Retrieves the footnote text from a Word document. This will handle UNICODE 64 | characters correctly, so if there are accented or non-Latin-1 characters 65 | present in the document, they'll show as is in the returned string. 66 | 67 | `Document#getEndnotes()` 68 | 69 | Retrieves the endnote text from a Word document. This will handle UNICODE 70 | characters correctly, so if there are accented or non-Latin-1 characters 71 | present in the document, they'll show as is in the returned string. 72 | 73 | `Document#getHeaders(options?)` 74 | 75 | Retrieves the header and footer text from a Word document. This will handle 76 | UNICODE characters correctly, so if there are accented or non-Latin-1 77 | characters present in the document, they'll show as is in the returned string. 78 | 79 | Note that by default, `getHeaders()` returns one string, containing all 80 | headers and footers. This is compatible with previous versions. If you want 81 | to separate headers and footers, use `getHeaders({includeFooters: false})`, 82 | to return only the headers, and the new method `getFooters()` (from version 1.0.1) 83 | to return the footers separately. 84 | 85 | `Document#getFooters()` 86 | 87 | From version 1.0.1. Retrieves the footer text from a Word document. This will handle 88 | UNICODE characters correctly, so if there are accented or non-Latin-1 89 | characters present in the document, they'll show as is in the returned string. 90 | 91 | `Document#getAnnotations()` 92 | 93 | Retrieves the comment bubble text from a Word document. This will handle 94 | UNICODE characters correctly, so if there are accented or non-Latin-1 95 | characters present in the document, they'll show as is in the returned string. 96 | 97 | `Document#getTextboxes(options?)` 98 | 99 | Retrieves the textbox contenttext from a Word document. This will handle 100 | UNICODE characters correctly, so if there are accented or non-Latin-1 101 | characters present in the document, they'll show as is in the returned string. 102 | 103 | Note that by default, `getTextboxes()` returns one string, containing all 104 | textbox content from both main document and the headers and footers. You 105 | can control what gets included by using the options `includeHeadersAndFooters` 106 | (which defaults to true) and `includeBody` (also defaults to true). So, 107 | as an example, if you only want the body text box content, use: 108 | `doc.getTextboxes({includeHeadersAndFooters: false})`. 109 | 110 | ### License 111 | 112 | Copyright (c) 2016-2021. Stuart Watt. 113 | 114 | Licensed under the MIT License. 115 | -------------------------------------------------------------------------------- /__tests__/00_base_test.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | const WordExtractor = require('../lib/word'); 4 | 5 | describe('Checking block from files', () => { 6 | 7 | const extractor = new WordExtractor(); 8 | 9 | it('should extract a .doc document successfully', () => { 10 | return extractor.extract(path.resolve(__dirname, "data/test01.doc")); 11 | }); 12 | 13 | it('should extract a .docx document successfully', () => { 14 | return extractor.extract(path.resolve(__dirname, "data/test01.docx")); 15 | }); 16 | 17 | it('should handle missing file error correctly', () => { 18 | const result = extractor.extract(path.resolve(__dirname, "data/missing00.docx")); 19 | return expect(result).rejects.toEqual(expect.objectContaining({ 20 | message: expect.stringMatching(/no such file or directory/) 21 | })); 22 | }); 23 | 24 | it('should properly close the file', () => { 25 | const open = jest.spyOn(fs, 'open'); 26 | const close = jest.spyOn(fs, 'close'); 27 | return extractor.extract(path.resolve(__dirname, "data/test01.doc")) 28 | .then(() => { 29 | expect(open).toHaveBeenCalledTimes(1); 30 | expect(close).toHaveBeenCalledTimes(1); 31 | }) 32 | .finally(() => { 33 | open.mockRestore(); 34 | close.mockRestore(); 35 | }); 36 | }); 37 | 38 | }); 39 | -------------------------------------------------------------------------------- /__tests__/01_word_files_ole_test.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | const { Buffer } = require('buffer'); 4 | 5 | const OleCompoundDoc = require('../lib/ole-compound-doc'); 6 | const FileReader = require('../lib/file-reader'); 7 | const BufferReader = require('../lib/buffer-reader'); 8 | 9 | const files = fs.readdirSync(path.resolve(__dirname, "data")) 10 | .filter((f) => ! /^~/.test(f)) 11 | .filter((f) => f.match(/test(\d+)\.doc$/)); 12 | 13 | describe.each(files.map((x) => [x]))( 14 | `Word file %s`, (file) => { 15 | it('can be opened correctly', () => { 16 | const filename = path.resolve(__dirname, `data/${file}`); 17 | const reader = new FileReader(filename); 18 | const doc = new OleCompoundDoc(reader); 19 | return reader.open() 20 | .then(() => doc.read()) 21 | .finally(() => reader.close()); 22 | }); 23 | 24 | it('generates a valid Word stream', () => { 25 | const filename = path.resolve(__dirname, `data/${file}`); 26 | const reader = new FileReader(filename); 27 | const doc = new OleCompoundDoc(reader); 28 | 29 | return reader.open() 30 | .then(() => doc.read()) 31 | .then(() => { 32 | return new Promise((resolve, reject) => { 33 | const chunks = []; 34 | const stream = doc.stream('WordDocument'); 35 | stream.on('data', (chunk) => chunks.push(chunk)); 36 | stream.on('error', (error) => reject(error)); 37 | stream.on('end', () => resolve(Buffer.concat(chunks))); 38 | }); 39 | }) 40 | .then((buffer) => { 41 | const magicNumber = buffer.readUInt16LE(0); 42 | expect(magicNumber.toString(16)).toBe("a5ec"); 43 | }) 44 | .finally(() => reader.close()); 45 | }); 46 | 47 | it('generates a valid Word stream from a buffer', () => { 48 | const filename = path.resolve(__dirname, `data/${file}`); 49 | const buffer = fs.readFileSync(filename); 50 | const reader = new BufferReader(buffer); 51 | const doc = new OleCompoundDoc(reader); 52 | 53 | return reader.open() 54 | .then(() => doc.read()) 55 | .then(() => { 56 | return new Promise((resolve, reject) => { 57 | const chunks = []; 58 | const stream = doc.stream('WordDocument'); 59 | stream.on('data', (chunk) => chunks.push(chunk)); 60 | stream.on('error', (error) => reject(error)); 61 | stream.on('end', () => resolve(Buffer.concat(chunks))); 62 | }); 63 | }) 64 | .then((buffer) => { 65 | const magicNumber = buffer.readUInt16LE(0); 66 | expect(magicNumber.toString(16)).toBe("a5ec"); 67 | }) 68 | .finally(() => reader.close()); 69 | }); 70 | 71 | } 72 | ); 73 | -------------------------------------------------------------------------------- /__tests__/02_word_files_extract_test.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | const WordExtractor = require('../lib/word'); 4 | const Document = require('../lib/document'); 5 | 6 | const files = fs.readdirSync(path.resolve(__dirname, "data")) 7 | .filter((f) => ! /^~/.test(f)) 8 | .filter((f) => f.match(/test(\d+)\.doc$/)); 9 | 10 | describe.each(files.map((x) => [x]))( 11 | `Word file %s`, (file) => { 12 | 13 | const extractor = new WordExtractor(); 14 | 15 | it('should extract a document successfully', function() { 16 | const extract = extractor.extract(path.resolve(__dirname, `data/${file}`)); 17 | return extract 18 | .then(function(result) { 19 | expect(result).toBeInstanceOf(Document); 20 | }); 21 | }); 22 | } 23 | ); 24 | -------------------------------------------------------------------------------- /__tests__/03_document_test.js: -------------------------------------------------------------------------------- 1 | const Document = require('../lib/document'); 2 | 3 | describe('Document', () => { 4 | 5 | it('should instantiate successfully', () => { 6 | const document = new Document(); 7 | expect(document).toBeInstanceOf(Document); 8 | }); 9 | 10 | it('should read the body', () => { 11 | const document = new Document(); 12 | document._body = "This is the body"; 13 | expect(document.getBody()).toBe("This is the body"); 14 | }); 15 | 16 | it('should read the footnotes', () => { 17 | const document = new Document(); 18 | document._footnotes = "This is the footnotes"; 19 | expect(document.getFootnotes()).toBe("This is the footnotes"); 20 | }); 21 | 22 | it('should read the endnotes', () => { 23 | const document = new Document(); 24 | document._endnotes = "This is the endnotes"; 25 | expect(document.getEndnotes()).toBe("This is the endnotes"); 26 | }); 27 | 28 | it('should read the annotations', () => { 29 | const document = new Document(); 30 | document._annotations = "This is the annotations"; 31 | expect(document.getAnnotations()).toBe("This is the annotations"); 32 | }); 33 | 34 | it('should read the headers', () => { 35 | const document = new Document(); 36 | document._headers = "This is the headers"; 37 | expect(document.getHeaders()).toBe("This is the headers"); 38 | }); 39 | 40 | it('should read the headers and footers', () => { 41 | const document = new Document(); 42 | document._headers = "This is the headers\n"; 43 | document._footers = "This is the footers\n"; 44 | expect(document.getHeaders()).toBe("This is the headers\nThis is the footers\n"); 45 | }); 46 | 47 | it('should selectively read the headers', () => { 48 | const document = new Document(); 49 | document._headers = "This is the headers\n"; 50 | document._footers = "This is the footers\n"; 51 | expect(document.getHeaders({includeFooters: false})).toBe("This is the headers\n"); 52 | }); 53 | 54 | it('should read the footers', () => { 55 | const document = new Document(); 56 | document._headers = "This is the headers\n"; 57 | document._footers = "This is the footers\n"; 58 | expect(document.getFooters()).toBe("This is the footers\n"); 59 | }); 60 | 61 | it('should read the body textboxes', () => { 62 | const document = new Document(); 63 | document._textboxes = "This is the textboxes\n"; 64 | document._headerTextboxes = "This is the header textboxes\n"; 65 | expect(document.getTextboxes({includeBody: true, includeHeadersAndFooters: false})).toBe("This is the textboxes\n"); 66 | }); 67 | 68 | it('should read the header textboxes', () => { 69 | const document = new Document(); 70 | document._textboxes = "This is the textboxes\n"; 71 | document._headerTextboxes = "This is the header textboxes\n"; 72 | expect(document.getTextboxes({includeBody: false, includeHeadersAndFooters: true})).toBe("This is the header textboxes\n"); 73 | }); 74 | 75 | it('should read all textboxes', () => { 76 | const document = new Document(); 77 | document._textboxes = "This is the textboxes\n"; 78 | document._headerTextboxes = "This is the header textboxes\n"; 79 | expect(document.getTextboxes({includeBody: true, includeHeadersAndFooters: true})).toBe("This is the textboxes\n\nThis is the header textboxes\n"); 80 | }); 81 | }); -------------------------------------------------------------------------------- /__tests__/04_word_files_snapshot_test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @overview 3 | * Snapshot tests for all Word (.doc) files. The useful thing about 4 | * this is it detects changes, but also the snapshots include the binary 5 | * values and characters, so we see exactly what is returned, which is 6 | * extremely useful for debugging. 7 | */ 8 | 9 | const fs = require('fs'); 10 | const path = require('path'); 11 | const WordExtractor = require('../lib/word'); 12 | 13 | require('jest-specific-snapshot'); 14 | 15 | const files = fs.readdirSync(path.resolve(__dirname, "data")); 16 | const pairs = files.filter((f) => f.match(/test(\d+)\.doc$/)) 17 | .filter((f) => ! /^~/.test(f)); 18 | 19 | const cleanHeaderText = (text) => { 20 | return text.replace(/^\s+/, '') 21 | .replace(/\s+$/, '') 22 | .replace(/\n{2,}/g, '\n\n'); 23 | }; 24 | 25 | describe.each(pairs.map((x) => [x]))( 26 | `Word file %s`, (file) => { 27 | 28 | const extractor = new WordExtractor(); 29 | 30 | it('should match its snapshot', () => { 31 | return extractor.extract(path.resolve(__dirname, `data/${file}`)) 32 | .then((document) => { 33 | const value = { 34 | body: JSON.stringify(document.getBody()), 35 | footnotes: JSON.stringify(document.getFootnotes()), 36 | endnotes: JSON.stringify(document.getEndnotes()), 37 | headers: JSON.stringify(document.getHeaders()), 38 | annotations: JSON.stringify(document.getAnnotations()), 39 | textboxes: JSON.stringify(document.getTextboxes({includeHeadersAndFooters: false}).trim()), 40 | headerTextboxes: JSON.stringify(cleanHeaderText(document.getTextboxes({includeBody: false})).trim()) 41 | }; 42 | expect(value).toMatchSpecificSnapshot(`./__snapshots__/${file}.snapx`, { 43 | headers: expect.any(String) 44 | }); 45 | }); 46 | }); 47 | } 48 | ); 49 | -------------------------------------------------------------------------------- /__tests__/05_word_buffers_snapshot_test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @overview 3 | * Snapshot tests for all Word (.doc) files using buffers. The useful thing about 4 | * this is it detects changes, but also the snapshots include the binary 5 | * values and characters, so we see exactly what is returned, which is 6 | * extremely useful for debugging. 7 | */ 8 | 9 | const fs = require('fs'); 10 | const path = require('path'); 11 | const WordExtractor = require('../lib/word'); 12 | 13 | require('jest-specific-snapshot'); 14 | 15 | const files = fs.readdirSync(path.resolve(__dirname, "data")); 16 | const pairs = files.filter((f) => f.match(/test(\d+)\.doc$/)) 17 | .filter((f) => ! /^~/.test(f)); 18 | 19 | const cleanHeaderText = (text) => { 20 | return text.replace(/^\s+/, '') 21 | .replace(/\s+$/, '') 22 | .replace(/\n{2,}/g, '\n\n'); 23 | }; 24 | 25 | describe.each(pairs.map((x) => [x]))( 26 | `Word file %s`, (file) => { 27 | 28 | const extractor = new WordExtractor(); 29 | 30 | it('should match its snapshot', () => { 31 | 32 | const filename = path.resolve(__dirname, `data/${file}`); 33 | const buffer = fs.readFileSync(filename); 34 | 35 | return extractor.extract(buffer) 36 | .then((document) => { 37 | const value = { 38 | body: JSON.stringify(document.getBody()), 39 | footnotes: JSON.stringify(document.getFootnotes()), 40 | endnotes: JSON.stringify(document.getEndnotes()), 41 | headers: JSON.stringify(document.getHeaders()), 42 | annotations: JSON.stringify(document.getAnnotations()), 43 | textboxes: JSON.stringify(document.getTextboxes({includeHeadersAndFooters: false}).trim()), 44 | headerTextboxes: JSON.stringify(cleanHeaderText(document.getTextboxes({includeBody: false})).trim()) 45 | }; 46 | expect(value).toMatchSpecificSnapshot(`./__snapshots__/${file}.snapx`, { 47 | headers: expect.any(String) 48 | }); 49 | }); 50 | }); 51 | } 52 | ); 53 | -------------------------------------------------------------------------------- /__tests__/06_openoffice_files_extract_test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @overview 3 | * Snapshot tests for all OpenOffice (.docx) files. The useful thing about 4 | * this is it detects changes, but also the snapshots include the binary 5 | * values and characters, so we see exactly what is returned, which is 6 | * extremely useful for debugging. 7 | */ 8 | 9 | const fs = require('fs'); 10 | const path = require('path'); 11 | const WordExtractor = require('../lib/word'); 12 | 13 | require('jest-specific-snapshot'); 14 | 15 | const files = fs.readdirSync(path.resolve(__dirname, "data")); 16 | const pairs = files.filter((f) => f.match(/test(\d+)\.doc$/)) 17 | .filter((f) => files.includes(f + "x")) 18 | .filter((f) => ! /^~/.test(f)); 19 | 20 | const cleanHeaderText = (text) => { 21 | return text.replace(/^\s+/, '') 22 | .replace(/\s+$/, '') 23 | .replace(/\n{2,}/g, '\n\n'); 24 | }; 25 | 26 | describe.each(pairs.map((x) => [x]))( 27 | `Word file %s`, (file) => { 28 | 29 | const extractor = new WordExtractor(); 30 | 31 | it('should match its snapshot', () => { 32 | return extractor.extract(path.resolve(__dirname, `data/${file}x`)) 33 | .then((document) => { 34 | const value = { 35 | body: JSON.stringify(document.getBody()), 36 | footnotes: JSON.stringify(document.getFootnotes()), 37 | endnotes: JSON.stringify(document.getEndnotes()), 38 | headers: JSON.stringify(document.getHeaders()), 39 | annotations: JSON.stringify(document.getAnnotations()), 40 | textboxes: JSON.stringify(document.getTextboxes({includeHeadersAndFooters: false}).trim()), 41 | headerTextboxes: JSON.stringify(cleanHeaderText(document.getTextboxes({includeBody: false})).trim()) 42 | }; 43 | expect(value).toMatchSpecificSnapshot(`./__snapshots__/${file}.snapx`, { 44 | headers: expect.any(String) 45 | }); 46 | }); 47 | }); 48 | } 49 | ); 50 | -------------------------------------------------------------------------------- /__tests__/07_openoffice_buffers_extract_test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @overview 3 | * Snapshot tests for all OpenOffice (.docx) files. The useful thing about 4 | * this is it detects changes, but also the snapshots include the binary 5 | * values and characters, so we see exactly what is returned, which is 6 | * extremely useful for debugging. 7 | */ 8 | 9 | const fs = require('fs'); 10 | const path = require('path'); 11 | const WordExtractor = require('../lib/word'); 12 | 13 | require('jest-specific-snapshot'); 14 | 15 | const files = fs.readdirSync(path.resolve(__dirname, "data")); 16 | const pairs = files.filter((f) => f.match(/test(\d+)\.doc$/)) 17 | .filter((f) => files.includes(f + "x")) 18 | .filter((f) => ! /^~/.test(f)); 19 | 20 | const cleanHeaderText = (text) => { 21 | return text.replace(/^\s+/, '') 22 | .replace(/\s+$/, '') 23 | .replace(/\n{2,}/g, '\n\n'); 24 | }; 25 | 26 | describe.each(pairs.map((x) => [x]))( 27 | `Word file %s`, (file) => { 28 | 29 | const extractor = new WordExtractor(); 30 | 31 | it('should match its snapshot', () => { 32 | 33 | const filename = path.resolve(__dirname, `data/${file}x`); 34 | const buffer = fs.readFileSync(filename); 35 | 36 | return extractor.extract(buffer) 37 | .then((document) => { 38 | const value = { 39 | body: JSON.stringify(document.getBody()), 40 | footnotes: JSON.stringify(document.getFootnotes()), 41 | endnotes: JSON.stringify(document.getEndnotes()), 42 | headers: JSON.stringify(document.getHeaders()), 43 | annotations: JSON.stringify(document.getAnnotations()), 44 | textboxes: JSON.stringify(document.getTextboxes({includeHeadersAndFooters: false}).trim()), 45 | headerTextboxes: JSON.stringify(cleanHeaderText(document.getTextboxes({includeBody: false})).trim()) 46 | }; 47 | expect(value).toMatchSpecificSnapshot(`./__snapshots__/${file}.snapx`, { 48 | headers: expect.any(String) 49 | }); 50 | }); 51 | }); 52 | } 53 | ); 54 | -------------------------------------------------------------------------------- /__tests__/08_bigfiles_test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @overview 3 | * Snapshot tests for all Word (.doc) files. The useful thing about 4 | * this is it detects changes, but also the snapshots include the binary 5 | * values and characters, so we see exactly what is returned, which is 6 | * extremely useful for debugging. 7 | */ 8 | 9 | const fs = require('fs'); 10 | const path = require('path'); 11 | const WordExtractor = require('../lib/word'); 12 | 13 | const files = fs.readdirSync(path.resolve(__dirname, "data")); 14 | const pairs = files.filter((f) => f.match(/bigfile-(\d+)\.doc$/)) 15 | .filter((f) => files.includes(f + "x")) 16 | .filter((f) => ! /^~/.test(f)); 17 | 18 | describe.each(pairs.map((x) => [x]))( 19 | `Word file %s`, (file) => { 20 | 21 | const extractor = new WordExtractor(); 22 | 23 | it('should match across formats', () => { 24 | return Promise.all([ 25 | extractor.extract(path.resolve(__dirname, `data/${file}`)), 26 | extractor.extract(path.resolve(__dirname, `data/${file}x`)) 27 | ]) 28 | .then((documents) => { 29 | const [oleDocument, ooDocument] = documents; 30 | const oleBody = oleDocument.getBody().replace(/\n{2,}/g, "\n"); 31 | const ooBody = ooDocument.getBody().replace(/\n{2,}/g, "\n"); 32 | expect(oleBody).toEqual(ooBody); 33 | }); 34 | }); 35 | } 36 | ); 37 | -------------------------------------------------------------------------------- /__tests__/09_headers_footers_test.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const WordExtractor = require('../lib/word'); 3 | 4 | describe('Word file word15.docx', () => { 5 | 6 | const extractor = new WordExtractor(); 7 | 8 | it('should properly separate headers and footers', () => { 9 | return extractor.extract(path.resolve(__dirname, "data/test15.docx")) 10 | .then((document) => { 11 | expect(document.getFooters()).toMatch(/footer/); 12 | expect(document.getFooters()).not.toMatch(/header/); 13 | expect(document.getHeaders({includeFooters: false})).toMatch(/header/); 14 | expect(document.getHeaders({includeFooters: false})).not.toMatch(/footer/); 15 | }); 16 | }); 17 | }); 18 | 19 | 20 | describe('Word file word15.doc', () => { 21 | 22 | const extractor = new WordExtractor(); 23 | 24 | it('should properly separate headers and footers', () => { 25 | return extractor.extract(path.resolve(__dirname, "data/test15.doc")) 26 | .then((document) => { 27 | expect(document.getFooters()).toMatch(/footer/); 28 | expect(document.getFooters()).not.toMatch(/header/); 29 | expect(document.getHeaders({includeFooters: false})).toMatch(/header/); 30 | expect(document.getHeaders({includeFooters: false})).not.toMatch(/footer/); 31 | }); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /__tests__/10_word_invalid_file_test.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const WordExtractor = require('../lib/word'); 3 | 4 | describe('Word file badfile-01-bad-header.doc', () => { 5 | 6 | const extractor = new WordExtractor(); 7 | 8 | it('should match the expected body', () => { 9 | return expect(extractor.extract(path.resolve(__dirname, "data/badfile-01-bad-header.doc"))) 10 | .rejects 11 | .toThrowError("Invalid Short Sector Allocation Table"); 12 | }); 13 | }); 14 | -------------------------------------------------------------------------------- /__tests__/11_openoffice_invalid_xml_test.js: -------------------------------------------------------------------------------- 1 | const path = require('path'); 2 | const WordExtractor = require('../lib/word'); 3 | 4 | describe('Word file bad-xml.docx', () => { 5 | 6 | const extractor = new WordExtractor(); 7 | 8 | it('should extract the expected body', () => { 9 | return extractor.extract(path.resolve(__dirname, "data/bad-xml.docx")) 10 | .then((document) => { 11 | expect(document.getBody()).toEqual(expect.stringMatching(/A second test of reviewing, but with Unicode characters in/)); 12 | }); 13 | }); 14 | }); 15 | -------------------------------------------------------------------------------- /__tests__/99_file_notest.js: -------------------------------------------------------------------------------- 1 | /** 2 | * A handy tool we can use to debug tests, by applying to a given file. 3 | */ 4 | 5 | const WordExtractor = require('../lib/word'); 6 | 7 | const extractor = new WordExtractor(); 8 | extractor.extract(process.argv[2]) 9 | .then((d) => console.log(d)) 10 | .catch((e) => console.error(e)); 11 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test01.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test01.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"A second test of reviewing, but with Unicode characters in to see if character offsets get broken. 😀 ∀\\\\n\\\\nThis is a test of reviewing\\\\n\\\\nThis text has been inserted, ✻and should be included\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test02.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test02.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"My name is Ryan This is a test blahblahblayh This is another test\\\\n\\\\nThis Test consists of several paragraphs and other things I am trying to get this to create several FKPs for testing purposes.\\\\n\\\\nThis paragraph has some differences sadasd asda sdasdasdasdasdasdasdasasdasddasdasd asdas dasdasd asdas d\\\\n\\\\na\\\\ndsadasdasdasd asd asd asdasdasda sdas d asdaasdas asd asdas dasdasd asd asdasd asdas dasd \\\\n\\\\ns Ryanasd\\\\nasd \\\\n asd\\\\ns d\\\\nas d\\\\nas d\\\\nas d\\\\nas d\\\\nas d\\\\nas d\\\\na sd\\\\na sd\\\\na sd\\\\ns d\\\\nsd \\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test03.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test03.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Each license name is hyperlinked to its location.\\\\nLicense\\\\tGPL v3.0\\\\tLGPL v3.0\\\\tBSD\\\\tMIT (X11)\\\\tApache v2.0\\\\t\\\\nCan You Release Commercial Works?\\\\tYes, but ALL source code must be distributed under GPL (viral).\\\\tYes\\\\tYes\\\\tYes\\\\tYes\\\\t\\\\nCan You Create Derivative Works?\\\\tYes, but ALL source code must be distributed under GPL(viral).\\\\tYes, but any derivative software must be released under a LGPL license and allow reverse engineering for client modifications and debugging.\\\\tYes\\\\tYes\\\\tYes\\\\t\\\\nAttribution?\\\\tMust be included in your source code and distribution.\\\\tMust be included in your source code and distribution.\\\\tMust be included in your source code and any documentation that you include with the release of your software.\\\\tMust be included with your source code.\\\\tMust be included with your source code, and you may be required to include it in your distribution if your licensor requires.\\\\t\\\\nSo What?\\\\tThe GPL dominates the free software world by significant margins. While it's a favorite for those committed to the open source movement, many are shying away from it because of its viral nature which can potentially scare clients.\\\\tNot viral like it's GPL counterpart. Software can be dynamically linked to other LGPL licensed libraries without having to release your source code under LGPL. This license is generally used for software libraries with exception of programs such as Mozilla and Open Office. \\\\tThe BSD license is popular because of the flexibility it allows its licensees. There are really no limitations to what the licensee can do with the software other than the attribution requirements.\\\\tThis is becoming a very popular license because of the extreme simplicity of its text. The whole license is about half a page long and is very permissive like the BSD license.\\\\tThis license is somewhat similar to the BSD license, but goes into further detail in the attribution clauses and maintenance of intellectual property rights. Choosing this license over the BSD or MIT license is a matter of how specific you want your protections to be.\\\\t\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test04.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test04.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"This is a fairly simple word document, over two pages, with headers and footers.\\\\nThe trick with this one is that it contains some Unicode based strings in it.\\\\nFirstly, some currency symbols:\\\\n\\\\tGBP - £\\\\n\\\\tEUR - €\\\\nNow, we'll have some French text, in bold and big:\\\\n\\\\tMolière\\\\nAnd some normal French text:\\\\n\\\\tL'Avare ou l'École du mensonge\\\\nThat's it for page one\\\\n\\\\n\\\\nThis is page two. Les Précieuses ridicules. The end.\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test05.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test05.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"This is a simple file created with Word 97-SR2.\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test06.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test06.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Nom\\\\tDocument1\\\\t\\\\nAnalyste\\\\tROB (432)\\\\t\\\\nBut\\\\tInsert subject here.\\\\t\\\\nDéfinition\\\\t\\\\t\\\\nPaquetage\\\\tInsert package name here..\\\\t\\\\nAncêtre\\\\tInsert super class name here..\\\\t\\\\nInterface\\\\tInsert interface name here..\\\\t\\\\nConstructeur\\\\tnewFwClient (référence de la classe appelante du type DmfAFrameworkClient).\\\\t\\\\nMéthode(s)\\\\tpublic Boolean evaluate( ) throw Exception \\\\n(méthode où se trouve toute la logique).\\\\t\\\\nAutre(s)\\\\tInsert other definitions here. \\\\t\\\\nFonctions appelées\\\\tInsert called functions here.\\\\t\\\\nMéthodes appelées\\\\tInsert called methods here.\\\\t\\\\n\\\\nLogique\\\\nInsert logic description here..\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"DmfA\\\\n\\\\nDmfA\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test07.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test07.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"CREDO REFERENCE\\\\t\\\\n03/599\\\\t\\\\n\\\\nTHE ROBERT GORDON UNIVERSITY\\\\n\\\\nResearch Governance Checklist \\\\nAPPLICABLE FOR ALL INTERNALLY AND EXTERNALLY RESOURCED PROJECTS\\\\n\\\\nInform CREDO as early as possible BEFORE submission\\\\nA MINIMUM of 5 working days is required to obtain the appropriate approvals\\\\nDraft bids are usually acceptable. If final drafts differ substantially, these may need re-authorisation\\\\nProposals must not be submitted to funder without appropriate authorisation\\\\n\\\\nFailure to comply with this authorisation procedure is a breach of the Research Governance Policy and may result in any subsequent award not being accepted\\\\n1. TO BE COMPLETED BY PRINCIPAL INVESTIGATOR\\\\n\\\\n1.1 APPLICANT(S) DETAILS\\\\n\\\\nApplicant name / Other named RGU Participants\\\\tSchool/ Department/ Centre\\\\tStatus to the project (PI/ Researcher/ Support/ Other (Please state)): \\\\n\\\\t\\\\n1.Dr J Malins\\\\tGray's School of Art\\\\tPRINCIPAL INVESTIGATOR\\\\t\\\\n2.Dr S Watt\\\\tSchool of Computing\\\\t\\\\t\\\\n3. \\\\t \\\\t\\\\t\\\\n4. \\\\t \\\\t\\\\t\\\\n5. \\\\t \\\\t\\\\t\\\\n\\\\n1.2 PROJECT DETAILS\\\\n\\\\nProposal Type:\\\\tINTERNALLY RESOURCED EXTERNAL EOI EXTERNAL FULL PROPOSAL \\\\t\\\\nProject Title/Acronym:\\\\tCollaborative design environments to support interdisciplinary research and learning\\\\t\\\\nFunding Body (if internal state \\\\\\"RGU\\\\\\"):\\\\t\\\\nARHB/EPSRC\\\\t\\\\nRGU Status: \\\\t\\\\tIf Other, please state \\\\t\\\\nPartner/ Co-ordinator Name and Contact Details (if RGU is not the sole applicant):\\\\t \\\\t\\\\n\\\\nDeadline for Submission: \\\\t30 June 2004\\\\t\\\\n\\\\nExpected Start Date and Duration:\\\\tJanuary 2005\\\\t\\\\n\\\\n1.3 PROJECT SCOPE/KEYWORDS\\\\n\\\\nFit with University Expertise Areas \\\\n(see 2010 RGU Strategic Vision)\\\\t\\\\t\\\\nKeywords (enter up to 5)\\\\t1)Online environments, 2)Design research, 3)Pedagogy, 4)Creative problem-solving, 5)visual methods\\\\n\\\\n\\\\t\\\\n\\\\n1.4 DISPATCH OF APPLICATION (Complete ONLY if CREDO is to send application)\\\\n\\\\n Please state below how many copies (plus the original) the funder requires, the name and address to which the application should be sent, and any other relevant information. \\\\nName and Address:\\\\tEPSRC\\\\nPolaris House, North Star Avenue\\\\nSwindon SN2 1ET\\\\n\\\\n\\\\n\\\\t\\\\t\\\\n\\\\tNumber of Copies:\\\\t9\\\\t\\\\n\\\\n1.5 ETHICAL REVIEW CHECKLIST (E1) TO BE COMPLETED BY PRINCIPAL INVESTIGATOR\\\\nThis checklist should be completed by the Principal Investigator who is intending to carry out any research activity (whether internally or externally resourced). This checklist will identify whether a project requires an application for ethics approval to be submitted to the Head of School or Research Ethics Committee.\\\\n\\\\nBefore completing this section, please refer to the Research Ethics and Research Governance Policies which can be found online at http://www.rgu.ac.uk/policies. The Principal Investigator is responsible for exercising appropriate professional judgement in this review. \\\\n\\\\nE2 and E3 forms can be found online at http://www.rgu.ac.uk/credo/staff/page.cfm?pge=10193\\\\n\\\\nE1 Ethics Review PART 1\\\\n\\\\t\\\\tYes \\\\tNo\\\\t\\\\nIs the research solely literature-based?\\\\t\\\\t\\\\t\\\\nIf YES, please go to the E1 Ethics Review Part 2 \\\\t\\\\t\\\\t\\\\t\\\\nDoes the research involve the use of any dangerous substances?\\\\t\\\\t\\\\t\\\\nDoes the research involve ionising or other type of dangerous \\\\\\"radiation\\\\\\"? \\\\t\\\\t\\\\t\\\\nCould conflicts of interest arise between the source of funding and the potential outcomes of the research? \\\\t\\\\t\\\\t\\\\nIs it likely that the research will put any of the following at risk:\\\\t\\\\t\\\\t\\\\n\\\\t       stakeholders?\\\\t\\\\t\\\\t\\\\n\\\\t       the environment?\\\\t\\\\t\\\\t\\\\n\\\\t       the economy?\\\\t\\\\t\\\\t\\\\n\\\\t       living creatures? \\\\t\\\\t\\\\t\\\\nDoes the research involve experimentation on animals or animal/human tissues?\\\\t\\\\t\\\\t\\\\n\\\\t\\\\t\\\\t\\\\t\\\\nDoes the research involve the observation, experimenting on, interviewing or examining the records of human participants?\\\\t\\\\t\\\\t\\\\nIf the answer to 7 is NO, please go to E1 Ethics Review Part 2\\\\t\\\\t\\\\t\\\\n\\\\t\\\\t\\\\t\\\\t\\\\nCould the research induce psychological stress or anxiety, cause harm or have negative consequences for the participants (beyond the risks encountered in their normal lifestyles)?\\\\t\\\\t\\\\t\\\\nWill the research involve prolonged or repetitive testing, or the collection of audio or video materials?\\\\t\\\\t\\\\t\\\\nWill financial inducements be offered?\\\\t\\\\t\\\\t\\\\nWill deception of participants be necessary during the research?\\\\t\\\\t\\\\t\\\\nAre there problems with the participant's right to remain anonymous?\\\\t\\\\t\\\\t\\\\nWill there be a need at anytime to withhold the right to withdraw from the research?\\\\t\\\\t\\\\t\\\\nDoes the research involve participants who may be particularly vulnerable (such as children or adults with severe learning difficulties)? \\\\t\\\\t\\\\t\\\\n\\\\nE1 Ethics Review Part 2\\\\n\\\\nPlease give a summary of the ethical issues and any action that will be taken to address the problem \\\\n* NOTE - If you believe there to be NO Ethical issues please enter \\\\\\"NONE\\\\\\" into the box \\\\nNone\\\\t\\\\n\\\\nSupporting documentation included (please tick all that apply):\\\\n\\\\nCopy of the proposal\\\\n\\\\t\\\\tCopy of call for proposal/funding guidelines/ preliminary correspondence with funding body (as appropriate)\\\\t\\\\t\\\\nE2 form (if answered YES to any of questions 2-6)\\\\t\\\\tE3 form (if answered YES to any of questions 7, 8-14)\\\\n\\\\t\\\\t\\\\n\\\\nConfirmation by signature/e-mail affirms your acceptance of the obligations under the RGU Research Governance and Ethics Policies\\\\n\\\\nTO BE SIGNED BY PRINCIPAL INVESTIGATOR\\\\n\\\\nSignature:\\\\t\\\\n\\\\t\\\\nDate:\\\\t\\\\t\\\\nSEND SIGNED COPY (or CONFIRMATION BY E-MAIL innovation@rgu.ac.uk) TO CREDO, to complete Research Proposal Authorisation \\\\n\\\\nNB **Applications to external funding bodies must **NOT** be made, nor internally resourced projects commenced, without confirmation from CREDO of the completion of the Research Proposal Authorisation Process** \\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\" EOI - Refers to Expression of Interest or Stage 1 applications (FP6) without financial commitment\\\\n\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test08.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test08.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"This is a bookmark test\\\\n\\\\nThe bookmark \\\\\\"TestBookmark\\\\\\" should include the text \\\\\\"Morag says hello\\\\\\"\\\\n\\\\n\\\\nThis is a form, and the bookmark is Text1\\\\n\\\\nForm text\\\\n\\\\n\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test09.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test09.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"{This line gets read fine}\\\\nOoops, where are the ( opening ( brackets?\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test10.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test10.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"Second paragraph comment\\\\nThird paragraph comment - and this is all I have to say on the matter\\\\n\\"", 6 | "body": "\\"This is a simple Word file\\\\n\\\\nSecond paragraph\\\\n\\\\nThird paragraph\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test11.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test11.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"这是一个用来测试nodejs解析Word文档.doccccc\\\\nThis is a test for parsing the Word file in node.\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test12.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test12.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"This is a simple paragraph\\\\n\\\\nRow 1, cell 1\\\\tRow 1, cell 2\\\\tRow 1, cell 3\\\\t\\\\nRow 2, cell 1\\\\t\\\\tRow 2, cell 3\\\\t\\\\n\\\\nAnd a second paragraph\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test13.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test13.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Endnotes and footnotes test\\\\n\\\\nParagraph 1\\\\n\\\\nParagraph 2\\\\n\\"", 7 | "endnotes": "\\" This is an endnote\\\\n\\"", 8 | "footnotes": "\\" This is a footnote\\\\n\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test14.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test14.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"This is a test of reviewing\\\\n\\\\nThis text has been inserted, and should be included\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test15.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test15.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Header test file\\\\n\\\\nSome random text\\\\n\\\\nMore random text\\\\n\\\\nStill more random text\\\\n\\\\n\\\\nSection 1 body continued\\\\n\\\\n\\\\n\\\\nStill section 1\\\\n\\\\n\\\\nSecond section\\\\n\\\\n\\\\n\\\\nSection 2 body continued\\\\n\\\\n\\\\n\\\\n\\\\nSection 2 continued\\\\n\\\\n\\\\n\\\\nSection 3 text\\\\n\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test16.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test16.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Text box document test\\\\n\\\\nThis is a document containing several text boxes.\\\\n\\\\nParagraph 1\\\\n\\\\nParagraph 2\\\\n\\\\nParagraph 3\\\\n\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"Header box 2\\\\n\\\\nHeader box 1\\"", 10 | "headers": Any, 11 | "textboxes": "\\"First text box, regular\\\\n\\\\nA shape with text inside\\\\n\\\\nSecond text box, copied and pasted from the first. The anchor point is the same\\\\nA second paragraph in the second text box, but it might wrap between boxes. \\\\nThis is in a third text box, but it's linked to the same text from the second.\\\\n\\\\nVertical text box added too\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test17.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test17.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Lorem ipsum \\\\n\\\\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \\\\n\\\\nVestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.\\\\nMaecenas non lorem quis tellus placerat varius. \\\\nNulla facilisi. \\\\nAenean congue fringilla justo ut aliquam. \\\\nMauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. \\\\nMorbi viverra semper lorem nec molestie. \\\\nMaecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. Fusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. \\\\n\\\\nCras fringilla ipsum magna, in fringilla dui commodo a.\\\\n\\\\n\\\\tLorem ipsum\\\\tLorem ipsum\\\\tLorem ipsum\\\\t\\\\n1\\\\tIn eleifend velit vitae libero sollicitudin euismod.\\\\tLorem\\\\t\\\\t\\\\n2\\\\tCras fringilla ipsum magna, in fringilla dui commodo a.\\\\tIpsum\\\\t\\\\t\\\\n3\\\\tAliquam erat volutpat. \\\\tLorem\\\\t\\\\t\\\\n4\\\\tFusce vitae vestibulum velit. \\\\tLorem\\\\t\\\\t\\\\n5\\\\tEtiam vehicula luctus fermentum.\\\\tIpsum\\\\t\\\\t\\\\n\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\\\nLorem ipsum dolor sit amet, consectetur adipiscing elit. \\\\n\\\\nNunc ac faucibus odio. Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. \\\\nMaecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. \\\\nFusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. Cras fringilla ipsum magna, in fringilla dui commodo a.\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test18.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test18.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Lorem ipsum \\\\n\\\\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \\\\n\\\\nVestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.\\\\nMaecenas non lorem quis tellus placerat varius. \\\\nNulla facilisi. \\\\nAenean congue fringilla justo ut aliquam. \\\\nMauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. \\\\nMorbi viverra semper lorem nec molestie. \\\\nMaecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. Fusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. \\\\n\\\\nCras fringilla ipsum magna, in fringilla dui commodo a.\\\\n\\\\n\\\\tLorem ipsum\\\\tLorem ipsum\\\\tLorem ipsum\\\\t\\\\n1\\\\tIn eleifend velit vitae libero sollicitudin euismod.\\\\tLorem\\\\t\\\\t\\\\n2\\\\tCras fringilla ipsum magna, in fringilla dui commodo a.\\\\tIpsum\\\\t\\\\t\\\\n3\\\\tAliquam erat volutpat. \\\\tLorem\\\\t\\\\t\\\\n4\\\\tFusce vitae vestibulum velit. \\\\tLorem\\\\t\\\\t\\\\n5\\\\tEtiam vehicula luctus fermentum.\\\\tIpsum\\\\t\\\\t\\\\n\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\\\nLorem ipsum dolor sit amet, consectetur adipiscing elit. \\\\n\\\\nNunc ac faucibus odio. Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. \\\\nMaecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. \\\\nFusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. Cras fringilla ipsum magna, in fringilla dui commodo a.\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\\\n\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. \\\\nMaecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. \\\\nFusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. Cras fringilla ipsum magna, in fringilla dui commodo a.\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test19.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test19.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Lorem ipsum \\\\n\\\\nLorem ipsum dolor sit amet, consectetur adipiscing elit. Nunc ac faucibus odio. \\\\n\\\\nVestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus.\\\\nMaecenas non lorem quis tellus placerat varius. \\\\nNulla facilisi. \\\\nAenean congue fringilla justo ut aliquam. \\\\nMauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. \\\\nMorbi viverra semper lorem nec molestie. \\\\nMaecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\n\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. Fusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. \\\\n\\\\nCras fringilla ipsum magna, in fringilla dui commodo a.\\\\n\\\\n\\\\tLorem ipsum\\\\tLorem ipsum\\\\tLorem ipsum\\\\t\\\\n1\\\\tIn eleifend velit vitae libero sollicitudin euismod.\\\\tLorem\\\\t\\\\t\\\\n2\\\\tCras fringilla ipsum magna, in fringilla dui commodo a.\\\\tIpsum\\\\t\\\\t\\\\n3\\\\tAliquam erat volutpat. \\\\tLorem\\\\t\\\\t\\\\n4\\\\tFusce vitae vestibulum velit. \\\\tLorem\\\\t\\\\t\\\\n5\\\\tEtiam vehicula luctus fermentum.\\\\tIpsum\\\\t\\\\t\\\\n\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\\\nLorem ipsum dolor sit amet, consectetur adipiscing elit. \\\\n\\\\nNunc ac faucibus odio. Vestibulum neque massa, scelerisque sit amet ligula eu, congue molestie mi. Praesent ut varius sem. Nullam at porttitor arcu, nec lacinia nisi. Ut ac dolor vitae odio interdum condimentum. Vivamus dapibus sodales ex, vitae malesuada ipsum cursus convallis. Maecenas sed egestas nulla, ac condimentum orci. Mauris diam felis, vulputate ac suscipit et, iaculis non est. Curabitur semper arcu ac ligula semper, nec luctus nisl blandit. Integer lacinia ante ac libero lobortis imperdiet. Nullam mollis convallis ipsum, ac accumsan nunc vehicula vitae. Nulla eget justo in felis tristique fringilla. Morbi sit amet tortor quis risus auctor condimentum. Morbi in ullamcorper elit. Nulla iaculis tellus sit amet mauris tempus fringilla.\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. \\\\nMaecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. \\\\nFusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. Cras fringilla ipsum magna, in fringilla dui commodo a.\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\\\n\\\\nMaecenas mauris lectus, lobortis et purus mattis, blandit dictum tellus. \\\\nMaecenas non lorem quis tellus placerat varius. Nulla facilisi. Aenean congue fringilla justo ut aliquam. Mauris id ex erat. Nunc vulputate neque vitae justo facilisis, non condimentum ante sagittis. Morbi viverra semper lorem nec molestie. Maecenas tincidunt est efficitur ligula euismod, sit amet ornare est vulputate.\\\\nIn non mauris justo. Duis vehicula mi vel mi pretium, a viverra erat efficitur. Cras aliquam est ac eros varius, id iaculis dui auctor. Duis pretium neque ligula, et pulvinar mi placerat et. Nulla nec nunc sit amet nunc posuere vestibulum. Ut id neque eget tortor mattis tristique. Donec ante est, blandit sit amet tristique vel, lacinia pulvinar arcu. Pellentesque scelerisque fermentum erat, id posuere justo pulvinar ut. Cras id eros sed enim aliquam lobortis. Sed lobortis nisl ut eros efficitur tincidunt. Cras justo mi, porttitor quis mattis vel, ultricies ut purus. Ut facilisis et lacus eu cursus.\\\\nIn eleifend velit vitae libero sollicitudin euismod. \\\\nFusce vitae vestibulum velit. Pellentesque vulputate lectus quis pellentesque commodo. Aliquam erat volutpat. Vestibulum in egestas velit. Pellentesque fermentum nisl vitae fringilla venenatis. Etiam id mauris vitae orci maximus ultricies. Cras fringilla ipsum magna, in fringilla dui commodo a.\\\\nEtiam vehicula luctus fermentum. In vel metus congue, pulvinar lectus vel, fermentum dui. Maecenas ante orci, egestas ut aliquet sit amet, sagittis a magna. Aliquam ante quam, pellentesque ut dignissim quis, laoreet eget est. Aliquam erat volutpat. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut ullamcorper justo sapien, in cursus libero viverra eget. Vivamus auctor imperdiet urna, at pulvinar leo posuere laoreet. Suspendisse neque nisl, fringilla at iaculis scelerisque, ornare vel dolor. Ut et pulvinar nunc. Pellentesque fringilla mollis efficitur. Nullam venenatis commodo imperdiet. Morbi velit neque, semper quis lorem quis, efficitur dignissim ipsum. Ut ac lorem sed turpis imperdiet eleifend sit amet id sapien.\\\\n\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/__snapshots__/test20.doc.snapx: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`Word file test20.doc should match its snapshot 1`] = ` 4 | Object { 5 | "annotations": "\\"\\"", 6 | "body": "\\"Virsraksts: Zemesgrāmatu likuma 60.panta tvērums\\\\nTēze: Elektronisko dokumentu likuma 1.panta 2.punkta b) apakšpunkts noteic, ka elektroniskais paraksts nodrošina tikai parakstītāja identifikāciju.\\\\nTurpretim Zemesgrāmatu likuma 60.pantā ietvertā prasība parakstus apliecināt pie notāra vai bāriņtiesā nodrošina ne vien personu identifikāciju, bet arī šo personu rīcībspējas, kā arī pilnvarnieka vai pārstāvja pilnvaru apjoma pārbaudi. Šajā pantā iekļautais privātpersonas pienākums apliecināt savu parakstu uz nostiprinājuma lūguma pie notāra vai bāriņtiesā kalpo par pamatu fizisko un juridisko personu civilo tiesību un likumisko interešu aizsardzībai.\\\\nLatvijas Republikas Augstākās tiesas\\\\nCivillietu departamenta\\\\n2016. gada 2. februāra\\\\nLĒMUMS\\\\nNr. SKC-1406/2016\\\\n\\\\nLatvijas Republikas Augstākās tiesas Civillietu departaments šādā sastāvā:\\\\ntiesnesis referents N.Salenieks \\\\ntiesnese A.Briede\\\\ntiesnese A.Čerņavska\\\\nrakstveida procesā izskatīja AS „Zemes īpašnieku ģilde\\\\\\" blakus sūdzību par Rīgas apgabaltiesas Civillietu tiesas kolēģijas 2015.gada 12.oktobra lēmumu, ar kuru atstāts negrozīts Rīgas pilsētas Vidzemes priekšpilsētas tiesas zemesgrāmatu nodaļas tiesneša 2015.gada 1.jūlija lēmums par nostiprinājuma lūguma atstāšanu bez ievērības.\\\\nAugstākās tiesas Civillietu departaments\\\\nkonstatēja\\\\n[1] AS „Zemes īpašnieku ģilde\\\\\\" 2015.gada 18.jūnijā iesniedza Rīgas pilsētas Vidzemes priekšpilsētas tiesas Zemesgrāmatu nodaļā nostiprinājuma lūgumu hipotēkas nostiprināšanai uz nekustamo īpašumu Zentenes ielā 11, Rīgā, kas ierakstīts Rīgas pilsētas zemesgrāmatas nodalījumā Nr. 9002, kadastra Nr. 0100 117 0131.\\\\nNostiprinājuma lūgums pamatots ar aizdevuma un hipotēkas līgumiem, tam pievienotas kvītis par kancelejas nodevas un valsts nodevas samaksu. Dokuments parakstīts elektroniski ar drošu elektronisko parakstu un satur laika zīmogu.\\\\n[2] Ar Rīgas pilsētas Vidzemes priekšpilsētas tiesas Zemesgrāmatu nodaļas tiesneša 2015.gada 1.jūlija lēmumu nostiprinājuma lūgums atstāts bez ievērības, pamatojoties uz Zemesgrāmatu likuma 56.panta 1.punktu, 76.panta pirmo daļu, 77.panta 1.punktu un 79.pantu.\\\\nLēmumā norādīts, ka AS „Zemes īpašnieku ģilde\\\\\\" kā tiesību subjekts neatbilst Zemesgrāmatu likuma 56.1 pantā uzskaitīto personu lokam, kuras var iesniegt nostiprinājuma lūgumu, kas parakstīts normatīvajos aktos par elektroniskajiem dokumentiem noteiktajā kārtībā. Savukārt nostiprinājuma lūgums atbilstoši Zemesgrāmatu likuma 60.panta prasībām nav iesniegts.\\\\n[3] Par Rīgas pilsētas Vidzemes priekšpilsētas tiesas Zemesgrāmatu nodaļas tiesneša 2015.gada 1.jūlija lēmumu AS „Zemes īpašnieku ģilde\\\\\\" iesniegusi sūdzību, kurā lūgusi lēmumu atcelt un nodot nostiprinājuma lūgumu jaunai izskatīšanai, kā arī vērsties Satversmes tiesā ar pieteikumu par Zemesgrāmatu likuma 60.panta pirmās daļas neatbilstību Satversmes 105.pantam.\\\\nBlakus sūdzība pamatota ar šādiem argumentiem.\\\\n[3.1] Nav apstrīdams, ka Zemesgrāmatu likuma 60.pants noteic prasību parakstus uz nostiprinājuma lūguma apliecināt pie notāra. Tomēr nepieciešams pārbaudīt šīs normas atbilstību Satversmei, respektīvi, vai normā ietvertais pamattiesību ierobežojums ir pamatots un samērīgs. Atbilstoši Elektronisko dokumentu likuma mērķim un 1.pantā ietvertajām definīcijām, parakstot elektroniski sagatavotu dokumentu ar drošu elektronisko parakstu, tiek nodrošināts dokumenta autentiskums, nemainīgums un apstiprināta parakstītāja identitāte. Tādējādi, sistēmiski tulkojot, Zemesgrāmatu likuma 60.pantā ietvertā prasība parakstus apliecināt pie notāra vai bāriņtiesā, lai nodrošinātu personu identifikāciju, nav pamatota, jo to iespējams nodrošināt citos - ātrākos, lētākos un ērtākos veidos.\\\\nAttiecībā uz pilnvarnieku un pārstāvju pilnvaru apjomu apsverams, ka pilnvarojuma pārbaudi veic arī zemesgrāmatu nodaļas tiesnesis, skatot nostiprinājuma lūgumu un tam pievienotos dokumentus, tādējādi divkāršai pilnvarojumu apliecinošo dokumentu pārbaudei nav tiesiskas nepieciešamības. Savukārt prasība pārbaudīt rīcībspēju nav izpildāma. Lai gan Notariāta likums formāli paredz notāra tiesības un pienākumu pārbaudīt personu rīcībspēju, tomēr faktiski tas nav izdarāms un faktiski arī netiek darīts, jo notāram fiziski un tiesiski nav iespējas pārliecināties par to, vai personai ir pilna rīcībspēja. Savukārt juridisko personu pārstāvības tiesības pārbauda zemesgrāmatu tiesnesis. Turklāt, nav gandrīz nekādas nozīmes rīcībspējai, tiesībspējai un pilnvarojuma apjomam paraksta dienā, ja dokumenta integritāte netiek pārbaudīta vispār, proti, likums neaizliedz veikt papildinājumus notariāli parakstītā lūgumā, tostarp, pēc paraksta ierakstīt dokumentā tā būtiskās sastāvdaļas.\\\\nNo otras puses, vienīgās personas, kurām ir faktiska un tiesiska iespēja pārbaudīt to, vai personai ir pilna rīcībspēja, ir tiesneši, jo tiem ir pieeja visiem tiesu nolēmumiem. Secīgi, Zemesgrāmatu likuma 60.panta pirmās daļas otrā teikuma izpilde būtu atstājama zemesgrāmatu nodaļas tiesnešu kompetencē. Šobrīd likumā iekļautā prasība visus parakstus uz nostiprinājuma lūgumiem apliecināt pie notāra rada ne vien neērtības, bet arī izdevumus personām, kurām šādi lūgumi regulāri jāiesniedz zemesgrāmatu nodaļās.\\\\n[4] Ar Rīgas apgabaltiesas Civillietu tiesas kolēģijas 2015.gada 12.oktobra lēmumu Rīgas pilsētas Vidzemes priekšpilsētas tiesas Zemesgrāmatu nodaļas tiesneša 2015.gada 1.jūlija lēmums atstāts negrozīts, bet AS „Zemes īpašnieku ģilde\\\\\\" sūdzība noraidīta.\\\\nLēmums pamatots ar šādiem argumentiem.\\\\n[4.1] Atbilstoši Civilprocesa likuma 447.¹ panta otrajai daļai (ja tiesa, izskatot blakus sūdzību, atzīst, ka pārsūdzētajā lēmumā ietvertie motīvi ir pareizi un pietiekami, tā lēmumā var norādīt, ka pievienojas pārsūdzētā lēmuma motīviem. Šādā gadījumā šā likuma 230.panta pirmās daļas 5.punktā noteiktos lēmuma motīvus var nenorādīt) Civillietu tiesas kolēģija pievienojas pārsūdzētā lēmuma motivācijai.\\\\n[4.2] Zemesgrāmatu likuma 56.1 pantā uzskaitītas personas, kuras var iesniegt nostiprinājuma lūgumu, kas parakstīts normatīvajos aktos par elektroniskiem dokumentiem noteiktajā kārtībā. Rakstveidā izteiktam nostiprinājuma lūgumam, proti, notāra vai bāriņtiesas apliecinājums privātpersonas parakstam, ja nostiprinājumu neiesniedz personīgi, Zemesgrāmatu likuma 60.panta pirmajā daļā ir noteiktas citas prasības.\\\\nParaksta apliecinājums atbilstoši minētajai normai ietver sevī ne tikai personas identificēšanas funkciju, ko nodrošina elektroniskais paraksts, bet arī personas rīcībspējas un pilnvarnieka vai pārstāvja pilnvarojuma apjoma pārbaudi, ko elektroniskais paraksts ar laika zīmogu nenodrošina.\\\\nSavukārt saskaņā ar Elektronisko dokumenta likuma 3.panta pirmo daļu prasība pēc dokumenta rakstveida formas attiecībā uz elektronisko dokumentu ir izpildīta, ja elektroniskajam dokumentam ir elektroniskais paraksts un elektroniskais dokuments atbilst citām normatīvajos aktos noteiktajām prasībām.\\\\nZemesgrāmatu nodaļas tiesnesis pamatoti konstatējis, ka konkrētajā gadījumā nostiprinājuma lūdzējs kā tiesību subjekts neatbilst Zemesgrāmatu likuma 56.1 pantā uzskaitītām personām, kuras var iesniegt nostiprinājuma lūgumu, kas parakstīts normatīvajos aktos par elektroniskajiem dokumentiem noteiktajā kārtībā, un nostiprinājuma lūgums atbilstoši Zemesgrāmatu likuma 60.panta prasībām nav pievienots.\\\\n[4.3] Attiecībā uz blakus sūdzībā izteikto lūgumu izvērtēt Zemesgrāmatu likuma 60.panta atbilstību Satversmei, Civillietu tiesu kolēģija uzskata, ka izskatāmajā lietā nav nepieciešams vērsties Satversmes tiesā. Taču, ja nostiprinājuma lūdzējs uzskata, ka minētā norma neatbilst Satversmes 105.pantam, viņam ir patstāvīgas tiesības, ievērojot Satversmes tiesas likumā noteikto kārtību, vērsties Latvijas Republikas Satversmes tiesā.\\\\n[5] Par Rīgas apgabaltiesas Civillietu tiesas kolēģijas 2015.gada 12.oktobra lēmumu AS „Zemes īpašnieku ģilde\\\\\\" iesniegusi blakus sūdzību, kurā lūgusi atcelt lēmumu un vērsties Satversmes tiesā ar pieteikumu par Zemesgrāmatu likuma 60.panta pirmās daļas neatbilstību Satversmes 105.pantam.\\\\nBlakus sūdzībā atkārtoti tie paši argumenti, kas norādīti sūdzībā par Zemesgrāmatu tiesneša lēmumu, papildinot ar tālāk minēto.\\\\n[5.1] Tiesas norāde, ka pieteicējam pašam ir tiesības vērsties Satversmes tiesā ar pieteikumu par normas neatbilstību, ir pretrunā ar Satversmes tiesas likuma 19.2 panta otro daļu.\\\\nTiesai, izskatot civiltiesisku strīdu, ir jāievēro tiesību normu hierarhija un, konstatējot pretrunu starp dažāda juridiska spēka tiesību normām, jāpiemēro tā tiesību norma, kurai ir augstāks juridiskais spēks.\\\\n[5.2] Zemesgrāmatu likuma 60.panta interpretācijā jāņem vērā normas pieņemšanas mērķis un Satversmē noteiktās pamattiesības.\\\\nSatversmei atbilstoša tiesību normas piemērošana ietver:\\\\npiemērojamās tiesības atrašanu;\\\\natbilstošu iztulkošanas metožu izmantošanu;\\\\nintertemporālās un hierarhiskās piemērojamības izvērtēšanu;\\\\njudikatūras un tiesību doktrīnas izmantošanu;\\\\ntiesību tālākveidošanu. (Sk. Satversmes tiesas 2005.gada 4.janvāra sprieduma lietā Nr.2004-1601 17.punktu).\\\\n\\\\n[6] Iepazinies ar lietas materiāliem un apsvēris blakus sūdzības argumentus, Augstākās tiesas Civillietu departaments tālāk minēto motīvu dēļ uzskata, ka apelācijas instances tiesas lēmums ir atstājams negrozīts, bet AS „Zemes īpašnieku ģilde\\\\\\" blakus sūdzība noraidāma.\\\\n[6.1] Nav šaubu un blakus sūdzības iesniedzēja to neapstrīd, ka Zemesgrāmatu likuma 60.pants paredz, ka gadījumā, ja nostiprinājumu lūdz privātpersona, tās parakstam uz nostiprinājuma lūguma jābūt notāra vai bāriņtiesas apliecinātam. Tādējādi, kā to pareizi atzinusi gan zemesgrāmatu nodaļas tiesnese, gan apelācijas instances tiesa, AS „Zemes īpašnieku ģilde\\\\\\" iesniegtais nostiprinājuma lūgums, kas iesniegts elektroniskā formā un parakstīts ar elektronisko parakstu, iepriekš minētās tiesību normas prasībām neatbilst.\\\\n[6.2] Blakus sūdzības iesniedzēja lūgusi izvērtēt minētās normas atbilstību Satversmei - vai normā ietvertais pamattiesību ierobežojums ir pamatots un samērīgs, jo atbilstoši Elektronisko dokumentu likuma mērķim un 1.pantā ietvertajām definīcijām, parakstot elektroniski sagatavotu dokumentu ar drošu elektronisko parakstu, tiek nodrošināts dokumenta autentiskums, nemainīgums un apstiprināta parakstītāja identitāte. Tādējādi sūdzības iesniedzēja uzskata, ka Zemesgrāmatu likuma 60.pantā ietvertā prasība parakstus apliecināt pie notāra vai bāriņtiesā, lai nodrošinātu personu identifikāciju, nav pamatota, jo to iespējams nodrošināt citā veidā.\\\\n[6.2.1] Ar 2014.gada 29.novembri stājušies spēkā grozījumi Zemesgrāmatu likumā - ceturtās nodaļas pirmā apakšnodaļa „Nostiprinājuma lūgumi un to pielikumi\\\\\\" papildināta ar pantu, kas paredz, gadījumus, kuros atļauts iesniegt nostiprinājuma lūgumu, kas parakstīts normatīvajos aktos par elektroniskajiem dokumentiem noteiktajā kārtībā:\\\\nkredītiestāde, kurai par labu nostiprināta ķīlas tiesība zemesgrāmatā, - par hipotēkas un ar to saistīto tiesību aprobežojumu dzēšanu;\\\\nzvērināts tiesu izpildītājs - par atzīmes ierakstīšanu, pārgrozīšanu vai dzēšanu;\\\\nmaksātnespējas procesa administrators - par atzīmes ierakstīšanu, pārgrozīšanu vai dzēšanu;\\\\nvalsts vai pašvaldības iestāde, ja nostiprinājuma lūgums pamatots ar šo iestāžu izdotu dokumentu [..];\\\\nzvērināts notārs, ja nostiprinājuma pamatā ir šā zvērināta notāra taisīts notariāls akts.\\\\nLikumprojekta „Grozījumi Zemesgrāmatu likumā\\\\\\" sākotnējās ietekmes novērtējuma ziņojumā (anotācija) norādīts, ka šādi grozījumi likumā veikti ar mērķi paplašināt personu loku, kas nostiprinājuma lūgumu var iesniegt elektroniski, tomēr ņemot vērā arī nepieciešamību saglabāt un nodrošināt zemesgrāmatas publisko ticamību. Tādējādi atzīts, ka tiesības vai pienākums iesniegt nostiprinājuma lūgumu nosakāms publiskām personām un gadījumā, ja nostiprinājuma pamatā ir publiskas personas iesniegts dokuments, jo katras valsts iestādes un amatpersonas dokuments, kas izdots tās kompetences ietvaros, ir publisks akts un kā tāds tas kalpo kā pierādījums tajā apliecinātajiem faktiem.\\\\nPrivātpersonas pienākums savu parakstu uz nostiprinājuma lūguma apliecināt pie notāra vai bāriņtiesā saglabāts, lai nodrošinātu publisko ticamību, kā arī fizisko un juridisko personu civilo tiesību un likumisko interešu aizsardzību.\\\\n[6.2.2] Blakus sūdzības iesniedzēja nepamatoti uzskata, ka Zemesgrāmatu likuma 60.pantā ietvertā prasība parakstus apliecināt pie notāra vai bāriņtiesā nodrošina vien personu identifikāciju. Minētā panta pirmās daļas otrajā teikumā noteikts, ka apliecinot parakstu tiek pārbaudīta šo personu rīcībspēja un pilnvarnieka vai pārstāvja pilnvaru apjoms.\\\\nSavukārt Elektronisko dokumentu likuma 1.panta 2.punkta b) apakšpunkts noteic, ka elektroniskais paraksts nodrošina tikai parakstītāja identifikāciju.\\\\n[6.2.3] Ņemot vērā minēto, Civillietu departaments uzskata, ka Zemesgrāmatu likuma 60.pantā iekļautais privātpersonas pienākums apliecināt savu parakstu uz nostiprinājuma lūguma pie notāra vai bāriņtiesā kalpo par pamatu fizisko un juridisko personu civilo tiesību un likumisko interešu aizsardzībai. Tāpat Civillietu departaments nekonstatē pretrunu starp dažāda juridiska spēka tiesību normām. Līdz ar to, Civillietu departamenta ieskatā nepastāv priekšnoteikumi, lai apstrīdētu minētās normas atbilstību LR Satversmei. Pamatojoties uz Civilprocesa likuma 448.panta pirmās daļas 1.punktu, Augstākās tiesas Civillietu departaments\\\\nnolēma\\\\nRīgas apgabaltiesas Civillietu tiesas kolēģijas 2015.gada 12.oktobra lēmumu atstāt negrozītu, bet AS „Zemes īpašnieku ģilde\\\\\\" blakus sūdzību noraidīt.\\\\nLēmums nav pārsūdzams.\\\\n\\\\nTiesību aktu rādītājs\\\\nLR Satversme\\\\n\\\\nElektronisko dokumentu likuma 1.pants\\\\n1.panta 2.punkta b) apakšpunkts\\\\n\\\\nZemesgrāmatu likuma 60.pants\\\\n\\\\nGrozījumi Zemesgrāmatu likumā (30.10.2014., spēkā 29.11.2014) ceturtās nodaļas pirmā apakšnodaļa „Nostiprinājuma lūgumi un to pielikumi\\\\\\"\\\\n\\\\nCivilprocesa likuma 448.panta pirmās daļas 1.punkts\\\\n\\"", 7 | "endnotes": "\\"\\"", 8 | "footnotes": "\\"\\"", 9 | "headerTextboxes": "\\"1\\"", 10 | "headers": Any, 11 | "textboxes": "\\"\\"", 12 | } 13 | `; 14 | -------------------------------------------------------------------------------- /__tests__/data/bad-xml.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/bad-xml.docx -------------------------------------------------------------------------------- /__tests__/data/badfile-01-bad-header.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/badfile-01-bad-header.doc -------------------------------------------------------------------------------- /__tests__/data/bigfile-01.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/bigfile-01.doc -------------------------------------------------------------------------------- /__tests__/data/bigfile-01.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/bigfile-01.docx -------------------------------------------------------------------------------- /__tests__/data/manifest.md: -------------------------------------------------------------------------------- 1 | ## A manifest of test files 2 | 3 | Where there is a `.docx` file name with the same name, it is a `.docx` version saved 4 | from the same file, and should extract to the same document data. 5 | 6 | * `bigfile-01.doc` -- a long and complex Word file, containing insertions, deletions, 7 | hyperlinks, and mixed Unicode and plain text segments. 8 | 9 | * `badfile-01-bad-header.doc` -- a test of an invalid Word file 10 | 11 | * `test01.doc` -- a short test of insertion and deletion with Unicode. 12 | 13 | * `test02.doc` -- a long set of short random paragraphs with a wide ranging of styling. 14 | 15 | * `test03.doc` -- a page containing a table. 16 | 17 | * `test04.doc` -- a Unicode test file, with headers and footers. 18 | 19 | * `test05.doc` -- a simple file created with Word 97-SR2. 20 | 21 | * `test06.doc` -- another table test with fields, headers, and footers. 22 | 23 | * `test07.doc` -- a representative form test, with tables for layout, form controls, footers, 24 | footnotes, and so on. 25 | 26 | * `test08.doc` -- a simple test of bookmarks. 27 | 28 | * `test09.doc` -- very simple test file, regression for a parenthesis character error. 29 | 30 | * `test10.doc` -- a simple test with a couple of sentences and a couple of annotations. 31 | 32 | * `test11.doc` -- a word file containing Asian language text. 33 | 34 | * `test12.doc` -- a test of tables with empty cells, checking row termination. 35 | 36 | * `test13.doc` -- a short test of endnotes and footnotes combined. 37 | 38 | * `test14.doc` -- a short test of insertion and deletion. 39 | -------------------------------------------------------------------------------- /__tests__/data/test01.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test01.doc -------------------------------------------------------------------------------- /__tests__/data/test01.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test01.docx -------------------------------------------------------------------------------- /__tests__/data/test02.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test02.doc -------------------------------------------------------------------------------- /__tests__/data/test02.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test02.docx -------------------------------------------------------------------------------- /__tests__/data/test03.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test03.doc -------------------------------------------------------------------------------- /__tests__/data/test03.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test03.docx -------------------------------------------------------------------------------- /__tests__/data/test04.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test04.doc -------------------------------------------------------------------------------- /__tests__/data/test04.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test04.docx -------------------------------------------------------------------------------- /__tests__/data/test05.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test05.doc -------------------------------------------------------------------------------- /__tests__/data/test06.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test06.doc -------------------------------------------------------------------------------- /__tests__/data/test06.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test06.docx -------------------------------------------------------------------------------- /__tests__/data/test07.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test07.doc -------------------------------------------------------------------------------- /__tests__/data/test07.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test07.docx -------------------------------------------------------------------------------- /__tests__/data/test08.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test08.doc -------------------------------------------------------------------------------- /__tests__/data/test08.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test08.docx -------------------------------------------------------------------------------- /__tests__/data/test09.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test09.doc -------------------------------------------------------------------------------- /__tests__/data/test09.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test09.docx -------------------------------------------------------------------------------- /__tests__/data/test10.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test10.doc -------------------------------------------------------------------------------- /__tests__/data/test10.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test10.docx -------------------------------------------------------------------------------- /__tests__/data/test11.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test11.doc -------------------------------------------------------------------------------- /__tests__/data/test11.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test11.docx -------------------------------------------------------------------------------- /__tests__/data/test12.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test12.doc -------------------------------------------------------------------------------- /__tests__/data/test12.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test12.docx -------------------------------------------------------------------------------- /__tests__/data/test13.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test13.doc -------------------------------------------------------------------------------- /__tests__/data/test13.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test13.docx -------------------------------------------------------------------------------- /__tests__/data/test14.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test14.doc -------------------------------------------------------------------------------- /__tests__/data/test14.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test14.docx -------------------------------------------------------------------------------- /__tests__/data/test15.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test15.doc -------------------------------------------------------------------------------- /__tests__/data/test15.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test15.docx -------------------------------------------------------------------------------- /__tests__/data/test16.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test16.doc -------------------------------------------------------------------------------- /__tests__/data/test16.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test16.docx -------------------------------------------------------------------------------- /__tests__/data/test17.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test17.doc -------------------------------------------------------------------------------- /__tests__/data/test17.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test17.docx -------------------------------------------------------------------------------- /__tests__/data/test18.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test18.doc -------------------------------------------------------------------------------- /__tests__/data/test18.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test18.docx -------------------------------------------------------------------------------- /__tests__/data/test19.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test19.doc -------------------------------------------------------------------------------- /__tests__/data/test19.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test19.docx -------------------------------------------------------------------------------- /__tests__/data/test20.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test20.doc -------------------------------------------------------------------------------- /__tests__/data/test20.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/morungos/node-word-extractor/d971d9f69056245ae129bd2ce31436d518293854/__tests__/data/test20.docx -------------------------------------------------------------------------------- /jsdoc.json: -------------------------------------------------------------------------------- 1 | { 2 | "source": { 3 | "include": [ "lib" ], 4 | "includePattern": ".+\\.js(doc|x)?$" 5 | }, 6 | "opts": { 7 | "destination": "./jsdoc/", 8 | "recurse": true 9 | }, 10 | "sourceType": "module", 11 | "plugins": ["plugins/markdown"], 12 | "verbose": true 13 | } -------------------------------------------------------------------------------- /lib/buffer-reader.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @module buffer-reader 3 | * 4 | * @description 5 | * Exports a class {@link BufferReader}, used internally to handle 6 | * access when an input buffer is passed. This provides a consistent 7 | * interface between reading from files and buffers, so that in-memory 8 | * files can be handled efficiently. 9 | */ 10 | 11 | /** 12 | * A class that allows a reader to access file through the file system. 13 | * This can be used as an alternative to the 14 | * [FileReader]{@link module:file-reader~FileReader} which 15 | * reads direct from an opened file descriptor. 16 | */ 17 | class BufferReader { 18 | 19 | constructor(buffer) { 20 | this._buffer = buffer; 21 | } 22 | 23 | open() { 24 | return Promise.resolve(); 25 | } 26 | 27 | close() { 28 | return Promise.resolve(); 29 | } 30 | 31 | read(buffer, offset, length, position) { 32 | this._buffer.copy(buffer, offset, position, position + length); 33 | return Promise.resolve(buffer); 34 | } 35 | 36 | buffer() { 37 | return this._buffer; 38 | } 39 | 40 | static isBufferReader(instance) { 41 | return instance instanceof BufferReader; 42 | } 43 | } 44 | 45 | module.exports = BufferReader; 46 | -------------------------------------------------------------------------------- /lib/document.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @module document 3 | * 4 | * @description 5 | * Implements the main document returned when a Word file has been extracted. This exposes 6 | * methods that allow the body, annotations, headers, footnotes, and endnotes, to be 7 | * read and used. 8 | * 9 | * @author 10 | * Stuart Watt 11 | */ 12 | 13 | const { filter } = require('./filters'); 14 | 15 | /** 16 | * @class 17 | * Returned from all extractors, this class provides accessors to read the different 18 | * parts of a Word document. This also allows some options to be passed to the accessors, 19 | * so you can control some character conversion and filtering, as described in the methods 20 | * below. 21 | */ 22 | class Document { 23 | 24 | constructor() { 25 | this._body = ""; 26 | this._footnotes = ""; 27 | this._endnotes = ""; 28 | this._headers = ""; 29 | this._footers = ""; 30 | this._annotations = ""; 31 | this._textboxes = ""; 32 | this._headerTextboxes = ""; 33 | } 34 | 35 | /** 36 | * Accessor to read the main body part of a Word file 37 | * @param {Object} options - options for body data 38 | * @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes 39 | * to standard ASCII characters 40 | * @returns a string, containing the Word file body 41 | */ 42 | getBody(options) { 43 | options = options || {}; 44 | const value = this._body; 45 | return (options.filterUnicode == false) ? value : filter(value); 46 | } 47 | 48 | /** 49 | * Accessor to read the footnotes part of a Word file 50 | * @param {Object} options - options for body data 51 | * @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes 52 | * to standard ASCII characters 53 | * @returns a string, containing the Word file footnotes 54 | */ 55 | getFootnotes(options) { 56 | options = options || {}; 57 | const value = this._footnotes; 58 | return (options.filterUnicode == false) ? value : filter(value); 59 | } 60 | 61 | /** 62 | * Accessor to read the endnotes part of a Word file 63 | * @param {Object} options - options for body data 64 | * @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes 65 | * to standard ASCII characters 66 | * @returns a string, containing the Word file endnotes 67 | */ 68 | getEndnotes(options) { 69 | options = options || {}; 70 | const value = this._endnotes; 71 | return (options.filterUnicode == false) ? value : filter(value); 72 | } 73 | 74 | /** 75 | * Accessor to read the headers part of a Word file 76 | * @param {Object} options - options for body data 77 | * @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes 78 | * to standard ASCII characters 79 | * @param {boolean} options.includeFooters - if true (the default), returns headers and footers 80 | * as a single string 81 | * @returns a string, containing the Word file headers 82 | */ 83 | getHeaders(options) { 84 | options = options || {}; 85 | const value = this._headers + ((options.includeFooters == false) ? "" : this._footers); 86 | return (options.filterUnicode == false) ? value : filter(value); 87 | } 88 | 89 | /** 90 | * Accessor to read the footers part of a Word file 91 | * @param {Object} options - options for body data 92 | * @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes 93 | * to standard ASCII characters 94 | * @returns a string, containing the Word file footers 95 | */ 96 | getFooters(options) { 97 | options = options || {}; 98 | const value = this._footers; 99 | return (options.filterUnicode == false) ? value : filter(value); 100 | } 101 | 102 | /** 103 | * Accessor to read the annotations part of a Word file 104 | * @param {Object} options - options for body data 105 | * @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes 106 | * to standard ASCII characters 107 | * @returns a string, containing the Word file annotations 108 | */ 109 | getAnnotations(options) { 110 | options = options || {}; 111 | const value = this._annotations; 112 | return (options.filterUnicode == false) ? value : filter(value); 113 | } 114 | 115 | /** 116 | * Accessor to read the textboxes from a Word file. The text box content is aggregated as a 117 | * single long string. When both the body and header content exists, they will be separated 118 | * by a newline. 119 | * @param {Object} options - options for body data 120 | * @param {boolean} options.filterUnicode - if true (the default), converts common Unicode quotes 121 | * to standard ASCII characters 122 | * @param {boolean} options.includeHeadersAndFooters - if true (the default), includes text box 123 | * content in headers and footers 124 | * @param {boolean} options.includeBody - if true (the default), includes text box 125 | * content in the document body 126 | * @returns a string, containing the Word file text box content 127 | */ 128 | getTextboxes(options) { 129 | options = options || {}; 130 | const segments = []; 131 | if (options.includeBody != false) 132 | segments.push(this._textboxes); 133 | if (options.includeHeadersAndFooters != false) 134 | segments.push(this._headerTextboxes); 135 | const value = segments.join("\n"); 136 | return (options.filterUnicode == false) ? value : filter(value); 137 | } 138 | } 139 | 140 | 141 | module.exports = Document; 142 | -------------------------------------------------------------------------------- /lib/file-reader.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @module file-reader 3 | * 4 | * @description 5 | * Exports a class {@link FileReader}, used internally to handle 6 | * access when a string filename is passed. This provides a consistent 7 | * interface between reading from files and buffers, so that in-memory 8 | * files can be handled efficiently. 9 | */ 10 | 11 | const fs = require('fs'); 12 | 13 | /** 14 | * A class that allows a reader to access file through the file system. 15 | * This can be used as an alternative to the 16 | * [BufferReader]{@link module:buffer-reader~BufferReader} which 17 | * reads direct from a buffer. 18 | */ 19 | class FileReader { 20 | 21 | /** 22 | * Creates a new file reader instance, using the given filename. 23 | * @param {*} filename 24 | */ 25 | constructor(filename) { 26 | this._filename = filename; 27 | } 28 | 29 | /** 30 | * Opens the file descriptor for a file, and returns a promise that resolves 31 | * when the file is open. After this, {@link FileReader#read} can be called 32 | * to read file content into a buffer. 33 | * @returns a promise 34 | */ 35 | open() { 36 | return new Promise((resolve, reject) => { 37 | fs.open(this._filename, 'r', 0o666, (err, fd) => { 38 | if(err) { 39 | return reject(err); 40 | } 41 | 42 | this._fd = fd; 43 | resolve(); 44 | }); 45 | }); 46 | } 47 | 48 | /** 49 | * Closes the file descriptor associated with an open document, if there 50 | * is one, and returns a promise that resolves when the file handle is closed. 51 | * @returns a promise 52 | */ 53 | close() { 54 | return new Promise((resolve, reject) => { 55 | if (this._fd) { 56 | fs.close(this._fd, (err) => { 57 | if (err) { 58 | return reject(err); 59 | } 60 | delete this._fd; 61 | resolve(); 62 | }); 63 | } else { 64 | resolve(); 65 | } 66 | }); 67 | } 68 | 69 | 70 | /** 71 | * Reads a buffer of `length` bytes into the `buffer`. The new data will 72 | * be added to the buffer at offset `offset`, and will be read from the 73 | * file starting at position `position` 74 | * @param {*} buffer 75 | * @param {*} offset 76 | * @param {*} length 77 | * @param {*} position 78 | * @returns a promise that resolves to the buffer when the data is present 79 | */ 80 | read(buffer, offset, length, position) { 81 | return new Promise((resolve, reject) => { 82 | if ( !this._fd) { 83 | return reject(new Error("file not open")); 84 | } 85 | fs.read(this._fd, buffer, offset, length, position, (err, bytesRead, buffer) => { 86 | if (err) { 87 | return reject(err); 88 | } 89 | resolve(buffer); 90 | }); 91 | }); 92 | } 93 | 94 | /** 95 | * Returns the open file descriptor 96 | * @returns the file descriptor 97 | */ 98 | fd() { 99 | return this._fd; 100 | } 101 | 102 | /** 103 | * Returns true if the passed instance is an instance of this class. 104 | * @param {*} instance 105 | * @returns true if `instance` is an instance of {@link FileReader}. 106 | */ 107 | static isFileReader(instance) { 108 | return instance instanceof FileReader; 109 | } 110 | 111 | } 112 | 113 | module.exports = FileReader; 114 | -------------------------------------------------------------------------------- /lib/filters.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @module filters 3 | * 4 | * @description 5 | * Exports several functions that implement various methods for translating 6 | * characters into Unicode, and cleaning up some of the remaining residues from 7 | * Word's odd internal marker character usage. 8 | */ 9 | 10 | /** 11 | * A replacement table, that maps Word control characters to either NULL, for 12 | * deletion, or to another more acceptable character ina Unicode world, such 13 | * as a newline. 14 | */ 15 | const replaceTable = []; 16 | replaceTable[0x0002] = '\x00'; 17 | replaceTable[0x0005] = '\x00'; 18 | replaceTable[0x0007] = "\t"; 19 | replaceTable[0x0008] = '\x00'; 20 | replaceTable[0x000A] = "\n"; 21 | replaceTable[0x000B] = "\n"; 22 | replaceTable[0x000C] = "\n"; 23 | replaceTable[0x000D] = "\n"; 24 | replaceTable[0x001E] = "\u2011"; 25 | 26 | /** 27 | * @constant 28 | * Maps between Windows character codes, especially between 0x80 and 0x9f, 29 | * into official Unicode code points. This smooths over the differences 30 | * between UCS-2 and 8-bit code runs in Word, by allowing us to work 31 | * entirely within Unicode later on. 32 | */ 33 | const binaryToUnicodeTable = []; 34 | binaryToUnicodeTable[0x0082] = "\u201a"; 35 | binaryToUnicodeTable[0x0083] = "\u0192"; 36 | binaryToUnicodeTable[0x0084] = "\u201e"; 37 | binaryToUnicodeTable[0x0085] = "\u2026"; 38 | binaryToUnicodeTable[0x0086] = "\u2020"; 39 | binaryToUnicodeTable[0x0087] = "\u2021"; 40 | binaryToUnicodeTable[0x0088] = "\u02C6"; 41 | binaryToUnicodeTable[0x0089] = "\u2030"; 42 | binaryToUnicodeTable[0x008a] = "\u0160"; 43 | binaryToUnicodeTable[0x008b] = "\u2039"; 44 | binaryToUnicodeTable[0x008c] = "\u0152"; 45 | binaryToUnicodeTable[0x008e] = "\u017D"; 46 | binaryToUnicodeTable[0x0091] = "\u2018"; 47 | binaryToUnicodeTable[0x0092] = "\u2019"; 48 | binaryToUnicodeTable[0x0093] = "\u201C"; 49 | binaryToUnicodeTable[0x0094] = "\u201D"; 50 | binaryToUnicodeTable[0x0095] = "\u2022"; 51 | binaryToUnicodeTable[0x0096] = "\u2013"; 52 | binaryToUnicodeTable[0x0097] = "\u2014"; 53 | binaryToUnicodeTable[0x0098] = "\u02DC"; 54 | binaryToUnicodeTable[0x0099] = "\u2122"; 55 | binaryToUnicodeTable[0x009a] = "\u0161"; 56 | binaryToUnicodeTable[0x009b] = "\u203A"; 57 | binaryToUnicodeTable[0x009c] = "\u0153"; 58 | binaryToUnicodeTable[0x009e] = "\u017E"; 59 | binaryToUnicodeTable[0x009f] = "\u0178"; 60 | 61 | /** 62 | * Converts character codes from 0x80 to 0x9f to Unicode equivalents 63 | * within a string 64 | * @param {string} string - the input string 65 | * @returns a converted string 66 | */ 67 | module.exports.binaryToUnicode = (string) => { 68 | return string.replace(/([\x80-\x9f])/g, (match) => binaryToUnicodeTable[match.charCodeAt(0)]); 69 | }; 70 | 71 | /** 72 | * The main function for cleaning OLE-based text. It runs a few standard replacements on characters 73 | * that are reserved for special purposes, also removes fields, and finally strips out any weird 74 | * characters that are likely not to be useful for anyone. 75 | * 76 | * @param {string} string - an input string 77 | * @returns a cleaned up string 78 | */ 79 | module.exports.clean = (string) => { 80 | 81 | // Fields can be nested, which makes this awkward. We use a strict non-nesting model 82 | // and repeat until we find no substitutions. This is because a second match might 83 | // start before an earlier one, due to our replacements. 84 | 85 | string = string.replace(/([\x02\x05\x07\x08\x0a\x0b\x0c\x0d\x1f])/g, (match) => replaceTable[match.charCodeAt(0)]); 86 | 87 | let called = true; 88 | while (called) { 89 | called = false; 90 | string = string.replace(/(?:\x13[^\x13\x14\x15]*\x14?([^\x13\x14\x15]*)\x15)/g, (match, p1) => { called = true; return p1; }); 91 | } 92 | 93 | return string 94 | .replace(/[\x00-\x07]/g, ''); 95 | }; 96 | 97 | const filterTable = []; 98 | filterTable[0x2002] = " "; 99 | filterTable[0x2003] = " "; 100 | filterTable[0x2012] = "-"; 101 | filterTable[0x2013] = "-"; 102 | filterTable[0x2014] = "-"; 103 | filterTable[0x2018] = "'"; 104 | filterTable[0x2019] = "'"; 105 | filterTable[0x201c] = "\""; 106 | filterTable[0x201d] = "\""; 107 | 108 | /** 109 | * Filters a string, with a few common Unicode replacements, primarily for standard 110 | * punctuation like non-breaking spaces, hyphens, and left and right curly quotes. 111 | * @param {string} string - the input string 112 | * @returns a filtered string 113 | */ 114 | module.exports.filter = (string) => { 115 | return string 116 | .replace(/[\u2002\u2003\u2012\u2013\u2014\u2018\u2019\u201c\u201d]/g, (match) => filterTable[match.charCodeAt(0)]); 117 | }; 118 | -------------------------------------------------------------------------------- /lib/ole-allocation-table.js: -------------------------------------------------------------------------------- 1 | 2 | const ALLOCATION_TABLE_SEC_ID_FREE = -1; 3 | const ALLOCATION_TABLE_SEC_ID_END_OF_CHAIN = -2; // eslint-disable-line no-unused-vars 4 | const ALLOCATION_TABLE_SEC_ID_SAT = -3; // eslint-disable-line no-unused-vars 5 | const ALLOCATION_TABLE_SEC_ID_MSAT = -4; // eslint-disable-line no-unused-vars 6 | 7 | class AllocationTable { 8 | 9 | constructor(doc) { 10 | this._doc = doc; 11 | } 12 | 13 | load(secIds) { 14 | const doc = this._doc; 15 | const header = doc._header; 16 | this._table = new Array(secIds.length * (header.secSize / 4)); 17 | return doc._readSectors(secIds) 18 | .then((buffer) => { 19 | for (let i = 0; i < buffer.length / 4; i++) { 20 | this._table[i] = buffer.readInt32LE(i * 4); 21 | } 22 | }); 23 | } 24 | 25 | getSecIdChain(startSecId) { 26 | let secId = startSecId; 27 | const secIds = []; 28 | while (secId > ALLOCATION_TABLE_SEC_ID_FREE) { 29 | secIds.push(secId); 30 | const secIdPrior = secId; 31 | secId = this._table[secId]; 32 | if (secId === secIdPrior) { // this will cause a deadlock and a out of memory error 33 | break; 34 | } 35 | } 36 | 37 | return secIds; 38 | } 39 | 40 | } 41 | 42 | module.exports = AllocationTable; 43 | -------------------------------------------------------------------------------- /lib/ole-compound-doc.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @module ole-compound-doc 3 | */ 4 | 5 | // Copyright (c) 2012 Chris Geiersbach 6 | // 7 | // Permission is hereby granted, free of charge, to any person obtaining a copy 8 | // of this software and associated documentation files (the "Software"), to deal 9 | // in the Software without restriction, including without limitation the rights 10 | // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 | // copies of the Software, and to permit persons to whom the Software is 12 | // furnished to do so, subject to the following conditions: 13 | // 14 | // The above copyright notice and this permission notice shall be included in 15 | // all copies or substantial portions of the Software. 16 | // 17 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 20 | // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 | // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 | // THE SOFTWARE. 24 | // 25 | // This component as adapted from node-ole-doc, available at: 26 | // https://github.com/atariman486/node-ole-doc. 27 | // 28 | // Modified extensively by Stuart Watt to keep the 29 | // principal logic, but replacing callbacks and some weird stream usages 30 | // with promises. 31 | 32 | const Header = require('./ole-header'); 33 | const AllocationTable = require('./ole-allocation-table'); 34 | const DirectoryTree = require('./ole-directory-tree'); 35 | const Storage = require('./ole-storage'); 36 | 37 | /** 38 | * Implements the main interface used to read from an OLE compoound file. 39 | */ 40 | class OleCompoundDoc { 41 | 42 | constructor(reader) { 43 | this._reader = reader; 44 | this._skipBytes = 0; 45 | } 46 | 47 | read() { 48 | return Promise.resolve() 49 | .then(() => this._readHeader()) 50 | .then(() => this._readMSAT()) 51 | .then(() => this._readSAT()) 52 | .then(() => this._readSSAT()) 53 | .then(() => this._readDirectoryTree()) 54 | .then(() => { 55 | if (this._skipBytes != 0) { 56 | return this._readCustomHeader(); 57 | } 58 | }) 59 | .then(() => this); 60 | } 61 | 62 | _readCustomHeader() { 63 | const buffer = Buffer.alloc(this._skipBytes); 64 | return this._reader.read(buffer, 0, this._skipBytes, 0) 65 | .then((buffer) => { 66 | if (!this._customHeaderCallback(buffer)) 67 | return; 68 | }); 69 | } 70 | 71 | _readHeader() { 72 | const buffer = Buffer.alloc(512); 73 | return this._reader.read(buffer, 0, 512, 0 + this._skipBytes) 74 | .then((buffer) => { 75 | const header = this._header = new Header(); 76 | if (!header.load(buffer)) { 77 | throw new Error("Not a valid compound document"); 78 | } 79 | }); 80 | } 81 | 82 | _readMSAT() { 83 | const header = this._header; 84 | 85 | this._MSAT = header.partialMSAT.slice(0); 86 | this._MSAT.length = header.SATSize; 87 | 88 | if(header.SATSize <= 109 || header.MSATSize == 0) { 89 | return Promise.resolve(); 90 | } 91 | 92 | let currMSATIndex = 109; 93 | let i = 0; 94 | 95 | const readOneMSAT = (i, currMSATIndex, secId) => { 96 | if (i >= header.MSATSize) { 97 | return Promise.resolve(); 98 | } 99 | 100 | return this._readSector(secId) 101 | .then((sectorBuffer) => { 102 | let s; 103 | for(s = 0; s < header.secSize - 4; s += 4) { 104 | if(currMSATIndex >= header.SATSize) 105 | break; 106 | else 107 | this._MSAT[currMSATIndex] = sectorBuffer.readInt32LE(s); 108 | 109 | currMSATIndex++; 110 | } 111 | 112 | secId = sectorBuffer.readInt32LE(header.secSize - 4); 113 | return readOneMSAT(i + 1, currMSATIndex, secId); 114 | }); 115 | }; 116 | 117 | return readOneMSAT(i, currMSATIndex, header.MSATSecId); 118 | } 119 | 120 | _readSector(secId) { 121 | return this._readSectors([ secId ]); 122 | } 123 | 124 | _readSectors(secIds) { 125 | const header = this._header; 126 | const buffer = Buffer.alloc(secIds.length * header.secSize); 127 | 128 | const readOneSector = (i) => { 129 | if (i >= secIds.length) { 130 | return Promise.resolve(buffer); 131 | } 132 | 133 | const bufferOffset = i * header.secSize; 134 | const fileOffset = this._getFileOffsetForSec(secIds[i]); 135 | 136 | return this._reader.read(buffer, bufferOffset, header.secSize, fileOffset) 137 | .then(() => readOneSector(i + 1)); 138 | }; 139 | 140 | return readOneSector(0); 141 | } 142 | 143 | _readShortSector(secId) { 144 | return this._readShortSectors([ secId ]); 145 | } 146 | 147 | _readShortSectors(secIds) { 148 | const header = this._header; 149 | const buffer = Buffer.alloc(secIds.length * header.shortSecSize); 150 | 151 | const readOneShortSector = (i) => { 152 | if (i >= secIds.length) { 153 | return Promise.resolve(buffer); 154 | } 155 | 156 | const bufferOffset = i * header.shortSecSize; 157 | const fileOffset = this._getFileOffsetForShortSec(secIds[i]); 158 | 159 | return this._reader.read(buffer, bufferOffset, header.shortSecSize, fileOffset) 160 | .then(() => readOneShortSector(i + 1)); 161 | }; 162 | 163 | return readOneShortSector(0); 164 | } 165 | 166 | _readSAT() { 167 | this._SAT = new AllocationTable(this); 168 | return this._SAT.load(this._MSAT); 169 | } 170 | 171 | _readSSAT() { 172 | const header = this._header; 173 | 174 | const secIds = this._SAT.getSecIdChain(header.SSATSecId); 175 | if (secIds.length != header.SSATSize) { 176 | return Promise.reject(new Error("Invalid Short Sector Allocation Table")); 177 | } 178 | 179 | this._SSAT = new AllocationTable(this); 180 | return this._SSAT.load(secIds); 181 | } 182 | 183 | _readDirectoryTree() { 184 | const header = this._header; 185 | 186 | this._directoryTree = new DirectoryTree(this); 187 | 188 | const secIds = this._SAT.getSecIdChain(header.dirSecId); 189 | return this._directoryTree.load(secIds) 190 | .then(() => { 191 | const rootEntry = this._directoryTree.root; 192 | this._rootStorage = new Storage(this, rootEntry); 193 | this._shortStreamSecIds = this._SAT.getSecIdChain(rootEntry.secId); 194 | }); 195 | } 196 | 197 | _getFileOffsetForSec(secId) { 198 | const secSize = this._header.secSize; 199 | return this._skipBytes + (secId + 1) * secSize; // Skip past the header sector 200 | } 201 | 202 | _getFileOffsetForShortSec(shortSecId) { 203 | const shortSecSize = this._header.shortSecSize; 204 | const shortStreamOffset = shortSecId * shortSecSize; 205 | 206 | const secSize = this._header.secSize; 207 | const secIdIndex = Math.floor(shortStreamOffset / secSize); 208 | const secOffset = shortStreamOffset % secSize; 209 | const secId = this._shortStreamSecIds[secIdIndex]; 210 | 211 | return this._getFileOffsetForSec(secId) + secOffset; 212 | } 213 | 214 | storage(storageName) { 215 | return this._rootStorage.storage(storageName); 216 | } 217 | 218 | stream(streamName) { 219 | return this._rootStorage.stream(streamName); 220 | } 221 | 222 | } 223 | 224 | module.exports = OleCompoundDoc; 225 | -------------------------------------------------------------------------------- /lib/ole-directory-tree.js: -------------------------------------------------------------------------------- 1 | const DIRECTORY_TREE_ENTRY_TYPE_EMPTY = 0; // eslint-disable-line no-unused-vars 2 | const DIRECTORY_TREE_ENTRY_TYPE_STORAGE = 1; 3 | const DIRECTORY_TREE_ENTRY_TYPE_STREAM = 2; 4 | const DIRECTORY_TREE_ENTRY_TYPE_ROOT = 5; 5 | 6 | const DIRECTORY_TREE_NODE_COLOR_RED = 0; // eslint-disable-line no-unused-vars 7 | const DIRECTORY_TREE_NODE_COLOR_BLACK = 1; // eslint-disable-line no-unused-vars 8 | 9 | const DIRECTORY_TREE_LEAF = -1; 10 | 11 | class DirectoryTree { 12 | 13 | constructor(doc) { 14 | this._doc = doc; 15 | } 16 | 17 | load(secIds) { 18 | const doc = this._doc; 19 | 20 | return doc._readSectors(secIds) 21 | .then((buffer) => { 22 | const count = buffer.length / 128; 23 | this._entries = new Array(count); 24 | for(let i = 0; i < count; i++) { 25 | const offset = i * 128; 26 | const nameLength = Math.max(buffer.readInt16LE(64 + offset) - 1, 0); 27 | 28 | const entry = {}; 29 | entry.name = buffer.toString('utf16le', 0 + offset, nameLength + offset); 30 | entry.type = buffer.readInt8(66 + offset); 31 | entry.nodeColor = buffer.readInt8(67 + offset); 32 | entry.left = buffer.readInt32LE(68 + offset); 33 | entry.right = buffer.readInt32LE(72 + offset); 34 | entry.storageDirId = buffer.readInt32LE(76 + offset); 35 | entry.secId = buffer.readInt32LE(116 + offset); 36 | entry.size = buffer.readInt32LE(120 + offset); 37 | 38 | this._entries[i] = entry; 39 | } 40 | 41 | this.root = this._entries.find((entry) => entry.type === DIRECTORY_TREE_ENTRY_TYPE_ROOT); 42 | this._buildHierarchy(this.root); 43 | }); 44 | } 45 | 46 | _buildHierarchy(storageEntry) { 47 | const childIds = this._getChildIds(storageEntry); 48 | 49 | storageEntry.storages = {}; 50 | storageEntry.streams = {}; 51 | 52 | for(const childId of childIds) { 53 | const childEntry = this._entries[childId]; 54 | const name = childEntry.name; 55 | if (childEntry.type === DIRECTORY_TREE_ENTRY_TYPE_STORAGE) { 56 | storageEntry.storages[name] = childEntry; 57 | } 58 | if (childEntry.type === DIRECTORY_TREE_ENTRY_TYPE_STREAM) { 59 | storageEntry.streams[name] = childEntry; 60 | } 61 | } 62 | 63 | for(const name in storageEntry.storages) { 64 | this._buildHierarchy(storageEntry.storages[name]); 65 | } 66 | } 67 | 68 | _getChildIds(storageEntry) { 69 | const childIds = []; 70 | 71 | const visit = (visitEntry) => { 72 | if (visitEntry.left !== DIRECTORY_TREE_LEAF) { 73 | childIds.push(visitEntry.left); 74 | visit(this._entries[visitEntry.left]); 75 | } 76 | if (visitEntry.right !== DIRECTORY_TREE_LEAF) { 77 | childIds.push(visitEntry.right); 78 | visit(this._entries[visitEntry.right]); 79 | } 80 | }; 81 | 82 | if (storageEntry.storageDirId > -1) { 83 | childIds.push(storageEntry.storageDirId); 84 | const rootChildEntry = this._entries[storageEntry.storageDirId]; 85 | visit(rootChildEntry); 86 | } 87 | 88 | return childIds; 89 | } 90 | 91 | } 92 | 93 | module.exports = DirectoryTree; 94 | -------------------------------------------------------------------------------- /lib/ole-header.js: -------------------------------------------------------------------------------- 1 | const HEADER_DATA = Buffer.from('D0CF11E0A1B11AE1', 'hex'); 2 | 3 | class Header { 4 | 5 | constructor() {} 6 | 7 | load(buffer) { 8 | for(let i = 0; i < HEADER_DATA.length; i++) { 9 | if (HEADER_DATA[i] != buffer[i]) 10 | return false; 11 | } 12 | 13 | this.secSize = 1 << buffer.readInt16LE(30); // Size of sectors 14 | this.shortSecSize = 1 << buffer.readInt16LE(32); // Size of short sectors 15 | this.SATSize = buffer.readInt32LE(44); // Number of sectors used for the Sector Allocation Table 16 | this.dirSecId = buffer.readInt32LE(48); // Starting Sec ID of the directory stream 17 | this.shortStreamMax = buffer.readInt32LE(56); // Maximum size of a short stream 18 | this.SSATSecId = buffer.readInt32LE(60); // Starting Sec ID of the Short Sector Allocation Table 19 | this.SSATSize = buffer.readInt32LE(64); // Number of sectors used for the Short Sector Allocation Table 20 | this.MSATSecId = buffer.readInt32LE(68); // Starting Sec ID of the Master Sector Allocation Table 21 | this.MSATSize = buffer.readInt32LE(72); // Number of sectors used for the Master Sector Allocation Table 22 | 23 | // The first 109 sectors of the MSAT 24 | this.partialMSAT = new Array(109); 25 | for(let i = 0; i < 109; i++) 26 | this.partialMSAT[i] = buffer.readInt32LE(76 + i * 4); 27 | 28 | return true; 29 | } 30 | 31 | } 32 | 33 | module.exports = Header; 34 | -------------------------------------------------------------------------------- /lib/ole-storage-stream.js: -------------------------------------------------------------------------------- 1 | const { Readable } = require('stream'); 2 | 3 | class StorageStream extends Readable { 4 | 5 | constructor(doc, streamEntry) { 6 | super(); 7 | this._doc = doc; 8 | this._streamEntry = streamEntry; 9 | this.initialize(); 10 | } 11 | 12 | initialize() { 13 | this._index = 0; 14 | this._done = true; 15 | 16 | if (!this._streamEntry) { 17 | return; 18 | } 19 | 20 | const doc = this._doc; 21 | this._bytes = this._streamEntry.size; 22 | 23 | this._allocationTable = doc._SAT; 24 | this._shortStream = false; 25 | if (this._bytes < doc._header.shortStreamMax) { 26 | this._shortStream = true; 27 | this._allocationTable = doc._SSAT; 28 | } 29 | 30 | this._secIds = this._allocationTable.getSecIdChain(this._streamEntry.secId); 31 | this._done = false; 32 | } 33 | 34 | _readSector(sector) { 35 | if (this._shortStream) { 36 | return this._doc._readShortSector(sector); 37 | } else { 38 | return this._doc._readSector(sector); 39 | } 40 | } 41 | 42 | _read() { 43 | if (this._done) { 44 | return this.push(null); 45 | } 46 | 47 | if (this._index >= this._secIds.length) { 48 | this._done = true; 49 | return this.push(null); 50 | } 51 | 52 | return this._readSector(this._secIds[this._index]) 53 | .then((buffer) => { 54 | if (this._bytes - buffer.length < 0) { 55 | buffer = buffer.slice(0, this._bytes); 56 | } 57 | 58 | this._bytes -= buffer.length; 59 | this._index ++; 60 | this.push(buffer); 61 | }); 62 | } 63 | } 64 | 65 | module.exports = StorageStream; -------------------------------------------------------------------------------- /lib/ole-storage.js: -------------------------------------------------------------------------------- 1 | const StorageStream = require('./ole-storage-stream'); 2 | 3 | class Storage { 4 | 5 | constructor(doc, dirEntry) { 6 | this._doc = doc; 7 | this._dirEntry = dirEntry; 8 | } 9 | 10 | storage(storageName) { 11 | return new Storage(this._doc, this._dirEntry.storages[storageName]); 12 | } 13 | 14 | stream(streamName) { 15 | return new StorageStream(this._doc, this._dirEntry.streams[streamName]); 16 | } 17 | 18 | } 19 | 20 | module.exports = Storage; -------------------------------------------------------------------------------- /lib/open-office-extractor.js: -------------------------------------------------------------------------------- 1 | 2 | /** 3 | * @module open-office-extractor 4 | * 5 | * @description 6 | * Implements the main Open Office format extractor. Open Office .docx files 7 | * are essentially zip files containing streams, and each of these streams contains 8 | * XML content in one form or another. So we need to use {@link zlib} to extract 9 | * the streams, and something like `sax-js` to parse the XML that we find 10 | * there. 11 | * 12 | * We probably don't need the whole of the Open Office data, we're only likely 13 | * to need a few streams. Sadly, the documentation for the file format is literally 14 | * 5000 pages. 15 | * Note that [WordOleExtractor]{@link module:word-ole-extractor~WordOleExtractor} is 16 | * used for older, OLE-style, compound document files. 17 | */ 18 | 19 | const path = require('path'); 20 | const SAXES = require("saxes"); 21 | const yauzl = require('yauzl'); 22 | 23 | const BufferReader = require('./buffer-reader'); 24 | const FileReader = require('./file-reader'); 25 | const Document = require('./document'); 26 | 27 | // function getEntryWeight(filename) { 28 | // return 1; 29 | // } 30 | 31 | function each(callback, array, index) { 32 | if (index === array.length) { 33 | return Promise.resolve(); 34 | } else { 35 | return Promise.resolve(callback(array[index++])) 36 | .then(() => each(callback, array, index)); 37 | } 38 | } 39 | 40 | /** 41 | * @class 42 | * The main class implementing extraction from Open Office Word files. 43 | */ 44 | class OpenOfficeExtractor { 45 | 46 | constructor() { 47 | this._streamTypes = { 48 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml': true, 49 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml': true, 50 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.commentsExtended+xml': true, 51 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml': true, 52 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml': true, 53 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml': true, 54 | 'application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml': true, 55 | 'application/vnd.openxmlformats-package.relationships+xml': true 56 | }; 57 | this._headerTypes = { 58 | 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/header': true, 59 | 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer': true 60 | }; 61 | this._actions = {}; 62 | this._defaults = {}; 63 | } 64 | 65 | shouldProcess(filename) { 66 | if (this._actions[filename]) { 67 | return true; 68 | } 69 | const extension = path.posix.extname(filename).replace(/^\./, ''); 70 | if (! extension) { 71 | return false; 72 | } 73 | const defaultType = this._defaults[extension]; 74 | if (defaultType && this._streamTypes[defaultType]) { 75 | return true; 76 | } 77 | return false; 78 | } 79 | 80 | openArchive(reader) { 81 | if (BufferReader.isBufferReader(reader)) { 82 | return new Promise((resolve, reject) => { 83 | yauzl.fromBuffer(reader.buffer(), {lazyEntries: true}, function(err, zipfile) { 84 | if (err) { 85 | return reject(err); 86 | } 87 | resolve(zipfile); 88 | }); 89 | }); 90 | } else if (FileReader.isFileReader(reader)) { 91 | return new Promise((resolve, reject) => { 92 | yauzl.fromFd(reader.fd(), {lazyEntries: true, autoClose: false}, function(err, zipfile) { 93 | if (err) { 94 | return reject(err); 95 | } 96 | resolve(zipfile); 97 | }); 98 | }); 99 | } else { 100 | throw new Error("Unexpected reader type: " + reader.constructor.name); 101 | } 102 | } 103 | 104 | processEntries(zipfile) { 105 | let entryTable = {}; 106 | let entryNames = []; 107 | return new Promise((resolve, reject) => { 108 | zipfile.readEntry(); 109 | zipfile.on("error", reject); 110 | zipfile.on("entry", (entry) => { 111 | const filename = entry.fileName; 112 | 113 | entryTable[filename] = entry; 114 | entryNames.push(filename); 115 | zipfile.readEntry(); 116 | }); 117 | zipfile.on("end", () => resolve(this._document)); 118 | }) 119 | .then(() => { 120 | 121 | // Re-order, so the content types are always loaded first 122 | const index = entryNames.indexOf('[Content_Types].xml'); 123 | if (index === -1) { 124 | throw new Error("Invalid Open Office XML: missing content types"); 125 | } 126 | 127 | entryNames.splice(index, 1); 128 | entryNames.unshift('[Content_Types].xml'); 129 | this._actions['[Content_Types].xml'] = true; 130 | 131 | return each((name) => { 132 | if (this.shouldProcess(name)) { 133 | return this.handleEntry(zipfile, entryTable[name]); 134 | } 135 | }, entryNames, 0); 136 | }); 137 | } 138 | 139 | extract(reader) { 140 | let archive = this.openArchive(reader); 141 | 142 | this._document = new Document(); 143 | this._relationships = {}; 144 | this._entryTable = {}; 145 | this._entries = []; 146 | 147 | return archive 148 | .then((zipfile) => this.processEntries(zipfile)) 149 | .then(() => { 150 | let document = this._document; 151 | if (document._textboxes && document._textboxes.length > 0) { 152 | document._textboxes = document._textboxes + "\n"; 153 | } 154 | if (document._headerTextboxes && document._headerTextboxes.length > 0) { 155 | document._headerTextboxes = document._headerTextboxes + "\n"; 156 | } 157 | return document; 158 | }); 159 | 160 | } 161 | 162 | handleOpenTag(node) { 163 | if (node.name === 'Override') { 164 | const actionFunction = this._streamTypes[node.attributes['ContentType']]; 165 | if (actionFunction) { 166 | const partName = node.attributes['PartName'].replace(/^[/]+/, ''); 167 | const action = {action: actionFunction, type: node.attributes['ContentType']}; 168 | this._actions[partName] = action; 169 | } 170 | } else if (node.name === 'Default') { 171 | const extension = node.attributes['Extension']; 172 | const contentType = node.attributes['ContentType']; 173 | this._defaults[extension] = contentType; 174 | } else if (node.name === 'Relationship') { 175 | // console.log(this._source, node); 176 | this._relationships[node.attributes['Id']] = { 177 | type: node.attributes['Type'], 178 | target: node.attributes['Target'], 179 | }; 180 | } else if (node.name === 'w:document' || 181 | node.name === 'w:footnotes' || 182 | node.name === 'w:endnotes' || 183 | node.name === 'w:comments') { 184 | this._context = ['content', 'body']; 185 | this._pieces = []; 186 | } else if (node.name === 'w:hdr' || 187 | node.name === 'w:ftr') { 188 | this._context = ['content', 'header']; 189 | this._pieces = []; 190 | } else if (node.name === 'w:endnote' || node.name === 'w:footnote') { 191 | const type = (node.attributes['w:type'] || this._context[0]); 192 | this._context.unshift(type); 193 | } else if (node.name === 'w:tab' && this._context[0] === 'content') { 194 | this._pieces.push("\t"); 195 | } else if (node.name === 'w:br' && this._context[0] === 'content') { 196 | if ((node.attributes['w:type'] || '') === 'page') { 197 | this._pieces.push("\n"); 198 | } else { 199 | this._pieces.push("\n"); 200 | } 201 | } else if (node.name === 'w:del' || node.name === 'w:instrText') { 202 | this._context.unshift('deleted'); 203 | } else if (node.name === 'w:tabs') { 204 | this._context.unshift('tabs'); 205 | } else if (node.name === 'w:tc') { 206 | this._context.unshift('cell'); 207 | } else if (node.name === 'w:drawing') { 208 | this._context.unshift('drawing'); 209 | } else if (node.name === 'w:txbxContent') { 210 | this._context.unshift(this._pieces); 211 | this._context.unshift('textbox'); 212 | this._pieces = []; 213 | } 214 | } 215 | 216 | handleCloseTag(node) { 217 | if (node.name === 'w:document') { 218 | this._context = null; 219 | this._document._body = this._pieces.join(""); 220 | } else if (node.name === 'w:footnote' || node.name === 'w:endnote') { 221 | this._context.shift(); 222 | } else if (node.name === 'w:footnotes') { 223 | this._context = null; 224 | this._document._footnotes = this._pieces.join(""); 225 | } else if (node.name === 'w:endnotes') { 226 | this._context = null; 227 | this._document._endnotes = this._pieces.join(""); 228 | } else if (node.name === 'w:comments') { 229 | this._context = null; 230 | this._document._annotations = this._pieces.join(""); 231 | } else if (node.name === 'w:hdr') { 232 | this._context = null; 233 | this._document._headers = this._document._headers + this._pieces.join(""); 234 | } else if (node.name === 'w:ftr') { 235 | this._context = null; 236 | this._document._footers = this._document._footers + this._pieces.join(""); 237 | } else if (node.name === 'w:p') { 238 | if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { 239 | this._pieces.push("\n"); 240 | } 241 | } else if (node.name === 'w:del' || node.name === 'w:instrText') { 242 | this._context.shift(); 243 | } else if (node.name === 'w:tabs') { 244 | this._context.shift(); 245 | } else if (node.name === 'w:tc') { 246 | this._pieces.pop(); 247 | this._pieces.push("\t"); 248 | this._context.shift(); 249 | } else if (node.name === 'w:tr') { 250 | this._pieces.push("\n"); 251 | } else if (node.name === 'w:drawing') { 252 | this._context.shift(); 253 | } else if (node.name === 'w:txbxContent') { 254 | const textBox = this._pieces.join(""); 255 | const context = this._context.shift(); 256 | if (context !== 'textbox') { 257 | throw new Error("Invalid textbox context"); 258 | } 259 | this._pieces = this._context.shift(); 260 | 261 | // If in drawing context, discard 262 | if (this._context[0] === 'drawing') 263 | return; 264 | 265 | if (textBox.length == 0) 266 | return; 267 | 268 | const inHeader = this._context.includes('header'); 269 | const documentField = (inHeader) ? '_headerTextboxes' : '_textboxes'; 270 | if (this._document[documentField]) { 271 | this._document[documentField] = this._document[documentField] + "\n" + textBox; 272 | } else { 273 | this._document[documentField] = textBox; 274 | } 275 | } 276 | } 277 | 278 | createXmlParser() { 279 | const parser = new SAXES.SaxesParser(); 280 | 281 | parser.on("opentag", (node) => { 282 | try { 283 | this.handleOpenTag(node); 284 | } catch (e) { 285 | parser.fail(e.message); 286 | } 287 | }); 288 | 289 | parser.on('closetag', (node) => { 290 | try { 291 | this.handleCloseTag(node); 292 | } catch (e) { 293 | parser.fail(e.message); 294 | } 295 | }); 296 | 297 | parser.on('text', (string) => { 298 | try { 299 | if (! this._context) 300 | return; 301 | if (this._context[0] === 'content' || this._context[0] === 'cell' || this._context[0] === 'textbox') { 302 | this._pieces.push(string); 303 | } 304 | } catch (e) { 305 | parser.fail(e.message); 306 | } 307 | }); 308 | 309 | return parser; 310 | } 311 | 312 | handleEntry(zipfile, entry) { 313 | return new Promise((resolve, reject) => { 314 | zipfile.openReadStream(entry, (err, readStream) => { 315 | if (err) { 316 | return reject(err); 317 | } 318 | 319 | this._source = entry.fileName; 320 | const parser = this.createXmlParser(); 321 | parser.on("error", (e) => { 322 | readStream.destroy(e); 323 | reject(e); 324 | }); 325 | parser.on("end", () => resolve()); 326 | readStream.on("end", () => parser.close()); 327 | readStream.on("error", (e) => reject(e)); 328 | readStream.on("readable", () => { 329 | // eslint-disable-next-line no-constant-condition 330 | while (true) { 331 | const chunk = readStream.read(0x1000); 332 | if (chunk === null) { 333 | return; 334 | } 335 | 336 | parser.write(chunk); 337 | } 338 | }); 339 | }); 340 | }); 341 | } 342 | 343 | } 344 | 345 | module.exports = OpenOfficeExtractor; 346 | -------------------------------------------------------------------------------- /lib/word-ole-extractor.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @module word-ole-extractor 3 | * 4 | * @description 5 | * Implements the main logic of extracting text from "classic" OLE-based Word files. 6 | * Depends on [OleCompoundDoc]{@link module:ole-compound-doc~OleCompoundDoc} 7 | * for most of the underlying OLE logic. Note that 8 | * [OpenOfficeExtractor]{@link module:open-office-extractor~OpenOfficeExtractor} is 9 | * used for newer, Open Office-style, files. 10 | */ 11 | 12 | const OleCompoundDoc = require('./ole-compound-doc'); 13 | const Document = require('./document'); 14 | const { binaryToUnicode, clean } = require('./filters'); 15 | 16 | /** 17 | * Constant for the deletion character SPRM. 18 | */ 19 | const sprmCFRMarkDel = 0x00; 20 | 21 | /** 22 | * Given a cp-style file offset, finds the containing piece index. 23 | * @param {*} offset the character offset 24 | * @returns the piece index 25 | * 26 | * @todo 27 | * Might be better using a binary search 28 | */ 29 | const getPieceIndexByCP = (pieces, position) => { 30 | for (let i = 0; i < pieces.length; i++) { 31 | const piece = pieces[i]; 32 | if (position <= piece.endCp) { 33 | return i; 34 | } 35 | } 36 | }; 37 | 38 | /** 39 | * Given a file-style offset, finds the containing piece index. 40 | * @param {*} offset the character offset 41 | * @returns the piece index 42 | * 43 | * @todo 44 | * Might be better using a binary search 45 | */ 46 | const getPieceIndexByFilePos = (pieces, position) => { 47 | for (let i = 0; i < pieces.length; i++) { 48 | const piece = pieces[i]; 49 | if (position <= piece.endFilePos) { 50 | return i; 51 | } 52 | } 53 | }; 54 | 55 | /** 56 | * Reads and extracts a character range from the pieces. This returns the 57 | * plain text within the pieces in the given range. 58 | * @param {*} start the start offset 59 | * @param {*} end the end offset 60 | * @returns a character string 61 | */ 62 | function getTextRangeByCP(pieces, start, end) { 63 | const startPiece = getPieceIndexByCP(pieces, start); 64 | const endPiece = getPieceIndexByCP(pieces, end); 65 | const result = []; 66 | for (let i = startPiece, end1 = endPiece; i <= end1; i++) { 67 | const piece = pieces[i]; 68 | const xstart = i === startPiece ? start - piece.startCp : 0; 69 | const xend = i === endPiece ? end - piece.startCp : piece.endCp; 70 | result.push(piece.text.substring(xstart, xend)); 71 | } 72 | 73 | return result.join(""); 74 | } 75 | 76 | 77 | /** 78 | * Given a piece, and a starting and ending cp-style file offset, 79 | * and a replacement character, updates the piece text to replace 80 | * between start and end with the given character. 81 | * @param {*} piece the piece 82 | * @param {*} start the starting character offset 83 | * @param {*} end the endingcharacter offset 84 | * @param {*} character the replacement character 85 | */ 86 | function fillPieceRange(piece, start, end, character) { 87 | const pieceStart = piece.startCp; 88 | const pieceEnd = pieceStart + piece.length; 89 | const original = piece.text; 90 | if (start < pieceStart) start = pieceStart; 91 | if (end > pieceEnd) end = pieceEnd; 92 | const modified = 93 | ((start == pieceStart) ? '' : original.slice(0, start - pieceStart)) + 94 | ''.padStart(end - start, character) + 95 | ((end == pieceEnd) ? '' : original.slice(end - pieceEnd)); 96 | piece.text = modified; 97 | } 98 | 99 | /** 100 | * Given a piece, and a starting and ending filePos-style file offset, 101 | * and a replacement character, updates the piece text to replace 102 | * between start and end with the given character. This is used when 103 | * applying character styles, which use filePos values rather than cp 104 | * values. 105 | * 106 | * @param {*} piece the piece 107 | * @param {*} start the starting character offset 108 | * @param {*} end the endingcharacter offset 109 | * @param {*} character the replacement character 110 | */ 111 | function fillPieceRangeByFilePos(piece, start, end, character) { 112 | const pieceStart = piece.startFilePos; 113 | const pieceEnd = pieceStart + piece.size; 114 | const original = piece.text; 115 | if (start < pieceStart) start = pieceStart; 116 | if (end > pieceEnd) end = pieceEnd; 117 | const modified = 118 | ((start == pieceStart) ? '' : original.slice(0, (start - pieceStart) / piece.bpc)) + 119 | ''.padStart((end - start) / piece.bpc, character) + 120 | ((end == pieceEnd) ? '' : original.slice((end - pieceEnd) / piece.bpc)); 121 | piece.text = modified; 122 | } 123 | 124 | /** 125 | * Replaces a selected range in the piece table, overwriting the selection with 126 | * the given character. The length of segments in the piece table must never be 127 | * changed. 128 | * @param {*} pieces 129 | * @param {*} start 130 | * @param {*} end 131 | * @param {*} character 132 | */ 133 | function replaceSelectedRange(pieces, start, end, character) { // eslint-disable-line no-unused-vars 134 | const startPiece = getPieceIndexByCP(pieces, start); 135 | const endPiece = getPieceIndexByCP(pieces, end); 136 | for (let i = startPiece, end1 = endPiece; i <= end1; i++) { 137 | const piece = pieces[i]; 138 | fillPieceRange(piece, start, end, character); 139 | } 140 | } 141 | 142 | /** 143 | * Replaces a selected range in the piece table, overwriting the selection with 144 | * the given character. The length of segments in the piece table must never be 145 | * changed. The start and end values are found by file position. 146 | * @param {*} pieces 147 | * @param {*} start 148 | * @param {*} end 149 | * @param {*} character 150 | */ 151 | function replaceSelectedRangeByFilePos(pieces, start, end, character) { 152 | const startPiece = getPieceIndexByFilePos(pieces, start); 153 | const endPiece = getPieceIndexByFilePos(pieces, end); 154 | for (let i = startPiece, end1 = endPiece; i <= end1; i++) { 155 | const piece = pieces[i]; 156 | fillPieceRangeByFilePos(piece, start, end, character); 157 | } 158 | } 159 | 160 | /** 161 | * Marks a range as deleted. It does this by overwriting it with null characters, 162 | * wich then get removed during the later cleaning process. 163 | * @param {*} pieces 164 | * @param {*} start 165 | * @param {*} end 166 | */ 167 | function markDeletedRange(pieces, start, end) { 168 | replaceSelectedRangeByFilePos(pieces, start, end, '\x00'); 169 | } 170 | 171 | /** 172 | * Called to iterate over a set of SPRMs in a buffer, starting at 173 | * a gived offset. The handler is called with the arguments: 174 | * buffer, offset, sprm, ispmd, fspec, sgc, spra. 175 | * @param {*} buffer the buffer 176 | * @param {*} offset the starting offset 177 | * @param {*} handler the function to call for each SPRM 178 | */ 179 | const processSprms = (buffer, offset, handler) => { 180 | while (offset < buffer.length - 1) { 181 | const sprm = buffer.readUInt16LE(offset); 182 | const ispmd = sprm & 0x1ff; 183 | const fspec = (sprm >>> 9) & 0x01; 184 | const sgc = (sprm >>> 10) & 0x07; 185 | const spra = sprm >>> 13; 186 | 187 | offset += 2; 188 | 189 | handler(buffer, offset, sprm, ispmd, fspec, sgc, spra); 190 | 191 | if (spra === 0) { 192 | offset += 1; 193 | continue; 194 | } else if (spra === 1) { 195 | offset += 1; 196 | continue; 197 | } else if (spra === 2) { 198 | offset += 2; 199 | continue; 200 | } else if (spra === 3) { 201 | offset += 4; 202 | continue; 203 | } else if (spra === 4 || spra === 5) { 204 | offset += 2; 205 | continue; 206 | } else if (spra === 6) { 207 | offset += buffer.readUInt8(offset) + 1; 208 | continue; 209 | } else if (spra === 7) { 210 | offset += 3; 211 | continue; 212 | } else { 213 | throw new Error("Unparsed sprm"); 214 | } 215 | 216 | } 217 | }; 218 | 219 | /** 220 | * @class 221 | * The main class implementing extraction from OLE-based Word files. 222 | * This handles all the extraction and conversion logic. 223 | */ 224 | class WordOleExtractor { 225 | 226 | constructor() { 227 | this._pieces = []; 228 | this._bookmarks = {}; 229 | this._boundaries = {}; 230 | this._taggedHeaders = []; 231 | } 232 | 233 | /** 234 | * The main extraction method. This creates an OLE compound document 235 | * interface, then opens up a stream and extracts out the main 236 | * stream. 237 | * @param {*} reader 238 | */ 239 | extract(reader) { 240 | const document = new OleCompoundDoc(reader); 241 | return document.read() 242 | .then(() => 243 | this.documentStream(document, 'WordDocument') 244 | .then((stream) => this.streamBuffer(stream)) 245 | .then((buffer) => this.extractWordDocument(document, buffer)) 246 | ); 247 | } 248 | 249 | /** 250 | * Builds and returns a {@link Document} object corresponding to the text 251 | * in the original document. This involves reading and retrieving the text 252 | * ranges corresponding to the primary document parts. The text segments are 253 | * read from the extracted table of text pieces. 254 | * @returns a {@link Document} object 255 | */ 256 | buildDocument() { 257 | const document = new Document(); 258 | const pieces = this._pieces; 259 | 260 | let start = 0; 261 | 262 | document._body = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpText)); 263 | start += this._boundaries.ccpText; 264 | 265 | if (this._boundaries.ccpFtn) { 266 | document._footnotes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpFtn - 1)); 267 | start += this._boundaries.ccpFtn; 268 | } 269 | 270 | if (this._boundaries.ccpHdd) { 271 | // Replaced old single-block data with tagged selection. See #34 272 | // document._headers = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpHdd - 1)); 273 | document._headers = clean(this._taggedHeaders.filter((s) => s.type === 'headers').map((s) => s.text).join("")); 274 | document._footers = clean(this._taggedHeaders.filter((s) => s.type === 'footers').map((s) => s.text).join("")); 275 | 276 | start += this._boundaries.ccpHdd; 277 | } 278 | 279 | if (this._boundaries.ccpAtn) { 280 | document._annotations = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpAtn - 1)); 281 | start += this._boundaries.ccpAtn; 282 | } 283 | 284 | if (this._boundaries.ccpEdn) { 285 | document._endnotes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpEdn - 1)); 286 | start += this._boundaries.ccpEdn; 287 | } 288 | 289 | if (this._boundaries.ccpTxbx) { 290 | document._textboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpTxbx - 1)); 291 | start += this._boundaries.ccpTxbx; 292 | } 293 | 294 | if (this._boundaries.ccpHdrTxbx) { 295 | document._headerTextboxes = clean(getTextRangeByCP(pieces, start, start + this._boundaries.ccpHdrTxbx - 1)); 296 | start += this._boundaries.ccpHdrTxbx; 297 | } 298 | 299 | return document; 300 | } 301 | 302 | /** 303 | * Main logic top level function for unpacking a Word document 304 | * @param {*} document the OLE document 305 | * @param {*} buffer a buffer 306 | * @returns a Promise which resolves to a {@link Document} 307 | */ 308 | extractWordDocument(document, buffer) { 309 | const magic = buffer.readUInt16LE(0); 310 | if (magic !== 0xa5ec) { 311 | return Promise.reject(new Error(`This does not seem to be a Word document: Invalid magic number: ${magic.toString(16)}`)); 312 | } 313 | 314 | const flags = buffer.readUInt16LE(0xA); 315 | 316 | const streamName = (flags & 0x0200) !== 0 ? "1Table" : "0Table"; 317 | 318 | return this.documentStream(document, streamName) 319 | .then((stream) => this.streamBuffer(stream)) 320 | .then((streamBuffer) => { 321 | this._boundaries.fcMin = buffer.readUInt32LE(0x0018); 322 | this._boundaries.ccpText = buffer.readUInt32LE(0x004c); 323 | this._boundaries.ccpFtn = buffer.readUInt32LE(0x0050); 324 | this._boundaries.ccpHdd = buffer.readUInt32LE(0x0054); 325 | this._boundaries.ccpAtn = buffer.readUInt32LE(0x005c); 326 | this._boundaries.ccpEdn = buffer.readUInt32LE(0x0060); 327 | this._boundaries.ccpTxbx = buffer.readUInt32LE(0x0064); 328 | this._boundaries.ccpHdrTxbx = buffer.readUInt32LE(0x0068); 329 | 330 | this.writeBookmarks(buffer, streamBuffer); 331 | this.writePieces(buffer, streamBuffer); 332 | this.writeCharacterProperties(buffer, streamBuffer); 333 | this.writeParagraphProperties(buffer, streamBuffer); 334 | this.normalizeHeaders(buffer, streamBuffer); 335 | 336 | return this.buildDocument(); 337 | }); 338 | } 339 | 340 | /** 341 | * Returns a promise that resolves to the named stream. 342 | * @param {*} document 343 | * @param {*} streamName 344 | * @returns a promise that resolves to the named stream 345 | */ 346 | documentStream(document, streamName) { 347 | return Promise.resolve(document.stream(streamName)); 348 | } 349 | 350 | /** 351 | * Returns a promise that resolves to a Buffer containing the contents of 352 | * the given stream. 353 | * @param {*} stream 354 | * @returns a promise that resolves to the sream contents 355 | */ 356 | streamBuffer(stream) { 357 | return new Promise((resolve, reject) => { 358 | const chunks = []; 359 | stream.on('data', (chunk) => chunks.push(chunk)); 360 | stream.on('error', (error) => reject(error)); 361 | stream.on('end', () => resolve(Buffer.concat(chunks))); 362 | return stream; 363 | }); 364 | } 365 | 366 | writeFields(buffer, tableBuffer, result) { // eslint-disable-line no-unused-vars 367 | const fcPlcffldMom = buffer.readInt32LE(0x011a); 368 | const lcbPlcffldMom = buffer.readUInt32LE(0x011e); 369 | //console.log(fcPlcffldMom, lcbPlcffldMom, tableBuffer.length); 370 | 371 | if (lcbPlcffldMom == 0) { 372 | return; 373 | } 374 | 375 | const fieldCount = (lcbPlcffldMom - 4) / 6; 376 | //console.log("extracting", fieldCount, "fields"); 377 | 378 | const dataOffset = (fieldCount + 1) * 4; 379 | 380 | const plcffldMom = tableBuffer.slice(fcPlcffldMom, fcPlcffldMom + lcbPlcffldMom); 381 | for(let i = 0; i < fieldCount; i++) { 382 | const cp = plcffldMom.readUInt32LE(i * 4); // eslint-disable-line no-unused-vars 383 | const fld = plcffldMom.readUInt16LE(dataOffset + i * 2); 384 | const byte1 = fld & 0xff; 385 | const byte2 = fld >>> 8; // eslint-disable-line no-unused-vars 386 | if ((byte1 & 0x1f) == 19) { 387 | //console.log("A", i, cp, byte1.toString(16), byte2.toString(16)); 388 | } else { 389 | //console.log("B", i, cp, byte1.toString(16), byte2.toString(16)); 390 | } 391 | } 392 | } 393 | 394 | /** 395 | * Extracts and stores the document bookmarks into a local field. 396 | * @param {*} buffer 397 | * @param {*} tableBuffer 398 | */ 399 | writeBookmarks(buffer, tableBuffer) { 400 | const fcSttbfBkmk = buffer.readUInt32LE(0x0142); 401 | const lcbSttbfBkmk = buffer.readUInt32LE(0x0146); 402 | const fcPlcfBkf = buffer.readUInt32LE(0x014a); 403 | const lcbPlcfBkf = buffer.readUInt32LE(0x014e); 404 | const fcPlcfBkl = buffer.readUInt32LE(0x0152); 405 | const lcbPlcfBkl = buffer.readUInt32LE(0x0156); 406 | 407 | if (lcbSttbfBkmk === 0) { 408 | return; 409 | } 410 | 411 | const sttbfBkmk = tableBuffer.slice(fcSttbfBkmk, fcSttbfBkmk + lcbSttbfBkmk); 412 | const plcfBkf = tableBuffer.slice(fcPlcfBkf, fcPlcfBkf + lcbPlcfBkf); 413 | const plcfBkl = tableBuffer.slice(fcPlcfBkl, fcPlcfBkl + lcbPlcfBkl); 414 | 415 | const fcExtend = sttbfBkmk.readUInt16LE(0); 416 | const cData = sttbfBkmk.readUInt16LE(2); // eslint-disable-line no-unused-vars 417 | const cbExtra = sttbfBkmk.readUInt16LE(4); // eslint-disable-line no-unused-vars 418 | 419 | if (fcExtend !== 0xffff) { 420 | throw new Error("Internal error: unexpected single-byte bookmark data"); 421 | } 422 | 423 | let offset = 6; 424 | const index = 0; 425 | 426 | while (offset < lcbSttbfBkmk) { 427 | let length = sttbfBkmk.readUInt16LE(offset); 428 | length = length * 2; 429 | const segment = sttbfBkmk.slice(offset + 2, offset + 2 + length); 430 | const cpStart = plcfBkf.readUInt32LE(index * 4); 431 | const cpEnd = plcfBkl.readUInt32LE(index * 4); 432 | this._bookmarks[segment] = {start: cpStart, end: cpEnd}; 433 | offset = offset + length + 2; 434 | } 435 | } 436 | 437 | /** 438 | * Extracts and stores the document text pieces into a local field. This is 439 | * probably the most crucial part of text extraction, as it is where we 440 | * get text corresponding to character positions. These may be stored in a 441 | * different order in the file compared to the order we want them. 442 | * 443 | * @param {*} buffer 444 | * @param {*} tableBuffer 445 | */ 446 | writePieces(buffer, tableBuffer) { 447 | let flag; 448 | let pos = buffer.readUInt32LE(0x01a2); 449 | 450 | while (true) { // eslint-disable-line no-constant-condition 451 | flag = tableBuffer.readUInt8(pos); 452 | if (flag !== 1) { break; } 453 | 454 | pos = pos + 1; 455 | const skip = tableBuffer.readUInt16LE(pos); 456 | pos = pos + 2 + skip; 457 | } 458 | 459 | flag = tableBuffer.readUInt8(pos); 460 | pos = pos + 1; 461 | if (flag !== 2) { 462 | throw new Error("Internal error: ccorrupted Word file"); 463 | } 464 | 465 | const pieceTableSize = tableBuffer.readUInt32LE(pos); 466 | pos = pos + 4; 467 | 468 | const pieces = (pieceTableSize - 4) / 12; 469 | 470 | let startCp = 0; 471 | let startStream = 0; 472 | 473 | for (let x = 0, end = pieces - 1; x <= end; x++) { 474 | const offset = pos + ((pieces + 1) * 4) + (x * 8) + 2; 475 | let startFilePos = tableBuffer.readUInt32LE(offset); 476 | let unicode = false; 477 | if ((startFilePos & 0x40000000) === 0) { 478 | unicode = true; 479 | } else { 480 | startFilePos = startFilePos & ~(0x40000000); 481 | startFilePos = Math.floor(startFilePos / 2); 482 | } 483 | const lStart = tableBuffer.readUInt32LE(pos + (x * 4)); 484 | const lEnd = tableBuffer.readUInt32LE(pos + ((x + 1) * 4)); 485 | const totLength = lEnd - lStart; 486 | 487 | const piece = { 488 | startCp, 489 | startStream, 490 | totLength, 491 | startFilePos, 492 | unicode, 493 | bpc: (unicode) ? 2 : 1 494 | }; 495 | 496 | piece.size = piece.bpc * (lEnd - lStart); 497 | 498 | const textBuffer = buffer.slice(startFilePos, startFilePos + piece.size); 499 | if (unicode) { 500 | piece.text = textBuffer.toString('ucs2'); 501 | } else { 502 | piece.text = binaryToUnicode(textBuffer.toString('binary')); 503 | } 504 | 505 | piece.length = piece.text.length; 506 | 507 | piece.endCp = piece.startCp + piece.length; 508 | piece.endStream = piece.startStream + piece.size; 509 | piece.endFilePos = piece.startFilePos + piece.size; 510 | 511 | startCp = piece.endCp; 512 | startStream = piece.endStream; 513 | 514 | this._pieces.push(piece); 515 | } 516 | } 517 | 518 | /** 519 | * Processes the headers and footers. The main logic here is that we might have a mix 520 | * of "real" and "pseudo" headers. For example, a footnote generates some footnote 521 | * separator footer elements, which, unless they contain something interesting, we 522 | * can dispense with. In fact, we want to dispense with anything which is made up of 523 | * whitespace and control characters, in general. This means locating the segments of 524 | * text in the extracted pieces, and conditionally replacing them with nulls. 525 | * 526 | * @param {*} buffer 527 | * @param {*} tableBuffer 528 | */ 529 | normalizeHeaders(buffer, tableBuffer) { 530 | const pieces = this._pieces; 531 | 532 | const fcPlcfhdd = buffer.readUInt32LE(0x00f2); 533 | const lcbPlcfhdd = buffer.readUInt32LE(0x00f6); 534 | if (lcbPlcfhdd < 8) { 535 | return; 536 | } 537 | 538 | const offset = this._boundaries.ccpText + this._boundaries.ccpFtn; 539 | const ccpHdd = this._boundaries.ccpHdd; 540 | 541 | const plcHdd = tableBuffer.slice(fcPlcfhdd, fcPlcfhdd + lcbPlcfhdd); 542 | const plcHddCount = (lcbPlcfhdd / 4); 543 | let start = offset + plcHdd.readUInt32LE(0); 544 | for(let i = 1; i < plcHddCount; i++) { 545 | let end = offset + plcHdd.readUInt32LE(i * 4); 546 | if (end > offset + ccpHdd) { 547 | end = offset + ccpHdd; 548 | } 549 | const string = getTextRangeByCP(pieces, start, end); 550 | const story = i - 1; 551 | if ([0, 1, 2].includes(story)) { 552 | this._taggedHeaders.push({type: 'footnoteSeparators', text: string}); 553 | } else if ([3, 4, 5].includes(story)) { 554 | this._taggedHeaders.push({type: 'endSeparators', text: string}); 555 | } else if ([0, 1, 4].includes(story % 6)) { 556 | this._taggedHeaders.push({type: 'headers', text: string}); 557 | } else if ([2, 3, 5].includes(story % 6)) { 558 | this._taggedHeaders.push({type: 'footers', text: string}); 559 | } 560 | 561 | if (! /[^\r\n\u0002-\u0008]/.test(string)) { 562 | replaceSelectedRange(pieces, start, end, "\x00"); 563 | } else { 564 | replaceSelectedRange(pieces, end - 1, end, "\x00"); 565 | } 566 | 567 | start = end; // eslint-disable-line no-unused-vars 568 | } 569 | 570 | // The last character can always be dropped, but we handle that later anyways. 571 | } 572 | 573 | writeParagraphProperties(buffer, tableBuffer) { 574 | const pieces = this._pieces; 575 | 576 | const fcPlcfbtePapx = buffer.readUInt32LE(0x0102); 577 | const lcbPlcfbtePapx = buffer.readUInt32LE(0x0106); 578 | 579 | const plcBtePapxCount = (lcbPlcfbtePapx - 4) / 8; 580 | const dataOffset = (plcBtePapxCount + 1) * 4; 581 | const plcBtePapx = tableBuffer.slice(fcPlcfbtePapx, fcPlcfbtePapx + lcbPlcfbtePapx); 582 | 583 | for(let i = 0; i < plcBtePapxCount; i++) { 584 | const cp = plcBtePapx.readUInt32LE(i * 4); // eslint-disable-line no-unused-vars 585 | const papxFkpBlock = plcBtePapx.readUInt32LE(dataOffset + i * 4); 586 | //console.log("paragraph property", cp, papxFkpBlock); 587 | 588 | const papxFkpBlockBuffer = buffer.slice(papxFkpBlock * 512, (papxFkpBlock + 1) * 512); 589 | //console.log("papxFkpBlockBuffer", papxFkpBlockBuffer); 590 | 591 | const crun = papxFkpBlockBuffer.readUInt8(511); 592 | //console.log("crun", crun); 593 | 594 | for(let j = 0; j < crun; j++) { 595 | const rgfc = papxFkpBlockBuffer.readUInt32LE(j * 4); 596 | const rgfcNext = papxFkpBlockBuffer.readUInt32LE((j + 1) * 4); 597 | 598 | const cbLocation = (crun + 1) * 4 + j * 13; 599 | const cbIndex = papxFkpBlockBuffer.readUInt8(cbLocation) * 2; 600 | 601 | const cb = papxFkpBlockBuffer.readUInt8(cbIndex); 602 | let grpPrlAndIstd = null; 603 | if (cb !== 0) { 604 | grpPrlAndIstd = papxFkpBlockBuffer.slice(cbIndex + 1, cbIndex + 1 + (2 * cb) - 1); 605 | } else { 606 | const cb2 = papxFkpBlockBuffer.readUInt8(cbIndex + 1); 607 | grpPrlAndIstd = papxFkpBlockBuffer.slice(cbIndex + 2, cbIndex + 2 + (2 * cb2)); 608 | } 609 | //console.log("para; ", j, "rgfc=", rgfc, "rgfcNext=", rgfcNext, "grpPrlAndIstd=", grpPrlAndIstd); 610 | 611 | const istd = grpPrlAndIstd.readUInt16LE(0); // eslint-disable-line no-unused-vars 612 | processSprms(grpPrlAndIstd, 2, (buffer, offset, sprm, ispmd, fspec, sgc, spra) => { // eslint-disable-line no-unused-vars 613 | //console.log("sprm x", offset, sprm.toString(16), ispmd, fspec, sgc, spra); 614 | if (sprm === 0x2417) { 615 | replaceSelectedRangeByFilePos(pieces, rgfc, rgfcNext, '\n'); 616 | } 617 | }); 618 | } 619 | 620 | } 621 | } 622 | 623 | writeCharacterProperties(buffer, tableBuffer) { 624 | const pieces = this._pieces; 625 | 626 | const fcPlcfbteChpx = buffer.readUInt32LE(0x00fa); 627 | const lcbPlcfbteChpx = buffer.readUInt32LE(0x00fe); 628 | 629 | const plcBteChpxCount = (lcbPlcfbteChpx - 4) / 8; 630 | //console.log("character format runs", plcBteChpxCount, fcPlcfbteChpx, lcbPlcfbteChpx); 631 | 632 | const dataOffset = (plcBteChpxCount + 1) * 4; 633 | const plcBteChpx = tableBuffer.slice(fcPlcfbteChpx, fcPlcfbteChpx + lcbPlcfbteChpx); 634 | 635 | //const cpLast = plcBteChpx.readUInt32LE(plcBteChpxCount * 4); 636 | //console.log("last cp", cpLast); 637 | 638 | let lastDeletionEnd = null; 639 | 640 | for(let i = 0; i < plcBteChpxCount; i++) { 641 | const cp = plcBteChpx.readUInt32LE(i * 4); // eslint-disable-line no-unused-vars 642 | const chpxFkpBlock = plcBteChpx.readUInt32LE(dataOffset + i * 4); 643 | //console.log("character property", cp, chpxFkpBlock); 644 | 645 | const chpxFkpBlockBuffer = buffer.slice(chpxFkpBlock * 512, (chpxFkpBlock + 1) * 512); 646 | //console.log("chpxFkpBlockBuffer", chpxFkpBlockBuffer); 647 | 648 | const crun = chpxFkpBlockBuffer.readUInt8(511); 649 | //console.log("crun", crun); 650 | 651 | for(let j = 0; j < crun; j++) { 652 | const rgfc = chpxFkpBlockBuffer.readUInt32LE(j * 4); 653 | const rgfcNext = chpxFkpBlockBuffer.readUInt32LE((j + 1) * 4); 654 | const rgb = chpxFkpBlockBuffer.readUInt8((crun + 1) * 4 + j); 655 | if (rgb == 0) { 656 | //console.log("skipping run; ", j, "rgfc=", rgfc, "rgb=", rgb); 657 | continue; 658 | } 659 | const chpxOffset = rgb * 2; 660 | const cb = chpxFkpBlockBuffer.readUInt8(chpxOffset); 661 | const grpprl = chpxFkpBlockBuffer.slice(chpxOffset + 1, chpxOffset + 1 + cb); 662 | //console.log("found run; ", j, "rgfc=", rgfc, "rgb=", rgb, "cb=", cb, "grpprl=", grpprl); 663 | 664 | processSprms(grpprl, 0, (buffer, offset, sprm, ispmd) => { 665 | if (ispmd === sprmCFRMarkDel) { 666 | if ((buffer[offset] & 1) != 1) { 667 | return; 668 | } 669 | 670 | // console.log("text deleted", rgfc, rgfcNext); 671 | if (lastDeletionEnd === rgfc) { 672 | markDeletedRange(pieces, lastDeletionEnd, rgfcNext); 673 | } else { 674 | markDeletedRange(pieces, rgfc, rgfcNext); 675 | } 676 | lastDeletionEnd = rgfcNext; 677 | 678 | // if (ld >= 0 && this._deletions[ld].end === rgfc) { 679 | // this._deletions[ld].end = rgfcNext; 680 | // } else { 681 | // this._deletions.push({start: rgfc, end: rgfcNext}); 682 | // } 683 | } 684 | }); 685 | } 686 | } 687 | } 688 | 689 | } 690 | 691 | module.exports = WordOleExtractor; 692 | -------------------------------------------------------------------------------- /lib/word.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @module word 3 | * 4 | * @description 5 | * The main module for the package. This exports an extractor class, which 6 | * provides a single `extract` method that can be called with either a 7 | * string (filename) or a buffer. 8 | */ 9 | 10 | const { Buffer } = require('buffer'); 11 | 12 | const WordOleExtractor = require('./word-ole-extractor'); 13 | const OpenOfficeExtractor = require('./open-office-extractor'); 14 | 15 | const BufferReader = require('./buffer-reader'); 16 | const FileReader = require('./file-reader'); 17 | 18 | /** 19 | * The main class for the word extraction package. Typically, people will make 20 | * an instance of this class, and call the {@link #extract} method to transform 21 | * a Word file into a {@link Document} instance, which provides the accessors 22 | * needed to read its body, and so on. 23 | */ 24 | class WordExtractor { 25 | 26 | constructor() {} 27 | 28 | /** 29 | * Extracts the main contents of the file. If a Buffer is passed, that 30 | * is used instead. Opens the file, and reads the first block, uses that 31 | * to detect whether this is a .doc file or a .docx file, and then calls 32 | * either {@link WordOleDocument#extract} or {@link OpenOfficeDocument#extract} 33 | * accordingly. 34 | * 35 | * @param {string|Buffer} source - either a string filename, or a Buffer containing the file content 36 | * @returns a {@link Document} providing accessors onto the text 37 | */ 38 | extract(source) { 39 | let reader = null; 40 | if (Buffer.isBuffer(source)) { 41 | reader = new BufferReader(source); 42 | } else if (typeof source === 'string') { 43 | reader = new FileReader(source); 44 | } 45 | const buffer = Buffer.alloc(512); 46 | return reader.open() 47 | .then(() => reader.read(buffer, 0, 512, 0)) 48 | .then((buffer) => { 49 | let extractor = null; 50 | 51 | if (buffer.readUInt16BE(0) === 0xd0cf) { 52 | extractor = WordOleExtractor; 53 | } else if (buffer.readUInt16BE(0) === 0x504b) { 54 | const next = buffer.readUInt16BE(2); 55 | if ((next === 0x0304) || (next === 0x0506) || (next === 0x0708)) { 56 | extractor = OpenOfficeExtractor; 57 | } 58 | } 59 | 60 | if (! extractor) { 61 | throw new Error("Unable to read this type of file"); 62 | } 63 | 64 | return (new extractor()).extract(reader); 65 | }) 66 | .finally(() => reader.close()); 67 | } 68 | 69 | } 70 | 71 | 72 | module.exports = WordExtractor; 73 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "word-extractor", 3 | "version": "1.0.4", 4 | "description": "Node.js package to read Word .doc files", 5 | "main": "lib/word.js", 6 | "scripts": { 7 | "test": "jest", 8 | "test-watch": "jest --watch", 9 | "coverage": "jest --coverage", 10 | "jsdoc": "jsdoc --configure jsdoc.json" 11 | }, 12 | "repository": { 13 | "type": "git", 14 | "url": "https://github.com/morungos/node-word-extractor.git" 15 | }, 16 | "keywords": [ 17 | "word" 18 | ], 19 | "author": "Stuart Watt ", 20 | "license": "MIT", 21 | "bugs": { 22 | "url": "https://github.com/morungos/node-word-extractor/issues" 23 | }, 24 | "homepage": "https://github.com/morungos/node-word-extractor", 25 | "devDependencies": { 26 | "eslint": "^7.25.0", 27 | "jest": "^26.6.0", 28 | "jest-specific-snapshot": "^4.0.0", 29 | "jsdoc": "^3.6.6" 30 | }, 31 | "dependencies": { 32 | "saxes": "^5.0.1", 33 | "yauzl": "^2.10.0" 34 | }, 35 | "jest": { 36 | "moduleFileExtensions": [ 37 | "js" 38 | ], 39 | "transform": {}, 40 | "transformIgnorePatterns": [], 41 | "testRegex": "(/__tests__/.*?_test)\\.jsx?$", 42 | "collectCoverageFrom": [ 43 | "lib/**/*.js" 44 | ], 45 | "coveragePathIgnorePatterns": [ 46 | "/node_modules/" 47 | ] 48 | } 49 | } 50 | --------------------------------------------------------------------------------