├── .gitignore
├── .npmignore
├── .travis.yml
├── LICENSE
├── README.md
├── bin
    └── pdf-text-extract.js
├── index.js
├── package.json
└── test
    ├── buffered-extract-test.js
    ├── data
        ├── huge.pdf
        ├── multipage.pdf
        ├── multipage.txt
        └── pdf with space in name.pdf
    ├── extract-test.js
    ├── promise-buffered-extract-test.js
    └── promise-extract-test.js


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | npm-debug.log
3 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | test/
2 | .travis.yml


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | before_install:
2 |  - sudo apt-get update -qq
3 |  - sudo apt-get install -qq poppler-utils
4 | 
5 | language: node_js
6 | node_js:
7 |    - "stable"
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2018, ftorto
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PDF Text Extract
  2 | 
  3 | Extract text from pdfs that contain searchable pdf text. The module is wrapper that calls the `pdftotext` command to perform the actual extraction
  4 | 
  5 | [![Build Status](https://travis-ci.org/nisaacson/pdf-text-extract.png?branch=master)](https://travis-ci.org/nisaacson/pdf-text-extract) [![Dependency Status](https://david-dm.org/nisaacson/pdf-text-extract.png)](https://david-dm.org/nisaacson/pdf-text-extract)
  6 | 
  7 | # Installation
  8 | ```bash
  9 | npm install --save pdf-text-extract
 10 | ```
 11 | 
 12 | 
 13 | You will need the `pdftotext` binary available on your path. There are packages available for many different operating systems
 14 | 
 15 | See [https://github.com/nisaacson/pdf-extract#osx](https://github.com/nisaacson/pdf-extract#osx) for how to install the `pdftotext` command
 16 | 
 17 | 
 18 | # Usage
 19 | 
 20 | ## As a module
 21 | 
 22 | `extract(filePath, [options], [pdftotextcommand], callback)`
 23 | 
 24 | Options and pdftotextcommand are not required.
 25 | 
 26 | 
 27 | ```javascript
 28 | var path = require('path')
 29 | var filePath = path.join(__dirname, 'test/data/multipage.pdf')
 30 | var extract = require('pdf-text-extract')
 31 | extract(filePath, function (err, pages) {
 32 |   if (err) {
 33 |     console.dir(err)
 34 |     return
 35 |   }
 36 |   console.dir(pages)
 37 | })
 38 | ```
 39 | The output will be an array of where each entry is a page of text. If you want just a string of all pages you can set the option to `splitPages: false`.
 40 | 
 41 | ```javascript
 42 | var filePath = path.join(__dirname, 'test/data/multipage.pdf')
 43 | var extract = require('pdf-text-extract')
 44 | extract(filePath, { splitPages: false }, function (err, text) {
 45 |   if (err) {
 46 |     console.dir(err)
 47 |     return
 48 |   }
 49 |   console.dir(text)
 50 | })
 51 | ```
 52 | 
 53 | You can set the following options:
 54 | - `firstPage`: First page to extract
 55 | - `lastPage`: Last page to extract
 56 | - `resolution`: in dpi, as is specified by pdftotext -r
 57 | - `crop`: Should be an object { x:x, y:y, w:w, h:h }
 58 | - `layout`: Should be either `layout`, `raw` or `htmlmeta`. Default: `layout`
 59 | - `encoding`: Should be either `UCS-2`, `ASCII7`, `Latin1`, `UTF-8`, `ZapfDingbats` or `Symbol`. Default: `UTF-8`
 60 | - `eol`: End of line convention. One of either: `unix`, `dos` or `mac`
 61 | - `ownerPassword`: Owner password (for encrypted files)
 62 | - `userPassword`: User password (for encrypted files)
 63 | - `splitPages`: If true, the result will be an array of pages. Default: true.
 64 | 
 65 | 
 66 | If needed you can pass optional arguments to the extract function. These will be passed to the `child_process.spawn` call.
 67 | 
 68 | ```javascript
 69 | var filePath = path.join(__dirname, 'test/data/multipage.pdf')
 70 | var extract = require('pdf-text-extract')
 71 | var options = {
 72 |   cwd: "./"
 73 | }
 74 | extract(filePath, options, function (err, pages) {
 75 |   if (err) {
 76 |     console.dir(err)
 77 |     return
 78 |   }
 79 |   console.dir('extracted pages', pages)
 80 | })
 81 | ```
 82 | 
 83 | You can also override the command for `pdftotext` if it is installed in a location that is not available in the `PATH` environment variable
 84 | 
 85 | 
 86 | ```javascript
 87 | var filePath = path.join(__dirname, 'test/data/multipage.pdf')
 88 | var pdfToTextCommand = '/opt/bin/pdftotext'
 89 | var extract = require('pdf-text-extract')
 90 | var options = {
 91 |   cwd: "./"
 92 | }
 93 | extract(filePath, options, pdfToTextCommand, function (err, pages) {
 94 |   if (err) {
 95 |     console.dir(err)
 96 |     return
 97 |   }
 98 |   console.dir('extracted pages', pages)
 99 | })
100 | ```
101 | 
102 | 
103 | ES6 promises are supported. You can now call .then(onFulfilled[, onRejected]):
104 | 
105 | ```javascript
106 | var filePath = path.join(__dirname, 'test/data/multipage.pdf')
107 | var Extract = require('../index.js')
108 | var extract = new Extract(filePath)
109 | 
110 | extract.then(function (pages) {
111 |   console.dir('extracted pages', pages)
112 | }).catch(function (err) {
113 |   console.error('error:', err)
114 | })
115 | ```
116 | 
117 | 
118 | ## As a command line tool
119 | 
120 | ```bash
121 | npm install -g pdf-text-extract
122 | ```
123 | 
124 | Execute with the filePath as an argument. Output will be json-formatted array of pages
125 | 
126 | ```bash
127 | pdf-text-extract ./test/data/multipage.pdf
128 | # outputs
129 | # ['<page 1 content...>', '<page 2 content...>']
130 | ```
131 | 
132 | # Test
133 | 
134 | ```bash
135 | # install dev dependencies
136 | npm install
137 | # run tests
138 | npm test
139 | 


--------------------------------------------------------------------------------
/bin/pdf-text-extract.js:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | 
 3 | var extract = require('../index')
 4 | 
 5 | var path = require('path')
 6 | var fileName = process.argv[2]
 7 | if (!fileName) {
 8 |   throw new Error('file path must be specified as the argument like "pdf-text-extract /path/to/file"')
 9 | }
10 | var filePath = path.resolve(fileName)
11 | extract(filePath, cb)
12 | 
13 | function cb (err, pages) {
14 |   if (err) {
15 |     throw err
16 |   }
17 |   console.dir(pages)
18 | }
19 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
  1 | var path = require('path')
  2 | var spawn = require('child_process').spawn
  3 | 
  4 | function pdfTextExtract (filePath, options, pdfToTextCommand, cb) {
  5 |   if (!cb) {
  6 |     cb = pdfToTextCommand
  7 |   }
  8 |   if (!pdfToTextCommand) {
  9 |     cb = options
 10 |   }
 11 |   // options is optional
 12 |   if (typeof (options) === 'function') {
 13 |     cb = options
 14 |     options = {}
 15 |   }
 16 |   if (typeof (pdfToTextCommand) === 'function') {
 17 |     cb = pdfToTextCommand
 18 |     pdfToTextCommand = 'pdftotext'
 19 |   }
 20 |   if (!pdfToTextCommand) {
 21 |     pdfToTextCommand = 'pdftotext'
 22 |   }
 23 | 
 24 |   filePath = path.resolve(filePath)
 25 | 
 26 |   // [feat-promise] if cb is not a function, then it's probably a promise-typed call
 27 |   if (typeof (cb) !== 'function') {
 28 |     cb = null
 29 |   }
 30 | 
 31 |   // [feat-promise] options have to be not null
 32 |   if (!options) {
 33 |     options = {}
 34 |   }
 35 | 
 36 |   // default options
 37 |   options.encoding = options.encoding || 'UTF-8'
 38 |   options.layout = options.layout || 'layout'
 39 |   options.splitPages = (options.splitPages !== false)
 40 | 
 41 |   // Build args based on options
 42 |   var args = []
 43 | 
 44 |   // First and last page to convert
 45 |   if (options.firstPage) { args.push('-f'); args.push(options.firstPage) }
 46 |   if (options.lastPage) { args.push('-l'); args.push(options.lastPage) }
 47 | 
 48 |   // Resolution, in dpi. (null is pdftotext default = 72)
 49 |   if (options.resolution) { args.push('-r'); args.push(options.resolution) }
 50 | 
 51 |   // If defined, should be an object { x:x, y:y, w:w, h:h }
 52 |   if (typeof (options.crop) === 'object') {
 53 |     if (options.crop.x) { args.push('-x'); args.push(options.crop.x) }
 54 |     if (options.crop.y) { args.push('-y'); args.push(options.crop.y) }
 55 |     if (options.crop.w) { args.push('-W'); args.push(options.crop.w) }
 56 |     if (options.crop.h) { args.push('-H'); args.push(options.crop.h) }
 57 |   }
 58 | 
 59 |   // One of either 'layout', 'raw' or 'htmlmeta'
 60 |   if (options.layout === 'layout') { args.push('-layout') }
 61 |   if (options.layout === 'raw') { args.push('-raw') }
 62 |   if (options.layout === 'htmlmeta') { args.push('-htmlmeta') }
 63 | 
 64 |   // Output text encoding (UCS-2, ASCII7, Latin1, UTF-8, ZapfDingbats or Symbol)
 65 |   if (options.encoding) { args.push('-enc'); args.push(options.encoding) }
 66 | 
 67 |   // Output end of line convention (unix, dos or mac)
 68 |   if (options.eol) { args.push('-eol'); args.push(options.eol) }
 69 | 
 70 |   // Owner and User password (for encrypted files)
 71 |   if (options.ownerPassword) { args.push('-opw'); args.push(options.ownerPassword) }
 72 |   if (options.userPassword) { args.push('-upw'); args.push(options.userPassword) }
 73 | 
 74 |   // finish up arguments
 75 |   args.push(filePath)
 76 |   args.push('-')
 77 | 
 78 |   function splitPages (err, content) {
 79 |     if (err) {
 80 |       return cb(err)
 81 |     }
 82 |     var pages = content.split(/\f/)
 83 |     if (!pages) {
 84 |       return cb({
 85 |         message: 'pdf-text-extract failed',
 86 |         error: 'no text returned from the pdftotext command',
 87 |         filePath: filePath,
 88 |         stack: new Error().stack
 89 |       })
 90 |     }
 91 |     // sometimes there can be an extract blank page on the end
 92 |     var lastPage = pages[pages.length - 1]
 93 |     if (!lastPage) {
 94 |       pages.pop()
 95 |     }
 96 |     cb(null, pages)
 97 |   }
 98 |   // [feat-promise]
 99 |   // if cb is not defined, then it's probably a promise-typed call
100 |   // in order to use promise, instantiation is required
101 |   if (!cb) {
102 |     this.pdfToTextCommand = pdfToTextCommand
103 |     this.args = args
104 |     this.options = options
105 |     this.splitPages = splitPages
106 |     this.filePath = filePath
107 |   } else {
108 |     streamResults(pdfToTextCommand, args, options, options.splitPages ? splitPages : cb)
109 |   }
110 | }
111 | 
112 | /**
113 |  * spawns pdftotext and returns its output
114 |  */
115 | function streamResults (command, args, options, cb) {
116 |   var output = ''
117 |   var stderr = ''
118 |   var child = spawn(command, args, options)
119 |   child.stdout.setEncoding('utf8')
120 |   child.stderr.setEncoding('utf8')
121 |   child.stdout.on('data', stdoutHandler)
122 |   child.stderr.on('data', stderrHandler)
123 |   child.on('close', closeHandler)
124 | 
125 |   function stdoutHandler (data) {
126 |     output += data
127 |   }
128 | 
129 |   function stderrHandler (data) {
130 |     stderr += data
131 |   }
132 | 
133 |   function closeHandler (code) {
134 |     if (code !== 0) {
135 |       return cb(new Error('pdf-text-extract command failed: ' + stderr))
136 |     }
137 |     cb(null, output)
138 |   }
139 | }
140 | 
141 | /**
142 |  * [feat-promise]
143 |  * Promise support
144 |  *
145 |  * @param {Function} resolve
146 |  * @param {Function} [reject]
147 |  * @return {Request}
148 |  */
149 | pdfTextExtract.prototype.then = function (resolve, reject) {
150 |   if (!this._fullfilledPromise) {
151 |     var self = this
152 |     this._fullfilledPromise = new Promise(function (innerResolve, innerReject) {
153 |       streamResultsPromise(self.pdfToTextCommand, self.args, self.options, self.options.splitPages ? splitPagesPromise : resolve)
154 |     })
155 |   }
156 | 
157 |   /**
158 |   * Duplicated from function splitPages of pdfTextExtract
159 |   */
160 |   function splitPagesPromise (content) {
161 |     var pages = content.split(/\f/)
162 |     if (!pages) {
163 |       var ex = {
164 |         message: 'pdf-text-extract failed',
165 |         error: 'no text returned from the pdftotext command',
166 |         filePath: this.filePath,
167 |         stack: new Error().stack
168 |       }
169 |       throw ex
170 |     }
171 |     // sometimes there can be an extract blank page on the end
172 |     var lastPage = pages[pages.length - 1]
173 |     if (!lastPage) {
174 |       pages.pop()
175 |     }
176 |     resolve(pages)
177 |   }
178 | 
179 |   /**
180 |   * Duplicated from function splitPages of streamResults
181 |   */
182 |   function streamResultsPromise (command, args, options, cb) {
183 |     var output = ''
184 |     var stderr = ''
185 |     var child = spawn(command, args, options)
186 |     child.stdout.setEncoding('utf8')
187 |     child.stderr.setEncoding('utf8')
188 |     child.stdout.on('data', stdoutHandler)
189 |     child.stderr.on('data', stderrHandler)
190 |     child.on('close', closeHandler)
191 | 
192 |     function stdoutHandler (data) {
193 |       output += data
194 |     }
195 | 
196 |     function stderrHandler (data) {
197 |       stderr += data
198 |     }
199 | 
200 |     function closeHandler (code) {
201 |       if (code !== 0) {
202 |         var ex = new Error('pdf-text-extract command failed: ' + stderr)
203 |         throw ex
204 |       }
205 |       cb(output)
206 |     }
207 |   }
208 | 
209 |   return this._fullfilledPromise.then(resolve, reject)
210 | }
211 | 
212 | module.exports = pdfTextExtract
213 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pdf-text-extract",
 3 |   "version": "1.5.0",
 4 |   "description": "Extract text from pdfs that contain searchable pdf text",
 5 |   "main": "index.js",
 6 |   "bin": "./bin/pdf-text-extract.js",
 7 |   "directories": {
 8 |     "test": "test"
 9 |   },
10 |   "scripts": {
11 |     "test": "node_modules/.bin/mocha --reporter spec",
12 |     "pretest": "standard | snazzy"
13 |   },
14 |   "repository": {
15 |     "type": "git",
16 |     "url": "git://github.com/nisaacson/pdf-text-extract.git"
17 |   },
18 |   "keywords": [
19 |     "pdf",
20 |     "extract",
21 |     "pdftotext",
22 |     "text",
23 |     "extract"
24 |   ],
25 |   "author": "Noah Isaacson",
26 |   "license": "BSD",
27 |   "readmeFilename": "README.md",
28 |   "devDependencies": {
29 |     "mocha": "~1.8.2",
30 |     "should": "~1.2.2",
31 |     "snazzy": "^2.0.1",
32 |     "standard": "^5.3.1"
33 |   },
34 |   "dependencies": {
35 |     "yargs": "^1.2.5"
36 |   },
37 |   "standard": {
38 |     "globals": [
39 |       "describe",
40 |       "before",
41 |       "beforeEach",
42 |       "after",
43 |       "afterEach",
44 |       "it"
45 |     ]
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/test/buffered-extract-test.js:
--------------------------------------------------------------------------------
 1 | var assert = require('assert')
 2 | var fs = require('fs')
 3 | var path = require('path')
 4 | var extract = require('../index.js')
 5 | var should = require('should')
 6 | describe('Buffered Extract', function () {
 7 |   it('should extract text', function (done) {
 8 |     var desiredNumPages = 8
 9 |     var filePath = path.join(__dirname, 'data', 'multipage.pdf')
10 |     assert.ok(fs.existsSync(filePath), 'pdf file not found at path: ' + filePath)
11 |     extract(filePath, function (err, pages) {
12 |       should.not.exist(err)
13 |       should.exist(pages, 'no pages extracted')
14 |       pages.length.should.eql(desiredNumPages)
15 |       pages.map(function (page) {
16 |         should.exist(page, 'page text content should exist')
17 |         page.length.should.be.above(0)
18 |       })
19 |       done()
20 |     })
21 |   })
22 | })
23 | 


--------------------------------------------------------------------------------
/test/data/huge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nisaacson/pdf-text-extract/d21ead42859aae859d3f20b79ebd5c801b21837d/test/data/huge.pdf


--------------------------------------------------------------------------------
/test/data/multipage.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nisaacson/pdf-text-extract/d21ead42859aae859d3f20b79ebd5c801b21837d/test/data/multipage.pdf


--------------------------------------------------------------------------------
/test/data/multipage.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nisaacson/pdf-text-extract/d21ead42859aae859d3f20b79ebd5c801b21837d/test/data/multipage.txt


--------------------------------------------------------------------------------
/test/data/pdf with space in name.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nisaacson/pdf-text-extract/d21ead42859aae859d3f20b79ebd5c801b21837d/test/data/pdf with space in name.pdf


--------------------------------------------------------------------------------
/test/extract-test.js:
--------------------------------------------------------------------------------
 1 | var assert = require('assert')
 2 | var fs = require('fs')
 3 | var path = require('path')
 4 | var extract = require('../index.js')
 5 | var should = require('should')
 6 | 
 7 | describe('Pdf extract', function () {
 8 |   it('should return output and no error when everything is ok', function (done) {
 9 |     var filePath = path.join(__dirname, 'data', 'multipage.pdf')
10 | 
11 |     extract(filePath, function (err, pages) {
12 |       should.not.exist(err)
13 |       should.exists(pages)
14 |       done()
15 |     })
16 |   })
17 | 
18 |   it('should accept files with space in name', function (done) {
19 |     var filePath = path.join(__dirname, 'data', 'pdf with space in name.pdf')
20 |     assert.ok(fs.existsSync(filePath), 'pdf file not found at path: ' + filePath)
21 | 
22 |     extract(filePath, function (err, pages) {
23 |       should.not.exist(err)
24 |       should.exist(pages)
25 | 
26 |       done()
27 |     })
28 |   })
29 | 
30 |   it('should work with parallel data streams', function (done) {
31 |     var filePath = path.join(__dirname, 'data', 'pdf with space in name.pdf')
32 | 
33 |     var streams = 10
34 |     var complete = 0
35 |     for (var i = 0; i < streams; i++) {
36 |       extract(filePath, function (err, pages) {
37 |         should.not.exist(err)
38 |         should.exists(pages[0])
39 |         complete++
40 |         if (complete === streams) {
41 |           done()
42 |         }
43 |       })
44 |     }
45 |   })
46 | 
47 |   it('should allow large files', function (done) {
48 |     this.timeout(5000)
49 |     this.slow('4s')
50 |     var filePath = path.join(__dirname, 'data', 'huge.pdf')
51 | 
52 |     var options = {
53 |       cwd: null
54 |     }
55 |     extract(filePath, options, function (err, pages) {
56 |       should.not.exists(err)
57 |       should.exists(pages)
58 |       done()
59 |     })
60 |   })
61 | 
62 |   it('should support custom pdftotext command undefined err when everything is ok', function (done) {
63 |     var filePath = path.join(__dirname, 'data', 'multipage.pdf')
64 |     var options = {}
65 |     var pdfToTextCommand = 'pdftotext'
66 | 
67 |     extract(filePath, options, pdfToTextCommand, function (err, pages) {
68 |       should.not.exist(err)
69 |       should.exists(pages)
70 |       done()
71 |     })
72 |   })
73 | })
74 | 


--------------------------------------------------------------------------------
/test/promise-buffered-extract-test.js:
--------------------------------------------------------------------------------
 1 | var assert = require('assert')
 2 | var fs = require('fs')
 3 | var path = require('path')
 4 | var Extract = require('../index.js')
 5 | var should = require('should')
 6 | 
 7 | describe('Buffered Extract Promise', function () {
 8 |   it('should extract text', function (done) {
 9 |     var desiredNumPages = 8
10 |     var filePath = path.join(__dirname, 'data', 'multipage.pdf')
11 |     assert.ok(fs.existsSync(filePath), 'pdf file not found at path: ' + filePath)
12 |     var extractor = new Extract(filePath)
13 |     extractor.then(function (pages) {
14 |       should.exist(pages, 'no pages extracted')
15 |       pages.length.should.eql(desiredNumPages)
16 |       pages.map(function (page) {
17 |         should.exist(page, 'page text content should exist')
18 |         page.length.should.be.above(0)
19 |       })
20 |       done()
21 |     }).catch(function (err) {
22 |       console.error('error:', err)
23 |     })
24 |   })
25 | })
26 | 


--------------------------------------------------------------------------------
/test/promise-extract-test.js:
--------------------------------------------------------------------------------
 1 | var assert = require('assert')
 2 | var fs = require('fs')
 3 | var path = require('path')
 4 | var Extract = require('../index.js')
 5 | var should = require('should')
 6 | 
 7 | describe('Pdf extract', function () {
 8 |   it('should return output and no error when everything is ok', function (done) {
 9 |     var filePath = path.join(__dirname, 'data', 'multipage.pdf')
10 |     var extractor = new Extract(filePath)
11 |     extractor.then(function (pages) {
12 |       should.exists(pages)
13 |       done()
14 |     }).catch(function (err) {
15 |       console.error('error:', err)
16 |     })
17 |   })
18 | 
19 |   it('should accept files with space in name', function (done) {
20 |     var filePath = path.join(__dirname, 'data', 'pdf with space in name.pdf')
21 |     assert.ok(fs.existsSync(filePath), 'pdf file not found at path: ' + filePath)
22 | 
23 |     var extractor = new Extract(filePath)
24 |     extractor.then(function (pages) {
25 |       should.exist(pages)
26 |       done()
27 |     }).catch(function (err) {
28 |       console.error('error:', err)
29 |     })
30 |   })
31 | 
32 |   it('should work with parallel data streams', function (done) {
33 |     var filePath = path.join(__dirname, 'data', 'pdf with space in name.pdf')
34 | 
35 |     var streams = 10
36 |     var complete = 0
37 |     for (var i = 0; i < streams; i++) {
38 |       var extractor = new Extract(filePath)
39 |       extractor.then(function (pages) {
40 |         should.exists(pages[0])
41 |         complete++
42 |         if (complete === streams) {
43 |           done()
44 |         }
45 |       }).catch(function (err) {
46 |         console.error('error:', err)
47 |       })
48 |     }
49 |   })
50 | 
51 |   it('should allow large files', function (done) {
52 |     this.timeout(5000)
53 |     this.slow('4s')
54 |     var filePath = path.join(__dirname, 'data', 'huge.pdf')
55 | 
56 |     var options = {
57 |       cwd: null
58 |     }
59 |     var extractor = new Extract(filePath, options)
60 |     extractor.then(function (pages) {
61 |       should.exists(pages)
62 |       done()
63 |     }).catch(function (err) {
64 |       console.error('error:', err)
65 |     })
66 |   })
67 | 
68 |   it('should support custom pdftotext command undefined err when everything is ok', function (done) {
69 |     var filePath = path.join(__dirname, 'data', 'multipage.pdf')
70 |     var options = {}
71 |     var pdfToTextCommand = 'pdftotext'
72 | 
73 |     var extractor = new Extract(filePath, options, pdfToTextCommand)
74 |     extractor.then(function (pages) {
75 |       should.exists(pages)
76 |       done()
77 |     }).catch(function (err) {
78 |       console.error('error:', err)
79 |     })
80 |   })
81 | })
82 | 


--------------------------------------------------------------------------------