├── index.js ├── temp └── Readme.md ├── example ├── trace.pdf └── simple.js ├── package.json ├── Readme.md └── lib └── reader.js /index.js: -------------------------------------------------------------------------------- 1 | module.exports = require('./lib/reader'); 2 | -------------------------------------------------------------------------------- /temp/Readme.md: -------------------------------------------------------------------------------- 1 | This directory is for temporary font files created while rendering. -------------------------------------------------------------------------------- /example/trace.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jviereck/node-pdfreader/HEAD/example/trace.pdf -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-pdfreader", 3 | "description": "Node wrapper around PDF.JS library to read and render PDFs", 4 | "version": "0.0.1", 5 | "author": "Julian Viereck ", 6 | "keywords": ["canvas", "graphic", "graphics", "pdf"], 7 | "homepage": "https://github.com/jviereck/node-pdfreader", 8 | "repository": { 9 | "type": "git", 10 | "url": "git://github.com/jviereck/node-pdfreader" 11 | }, 12 | "dependencies": { 13 | "canvas": ">= 1.0.0" 14 | }, 15 | "engines": { 16 | "node": ">= 0.8.0" 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /example/simple.js: -------------------------------------------------------------------------------- 1 | var PDFReader = require('../index').PDFReader; 2 | 3 | function errorDumper(err) { 4 | if (err) { 5 | console.log('something went wrong :/'); 6 | throw err; 7 | } 8 | } 9 | 10 | var pdf = new PDFReader(__dirname + '/trace.pdf'); 11 | pdf.on('error', errorDumper); 12 | pdf.on('ready', function(pdf) { 13 | // Render a single page. 14 | pdf.render(1 /* First page */, { 15 | bg: true, /* Enable white background */ 16 | output: __dirname + '/page-single.png' 17 | }, errorDumper); 18 | 19 | // Render all pages. 20 | pdf.renderAll({ 21 | output: function(pageNum) { 22 | return __dirname + '/page' + pageNum + '.png'; 23 | } 24 | }, errorDumper); 25 | 26 | // Get the text content of single pages (similar to pdf2txt). 27 | pdf.getContent(1 /* First page */, function(err, content) { 28 | console.log(content); 29 | }, errorDumper); 30 | }); 31 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # Node-PDFReader 2 | 3 | A PDF reader for Node. Based on [PDF.JS](https://github.com/mozilla/pdf.js). 4 | 5 | # WARNING 6 | 7 | This is super experimental. It's more a proof of concept. Some terrible things: 8 | 9 | * no test coverage 10 | * hacked up code 11 | * sync file operations to first store font files on disk and later read them again (yeah, it's really that awful) 12 | * no windows support (due to lack of freetype support in node-canvas) 13 | 14 | # Overview 15 | 16 | Right now you can: 17 | 18 | * Render single or all pages to PNG files 19 | * Get the text content of single pages 20 | 21 | # Installation 22 | 23 | You need to have node and build tools installed. 24 | 25 | If you haven't installed the cairo library or installed it but without with freetype support, you can install it by running this script (make sure to change into a directory, where you can store some temporary files created during the build process of the libraries): 26 | 27 | ```bash 28 | $ cd 29 | $ bash <(curl -fsSk https://raw.github.com/jviereck/node-canvas/font/install) 30 | ``` 31 | 32 | Once that is done, install the dependencies: 33 | 34 | ```bash 35 | $ npm install 36 | ``` 37 | 38 | This will fetch node-canvas and build it. 39 | 40 | # Usage 41 | 42 | See the example directory. You can run the example from the root directory using 43 | 44 | ```bash 45 | $ node example/simple.js 46 | ``` 47 | 48 | This loads the trace-monkey PDF, extracts the text of the first page and dumps it to the console, renders the first page using a white background and all the other pages without a background. The resuling PNG files are stored in the `example/` directory. 49 | 50 | The code of the `simple.js` file looks like this: 51 | 52 | ```javascript 53 | var PDFReader = require('../index').PDFReader; 54 | 55 | function errorDumper(err) { 56 | if (err) { 57 | console.log('something went wrong :/'); 58 | throw err; 59 | } 60 | } 61 | 62 | var pdf = new PDFReader(__dirname + '/trace.pdf'); 63 | pdf.on('error', errorDumper); 64 | pdf.on('ready', function(pdf) { 65 | // Render a single page. 66 | pdf.render(1 /* First page */, { 67 | bg: true, /* Enable white background */ 68 | output: __dirname + '/page-single.png' 69 | }, errorDumper); 70 | 71 | // Render all pages. 72 | pdf.renderAll({ 73 | output: function(pageNum) { 74 | return __dirname + '/page' + pageNum + '.png'; 75 | } 76 | }, errorDumper); 77 | 78 | // Get the text content of single pages (similar to pdf2txt). 79 | pdf.getContent(1 /* First page */, function(err, content) { 80 | console.log(content); 81 | }, errorDumper); 82 | }); 83 | ``` 84 | 85 | # FAQ 86 | 87 | ## I get the error "Need to compile node-canvas/cairo with font support." 88 | 89 | You need to have a version of cairo with freetype2 font support. Best is to first compile and install the freetype2 library and then compile cairo. At the end of running `./configure` when building cairo, you should see the freetype listed "yes" as one of the font backends. 90 | 91 | ## Is there windows support? 92 | 93 | Not for rendering. I just haven't tested the special node-canvas build on windows, so I've disabled windows support. 94 | 95 | ## Can you please implement X? 96 | 97 | No. I don't want to invest too much time in this project. It's a proof of concept for me. However, I'm happy to help others to implement missing features and accept PR :) 98 | 99 | ## Is that necessary to compile cairo/freetype/node-canvas just to extract text? 100 | 101 | No. This was just the easiest way for me to get something out the door. 102 | -------------------------------------------------------------------------------- /lib/reader.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Module dependencies. 3 | */ 4 | 5 | var Canvas = require('canvas') 6 | , Font = Canvas.Font 7 | , fs = require('fs') 8 | , EventEmitter = require('events').EventEmitter 9 | , util = require('util') 10 | ; 11 | 12 | 13 | // ----------------------------------------------------------------------------- 14 | // HACKING :P 15 | // --- 16 | 17 | var PDFJS = require('./pdf.js'); 18 | 19 | // === Some NODE specific stuff. 20 | // Turn of worker support for now. 21 | PDFJS.disableWorker = true; 22 | 23 | PDFJS.createScratchCanvas = function nodeCreateScratchCanvas(width, height) { 24 | var canvas = new Canvas(width, height); 25 | return canvas; 26 | }; 27 | 28 | // Change the font loader logic - THERE IS NO DOM HERE. 29 | PDFJS.FontLoader.bind = function nodeFontLoaderBind(pdf, fonts, callback) { 30 | if (!Font) { 31 | throw new Error("Need to compile node-canvas/cairo with font support."); 32 | } 33 | 34 | 35 | for (var i = 0, ii = fonts.length; i < ii; i++) { 36 | var font = fonts[i]; 37 | 38 | // Add the font to the DOM only once or skip if the font 39 | // is already loaded. 40 | if (font.attached || font.loading == false) { 41 | continue; 42 | } 43 | font.attached = true; 44 | 45 | var data = font.data; 46 | 47 | // Some fonts don't come with data. 48 | if (!data) { 49 | continue; 50 | } 51 | 52 | var fontName = font.loadedName; 53 | var fontFile = 'temp/' + pdf._idx + '_' + fontName + '.ttf'; 54 | 55 | // Temporary hack for loading the font. Write it to file such that a font 56 | // object can get created from it and use it on the context. 57 | var buf = new Buffer(data); 58 | 59 | fs.writeFileSync(fontFile, buf); 60 | var fontObj = new Font(fontName, fontFile); 61 | 62 | pdf.useFont(fontObj); 63 | } 64 | 65 | callback(); 66 | }; 67 | 68 | // === Let's get started 69 | 70 | var idxCounter = 0 71 | 72 | function PDFReader(path) { 73 | EventEmitter.call(this); 74 | 75 | var self = this; 76 | 77 | this.fontList = []; 78 | this.busyContextList = []; 79 | this._useFont = this._useFont.bind(this); 80 | this._idx = idxCounter++; 81 | 82 | var buf = this._loadPDF(path); 83 | 84 | // PDFJS.getDocument might return right away, but then the listerns 85 | // for the `ready` event are not bound yet. 86 | // Delay the function until the next tick. 87 | process.nextTick(function() { 88 | // Basic parsing of the PDF document. 89 | PDFJS.getDocument(buf).then(function(pdf) { 90 | pdf.useFont = self._useFont; 91 | pdf._idx = self._idx; 92 | 93 | self.pdf = pdf; 94 | self.emit('ready', self); 95 | 96 | }, function(err) { 97 | console.log('error'); 98 | self.emit('error', err); 99 | }); 100 | }); 101 | } 102 | 103 | util.inherits(PDFReader, EventEmitter); 104 | 105 | PDFReader.prototype._useFont = function(font) { 106 | this.fontList.push(font); 107 | this.busyContextList.forEach(function(ctx) { 108 | ctx.addFont(this); 109 | }, font); 110 | }; 111 | 112 | PDFReader.prototype._addBusyContext = function(context) { 113 | this.busyContextList.push(context); 114 | 115 | // Make context know all already loaded fonts. 116 | this.fontList.forEach(function(font) { 117 | context.addFont(font); 118 | }); 119 | }; 120 | 121 | PDFReader.prototype._removeBusyContext = function(context) { 122 | var list = this.busyContextList; 123 | list.splice(list.indexOf(context), 1); 124 | }; 125 | 126 | PDFReader.prototype._loadPDF = function(path) { 127 | // TODO: Check file exist. 128 | var state = fs.statSync(path); 129 | var size = state.size; 130 | var buf = new Buffer(size); 131 | 132 | var fd = fs.openSync(path, 'r'); 133 | fs.readSync(fd, buf, 0, size, 0); 134 | 135 | // Set the buffer length, such that the PDF.JS `isArrayBuffer` think it's 136 | // a real typed-array buffer ;) 137 | buf.byteLength = size; 138 | return buf; 139 | }; 140 | 141 | PDFReader.prototype._pdfNotReady = function(callback) { 142 | callback('PDF not ready yet'); 143 | }; 144 | 145 | PDFReader.prototype.render = function(pageNum, opt, callback) { 146 | var pdf = this.pdf; 147 | var self = this; 148 | 149 | if (!pdf) { 150 | return this._pdfNotReady(callback); 151 | } 152 | 153 | opt.scale = opt.scale || 1.0; 154 | 155 | pdf.getPage(pageNum).then(function(page) { 156 | 157 | var viewport = page.getViewport(opt.scale); 158 | 159 | var canvas = new Canvas(viewport.width, viewport.height); 160 | var context = canvas.getContext('2d'); 161 | 162 | // Store reference to the context, such that new loaded fonts can be 163 | // registered. Also adds in all already loaded fonts in the PDF on the 164 | // context. 165 | self._addBusyContext(context); 166 | 167 | if (opt.bg) { 168 | context.save(); 169 | context.fillStyle = 'white'; 170 | context.fillRect(0, 0, viewport.width, viewport.height); 171 | context.restore(); 172 | } 173 | 174 | var renderContext = { 175 | canvasContext: context, 176 | viewport: viewport 177 | }; 178 | page.render(renderContext).then(function() { 179 | var file = ''; 180 | if (typeof opt.output === 'string') { 181 | file = opt.output; 182 | } else { 183 | // TODO: Error handling if it's not a function. 184 | file = opt.output(pageNum); 185 | } 186 | 187 | console.log('finished page: %d - write to file: %s', pageNum, file); 188 | 189 | var out = fs.createWriteStream(file); 190 | var stream = canvas.createPNGStream(); 191 | 192 | stream.on('data', function(chunk){ 193 | out.write(chunk); 194 | }); 195 | 196 | stream.on('end', function() { 197 | self._removeBusyContext(context); 198 | callback(); 199 | }); 200 | }, function(error) { 201 | self._removeBusyContext(context); 202 | callback(error); 203 | }); 204 | }); 205 | }; 206 | 207 | PDFReader.prototype.renderAll = function(opt, callback) { 208 | if (!this.pdf) { 209 | return this._pdfNotReady(callback); 210 | } 211 | 212 | var numPages = this.pdf.numPages; 213 | var i = 1; 214 | var next = function() { 215 | if (i > numPages) { 216 | callback(); 217 | return; 218 | } 219 | 220 | this.render(i, opt, function(err) { 221 | if (err) { 222 | callback(err); 223 | return; 224 | } 225 | i++; 226 | next(); 227 | }); 228 | }; 229 | next = next.bind(this); 230 | next(); 231 | }; 232 | 233 | PDFReader.prototype.getContent = function(pageNum, callback) { 234 | var pdf = this.pdf; 235 | if (!pdf) { 236 | return this._pdfNotReady(callback); 237 | } 238 | 239 | pdf.getPage(pageNum).then(function(page) { 240 | page.getTextContent().then(function(arr) { 241 | // TODO: Handle RTL properly here. 242 | var content = arr.bidiTexts.map(function(bit) { 243 | return bit.str; 244 | }).join(' '); 245 | callback(null, content); 246 | }, function(err) { 247 | callback(err); 248 | }); 249 | }, function(err) { 250 | callback(err); 251 | }); 252 | }; 253 | 254 | exports.PDFReader = PDFReader; 255 | --------------------------------------------------------------------------------