├── .nvmrc ├── .prettierignore ├── .gitattributes ├── .gitignore ├── .npmignore ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE │ └── bug-report.md └── workflows │ └── nodejs.yml ├── test ├── sample.pdf ├── sample-table.pdf ├── sample-with-password.pdf ├── snapshots │ ├── test.js.snap │ └── test.js.md ├── sample.html ├── sample.rtf └── test.js ├── .eslintrc.json ├── .vscode ├── settings.json └── extensions.json ├── .editorconfig ├── index.js ├── lib ├── LOG.js ├── SequentialParser.js ├── ColumnsParser.js ├── parseColumns.js ├── TableParser.js └── parseTable.js ├── parse.js ├── LICENSE ├── parseAsBuffer.js ├── package.json ├── index.d.ts ├── PdfReader.js ├── CHANGELOG.md ├── README.md └── Rule.js /.nvmrc: -------------------------------------------------------------------------------- 1 | v16 2 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | CHANGELOG.md 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | /dist 3 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | .* 2 | node_modules 3 | test 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: [adrienjoly] 2 | custom: ['https://adrienjoly.com/donate/'] 3 | -------------------------------------------------------------------------------- /test/sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample.pdf -------------------------------------------------------------------------------- /test/sample-table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample-table.pdf -------------------------------------------------------------------------------- /test/sample-with-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample-with-password.pdf -------------------------------------------------------------------------------- /test/snapshots/test.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/snapshots/test.js.snap -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": ["plugin:prettier/recommended"], 3 | "parserOptions": { "ecmaVersion": 2020, "sourceType": "module" } 4 | } 5 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "editor.defaultFormatter": "esbenp.prettier-vscode", 4 | "prettier.singleQuote": false 5 | } 6 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "editorconfig.editorconfig", 4 | "dbaeumer.vscode-eslint", 5 | "esbenp.prettier-vscode" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- 1 | ; http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 2 8 | end_of_line = lf 9 | charset = utf-8 10 | trim_trailing_whitespace = true 11 | insert_final_newline = true 12 | -------------------------------------------------------------------------------- /test/sample.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 13 | 14 | 15 |

Hello "world"

16 |

Value:

17 |

4

18 | 19 | 20 | 21 | 22 |
c1c2c3
12.3
helloworld
23 |

Values:

24 |

1

25 |

2

26 |

3

27 | 28 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | export { PdfReader } from "./PdfReader.js"; 2 | export { Rule } from "./Rule.js"; 3 | export * as LOG from "./lib/LOG.js"; 4 | import * as parseTableExports from "./lib/parseTable.js"; 5 | export const parseTable = Object.assign( 6 | parseTableExports.parseTable, 7 | parseTableExports 8 | ); 9 | import * as parseColumnsExports from "./lib/parseColumns.js"; 10 | export const parseColumns = Object.assign( 11 | parseColumnsExports.parseColumns, 12 | parseColumnsExports 13 | ); 14 | export { SequentialParser } from "./lib/SequentialParser.js"; // experimental 15 | export { TableParser } from "./lib/TableParser.js"; 16 | export { ColumnsParser } from "./lib/ColumnsParser.js"; 17 | -------------------------------------------------------------------------------- /lib/LOG.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Minimal logger 3 | * @author Adrien Joly, http://github.com/adrienjoly 4 | * This content is released under the MIT License. 5 | **/ 6 | 7 | import util from "util"; 8 | 9 | var nullLog = function LOG() {}; 10 | 11 | var realLog = function LOG() { 12 | for (var i in arguments) 13 | if (arguments[i] instanceof Object || arguments[i] instanceof Array) 14 | arguments[i] = util.inspect(arguments[i]); 15 | console.log("[DEBUG] " + Array.prototype.join.call(arguments, " ")); 16 | }; 17 | 18 | var LOG = nullLog; 19 | 20 | export function log() { 21 | LOG.apply(null, arguments); 22 | } 23 | 24 | export function toggle(enabled) { 25 | LOG = !enabled ? nullLog : realLog; 26 | } 27 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve this npm package 4 | --- 5 | 6 | **Describe the bug** 7 | A clear and concise description of what the bug is. 8 | 9 | **To Reproduce** 10 | List the steps you followed and/or share your code to help us reproduce the bug 11 | 12 | **Expected behavior** 13 | A clear and concise description of what you expected to happen. 14 | 15 | **Screenshots, outputs or logs** 16 | If applicable, add screenshots, outputs or logs to help explain your problem. 17 | 18 | **Desktop (please complete the following information):** 19 | 20 | - OS: (e.g. iOS) 21 | - Browser: (e.g. chrome, safari) 22 | - Version: (e.g. 22) 23 | 24 | **Additional context** 25 | Add any other context about the problem here. 26 | -------------------------------------------------------------------------------- /parse.js: -------------------------------------------------------------------------------- 1 | import { toggle } from "./lib/LOG.js"; 2 | import { PdfReader } from "./index.js"; 3 | 4 | toggle(false); 5 | 6 | function printRawItems(filename, callback) { 7 | new PdfReader().parseFileItems(filename, function (err, item) { 8 | if (err) callback(err); 9 | else if (!item) callback(); 10 | else if (item.file) console.log("file =", item.file.path); 11 | else if (item.page) console.log("page =", item.page); 12 | else if (item.x) 13 | console.log( 14 | [item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join( 15 | "\t" 16 | ) 17 | ); 18 | else console.warn(item); 19 | }); 20 | } 21 | 22 | var filename = process.argv[2]; 23 | if (!filename) { 24 | console.error("please provide the name of a PDF file"); 25 | } else { 26 | console.warn("printing raw items from file:", filename, "..."); 27 | printRawItems(filename, function (err) { 28 | if (err) { 29 | console.error(err); 30 | process.exit(1); 31 | } 32 | console.warn("done."); 33 | }); 34 | } 35 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Adrien Joly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /parseAsBuffer.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import { toggle } from "./lib/LOG.js"; 3 | import { PdfReader } from "./index.js"; 4 | 5 | toggle(false); 6 | 7 | function printRawItems(pdfBuffer, callback) { 8 | new PdfReader().parseBuffer(pdfBuffer, function (err, item) { 9 | if (err) callback(err); 10 | else if (!item) callback(); 11 | else if (item.file) console.log("file =", item.file.path); 12 | else if (item.page) console.log("page =", item.page); 13 | else if (item.x) 14 | console.log( 15 | [item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join( 16 | "\t" 17 | ) 18 | ); 19 | else console.warn(item); 20 | }); 21 | } 22 | 23 | var filename = process.argv[2]; 24 | if (!filename) { 25 | console.error("please provide the name of a PDF file"); 26 | } else { 27 | console.warn("printing raw items from file:", filename, "..."); 28 | fs.readFile(filename, (err, pdfBuffer) => { 29 | if (err) console.error(err); 30 | printRawItems(pdfBuffer, function (err) { 31 | if (err) { 32 | console.error(err); 33 | process.exit(1); 34 | } 35 | console.warn("done."); 36 | }); 37 | }); 38 | } 39 | -------------------------------------------------------------------------------- /lib/SequentialParser.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Applies a list of simple actions to apply to each provided item, in order to accumulate field values. 3 | * Provides a list of parsed `fields`. 4 | * Calls `callback(error, this)` when all accumulators were processed, or when processing a null item. 5 | **/ 6 | export function SequentialParser(accumulators, callback) { 7 | var step = 0; 8 | var fields = {}; 9 | return { 10 | fields: fields, 11 | addField: function (key, value) { 12 | this.fields[key] = value; 13 | }, 14 | parseItem: function (item) { 15 | if (step >= accumulators.length) { 16 | return console.warn( 17 | "warning: skipping item, because SequentialParser is done." 18 | ); 19 | } 20 | var current = accumulators[step]; 21 | if (current.field) { 22 | this.addField(current.field, item); 23 | ++step; 24 | } else if (current.accumulator) { 25 | var doneAccumulating = current.accumulator(item, this); 26 | if (doneAccumulating) ++step; 27 | } // no action => skip item 28 | else ++step; 29 | if (!item || step >= accumulators.length) { 30 | callback && callback(null, this); 31 | } 32 | }, 33 | }; 34 | } 35 | -------------------------------------------------------------------------------- /lib/ColumnsParser.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ColumnsParser 3 | * Classifies items into columns, nearest to the left position of their corresponding header. 4 | * @author Adrien Joly, http://github.com/adrienjoly 5 | * This content is released under the MIT License. 6 | **/ 7 | 8 | import { log as LOG } from "./LOG.js"; 9 | 10 | function getColumnIndex(cols, x) { 11 | var bestDist = null; 12 | for (var i = 0; i < cols.length; ++i) { 13 | var dist = Math.abs(x - cols[i].x); 14 | if (bestDist !== null && dist > bestDist) { 15 | break; 16 | } else { 17 | bestDist = dist; 18 | } 19 | } 20 | return i - 1; 21 | } 22 | 23 | export function ColumnsParser(colNames) { 24 | this.cols = []; 25 | var cols = this.cols, 26 | colNames = colNames.slice(), // clone (for parameter immutability) 27 | line = -1; // -1 = header 28 | 29 | this.processItem = function (item) { 30 | if (line == -1) { 31 | // parse x-position of column headers 32 | var i = colNames.indexOf(item.text); 33 | if (i > -1) { 34 | LOG("ColumnsParser header", i, item.text, "=> x:", item.x); 35 | cols[i] = { 36 | name: item.text, 37 | x: item.x, 38 | items: [], 39 | }; 40 | colNames[i] = ""; // needed so that a column name can be associated to more than 1 index 41 | } 42 | if (cols.length == colNames.length) { 43 | // done parsing header 44 | line++; 45 | } 46 | } else { 47 | cols[getColumnIndex(cols, item.x)].items.push(item); 48 | } 49 | }; 50 | } 51 | -------------------------------------------------------------------------------- /lib/parseColumns.js: -------------------------------------------------------------------------------- 1 | /** 2 | * parseColumns, for pdfreader, used by the Rule class. 3 | * accumulates values below each column header (on 1st row, given their name), without detecting empty rows. 4 | * TODO: use ColumnsParser 5 | * @author Adrien Joly, http://github.com/adrienjoly 6 | * This content is released under the MIT License. 7 | **/ 8 | 9 | import { log as LOG } from "./LOG.js"; 10 | 11 | export const parseColumns = function (/* columns */) { 12 | this.output = []; 13 | this.cols = Array.prototype.slice.apply(arguments); 14 | var colNames = this.cols, 15 | colX = [], 16 | rows = this.output, 17 | line = -1, // header 18 | lineY = null; 19 | function processItem(item) { 20 | if (line == -1) { 21 | // parse x-position of column headers 22 | var i = colNames.indexOf(item.text); 23 | if (i > -1) colX[i] = item.x; 24 | if (colX.length == colNames.length) { 25 | LOG("table header:", colNames, colX); 26 | line++; 27 | } 28 | } else { 29 | if (lineY === null) { 30 | lineY = item.y; 31 | } else if (lineY != item.y) { 32 | lineY = item.y; 33 | line++; 34 | } 35 | // parsing values for each column 36 | var col = 0; 37 | for (var i = colX.length - 1; i >= 0; --i) 38 | if (item.x > colX[i]) { 39 | col = i; 40 | break; 41 | } 42 | rows[lineY] = rows[lineY] || {}; 43 | rows[lineY][col] = item.text; 44 | } 45 | } 46 | processItem(this.currentItem); // apply on header's first item 47 | return processItem; // then the same function will be run on all following items, until another rule is triggered 48 | }; 49 | -------------------------------------------------------------------------------- /.github/workflows/nodejs.yml: -------------------------------------------------------------------------------- 1 | name: Node CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | # Prevent functional regressions on supported Node.js versions 13 | tests: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | node-version: [16.x, 18.x] 18 | steps: 19 | - uses: actions/checkout@v1 20 | - name: Use Node.js ${{ matrix.node-version }} 21 | uses: actions/setup-node@v1 22 | with: 23 | node-version: ${{ matrix.node-version }} 24 | - run: npm ci # install dependencies 25 | - run: npm test 26 | 27 | # Checks that files are formatted consistently 28 | formatting: 29 | runs-on: ubuntu-latest 30 | steps: 31 | - uses: actions/checkout@v1 32 | - uses: actions/setup-node@v1 33 | with: 34 | node-version: 16.x 35 | - run: npm ci # install dependencies 36 | - run: npm run prettier:check 37 | - run: npm run lint 38 | 39 | release: 40 | needs: 41 | - tests 42 | - formatting 43 | runs-on: ubuntu-latest 44 | steps: 45 | - name: Checkout 46 | uses: actions/checkout@v2 47 | with: 48 | fetch-depth: 0 49 | - name: Setup Node.js 50 | uses: actions/setup-node@v1 51 | with: 52 | node-version: 16 53 | - name: Install dependencies 54 | run: npm ci 55 | - name: Build commonjs 56 | run: npm run build:cjs 57 | - name: Release 58 | env: 59 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 60 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 61 | run: npm run semantic-release 62 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pdfreader", 3 | "type": "module", 4 | "version": "0.0.0-development", 5 | "description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.", 6 | "main": "dist/index.cjs", 7 | "module": "./index.js", 8 | "typings": "./index.d.ts", 9 | "scripts": { 10 | "prettier:print": "prettier --list-different \"./**/*.js\" \"./**/*.md\" \"./**/*.d.ts\"", 11 | "prettier:check": "npm run -s prettier:print 1>&2; exit $(npm run -s prettier:print | wc -l)", 12 | "prettier:fix": "prettier \"./**/*.js\" \"./**/*.md\" \"./**/*.d.ts\" --write --end-of-line lf", 13 | "test:samples": "node parse.js test/sample.pdf && node parseAsBuffer.js test/sample.pdf", 14 | "test:update": "ava --update-snapshots", 15 | "test": "ava", 16 | "lint": "eslint .", 17 | "semantic-release": "semantic-release", 18 | "build:cjs": "rollup index.js --file dist/index.cjs --format cjs" 19 | }, 20 | "repository": { 21 | "type": "git", 22 | "url": "https://github.com/adrienjoly/npm-pdfreader" 23 | }, 24 | "keywords": [ 25 | "pdf", 26 | "reader", 27 | "parser", 28 | "parse", 29 | "parsing", 30 | "convert", 31 | "CLI", 32 | "table", 33 | "data", 34 | "csv", 35 | "json", 36 | "rules" 37 | ], 38 | "author": "Adrien Joly", 39 | "license": "MIT", 40 | "bugs": { 41 | "url": "https://github.com/adrienjoly/npm-pdfreader/issues" 42 | }, 43 | "homepage": "https://github.com/adrienjoly/npm-pdfreader", 44 | "dependencies": { 45 | "pdf2json": "3.1.4" 46 | }, 47 | "devDependencies": { 48 | "@semantic-release/changelog": "^6.0.1", 49 | "@semantic-release/git": "^10.0.1", 50 | "@semantic-release/npm": "^9.0.1", 51 | "ava": "^4.1.0", 52 | "eslint": "^8.11.0", 53 | "eslint-config-prettier": "^8.5.0", 54 | "eslint-plugin-prettier": "^4.0.0", 55 | "execa": "^6.1.0", 56 | "prettier": "2.6.1", 57 | "semantic-release": "^19.0.2", 58 | "rollup": "^4.19.1" 59 | }, 60 | "engines": { 61 | "node": ">=14" 62 | } 63 | } -------------------------------------------------------------------------------- /index.d.ts: -------------------------------------------------------------------------------- 1 | export type InitOptions = { 2 | password?: string; 3 | debug?: boolean; 4 | signal?: AbortSignal; 5 | }; 6 | export type Error = null | string; 7 | 8 | export type DataEntry = { 9 | page?: number; 10 | width?: number; 11 | height?: number; 12 | text?: string; 13 | file?: { 14 | path?: string; 15 | buffer?: string; 16 | }; 17 | } | null; 18 | 19 | export type ItemHandler = (err: Error, data: DataEntry & Item) => void; 20 | 21 | export declare class PdfReader { 22 | constructor(opts?: InitOptions | null); 23 | parseFileItems(pdfFilePath: string, itemHandler: ItemHandler): void; 24 | parseBuffer(buffer: Buffer, itemHandler: ItemHandler): void; 25 | } 26 | 27 | export type Item = { 28 | x: number; 29 | y: number; 30 | sw: number; 31 | w: number; 32 | A: string; 33 | clr: number; 34 | R: { 35 | T: string; 36 | S: number; 37 | TS: any[]; 38 | }[]; 39 | text: string; 40 | }; 41 | 42 | export type RuleAccumulator = (item: Item) => boolean | void; 43 | export type RuleHandler = (value: T) => void; 44 | 45 | export interface TableResult { 46 | matrix: string[][]; 47 | items: Item[]; 48 | } 49 | 50 | export class TableParser { 51 | private rows: { [key: string]: Item[] }; 52 | constructor(); 53 | processItem(item: Item, col: number): void; 54 | processHeadingItem(item: Item, col: number): void; 55 | getRows(): Item[][]; 56 | renderRows(): string; 57 | /** row-> column-> items_collisionning_in_column-> item:Item */ 58 | getMatrix(): Item[][][]; 59 | getCleanMatrix(options?: { collisionSeparator: string }): string[][]; 60 | renderMatrix(): string; 61 | } 62 | 63 | export class Rule { 64 | static on(regexp: RegExp): Rule; 65 | static after(regexp: RegExp): Rule; 66 | static makeItemProcessor(rules: Rule[]): (item: DataEntry) => void; 67 | static addAccumulator(methodName: string, methodBuilder: Function): void; 68 | 69 | constructor(regexp: RegExp); 70 | 71 | // Accumulator methods 72 | extractRegexpValues(): Rule; 73 | parseNextItemValue(): Rule; 74 | accumulateAfterHeading(): Rule; 75 | accumulateFromSameX(): Rule; 76 | parseColumns(...args: any[]): Rule; 77 | parseTable(columnCount: number): Rule & { 78 | then(handler: (result: TableResult) => void): Rule; 79 | }; 80 | 81 | then(handler: RuleHandler): Rule; 82 | 83 | private test(item: Item): RuleAccumulator | undefined; 84 | private whenDone(callback: () => void): void; 85 | } 86 | -------------------------------------------------------------------------------- /PdfReader.js: -------------------------------------------------------------------------------- 1 | /** 2 | * PdfReader: class that reads a PDF file, and calls a function on each item found while parsing that file. 3 | * @author Adrien Joly, http://github.com/adrienjoly 4 | * This content is released under the MIT License. 5 | * 6 | * An item object can match one of the following objects: 7 | * - null, when the parsing is over, or an error occured. 8 | * - {file:{path:string}}, when a PDF file is being opened. 9 | * - {page:integer}, when a new page is being parsed, provides the page number, starting at 1. 10 | * - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position. 11 | * 12 | **/ 13 | 14 | import { log as LOG } from "./lib/LOG.js"; 15 | import PDFParser from "pdf2json"; // doc: https://github.com/modesty/pdf2json 16 | 17 | function forEachItem(pdf, handler) { 18 | var pageNumber = 0; 19 | // pdf.formImage was removed in pdf2json@2, but we keep backward compatibility too 20 | var Pages = pdf.Pages || pdf.formImage.Pages; 21 | for (var p in Pages) { 22 | var page = Pages[p]; 23 | var number = ++pageNumber; 24 | handler(null, { 25 | page: number, 26 | width: page.Width || (pdf.formImage ? pdf.formImage.Width : 0), 27 | height: 28 | page.Height || 29 | (pdf.formImage ? pdf.formImage.Pages[number - 1].Height : 0), 30 | }); 31 | for (var t in page.Texts) { 32 | var item = page.Texts[t]; 33 | item.text = decodeURIComponent(item.R[0].T); 34 | handler(null, item); 35 | } 36 | } 37 | handler(); 38 | } 39 | 40 | export function PdfReader(options) { 41 | LOG("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter 42 | this.options = options || {}; 43 | } 44 | 45 | /** 46 | * parseFileItems: calls itemHandler(error, item) on each item parsed from the pdf file 47 | **/ 48 | PdfReader.prototype.parseFileItems = function (pdfFilePath, itemHandler) { 49 | itemHandler(null, { file: { path: pdfFilePath } }); 50 | var pdfParser; 51 | if (this.options.password) { 52 | pdfParser = new PDFParser(null, null, this.options.password); 53 | } else { 54 | pdfParser = new PDFParser(); 55 | } 56 | 57 | pdfParser.on("pdfParser_dataError", itemHandler); 58 | pdfParser.on("pdfParser_dataReady", function (pdfData) { 59 | forEachItem(pdfData, itemHandler); 60 | }); 61 | var verbosity = this.options.debug ? 1 : 0; 62 | pdfParser.loadPDF(pdfFilePath, verbosity); 63 | 64 | this.options.signal?.addEventListener("abort", function () { 65 | pdfParser.destroy(); 66 | }); 67 | }; 68 | 69 | /** 70 | * parseBuffer: calls itemHandler(error, item) on each item parsed from the pdf file received as a buffer 71 | */ 72 | PdfReader.prototype.parseBuffer = function (pdfBuffer, itemHandler) { 73 | itemHandler(null, { file: { buffer: pdfBuffer } }); 74 | var pdfParser; 75 | if (this.options.password) { 76 | pdfParser = new PDFParser(null, null, this.options.password); 77 | } else { 78 | pdfParser = new PDFParser(); 79 | } 80 | 81 | pdfParser.on("pdfParser_dataError", itemHandler); 82 | pdfParser.on("pdfParser_dataReady", function (pdfData) { 83 | forEachItem(pdfData, itemHandler); 84 | }); 85 | var verbosity = this.options.debug ? 1 : 0; 86 | pdfParser.parseBuffer(pdfBuffer, verbosity); 87 | 88 | this.options.signal?.addEventListener("abort", function () { 89 | pdfParser.destroy(); 90 | }); 91 | }; 92 | -------------------------------------------------------------------------------- /lib/TableParser.js: -------------------------------------------------------------------------------- 1 | /** 2 | * TableParser 3 | * Classifies items into columns and rows, based on their left and top coordinates, 4 | * and left position of column headers. 5 | * @author Adrien Joly, http://github.com/adrienjoly 6 | * This content is released under the MIT License. 7 | **/ 8 | 9 | export function TableParser() { 10 | this.rows = {}; 11 | } 12 | 13 | TableParser.prototype.processItem = function (item, col) { 14 | var row = (this.rows["" + item.y] = this.rows["" + item.y] || {}); 15 | (row[col] = row[col] || []).push(item); 16 | }; 17 | 18 | TableParser.prototype.processHeadingItem = function (item, col) { 19 | this.processItem( 20 | { 21 | y: 0, 22 | x: item.x, 23 | text: item.text, 24 | }, 25 | col 26 | ); 27 | }; 28 | 29 | // Rows 30 | 31 | function sortAsFloatValues(values) { 32 | return values.slice().sort(function (a, b) { 33 | return parseFloat(a) - parseFloat(b); 34 | }); 35 | } 36 | 37 | TableParser.prototype.getRows = function () { 38 | var rows = this.rows; 39 | var yValues = sortAsFloatValues(Object.keys(rows)); 40 | return yValues.map(function (y) { 41 | return rows["" + y]; 42 | }); 43 | }; 44 | 45 | function renderRows(rows) { 46 | return (rows || []) 47 | .map(function (row, rowId) { 48 | var cells = []; 49 | for (var i in row) 50 | for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text); 51 | return rowId + ":\t" + cells.join(", "); 52 | }) 53 | .join("\n"); 54 | } 55 | 56 | TableParser.prototype.renderRows = function () { 57 | return renderRows(this.getRows()); 58 | }; 59 | 60 | // Matrix 61 | 62 | function getSortedXValues(rows) { 63 | var xSet = {}; 64 | for (var y in rows) for (var x in rows[y]) xSet[x] = true; 65 | return sortAsFloatValues(Object.keys(xSet)); 66 | } 67 | 68 | /** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */ 69 | TableParser.prototype.getMatrix = function () { 70 | var rows = this.getRows(); 71 | var xValues = getSortedXValues(rows); 72 | return rows.map(function (row, y) { 73 | var rowNew = []; 74 | for (var x in row) { 75 | var items = row[x]; 76 | var colN = xValues.indexOf(x); 77 | rowNew[colN] = (rowNew[colN] || []).concat(items); 78 | } 79 | return rowNew; 80 | }); 81 | }; 82 | 83 | /** 84 | * For use with console.table(). 85 | * @param {String} collisionSeparator separator to use when there are multiple values to join for a given column 86 | * @returns a 2-dimension matrix: row -> column -> value 87 | */ 88 | TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) { 89 | return this.getMatrix().map((rowColumns) => 90 | rowColumns.map((items) => 91 | items.map((item) => item.text).join(collisionSeparator || "") 92 | ) 93 | ); 94 | }; 95 | 96 | function getText(item) { 97 | return item.text; 98 | } 99 | 100 | function joinCellCollisions(separ) { 101 | return function (cell) { 102 | return (cell || []).map(getText).join(separ).substr(0, 7); 103 | }; 104 | } 105 | 106 | function renderMatrix(matrix) { 107 | return (matrix || []) 108 | .map(function (row) { 109 | return (row || []).map(joinCellCollisions("+")).join("\t"); 110 | }) 111 | .join("\n"); 112 | } 113 | 114 | TableParser.prototype.renderMatrix = function () { 115 | return renderMatrix(this.getMatrix()); 116 | }; 117 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## [1.2.1](https://github.com/adrienjoly/npm-pdfreader/compare/v1.2.0...v1.2.1) (2020-09-25) 2 | 3 | 4 | ### Bug Fixes 5 | 6 | * **deps:** Update dependencies ([#63](https://github.com/adrienjoly/npm-pdfreader/issues/63)) ([308f322](https://github.com/adrienjoly/npm-pdfreader/commit/308f322ea670ab2ec11f77e3588667674709b453)) 7 | 8 | # [1.2.0](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.4...v1.2.0) (2020-09-25) 9 | 10 | 11 | ### Features 12 | 13 | * Support password-protected PDF files ([#61](https://github.com/adrienjoly/npm-pdfreader/issues/61)) ([248af89](https://github.com/adrienjoly/npm-pdfreader/commit/248af89d79304dfa64b5785614b496e4e5d36e69)), closes [#15](https://github.com/adrienjoly/npm-pdfreader/issues/15) 14 | 15 | ## [1.1.4](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.3...v1.1.4) (2020-09-25) 16 | 17 | 18 | ### Bug Fixes 19 | 20 | * Ease contributions ([#62](https://github.com/adrienjoly/npm-pdfreader/issues/62)) ([4a1fe66](https://github.com/adrienjoly/npm-pdfreader/commit/4a1fe6677d5a829049aa0c3c28dccb2f96e8e2f6)) 21 | 22 | ## [1.1.3](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.2...v1.1.3) (2020-04-26) 23 | 24 | 25 | ### Bug Fixes 26 | 27 | * **node:** use latest node 10 version ([#52](https://github.com/adrienjoly/npm-pdfreader/issues/52)) ([eb34ea9](https://github.com/adrienjoly/npm-pdfreader/commit/eb34ea92fea924d3d1e28b13a2e730b62a996b51)) 28 | 29 | ## [1.1.2](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.1...v1.1.2) (2020-04-26) 30 | 31 | 32 | ### Bug Fixes 33 | 34 | * **deps:** with npm audit fix ([#51](https://github.com/adrienjoly/npm-pdfreader/issues/51)) ([16502fc](https://github.com/adrienjoly/npm-pdfreader/commit/16502fce29af76ebf8216e17aafb388a54326b6c)) 35 | 36 | ## [1.1.1](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.0...v1.1.1) (2020-04-26) 37 | 38 | 39 | ### Bug Fixes 40 | 41 | * **deps:** bump acorn from 6.3.0 to 6.4.1 ([#46](https://github.com/adrienjoly/npm-pdfreader/issues/46)) ([af61802](https://github.com/adrienjoly/npm-pdfreader/commit/af61802d1430adab8c9c56588d8a5b565910bd3a)) 42 | 43 | # [1.1.0](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.10...v1.1.0) (2020-04-26) 44 | 45 | 46 | ### Features 47 | 48 | * **deps:** upgrade pdf2json to version 1.2.0 ([#50](https://github.com/adrienjoly/npm-pdfreader/issues/50)) ([0877162](https://github.com/adrienjoly/npm-pdfreader/commit/08771623aa7bf228b4a39e763e38614e79dca10c)), closes [#40](https://github.com/adrienjoly/npm-pdfreader/issues/40) 49 | 50 | ## [1.0.10](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.9...v1.0.10) (2020-04-26) 51 | 52 | 53 | ### Bug Fixes 54 | 55 | * **ci:** check formatting in separate step, after tests ([#49](https://github.com/adrienjoly/npm-pdfreader/issues/49)) ([9129b8a](https://github.com/adrienjoly/npm-pdfreader/commit/9129b8a4f860fbc674fd7485c7c0661c0344a71d)) 56 | 57 | ## [1.0.9](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.8...v1.0.9) (2020-04-26) 58 | 59 | 60 | ### Bug Fixes 61 | 62 | * **ci:** prettier to ignore CHANGELOG (generated) ([8bcf776](https://github.com/adrienjoly/npm-pdfreader/commit/8bcf77674a6e472c791accca4d8385e8462679b6)) 63 | * **ci:** skip github actions workflow on release commits ([c970cda](https://github.com/adrienjoly/npm-pdfreader/commit/c970cda451a3a3b53c9d42c721524b22a7714544)) 64 | 65 | ## [1.0.8](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.7...v1.0.8) (2020-04-26) 66 | 67 | 68 | ### Bug Fixes 69 | 70 | * **release:** automatic update of version in package.json ([#48](https://github.com/adrienjoly/npm-pdfreader/issues/48)) ([bad1d5b](https://github.com/adrienjoly/npm-pdfreader/commit/bad1d5bfce1c55b503cca3380c3187fb071b6056)) 71 | -------------------------------------------------------------------------------- /test/sample.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg1252\cocoartf1344\cocoasubrtf720 2 | {\fonttbl\f0\froman\fcharset0 Times-Roman;} 3 | {\colortbl;\red255\green255\blue255;\red0\green0\blue0;} 4 | \paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0 5 | \deftab720 6 | \pard\pardeftab720\sa321 7 | 8 | \f0\b\fs48 \cf2 \expnd0\expndtw0\kerning0 9 | \outl0\strokewidth0 \strokec2 Hello "world"\ 10 | \pard\pardeftab720\sa240 11 | 12 | \b0\fs24 \cf2 \expnd0\expndtw0\kerning0 13 | \outl0\strokewidth0 Value:\ 14 | 4\ 15 | 16 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrt\brdrs\brdrw20\brdrcf2 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2 17 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx2880 18 | \clvertalc \clshdrawnil \clwWidth860\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx5760 19 | \clvertalc \clshdrawnil \clwWidth940\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx8640 20 | \pard\intbl\itap1\pardeftab720\qc 21 | 22 | \b \cf2 \expnd0\expndtw0\kerning0 23 | \outl0\strokewidth0 c1\cell 24 | \pard\intbl\itap1\pardeftab720\qc 25 | \cf2 \expnd0\expndtw0\kerning0 26 | \outl0\strokewidth0 c2\cell 27 | \pard\intbl\itap1\pardeftab720\qc 28 | \cf2 \expnd0\expndtw0\kerning0 29 | \outl0\strokewidth0 c3\cell \row 30 | 31 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2 32 | \clvertalc \clshdrawnil \clwWidth120\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx2880 33 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx5760 34 | \clvertalc \clshdrawnil \clwWidth560\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx8640 35 | \pard\intbl\itap1\pardeftab720\qr 36 | 37 | \b0 \cf2 \expnd0\expndtw0\kerning0 38 | \outl0\strokewidth0 1\cell 39 | \pard\intbl\itap1\pardeftab720\qr 40 | \cf2 \expnd0\expndtw0\kerning0 41 | \outl0\strokewidth0 \cell 42 | \pard\intbl\itap1\pardeftab720\qr 43 | \cf2 \expnd0\expndtw0\kerning0 44 | \outl0\strokewidth0 2.3\cell \row 45 | 46 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrb\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2 47 | \clvertalc \clshdrawnil \clwWidth120\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx2880 48 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx5760 49 | \clvertalc \clshdrawnil \clwWidth560\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx8640 50 | \pard\intbl\itap1\pardeftab720\qr 51 | \cf2 \expnd0\expndtw0\kerning0 52 | \outl0\strokewidth0 \cell 53 | \pard\intbl\itap1\pardeftab720\qr 54 | \cf2 \expnd0\expndtw0\kerning0 55 | \outl0\strokewidth0 hello\cell 56 | \pard\intbl\itap1\pardeftab720\qr 57 | \cf2 \expnd0\expndtw0\kerning0 58 | \outl0\strokewidth0 world\cell \lastrow\row 59 | \pard\pardeftab720\sa240 60 | \cf2 \expnd0\expndtw0\kerning0 61 | \outl0\strokewidth0 Values:\ 62 | 1\ 63 | 2\ 64 | 3\ 65 | } -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | import assert from "assert"; 2 | import test from "ava"; 3 | import { toggle } from "../lib/LOG.js"; 4 | import * as lib from "../index.js"; 5 | 6 | toggle(false); 7 | 8 | const PdfReader = lib.PdfReader; 9 | const Rule = lib.Rule; 10 | 11 | const TESTFILE = "./test/sample.pdf"; 12 | const TESTFILE_WITH_PASSWORD = "./test/sample-with-password.pdf"; 13 | 14 | test("parse raw items from pdf file", async (t) => { 15 | const res = new Promise((resolve, reject) => { 16 | const items = []; 17 | new PdfReader().parseFileItems(TESTFILE, (err, item) => { 18 | if (err) reject(err); 19 | else if (!item) resolve(items); 20 | else items.push(item); 21 | }); 22 | }); 23 | t.snapshot(await res); 24 | }); 25 | 26 | test("parse structured content from pdf file, using rules", async (t) => { 27 | const res = new Promise((resolve, reject) => { 28 | const content = []; 29 | const rules = [ 30 | Rule.on(/^Hello \"(.*)\"$/) 31 | .extractRegexpValues() 32 | .then((value) => content.push({ extractRegexpValues: value })), 33 | Rule.on(/^Value\:/) 34 | .parseNextItemValue() 35 | .then((value) => content.push({ parseNextItemValue: value })), 36 | Rule.on(/^c1$/) 37 | .parseTable(3) 38 | .then((table) => 39 | content.push({ 40 | "parseTable.renderMatrix": lib.parseTable.renderMatrix( 41 | table.matrix 42 | ), 43 | "parseTable.renderItems": lib.parseTable.renderItems(table.items), 44 | }) 45 | ), 46 | Rule.on(/^Values\:/) 47 | .accumulateAfterHeading() 48 | .then((value) => content.push({ accumulateAfterHeading: value })), 49 | ]; 50 | const processItem = Rule.makeItemProcessor(rules); 51 | new PdfReader().parseFileItems(TESTFILE, (err, item) => { 52 | if (err) reject(err); 53 | else { 54 | processItem(item); 55 | if (!item) resolve(content); 56 | } 57 | }); 58 | }); 59 | t.snapshot(await res); 60 | }); 61 | 62 | test("parse Table from PDF file, using TableParser", async (t) => { 63 | const matrix = await new Promise((resolve, reject) => { 64 | // the thresholds were determined manually, based on the horizontal position (x) for column headers 65 | const colThresholds = [6.8, 9.5, 13.3, 16.7, 18.4, 28, 32, 36, Infinity]; 66 | 67 | const columnQuantitizer = (item) => { 68 | const col = colThresholds.findIndex( 69 | (colThreshold) => parseFloat(item.x) < colThreshold 70 | ); 71 | assert(col >= 0, col); 72 | assert(col < colThresholds.length, col); 73 | // console.log(`COL ${col}\t${parseFloat(item.x)}\t${item.text}`); 74 | return col; 75 | }; 76 | 77 | const table = new lib.TableParser(); 78 | new PdfReader().parseFileItems("./test/sample-table.pdf", (err, item) => { 79 | if (err) reject(err); 80 | else if (!item) { 81 | resolve(table.getCleanMatrix({ collisionSeparator: "" })); 82 | } else if (item.text) { 83 | table.processItem(item, columnQuantitizer(item)); 84 | } 85 | }); 86 | }); 87 | // console.table(matrix); 88 | t.snapshot(matrix); 89 | }); 90 | 91 | test("support pdf file with password", async (t) => { 92 | const promise = new Promise((resolve, reject) => 93 | new PdfReader({ password: "password" }).parseFileItems( 94 | TESTFILE_WITH_PASSWORD, 95 | (err, item) => { 96 | if (err) reject(err); 97 | else if (!item) resolve(); 98 | } 99 | ) 100 | ); 101 | await t.notThrowsAsync(promise); 102 | }); 103 | 104 | test("sample scripts should print raw items from pdf file", async (t) => { 105 | const { execa } = await import("execa"); 106 | const { stdout, stderr } = await execa("npm run test:samples", { 107 | shell: true, // needed in order to run npm commands with execa 108 | }); 109 | t.snapshot({ stdout, stderr }); 110 | }); 111 | -------------------------------------------------------------------------------- /lib/parseTable.js: -------------------------------------------------------------------------------- 1 | /** 2 | * parseTable accumulator, for pdfreader, used by the Rule class. 3 | * items are classified into columns and rows, based on their left and top coordinates, 4 | * and left position of column headers. 5 | * TODO: use TableParser 6 | * @author Adrien Joly, http://github.com/adrienjoly 7 | * This content is released under the MIT License. 8 | **/ 9 | 10 | function getTopPos(item) { 11 | return item.y; 12 | } 13 | 14 | function getLeftPos(item) { 15 | return item.x; 16 | } 17 | 18 | function getText(item) { 19 | return item.text; 20 | } 21 | 22 | /** 23 | * makeClassifier(): makes a classifier, based on an array of numbers and an expected number of clusters. 24 | * nbClusters: expected number of clusters 25 | * arr: array of numbers 26 | * => returns a function that takes a number, and returns the number of its corresponding column. 27 | **/ 28 | function makeFloorClassifier(nbClusters, arr) { 29 | var min = Math.min.apply(Math, arr); 30 | var delta = Math.max.apply(Math, arr) - min; 31 | min -= delta / nbClusters / 2; 32 | return function classify(value) { 33 | return Math.floor((nbClusters * (value - min)) / delta); 34 | }; 35 | } 36 | 37 | function makeColumnClassifier(header) { 38 | var colX = [0].concat(header.map(getLeftPos)).sort(function (a, b) { 39 | return a - b; 40 | }); 41 | return function classify(item) { 42 | for (var i = colX.length - 1; i > -1; --i) 43 | if (getLeftPos(item) >= colX[i]) return i; 44 | }; 45 | } 46 | 47 | function buildRowList(items, classifyRow) { 48 | var rows = []; 49 | for (var i in items) { 50 | var item = items[i]; 51 | var row = classifyRow(getTopPos(item)); 52 | (rows[row] = rows[row] || []).push(item); 53 | } 54 | return rows; 55 | } 56 | 57 | function joinCellCollisions(separ) { 58 | return function (cell) { 59 | return (cell || []).map(getText).join(separ).substr(0, 7); 60 | }; 61 | } 62 | 63 | function fillTab(str) { 64 | return str.substr(0, 7); 65 | } 66 | 67 | export function renderTable(table) { 68 | return (table || []) 69 | .map(function (row) { 70 | return (row || []).map(fillTab).join("\t"); 71 | }) 72 | .join("\n"); 73 | } 74 | 75 | export function renderMatrix(matrix) { 76 | return (matrix || []) 77 | .map(function (row) { 78 | return (row || []).map(joinCellCollisions("+")).join("\t"); 79 | }) 80 | .join("\n"); 81 | } 82 | 83 | export function renderRows(rows) { 84 | return (rows || []) 85 | .map(function (row, rowId) { 86 | var cells = [rowId + ":"]; 87 | for (var i in row) 88 | cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7)); 89 | return cells.join("\t"); 90 | }) 91 | .join("\n"); 92 | } 93 | 94 | export function renderItems(items) { 95 | return items 96 | .map(function (i) { 97 | return [i.y, i.x, i.text].join("\t"); 98 | }) 99 | .join("\n"); 100 | } 101 | 102 | function buildMatrix(rows, classifyColumn) { 103 | var matrix = []; 104 | for (var y in rows) { 105 | var row = []; 106 | for (var x in rows[y]) { 107 | var item = rows[y][x]; 108 | var colN = classifyColumn(item); 109 | (row[colN] = row[colN] || []).push(item); 110 | } 111 | matrix.push(row); 112 | } 113 | return matrix; 114 | } 115 | 116 | export function detectCollisions(matrix) { 117 | var collisions = []; 118 | (matrix || []).map(function (row, rowN) { 119 | (row || []).map(function (cellItems, colN) { 120 | if (cellItems.length > 1) 121 | collisions.push({ 122 | row: rowN, 123 | col: colN, 124 | items: cellItems, 125 | }); 126 | }); 127 | }); 128 | return collisions; 129 | } 130 | 131 | export const parseTable = function makeAccumulator(nbRows, headerRow) { 132 | var rule = this, 133 | items = []; 134 | 135 | rule.nbRows = nbRows || 0; 136 | rule.output = { 137 | items: items, 138 | rows: null, 139 | matrix: null, 140 | }; 141 | 142 | function accumulate(item) { 143 | items.push(item); 144 | } 145 | 146 | // when parsing is done: generate a clean table, from items. 147 | rule.whenDone(function () { 148 | // classify items into rows 149 | var classifyRow = makeFloorClassifier(rule.nbRows, items.map(getTopPos)); 150 | //LOG(items.map(function(i){ return [getTopPos(i), classifyRow(getTopPos(i)), i.text].join("\t"); }).join("\n")); 151 | this.output.rows = buildRowList(items, classifyRow); 152 | // classify row items into columns 153 | var classifyColumn = makeColumnClassifier(this.output.rows[headerRow || 0]); 154 | this.output.matrix = buildMatrix(this.output.rows, classifyColumn); 155 | }); 156 | 157 | return accumulate; // then the same function will be run on all following items, until another rule is triggered 158 | }; 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pdfreader ![Node CI](https://github.com/adrienjoly/npm-pdfreader/workflows/Node%20CI/badge.svg) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://app.codacy.com/gh/adrienjoly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade) 2 | 3 | Read text and parse tables from PDF files. 4 | 5 | Supports **tabular data** with automatic column detection, and **rule-based parsing**. 6 | 7 | Dependencies: it is based on [pdf2json](https://www.npmjs.com/package/pdf2json), which itself relies on Mozilla's [pdf.js](https://github.com/mozilla/pdf.js/). 8 | 9 | 🆕 Now includes TypeScript type definitions! 10 | 11 | ℹ️ Important notes: 12 | 13 | - This module is meant to be run using Node.js only. **It does not work from a web browser.** 14 | - This module extracts text entries from PDF files. It does not support photographed text. If you cannot select text from the PDF file, **you may need to use OCR software first**. 15 | 16 | Summary: 17 | 18 | - [Installation, tests and CLI usage](#installation-tests-and-cli-usage) 19 | - [Raw PDF reading](#raw-pdf-reading) (incl. examples) 20 | - [Rule-based data extraction](#rule-based-data-extraction) 21 | - [Troubleshooting & FAQ](#troubleshooting--faq) 22 | 23 | ## Installation, tests and CLI usage 24 | 25 | After installing [Node.js](https://nodejs.org/): 26 | 27 | ```sh 28 | git clone https://github.com/adrienjoly/npm-pdfreader.git 29 | cd npm-pdfreader 30 | npm install 31 | npm test 32 | node parse.js test/sample.pdf 33 | ``` 34 | 35 | ## Installation into an existing project 36 | 37 | To install `pdfreader` as a dependency of your Node.js project: 38 | 39 | ```sh 40 | npm install pdfreader 41 | ``` 42 | 43 | Then, see below for examples of use. 44 | 45 | ## Raw PDF reading 46 | 47 | This module exposes the `PdfReader` class, to be instantiated. You can pass `{ debug: true }` to the constructor, in order to log debugging information. (useful for troubleshooting) 48 | 49 | Your instance has two methods for parsing a PDF. They return the same output and differ only in input: `PdfReader.parseFileItems` (as below) for a filename, and `PdfReader.parseBuffer` (see: "Raw PDF reading from a PDF already in memory (buffer)") from data that you don't want to reference from the filesystem. 50 | 51 | Whichever method you choose, it asks for a callback, which gets called each time the instance finds what it denotes as a PDF item. 52 | 53 | An item object can match one of the following objects: 54 | 55 | - `null`, when the parsing is over, or an error occured. 56 | - File metadata, `{file:{path:string}}`, when a PDF file is being opened, and is always the first item. 57 | - Page metadata, `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1. This basically acts as a carriage return for the coordinates of text items to be processed. 58 | - Text items, `{text:string, x:float, y:float, w:float, ...}`, which you can think of as simple objects with a text property, and floating 2D AABB coordinates on the page. 59 | 60 | It's up to your callback to process these items into a data structure of your choice, and also to handle any errors thrown to it. 61 | 62 | For example: 63 | 64 | ```javascript 65 | import { PdfReader } from "pdfreader"; 66 | 67 | new PdfReader().parseFileItems("test/sample.pdf", (err, item) => { 68 | if (err) console.error("error:", err); 69 | else if (!item) console.warn("end of file"); 70 | else if (item.text) console.log(item.text); 71 | }); 72 | ``` 73 | 74 | ### Parsing a password-protected PDF file 75 | 76 | ```javascript 77 | new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems( 78 | "test/sample-with-password.pdf", 79 | function (err, item) { 80 | if (err) console.error(err); 81 | else if (!item) console.warn("end of file"); 82 | else if (item.text) console.log(item.text); 83 | } 84 | ); 85 | ``` 86 | 87 | ### Raw PDF reading from a PDF buffer 88 | 89 | As above, but reading from a buffer in memory rather than from a file referenced by path. For example: 90 | 91 | ```javascript 92 | import fs from "fs"; 93 | import { PdfReader } from "pdfreader"; 94 | 95 | fs.readFile("test/sample.pdf", (err, pdfBuffer) => { 96 | // pdfBuffer contains the file content 97 | new PdfReader().parseBuffer(pdfBuffer, (err, item) => { 98 | if (err) console.error("error:", err); 99 | else if (!item) console.warn("end of buffer"); 100 | else if (item.text) console.log(item.text); 101 | }); 102 | }); 103 | ``` 104 | 105 | ### Other examples of use 106 | 107 | ![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png) 108 | 109 | ![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png) 110 | 111 | Source code of the examples above: [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example). 112 | 113 | For more, see [Examples of use](https://github.com/adrienjoly/npm-pdfreader/discussions/categories/examples-of-use). 114 | 115 | ## Rule-based data extraction 116 | 117 | The `Rule` class can be used to define and process data extraction rules, while parsing a PDF document. 118 | 119 | `Rule` instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule. 120 | 121 | Example: 122 | 123 | ```javascript 124 | const processItem = Rule.makeItemProcessor([ 125 | Rule.on(/^Hello \"(.*)\"$/) 126 | .extractRegexpValues() 127 | .then(displayValue), 128 | Rule.on(/^Value\:/) 129 | .parseNextItemValue() 130 | .then(displayValue), 131 | Rule.on(/^c1$/).parseTable(3).then(displayTable), 132 | Rule.on(/^Values\:/) 133 | .accumulateAfterHeading() 134 | .then(displayValue), 135 | ]); 136 | new PdfReader().parseFileItems("test/sample.pdf", (err, item) => { 137 | if (err) console.error(err); 138 | else processItem(item); 139 | }); 140 | ``` 141 | 142 | ## Troubleshooting & FAQ 143 | 144 | ### Is it possible to parse a PDF document from a web application? 145 | 146 | Solutions exist, but this module cannot be run directly by a web browser. If you really want to use this module, you will have to integrate it into your back-end so that PDF files can be read from your server. 147 | 148 | ### `Cannot read property 'userAgent' of undefined` error from an express-based node.js app 149 | 150 | Dmitry found out that you may need to run these instructions before including the `pdfreader` module: 151 | 152 | ```js 153 | global.navigator = { 154 | userAgent: "node", 155 | }; 156 | 157 | window.navigator = { 158 | userAgent: "node", 159 | }; 160 | ``` 161 | 162 | Source: [express - TypeError: Cannot read property 'userAgent' of undefined error on node.js app run - Stack Overflow](https://stackoverflow.com/questions/49208414/typeerror-cannot-read-property-useragent-of-undefined-error-on-node-js-app-ru) 163 | -------------------------------------------------------------------------------- /Rule.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Rule: class that can be used to define and process data extraction rules, while parsing a PDF document. 3 | * @author Adrien Joly, http://github.com/adrienjoly 4 | * This content is released under the MIT License. 5 | **/ 6 | 7 | import { log as LOG } from "./lib/LOG.js"; 8 | import { parseColumns } from "./lib/parseColumns.js"; 9 | import { parseTable } from "./lib/parseTable.js"; 10 | 11 | /** 12 | * regexp: a regular expression which a PDF item's text must match in order to execute that rule. 13 | * => a Rule object exposes "accumulators": methods that defines the data extraction strategy of a rule. 14 | **/ 15 | export function Rule(regexp) { 16 | this.regexp = regexp; 17 | var self = this; 18 | // proxy accumulators methods 19 | Object.keys(Rule.accumulators).forEach(function (name) { 20 | self[name] = function () { 21 | LOG("building rule:", regexp, "->", name); 22 | self.methodName = name; 23 | self.accumulatorParams = arguments; 24 | self.accumulatorBuilder = Rule.accumulators[name]; 25 | return self; 26 | }; 27 | }); 28 | } 29 | 30 | // shortcut for defining Rule objects in a more concise manner 31 | Rule.on = function (regexp) { 32 | return new Rule(regexp); 33 | }; 34 | 35 | Rule.after = function (regexp) { 36 | var rule = new Rule(regexp); 37 | rule.skipCurrentItem = true; 38 | return rule; 39 | }; 40 | 41 | /** 42 | * then(): defines a function to be called after a Rule's accumulator has finished processing items. 43 | * fct: the function to be called after a Rule's accumulator has finished processing items. 44 | * the output of the accumulator will be passed as the first parameter of that function. 45 | **/ 46 | Rule.prototype.then = function (fct) { 47 | var self = this; 48 | this.terminate = function () { 49 | fct.call(self, self.output); 50 | }; 51 | return this; 52 | }; 53 | 54 | // private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator. 55 | Rule.prototype.test = function (item) { 56 | if (this.regexp.test(item.text)) { 57 | // lazy init of accumulators: build and init the accumulator on first match 58 | this.currentItem = item; 59 | if (!this.accumulatorImpl && this.accumulatorBuilder) { 60 | this.accumulatorImpl = this.accumulatorBuilder.apply( 61 | this, 62 | this.accumulatorParams 63 | ); 64 | this.accumulatorImpl.methodName = this.methodName; 65 | this.accumulatorImpl.terminate = this.terminate; 66 | } 67 | return this.accumulatorImpl; 68 | } 69 | }; 70 | 71 | // intended to be run from accumulator, in order to process output before calling termination then() handler. 72 | Rule.prototype.whenDone = function (fct) { 73 | var self = this; 74 | var then = this.terminate; 75 | this.terminate = function () { 76 | fct.call(self); 77 | then(); 78 | }; 79 | }; 80 | 81 | /** 82 | * rules: array of Rule objects that will be executed one-by-one, whenever a PDF item matches a rule. 83 | * each rule can only be executed once. 84 | * => returns a function to be called for each item by the PdfReader. 85 | **/ 86 | Rule.makeItemProcessor = function (rules) { 87 | var currentAccumulator = null; 88 | function terminateAccumulator() { 89 | var terminatePreviousAcc = (currentAccumulator || {}).terminate; 90 | if (terminatePreviousAcc) { 91 | LOG("terminating accumulator:", currentAccumulator.methodName); 92 | terminatePreviousAcc(currentAccumulator); // TODO: remove currentAccumulator parameter 93 | } 94 | } 95 | var applyRulesOnNextItem = true; 96 | return function (item) { 97 | if (!item) 98 | // last item of the file => flush buffers 99 | return terminateAccumulator(); 100 | else if (!item.text) return; 101 | //LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem); 102 | if (applyRulesOnNextItem) 103 | for (var r in rules) { 104 | var accumulator = rules[r].test(item); 105 | if (accumulator) { 106 | terminateAccumulator(); 107 | LOG("current accumulator:", accumulator.methodName); 108 | if (rules[r].skipCurrentItem) applyRulesOnNextItem = false; 109 | currentAccumulator = accumulator; 110 | delete rules[r]; 111 | return; 112 | } 113 | } 114 | else applyRulesOnNextItem = true; 115 | // if reaching this point, the current item matches none of the rules => accumulating data on current accumulator 116 | if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item); 117 | }; 118 | }; 119 | 120 | /** 121 | * Rule.accumulators: array of accumulators that can be used for defining Rule objects. 122 | * An accumulator is a function that may (or may not) accept parameters, to be provided by the developer of a parser. 123 | * It returns another function that will be run on every following PDF item, in order to accumulate data. 124 | * The output of an accumulator is stored in this.output (field of its parent Rule object). 125 | **/ 126 | Rule.accumulators = { 127 | stopAccumulating: function () { 128 | return function () {}; 129 | }, 130 | }; 131 | 132 | // method for adding accumulators 133 | Rule.addAccumulator = function (methodName, methodBuilder) { 134 | Rule.accumulators[methodName] = methodBuilder; 135 | }; 136 | 137 | /** 138 | * This accumulator will store the group values extracted by the regexp of the Rule object, 139 | * on the current matching PDF item, into an array. 140 | * 141 | * E.g. with regex: /hello ([a-z]+)/, the text "hello world" will yield "world". 142 | **/ 143 | Rule.addAccumulator("extractRegexpValues", function () { 144 | var matches = this.regexp.exec(this.currentItem.text); 145 | this.output = matches.slice(1); 146 | return function () {}; // following lines are not to be processed by this accumulator 147 | }); 148 | 149 | /** 150 | * This accumulator will store the value of the next PDF item. 151 | **/ 152 | Rule.addAccumulator("parseNextItemValue", function () { 153 | var self = this, 154 | done = false; 155 | return function (item) { 156 | if (done) return; 157 | done = true; 158 | self.output = item.text; 159 | }; 160 | }); 161 | 162 | /** 163 | * This accumulator will store the text of all following PDF items into an array. 164 | **/ 165 | Rule.addAccumulator("accumulateAfterHeading", function () { 166 | var output = (this.output = []); 167 | return function accumulate(item) { 168 | output.push(item.text); 169 | }; 170 | }); 171 | 172 | /** 173 | * This accumulator will store the text of all following PDF items with equal x-coordinates. 174 | **/ 175 | Rule.addAccumulator("accumulateFromSameX", function () { 176 | var output = (this.output = []), 177 | x = null; 178 | return function accumulate(item) { 179 | if (x === null) x = item.x; 180 | if (x == item.x) output.push(item.text); 181 | }; 182 | }); 183 | 184 | /** 185 | * This accumulator will store a table by detecting its columns, given their names. 186 | **/ 187 | Rule.addAccumulator("parseColumns", parseColumns); 188 | 189 | /** 190 | * This accumulator will store a table by detecting its columns, given their count. 191 | **/ 192 | Rule.addAccumulator("parseTable", parseTable); 193 | -------------------------------------------------------------------------------- /test/snapshots/test.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/test.js` 2 | 3 | The actual snapshot is saved in `test.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## parse raw items from pdf file 8 | 9 | > Snapshot 1 10 | 11 | [ 12 | { 13 | file: { 14 | path: './test/sample.pdf', 15 | }, 16 | }, 17 | { 18 | height: 52.618, 19 | page: 1, 20 | width: 37.205, 21 | }, 22 | { 23 | A: 'left', 24 | R: [ 25 | { 26 | S: -1, 27 | T: 'Hello%20%22world%22', 28 | TS: [ 29 | 0, 30 | 28, 31 | 1, 32 | 0, 33 | ], 34 | }, 35 | ], 36 | clr: 0, 37 | sw: 0.32553125, 38 | text: 'Hello "world"', 39 | w: 6.138, 40 | x: 4.555, 41 | y: 5.154, 42 | }, 43 | { 44 | A: 'left', 45 | R: [ 46 | { 47 | S: -1, 48 | T: 'Value%3A', 49 | TS: [ 50 | 0, 51 | 15, 52 | 0, 53 | 0, 54 | ], 55 | }, 56 | ], 57 | clr: 0, 58 | sw: NaN, 59 | text: 'Value:', 60 | w: 2.666, 61 | x: 4.555, 62 | y: 7.174, 63 | }, 64 | { 65 | A: 'left', 66 | R: [ 67 | { 68 | S: -1, 69 | T: '4', 70 | TS: [ 71 | 0, 72 | 15, 73 | 0, 74 | 0, 75 | ], 76 | }, 77 | ], 78 | clr: 0, 79 | sw: NaN, 80 | text: '4', 81 | w: 0.5, 82 | x: 4.555, 83 | y: 8.761, 84 | }, 85 | { 86 | A: 'left', 87 | R: [ 88 | { 89 | S: -1, 90 | T: 'c1', 91 | TS: [ 92 | 0, 93 | 16, 94 | 1, 95 | 0, 96 | ], 97 | }, 98 | ], 99 | clr: 0, 100 | sw: 0.32553125, 101 | text: 'c1', 102 | w: 0.944, 103 | x: 5.095, 104 | y: 10.501, 105 | }, 106 | { 107 | A: 'left', 108 | R: [ 109 | { 110 | S: -1, 111 | T: 'c2', 112 | TS: [ 113 | 0, 114 | 16, 115 | 1, 116 | 0, 117 | ], 118 | }, 119 | ], 120 | clr: 0, 121 | sw: 0.32553125, 122 | text: 'c2', 123 | w: 0.944, 124 | x: 7.262, 125 | y: 10.501, 126 | }, 127 | { 128 | A: 'left', 129 | R: [ 130 | { 131 | S: -1, 132 | T: 'c3', 133 | TS: [ 134 | 0, 135 | 16, 136 | 1, 137 | 0, 138 | ], 139 | }, 140 | ], 141 | clr: 0, 142 | sw: 0.32553125, 143 | text: 'c3', 144 | w: 0.944, 145 | x: 10.131, 146 | y: 10.501, 147 | }, 148 | { 149 | A: 'left', 150 | R: [ 151 | { 152 | S: -1, 153 | T: '1', 154 | TS: [ 155 | 0, 156 | 15, 157 | 0, 158 | 0, 159 | ], 160 | }, 161 | ], 162 | clr: 0, 163 | sw: NaN, 164 | text: '1', 165 | w: 0.5, 166 | x: 5.288, 167 | y: 11.447, 168 | }, 169 | { 170 | A: 'left', 171 | R: [ 172 | { 173 | S: -1, 174 | T: '2.3', 175 | TS: [ 176 | 0, 177 | 15, 178 | 0, 179 | 0, 180 | ], 181 | }, 182 | ], 183 | clr: 0, 184 | sw: NaN, 185 | text: '2.3', 186 | w: 1.25, 187 | x: 10.477, 188 | y: 11.447, 189 | }, 190 | { 191 | A: 'left', 192 | R: [ 193 | { 194 | S: -1, 195 | T: 'hello', 196 | TS: [ 197 | 0, 198 | 15, 199 | 0, 200 | 0, 201 | ], 202 | }, 203 | ], 204 | clr: 0, 205 | sw: NaN, 206 | text: 'hello', 207 | w: 2, 208 | x: 6.937, 209 | y: 12.363, 210 | }, 211 | { 212 | A: 'left', 213 | R: [ 214 | { 215 | S: -1, 216 | T: 'world', 217 | TS: [ 218 | 0, 219 | 15, 220 | 0, 221 | 0, 222 | ], 223 | }, 224 | ], 225 | clr: 0, 226 | sw: NaN, 227 | text: 'world', 228 | w: 2.333, 229 | x: 9.684, 230 | y: 12.363, 231 | }, 232 | { 233 | A: 'left', 234 | R: [ 235 | { 236 | S: -1, 237 | T: 'Values%3A', 238 | TS: [ 239 | 0, 240 | 15, 241 | 0, 242 | 0, 243 | ], 244 | }, 245 | ], 246 | clr: 0, 247 | sw: NaN, 248 | text: 'Values:', 249 | w: 3.055, 250 | x: 4.555, 251 | y: 13.248, 252 | }, 253 | { 254 | A: 'left', 255 | R: [ 256 | { 257 | S: -1, 258 | T: '1', 259 | TS: [ 260 | 0, 261 | 15, 262 | 0, 263 | 0, 264 | ], 265 | }, 266 | ], 267 | clr: 0, 268 | sw: NaN, 269 | text: '1', 270 | w: 0.5, 271 | x: 4.555, 272 | y: 14.835, 273 | }, 274 | { 275 | A: 'left', 276 | R: [ 277 | { 278 | S: -1, 279 | T: '2', 280 | TS: [ 281 | 0, 282 | 15, 283 | 0, 284 | 0, 285 | ], 286 | }, 287 | ], 288 | clr: 0, 289 | sw: NaN, 290 | text: '2', 291 | w: 0.5, 292 | x: 4.555, 293 | y: 16.423, 294 | }, 295 | { 296 | A: 'left', 297 | R: [ 298 | { 299 | S: -1, 300 | T: '3', 301 | TS: [ 302 | 0, 303 | 15, 304 | 0, 305 | 0, 306 | ], 307 | }, 308 | ], 309 | clr: 0, 310 | sw: NaN, 311 | text: '3', 312 | w: 0.5, 313 | x: 4.555, 314 | y: 18.01, 315 | }, 316 | ] 317 | 318 | ## parse structured content from pdf file, using rules 319 | 320 | > Snapshot 1 321 | 322 | [ 323 | { 324 | extractRegexpValues: [ 325 | 'world', 326 | ], 327 | }, 328 | { 329 | parseNextItemValue: '4', 330 | }, 331 | { 332 | 'parseTable.renderItems': `10.501 7.262 c2␊ 333 | 10.501 10.131 c3␊ 334 | 11.447 5.288 1␊ 335 | 11.447 10.477 2.3␊ 336 | 12.363 6.937 hello␊ 337 | 12.363 9.684 world`, 338 | 'parseTable.renderMatrix': ` c2 c3␊ 339 | 1 2.3␊ 340 | hello world`, 341 | }, 342 | { 343 | accumulateAfterHeading: [ 344 | '1', 345 | '2', 346 | '3', 347 | ], 348 | }, 349 | ] 350 | 351 | ## parse Table from PDF file, using TableParser 352 | 353 | > Snapshot 1 354 | 355 | [ 356 | [ 357 | 'Version', 358 | 'LTS', 359 | 'Date', 360 | 'V8', 361 | 'npm', 362 | 'NODE_MODULE_VERSION [1]', 363 | ], 364 | [ 365 | 'Node.js 17.1.0', 366 | undefined, 367 | '2021-11-09', 368 | '9.5.172.25', 369 | '8.1.2', 370 | '102', 371 | 'Downloads', 372 | ' Changelog ', 373 | 'Docs', 374 | ], 375 | [ 376 | 'Node.js 17.0.1', 377 | undefined, 378 | '2021-10-20', 379 | '9.5.172.21', 380 | '8.1.0', 381 | '102', 382 | 'Downloads', 383 | ' Changelog ', 384 | 'Docs', 385 | ], 386 | [ 387 | 'Node.js 17.0.0', 388 | undefined, 389 | '2021-10-19', 390 | '9.5.172.21', 391 | '8.1.0', 392 | '102', 393 | 'Downloads', 394 | ' Changelog ', 395 | 'Docs', 396 | ], 397 | [ 398 | 'Node.js 16.14.2', 399 | 'Gallium', 400 | '2022-03-17', 401 | '9.4.146.24', 402 | '8.5.0', 403 | '93', 404 | 'Downloads', 405 | ' Changelog ', 406 | 'Docs', 407 | ], 408 | [ 409 | 'Node.js 16.14.1', 410 | 'Gallium', 411 | '2022-03-16', 412 | '9.4.146.24', 413 | '8.5.0', 414 | '93', 415 | 'Downloads', 416 | ' Changelog ', 417 | 'Docs', 418 | ], 419 | [ 420 | 'Node.js 16.14.0', 421 | 'Gallium', 422 | '2022-02-08', 423 | '9.4.146.24', 424 | '8.3.1', 425 | '93', 426 | 'Downloads', 427 | ' Changelog ', 428 | 'Docs', 429 | ], 430 | [ 431 | 'Node.js 16.13.2', 432 | 'Gallium', 433 | '2022-01-10', 434 | '9.4.146.24', 435 | '8.1.2', 436 | '93', 437 | 'Downloads', 438 | ' Changelog ', 439 | 'Docs', 440 | ], 441 | [ 442 | 'Node.js 16.13.1', 443 | 'Gallium', 444 | '2021-12-01', 445 | '9.4.146.24', 446 | '8.1.2', 447 | '93', 448 | 'Downloads', 449 | ' Changelog ', 450 | 'Docs', 451 | ], 452 | [ 453 | 'Node.js 16.13.0', 454 | 'Gallium', 455 | '2021-10-26', 456 | '9.4.146.19', 457 | '8.1.0', 458 | '93', 459 | 'Downloads', 460 | ' Changelog ', 461 | 'Docs', 462 | ], 463 | [ 464 | 'Node.js 16.12.0', 465 | undefined, 466 | '2021-10-20', 467 | '9.4.146.19', 468 | '8.1.0', 469 | '93', 470 | 'Downloads', 471 | ' Changelog ', 472 | 'Docs', 473 | ], 474 | ] 475 | 476 | ## sample scripts should print raw items from pdf file 477 | 478 | > Snapshot 1 479 | 480 | { 481 | stderr: `printing raw items from file: test/sample.pdf ...␊ 482 | done.␊ 483 | printing raw items from file: test/sample.pdf ...␊ 484 | done.`, 485 | stdout: `␊ 486 | > pdfreader@0.0.0-development test:samples␊ 487 | > node parse.js test/sample.pdf && node parseAsBuffer.js test/sample.pdf␊ 488 | ␊ 489 | file = test/sample.pdf␊ 490 | page = 1␊ 491 | 4.555 5.154 left 6 Hello "world"␊ 492 | 4.555 7.174 left 2 Value:␊ 493 | 4.555 8.761 left 0 4␊ 494 | 5.095 10.501 left 0 c1␊ 495 | 7.262 10.501 left 0 c2␊ 496 | 10.131 10.501 left 0 c3␊ 497 | 5.288 11.447 left 0 1␊ 498 | 10.477 11.447 left 1 2.3␊ 499 | 6.937 12.363 left 2 hello␊ 500 | 9.684 12.363 left 2 world␊ 501 | 4.555 13.248 left 3 Values:␊ 502 | 4.555 14.835 left 0 1␊ 503 | 4.555 16.423 left 0 2␊ 504 | 4.555 18.01 left 0 3␊ 505 | file = undefined␊ 506 | page = 1␊ 507 | 4.555 5.154 left 6 Hello "world"␊ 508 | 4.555 7.174 left 2 Value:␊ 509 | 4.555 8.761 left 0 4␊ 510 | 5.095 10.501 left 0 c1␊ 511 | 7.262 10.501 left 0 c2␊ 512 | 10.131 10.501 left 0 c3␊ 513 | 5.288 11.447 left 0 1␊ 514 | 10.477 11.447 left 1 2.3␊ 515 | 6.937 12.363 left 2 hello␊ 516 | 9.684 12.363 left 2 world␊ 517 | 4.555 13.248 left 3 Values:␊ 518 | 4.555 14.835 left 0 1␊ 519 | 4.555 16.423 left 0 2␊ 520 | 4.555 18.01 left 0 3`, 521 | } 522 | --------------------------------------------------------------------------------