├── .nvmrc
├── .prettierignore
├── .gitattributes
├── .gitignore
├── .npmignore
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE
    │   └── bug-report.md
    └── workflows
    │   └── nodejs.yml
├── test
    ├── sample.pdf
    ├── sample-table.pdf
    ├── sample-with-password.pdf
    ├── snapshots
    │   ├── test.js.snap
    │   └── test.js.md
    ├── sample.html
    ├── sample.rtf
    └── test.js
├── .eslintrc.json
├── .vscode
    ├── settings.json
    └── extensions.json
├── .editorconfig
├── index.js
├── lib
    ├── LOG.js
    ├── SequentialParser.js
    ├── ColumnsParser.js
    ├── parseColumns.js
    ├── TableParser.js
    └── parseTable.js
├── parse.js
├── LICENSE
├── parseAsBuffer.js
├── package.json
├── index.d.ts
├── PdfReader.js
├── CHANGELOG.md
├── README.md
└── Rule.js


/.nvmrc:
--------------------------------------------------------------------------------
1 | v16
2 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | CHANGELOG.md
2 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 | /dist
3 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | .*
2 | node_modules
3 | test
4 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [adrienjoly]
2 | custom: ['https://adrienjoly.com/donate/']
3 | 


--------------------------------------------------------------------------------
/test/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample.pdf


--------------------------------------------------------------------------------
/test/sample-table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample-table.pdf


--------------------------------------------------------------------------------
/test/sample-with-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample-with-password.pdf


--------------------------------------------------------------------------------
/test/snapshots/test.js.snap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/snapshots/test.js.snap


--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "extends": ["plugin:prettier/recommended"],
3 |   "parserOptions": { "ecmaVersion": 2020, "sourceType": "module" }
4 | }
5 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "editor.formatOnSave": true,
3 |   "editor.defaultFormatter": "esbenp.prettier-vscode",
4 |   "prettier.singleQuote": false
5 | }
6 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |     "editorconfig.editorconfig",
4 |     "dbaeumer.vscode-eslint",
5 |     "esbenp.prettier-vscode"
6 |   ]
7 | }
8 | 


--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
 1 | ; http://editorconfig.org
 2 | 
 3 | root = true
 4 | 
 5 | [*]
 6 | indent_style = space
 7 | indent_size = 2
 8 | end_of_line = lf
 9 | charset = utf-8
10 | trim_trailing_whitespace = true
11 | insert_final_newline = true
12 | 


--------------------------------------------------------------------------------
/test/sample.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 |   <head>
 3 |     <style>
 4 |       table, tr, td {
 5 |         border: 1px solid black;
 6 |         border-collapse: collapse;
 7 |       }
 8 |       td {
 9 |         text-align: right;
10 |         padding: 0 10px;
11 |       }
12 |     </style>
13 |   </head>
14 |   <body>
15 |     <h1>Hello "world"</h1>
16 |     <p>Value:</p>
17 |     <p>4</p>
18 |     <table>
19 |       <tr><th>c1</th><th>c2</th><th>c3</th></tr>
20 |       <tr><td>1</td><td></td><td>2.3</td></tr>
21 |       <tr><td></td><td>hello</td><td>world</td></tr>
22 |     </table>
23 |     <p>Values:</p>
24 |     <p>1</p>
25 |     <p>2</p>
26 |     <p>3</p>
27 |   </body>
28 | </html>


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | export { PdfReader } from "./PdfReader.js";
 2 | export { Rule } from "./Rule.js";
 3 | export * as LOG from "./lib/LOG.js";
 4 | import * as parseTableExports from "./lib/parseTable.js";
 5 | export const parseTable = Object.assign(
 6 |   parseTableExports.parseTable,
 7 |   parseTableExports
 8 | );
 9 | import * as parseColumnsExports from "./lib/parseColumns.js";
10 | export const parseColumns = Object.assign(
11 |   parseColumnsExports.parseColumns,
12 |   parseColumnsExports
13 | );
14 | export { SequentialParser } from "./lib/SequentialParser.js"; // experimental
15 | export { TableParser } from "./lib/TableParser.js";
16 | export { ColumnsParser } from "./lib/ColumnsParser.js";
17 | 


--------------------------------------------------------------------------------
/lib/LOG.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Minimal logger
 3 |  * @author Adrien Joly, http://github.com/adrienjoly
 4 |  * This content is released under the MIT License.
 5 |  **/
 6 | 
 7 | import util from "util";
 8 | 
 9 | var nullLog = function LOG() {};
10 | 
11 | var realLog = function LOG() {
12 |   for (var i in arguments)
13 |     if (arguments[i] instanceof Object || arguments[i] instanceof Array)
14 |       arguments[i] = util.inspect(arguments[i]);
15 |   console.log("[DEBUG] " + Array.prototype.join.call(arguments, " "));
16 | };
17 | 
18 | var LOG = nullLog;
19 | 
20 | export function log() {
21 |   LOG.apply(null, arguments);
22 | }
23 | 
24 | export function toggle(enabled) {
25 |   LOG = !enabled ? nullLog : realLog;
26 | }
27 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve this npm package
 4 | ---
 5 | 
 6 | **Describe the bug**
 7 | A clear and concise description of what the bug is.
 8 | 
 9 | **To Reproduce**
10 | List the steps you followed and/or share your code to help us reproduce the bug
11 | 
12 | **Expected behavior**
13 | A clear and concise description of what you expected to happen.
14 | 
15 | **Screenshots, outputs or logs**
16 | If applicable, add screenshots, outputs or logs to help explain your problem.
17 | 
18 | **Desktop (please complete the following information):**
19 | 
20 | - OS: (e.g. iOS)
21 | - Browser: (e.g. chrome, safari)
22 | - Version: (e.g. 22)
23 | 
24 | **Additional context**
25 | Add any other context about the problem here.
26 | 


--------------------------------------------------------------------------------
/parse.js:
--------------------------------------------------------------------------------
 1 | import { toggle } from "./lib/LOG.js";
 2 | import { PdfReader } from "./index.js";
 3 | 
 4 | toggle(false);
 5 | 
 6 | function printRawItems(filename, callback) {
 7 |   new PdfReader().parseFileItems(filename, function (err, item) {
 8 |     if (err) callback(err);
 9 |     else if (!item) callback();
10 |     else if (item.file) console.log("file =", item.file.path);
11 |     else if (item.page) console.log("page =", item.page);
12 |     else if (item.x)
13 |       console.log(
14 |         [item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
15 |           "\t"
16 |         )
17 |       );
18 |     else console.warn(item);
19 |   });
20 | }
21 | 
22 | var filename = process.argv[2];
23 | if (!filename) {
24 |   console.error("please provide the name of a PDF file");
25 | } else {
26 |   console.warn("printing raw items from file:", filename, "...");
27 |   printRawItems(filename, function (err) {
28 |     if (err) {
29 |       console.error(err);
30 |       process.exit(1);
31 |     }
32 |     console.warn("done.");
33 |   });
34 | }
35 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Adrien Joly
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/parseAsBuffer.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import { toggle } from "./lib/LOG.js";
 3 | import { PdfReader } from "./index.js";
 4 | 
 5 | toggle(false);
 6 | 
 7 | function printRawItems(pdfBuffer, callback) {
 8 |   new PdfReader().parseBuffer(pdfBuffer, function (err, item) {
 9 |     if (err) callback(err);
10 |     else if (!item) callback();
11 |     else if (item.file) console.log("file =", item.file.path);
12 |     else if (item.page) console.log("page =", item.page);
13 |     else if (item.x)
14 |       console.log(
15 |         [item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
16 |           "\t"
17 |         )
18 |       );
19 |     else console.warn(item);
20 |   });
21 | }
22 | 
23 | var filename = process.argv[2];
24 | if (!filename) {
25 |   console.error("please provide the name of a PDF file");
26 | } else {
27 |   console.warn("printing raw items from file:", filename, "...");
28 |   fs.readFile(filename, (err, pdfBuffer) => {
29 |     if (err) console.error(err);
30 |     printRawItems(pdfBuffer, function (err) {
31 |       if (err) {
32 |         console.error(err);
33 |         process.exit(1);
34 |       }
35 |       console.warn("done.");
36 |     });
37 |   });
38 | }
39 | 


--------------------------------------------------------------------------------
/lib/SequentialParser.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Applies a list of simple actions to apply to each provided item, in order to accumulate field values.
 3 |  * Provides a list of parsed `fields`.
 4 |  * Calls `callback(error, this)` when all accumulators were processed, or when processing a null item.
 5 |  **/
 6 | export function SequentialParser(accumulators, callback) {
 7 |   var step = 0;
 8 |   var fields = {};
 9 |   return {
10 |     fields: fields,
11 |     addField: function (key, value) {
12 |       this.fields[key] = value;
13 |     },
14 |     parseItem: function (item) {
15 |       if (step >= accumulators.length) {
16 |         return console.warn(
17 |           "warning: skipping item, because SequentialParser is done."
18 |         );
19 |       }
20 |       var current = accumulators[step];
21 |       if (current.field) {
22 |         this.addField(current.field, item);
23 |         ++step;
24 |       } else if (current.accumulator) {
25 |         var doneAccumulating = current.accumulator(item, this);
26 |         if (doneAccumulating) ++step;
27 |       } // no action => skip item
28 |       else ++step;
29 |       if (!item || step >= accumulators.length) {
30 |         callback && callback(null, this);
31 |       }
32 |     },
33 |   };
34 | }
35 | 


--------------------------------------------------------------------------------
/lib/ColumnsParser.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * ColumnsParser
 3 |  * Classifies items into columns, nearest to the left position of their corresponding header.
 4 |  * @author Adrien Joly, http://github.com/adrienjoly
 5 |  * This content is released under the MIT License.
 6 |  **/
 7 | 
 8 | import { log as LOG } from "./LOG.js";
 9 | 
10 | function getColumnIndex(cols, x) {
11 |   var bestDist = null;
12 |   for (var i = 0; i < cols.length; ++i) {
13 |     var dist = Math.abs(x - cols[i].x);
14 |     if (bestDist !== null && dist > bestDist) {
15 |       break;
16 |     } else {
17 |       bestDist = dist;
18 |     }
19 |   }
20 |   return i - 1;
21 | }
22 | 
23 | export function ColumnsParser(colNames) {
24 |   this.cols = [];
25 |   var cols = this.cols,
26 |     colNames = colNames.slice(), // clone (for parameter immutability)
27 |     line = -1; // -1 = header
28 | 
29 |   this.processItem = function (item) {
30 |     if (line == -1) {
31 |       // parse x-position of column headers
32 |       var i = colNames.indexOf(item.text);
33 |       if (i > -1) {
34 |         LOG("ColumnsParser header", i, item.text, "=> x:", item.x);
35 |         cols[i] = {
36 |           name: item.text,
37 |           x: item.x,
38 |           items: [],
39 |         };
40 |         colNames[i] = ""; // needed so that a column name can be associated to more than 1 index
41 |       }
42 |       if (cols.length == colNames.length) {
43 |         // done parsing header
44 |         line++;
45 |       }
46 |     } else {
47 |       cols[getColumnIndex(cols, item.x)].items.push(item);
48 |     }
49 |   };
50 | }
51 | 


--------------------------------------------------------------------------------
/lib/parseColumns.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * parseColumns, for pdfreader, used by the Rule class.
 3 |  * accumulates values below each column header (on 1st row, given their name), without detecting empty rows.
 4 |  * TODO: use ColumnsParser
 5 |  * @author Adrien Joly, http://github.com/adrienjoly
 6 |  * This content is released under the MIT License.
 7 |  **/
 8 | 
 9 | import { log as LOG } from "./LOG.js";
10 | 
11 | export const parseColumns = function (/* columns */) {
12 |   this.output = [];
13 |   this.cols = Array.prototype.slice.apply(arguments);
14 |   var colNames = this.cols,
15 |     colX = [],
16 |     rows = this.output,
17 |     line = -1, // header
18 |     lineY = null;
19 |   function processItem(item) {
20 |     if (line == -1) {
21 |       // parse x-position of column headers
22 |       var i = colNames.indexOf(item.text);
23 |       if (i > -1) colX[i] = item.x;
24 |       if (colX.length == colNames.length) {
25 |         LOG("table header:", colNames, colX);
26 |         line++;
27 |       }
28 |     } else {
29 |       if (lineY === null) {
30 |         lineY = item.y;
31 |       } else if (lineY != item.y) {
32 |         lineY = item.y;
33 |         line++;
34 |       }
35 |       // parsing values for each column
36 |       var col = 0;
37 |       for (var i = colX.length - 1; i >= 0; --i)
38 |         if (item.x > colX[i]) {
39 |           col = i;
40 |           break;
41 |         }
42 |       rows[lineY] = rows[lineY] || {};
43 |       rows[lineY][col] = item.text;
44 |     }
45 |   }
46 |   processItem(this.currentItem); // apply on header's first item
47 |   return processItem; // then the same function will be run on all following items, until another rule is triggered
48 | };
49 | 


--------------------------------------------------------------------------------
/.github/workflows/nodejs.yml:
--------------------------------------------------------------------------------
 1 | name: Node CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   # Prevent functional regressions on supported Node.js versions
13 |   tests:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         node-version: [16.x, 18.x]
18 |     steps:
19 |       - uses: actions/checkout@v1
20 |       - name: Use Node.js ${{ matrix.node-version }}
21 |         uses: actions/setup-node@v1
22 |         with:
23 |           node-version: ${{ matrix.node-version }}
24 |       - run: npm ci # install dependencies
25 |       - run: npm test
26 | 
27 |   # Checks that files are formatted consistently
28 |   formatting:
29 |     runs-on: ubuntu-latest
30 |     steps:
31 |       - uses: actions/checkout@v1
32 |       - uses: actions/setup-node@v1
33 |         with:
34 |           node-version: 16.x
35 |       - run: npm ci # install dependencies
36 |       - run: npm run prettier:check
37 |       - run: npm run lint
38 | 
39 |   release:
40 |     needs:
41 |       - tests
42 |       - formatting
43 |     runs-on: ubuntu-latest
44 |     steps:
45 |       - name: Checkout
46 |         uses: actions/checkout@v2
47 |         with:
48 |           fetch-depth: 0
49 |       - name: Setup Node.js
50 |         uses: actions/setup-node@v1
51 |         with:
52 |           node-version: 16
53 |       - name: Install dependencies
54 |         run: npm ci
55 |       - name: Build commonjs
56 |         run: npm run build:cjs
57 |       - name: Release
58 |         env:
59 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
60 |           NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
61 |         run: npm run semantic-release
62 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pdfreader",
 3 |   "type": "module",
 4 |   "version": "0.0.0-development",
 5 |   "description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.",
 6 |   "main": "dist/index.cjs",
 7 |   "module": "./index.js",
 8 |   "typings": "./index.d.ts",
 9 |   "scripts": {
10 |     "prettier:print": "prettier --list-different \"./**/*.js\" \"./**/*.md\" \"./**/*.d.ts\"",
11 |     "prettier:check": "npm run -s prettier:print 1>&2; exit $(npm run -s prettier:print | wc -l)",
12 |     "prettier:fix": "prettier \"./**/*.js\" \"./**/*.md\" \"./**/*.d.ts\" --write --end-of-line lf",
13 |     "test:samples": "node parse.js test/sample.pdf && node parseAsBuffer.js test/sample.pdf",
14 |     "test:update": "ava --update-snapshots",
15 |     "test": "ava",
16 |     "lint": "eslint .",
17 |     "semantic-release": "semantic-release",
18 |     "build:cjs": "rollup index.js --file dist/index.cjs --format cjs"
19 |   },
20 |   "repository": {
21 |     "type": "git",
22 |     "url": "https://github.com/adrienjoly/npm-pdfreader"
23 |   },
24 |   "keywords": [
25 |     "pdf",
26 |     "reader",
27 |     "parser",
28 |     "parse",
29 |     "parsing",
30 |     "convert",
31 |     "CLI",
32 |     "table",
33 |     "data",
34 |     "csv",
35 |     "json",
36 |     "rules"
37 |   ],
38 |   "author": "Adrien Joly",
39 |   "license": "MIT",
40 |   "bugs": {
41 |     "url": "https://github.com/adrienjoly/npm-pdfreader/issues"
42 |   },
43 |   "homepage": "https://github.com/adrienjoly/npm-pdfreader",
44 |   "dependencies": {
45 |     "pdf2json": "3.1.4"
46 |   },
47 |   "devDependencies": {
48 |     "@semantic-release/changelog": "^6.0.1",
49 |     "@semantic-release/git": "^10.0.1",
50 |     "@semantic-release/npm": "^9.0.1",
51 |     "ava": "^4.1.0",
52 |     "eslint": "^8.11.0",
53 |     "eslint-config-prettier": "^8.5.0",
54 |     "eslint-plugin-prettier": "^4.0.0",
55 |     "execa": "^6.1.0",
56 |     "prettier": "2.6.1",
57 |     "semantic-release": "^19.0.2",
58 |     "rollup": "^4.19.1"
59 |   },
60 |   "engines": {
61 |     "node": ">=14"
62 |   }
63 | }


--------------------------------------------------------------------------------
/index.d.ts:
--------------------------------------------------------------------------------
 1 | export type InitOptions = {
 2 |   password?: string;
 3 |   debug?: boolean;
 4 |   signal?: AbortSignal;
 5 | };
 6 | export type Error = null | string;
 7 | 
 8 | export type DataEntry = {
 9 |   page?: number;
10 |   width?: number;
11 |   height?: number;
12 |   text?: string;
13 |   file?: {
14 |     path?: string;
15 |     buffer?: string;
16 |   };
17 | } | null;
18 | 
19 | export type ItemHandler = (err: Error, data: DataEntry & Item) => void;
20 | 
21 | export declare class PdfReader {
22 |   constructor(opts?: InitOptions | null);
23 |   parseFileItems(pdfFilePath: string, itemHandler: ItemHandler): void;
24 |   parseBuffer(buffer: Buffer, itemHandler: ItemHandler): void;
25 | }
26 | 
27 | export type Item = {
28 |   x: number;
29 |   y: number;
30 |   sw: number;
31 |   w: number;
32 |   A: string;
33 |   clr: number;
34 |   R: {
35 |     T: string;
36 |     S: number;
37 |     TS: any[];
38 |   }[];
39 |   text: string;
40 | };
41 | 
42 | export type RuleAccumulator = (item: Item) => boolean | void;
43 | export type RuleHandler<T = any> = (value: T) => void;
44 | 
45 | export interface TableResult {
46 |   matrix: string[][];
47 |   items: Item[];
48 | }
49 | 
50 | export class TableParser {
51 |   private rows: { [key: string]: Item[] };
52 |   constructor();
53 |   processItem(item: Item, col: number): void;
54 |   processHeadingItem(item: Item, col: number): void;
55 |   getRows(): Item[][];
56 |   renderRows(): string;
57 |   /** row-> column-> items_collisionning_in_column-> item:Item */
58 |   getMatrix(): Item[][][];
59 |   getCleanMatrix(options?: { collisionSeparator: string }): string[][];
60 |   renderMatrix(): string;
61 | }
62 | 
63 | export class Rule {
64 |   static on(regexp: RegExp): Rule;
65 |   static after(regexp: RegExp): Rule;
66 |   static makeItemProcessor(rules: Rule[]): (item: DataEntry) => void;
67 |   static addAccumulator(methodName: string, methodBuilder: Function): void;
68 | 
69 |   constructor(regexp: RegExp);
70 | 
71 |   // Accumulator methods
72 |   extractRegexpValues(): Rule;
73 |   parseNextItemValue(): Rule;
74 |   accumulateAfterHeading(): Rule;
75 |   accumulateFromSameX(): Rule;
76 |   parseColumns(...args: any[]): Rule;
77 |   parseTable(columnCount: number): Rule & {
78 |     then(handler: (result: TableResult) => void): Rule;
79 |   };
80 | 
81 |   then<T>(handler: RuleHandler<T>): Rule;
82 | 
83 |   private test(item: Item): RuleAccumulator | undefined;
84 |   private whenDone(callback: () => void): void;
85 | }
86 | 


--------------------------------------------------------------------------------
/PdfReader.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * PdfReader: class that reads a PDF file, and calls a function on each item found while parsing that file.
 3 |  * @author Adrien Joly, http://github.com/adrienjoly
 4 |  * This content is released under the MIT License.
 5 |  *
 6 |  * An item object can match one of the following objects:
 7 |  * - null, when the parsing is over, or an error occured.
 8 |  * - {file:{path:string}}, when a PDF file is being opened.
 9 |  * - {page:integer}, when a new page is being parsed, provides the page number, starting at 1.
10 |  * - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position.
11 |  *
12 |  **/
13 | 
14 | import { log as LOG } from "./lib/LOG.js";
15 | import PDFParser from "pdf2json"; // doc: https://github.com/modesty/pdf2json
16 | 
17 | function forEachItem(pdf, handler) {
18 |   var pageNumber = 0;
19 |   // pdf.formImage was removed in pdf2json@2, but we keep backward compatibility too
20 |   var Pages = pdf.Pages || pdf.formImage.Pages;
21 |   for (var p in Pages) {
22 |     var page = Pages[p];
23 |     var number = ++pageNumber;
24 |     handler(null, {
25 |       page: number,
26 |       width: page.Width || (pdf.formImage ? pdf.formImage.Width : 0),
27 |       height:
28 |         page.Height ||
29 |         (pdf.formImage ? pdf.formImage.Pages[number - 1].Height : 0),
30 |     });
31 |     for (var t in page.Texts) {
32 |       var item = page.Texts[t];
33 |       item.text = decodeURIComponent(item.R[0].T);
34 |       handler(null, item);
35 |     }
36 |   }
37 |   handler();
38 | }
39 | 
40 | export function PdfReader(options) {
41 |   LOG("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter
42 |   this.options = options || {};
43 | }
44 | 
45 | /**
46 |  * parseFileItems: calls itemHandler(error, item) on each item parsed from the pdf file
47 |  **/
48 | PdfReader.prototype.parseFileItems = function (pdfFilePath, itemHandler) {
49 |   itemHandler(null, { file: { path: pdfFilePath } });
50 |   var pdfParser;
51 |   if (this.options.password) {
52 |     pdfParser = new PDFParser(null, null, this.options.password);
53 |   } else {
54 |     pdfParser = new PDFParser();
55 |   }
56 | 
57 |   pdfParser.on("pdfParser_dataError", itemHandler);
58 |   pdfParser.on("pdfParser_dataReady", function (pdfData) {
59 |     forEachItem(pdfData, itemHandler);
60 |   });
61 |   var verbosity = this.options.debug ? 1 : 0;
62 |   pdfParser.loadPDF(pdfFilePath, verbosity);
63 | 
64 |   this.options.signal?.addEventListener("abort", function () {
65 |     pdfParser.destroy();
66 |   });
67 | };
68 | 
69 | /**
70 |  * parseBuffer: calls itemHandler(error, item) on each item parsed from the pdf file received as a buffer
71 |  */
72 | PdfReader.prototype.parseBuffer = function (pdfBuffer, itemHandler) {
73 |   itemHandler(null, { file: { buffer: pdfBuffer } });
74 |   var pdfParser;
75 |   if (this.options.password) {
76 |     pdfParser = new PDFParser(null, null, this.options.password);
77 |   } else {
78 |     pdfParser = new PDFParser();
79 |   }
80 | 
81 |   pdfParser.on("pdfParser_dataError", itemHandler);
82 |   pdfParser.on("pdfParser_dataReady", function (pdfData) {
83 |     forEachItem(pdfData, itemHandler);
84 |   });
85 |   var verbosity = this.options.debug ? 1 : 0;
86 |   pdfParser.parseBuffer(pdfBuffer, verbosity);
87 | 
88 |   this.options.signal?.addEventListener("abort", function () {
89 |     pdfParser.destroy();
90 |   });
91 | };
92 | 


--------------------------------------------------------------------------------
/lib/TableParser.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * TableParser
  3 |  * Classifies items into columns and rows, based on their left and top coordinates,
  4 |  * and left position of column headers.
  5 |  * @author Adrien Joly, http://github.com/adrienjoly
  6 |  * This content is released under the MIT License.
  7 |  **/
  8 | 
  9 | export function TableParser() {
 10 |   this.rows = {};
 11 | }
 12 | 
 13 | TableParser.prototype.processItem = function (item, col) {
 14 |   var row = (this.rows["" + item.y] = this.rows["" + item.y] || {});
 15 |   (row[col] = row[col] || []).push(item);
 16 | };
 17 | 
 18 | TableParser.prototype.processHeadingItem = function (item, col) {
 19 |   this.processItem(
 20 |     {
 21 |       y: 0,
 22 |       x: item.x,
 23 |       text: item.text,
 24 |     },
 25 |     col
 26 |   );
 27 | };
 28 | 
 29 | // Rows
 30 | 
 31 | function sortAsFloatValues(values) {
 32 |   return values.slice().sort(function (a, b) {
 33 |     return parseFloat(a) - parseFloat(b);
 34 |   });
 35 | }
 36 | 
 37 | TableParser.prototype.getRows = function () {
 38 |   var rows = this.rows;
 39 |   var yValues = sortAsFloatValues(Object.keys(rows));
 40 |   return yValues.map(function (y) {
 41 |     return rows["" + y];
 42 |   });
 43 | };
 44 | 
 45 | function renderRows(rows) {
 46 |   return (rows || [])
 47 |     .map(function (row, rowId) {
 48 |       var cells = [];
 49 |       for (var i in row)
 50 |         for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text);
 51 |       return rowId + ":\t" + cells.join(", ");
 52 |     })
 53 |     .join("\n");
 54 | }
 55 | 
 56 | TableParser.prototype.renderRows = function () {
 57 |   return renderRows(this.getRows());
 58 | };
 59 | 
 60 | // Matrix
 61 | 
 62 | function getSortedXValues(rows) {
 63 |   var xSet = {};
 64 |   for (var y in rows) for (var x in rows[y]) xSet[x] = true;
 65 |   return sortAsFloatValues(Object.keys(xSet));
 66 | }
 67 | 
 68 | /** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */
 69 | TableParser.prototype.getMatrix = function () {
 70 |   var rows = this.getRows();
 71 |   var xValues = getSortedXValues(rows);
 72 |   return rows.map(function (row, y) {
 73 |     var rowNew = [];
 74 |     for (var x in row) {
 75 |       var items = row[x];
 76 |       var colN = xValues.indexOf(x);
 77 |       rowNew[colN] = (rowNew[colN] || []).concat(items);
 78 |     }
 79 |     return rowNew;
 80 |   });
 81 | };
 82 | 
 83 | /**
 84 |  * For use with console.table().
 85 |  * @param {String} collisionSeparator separator to use when there are multiple values to join for a given column
 86 |  * @returns a 2-dimension matrix: row -> column -> value
 87 |  */
 88 | TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) {
 89 |   return this.getMatrix().map((rowColumns) =>
 90 |     rowColumns.map((items) =>
 91 |       items.map((item) => item.text).join(collisionSeparator || "")
 92 |     )
 93 |   );
 94 | };
 95 | 
 96 | function getText(item) {
 97 |   return item.text;
 98 | }
 99 | 
100 | function joinCellCollisions(separ) {
101 |   return function (cell) {
102 |     return (cell || []).map(getText).join(separ).substr(0, 7);
103 |   };
104 | }
105 | 
106 | function renderMatrix(matrix) {
107 |   return (matrix || [])
108 |     .map(function (row) {
109 |       return (row || []).map(joinCellCollisions("+")).join("\t");
110 |     })
111 |     .join("\n");
112 | }
113 | 
114 | TableParser.prototype.renderMatrix = function () {
115 |   return renderMatrix(this.getMatrix());
116 | };
117 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | ## [1.2.1](https://github.com/adrienjoly/npm-pdfreader/compare/v1.2.0...v1.2.1) (2020-09-25)
 2 | 
 3 | 
 4 | ### Bug Fixes
 5 | 
 6 | * **deps:** Update dependencies ([#63](https://github.com/adrienjoly/npm-pdfreader/issues/63)) ([308f322](https://github.com/adrienjoly/npm-pdfreader/commit/308f322ea670ab2ec11f77e3588667674709b453))
 7 | 
 8 | # [1.2.0](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.4...v1.2.0) (2020-09-25)
 9 | 
10 | 
11 | ### Features
12 | 
13 | * Support password-protected PDF files ([#61](https://github.com/adrienjoly/npm-pdfreader/issues/61)) ([248af89](https://github.com/adrienjoly/npm-pdfreader/commit/248af89d79304dfa64b5785614b496e4e5d36e69)), closes [#15](https://github.com/adrienjoly/npm-pdfreader/issues/15)
14 | 
15 | ## [1.1.4](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.3...v1.1.4) (2020-09-25)
16 | 
17 | 
18 | ### Bug Fixes
19 | 
20 | * Ease contributions ([#62](https://github.com/adrienjoly/npm-pdfreader/issues/62)) ([4a1fe66](https://github.com/adrienjoly/npm-pdfreader/commit/4a1fe6677d5a829049aa0c3c28dccb2f96e8e2f6))
21 | 
22 | ## [1.1.3](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.2...v1.1.3) (2020-04-26)
23 | 
24 | 
25 | ### Bug Fixes
26 | 
27 | * **node:** use latest node 10 version ([#52](https://github.com/adrienjoly/npm-pdfreader/issues/52)) ([eb34ea9](https://github.com/adrienjoly/npm-pdfreader/commit/eb34ea92fea924d3d1e28b13a2e730b62a996b51))
28 | 
29 | ## [1.1.2](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.1...v1.1.2) (2020-04-26)
30 | 
31 | 
32 | ### Bug Fixes
33 | 
34 | * **deps:** with npm audit fix ([#51](https://github.com/adrienjoly/npm-pdfreader/issues/51)) ([16502fc](https://github.com/adrienjoly/npm-pdfreader/commit/16502fce29af76ebf8216e17aafb388a54326b6c))
35 | 
36 | ## [1.1.1](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.0...v1.1.1) (2020-04-26)
37 | 
38 | 
39 | ### Bug Fixes
40 | 
41 | * **deps:** bump acorn from 6.3.0 to 6.4.1 ([#46](https://github.com/adrienjoly/npm-pdfreader/issues/46)) ([af61802](https://github.com/adrienjoly/npm-pdfreader/commit/af61802d1430adab8c9c56588d8a5b565910bd3a))
42 | 
43 | # [1.1.0](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.10...v1.1.0) (2020-04-26)
44 | 
45 | 
46 | ### Features
47 | 
48 | * **deps:** upgrade pdf2json to version 1.2.0 ([#50](https://github.com/adrienjoly/npm-pdfreader/issues/50)) ([0877162](https://github.com/adrienjoly/npm-pdfreader/commit/08771623aa7bf228b4a39e763e38614e79dca10c)), closes [#40](https://github.com/adrienjoly/npm-pdfreader/issues/40)
49 | 
50 | ## [1.0.10](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.9...v1.0.10) (2020-04-26)
51 | 
52 | 
53 | ### Bug Fixes
54 | 
55 | * **ci:** check formatting in separate step, after tests ([#49](https://github.com/adrienjoly/npm-pdfreader/issues/49)) ([9129b8a](https://github.com/adrienjoly/npm-pdfreader/commit/9129b8a4f860fbc674fd7485c7c0661c0344a71d))
56 | 
57 | ## [1.0.9](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.8...v1.0.9) (2020-04-26)
58 | 
59 | 
60 | ### Bug Fixes
61 | 
62 | * **ci:** prettier to ignore CHANGELOG (generated) ([8bcf776](https://github.com/adrienjoly/npm-pdfreader/commit/8bcf77674a6e472c791accca4d8385e8462679b6))
63 | * **ci:** skip github actions workflow on release commits ([c970cda](https://github.com/adrienjoly/npm-pdfreader/commit/c970cda451a3a3b53c9d42c721524b22a7714544))
64 | 
65 | ## [1.0.8](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.7...v1.0.8) (2020-04-26)
66 | 
67 | 
68 | ### Bug Fixes
69 | 
70 | * **release:** automatic update of version in package.json ([#48](https://github.com/adrienjoly/npm-pdfreader/issues/48)) ([bad1d5b](https://github.com/adrienjoly/npm-pdfreader/commit/bad1d5bfce1c55b503cca3380c3187fb071b6056))
71 | 


--------------------------------------------------------------------------------
/test/sample.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg1252\cocoartf1344\cocoasubrtf720
 2 | {\fonttbl\f0\froman\fcharset0 Times-Roman;}
 3 | {\colortbl;\red255\green255\blue255;\red0\green0\blue0;}
 4 | \paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0
 5 | \deftab720
 6 | \pard\pardeftab720\sa321
 7 | 
 8 | \f0\b\fs48 \cf2 \expnd0\expndtw0\kerning0
 9 | \outl0\strokewidth0 \strokec2 Hello "world"\
10 | \pard\pardeftab720\sa240
11 | 
12 | \b0\fs24 \cf2 \expnd0\expndtw0\kerning0
13 | \outl0\strokewidth0 Value:\
14 | 4\
15 | 
16 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrt\brdrs\brdrw20\brdrcf2 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2 
17 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx2880
18 | \clvertalc \clshdrawnil \clwWidth860\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx5760
19 | \clvertalc \clshdrawnil \clwWidth940\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx8640
20 | \pard\intbl\itap1\pardeftab720\qc
21 | 
22 | \b \cf2 \expnd0\expndtw0\kerning0
23 | \outl0\strokewidth0 c1\cell 
24 | \pard\intbl\itap1\pardeftab720\qc
25 | \cf2 \expnd0\expndtw0\kerning0
26 | \outl0\strokewidth0 c2\cell 
27 | \pard\intbl\itap1\pardeftab720\qc
28 | \cf2 \expnd0\expndtw0\kerning0
29 | \outl0\strokewidth0 c3\cell \row
30 | 
31 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2 
32 | \clvertalc \clshdrawnil \clwWidth120\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx2880
33 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx5760
34 | \clvertalc \clshdrawnil \clwWidth560\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx8640
35 | \pard\intbl\itap1\pardeftab720\qr
36 | 
37 | \b0 \cf2 \expnd0\expndtw0\kerning0
38 | \outl0\strokewidth0 1\cell 
39 | \pard\intbl\itap1\pardeftab720\qr
40 | \cf2 \expnd0\expndtw0\kerning0
41 | \outl0\strokewidth0 \cell 
42 | \pard\intbl\itap1\pardeftab720\qr
43 | \cf2 \expnd0\expndtw0\kerning0
44 | \outl0\strokewidth0 2.3\cell \row
45 | 
46 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrb\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2 
47 | \clvertalc \clshdrawnil \clwWidth120\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx2880
48 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx5760
49 | \clvertalc \clshdrawnil \clwWidth560\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx8640
50 | \pard\intbl\itap1\pardeftab720\qr
51 | \cf2 \expnd0\expndtw0\kerning0
52 | \outl0\strokewidth0 \cell 
53 | \pard\intbl\itap1\pardeftab720\qr
54 | \cf2 \expnd0\expndtw0\kerning0
55 | \outl0\strokewidth0 hello\cell 
56 | \pard\intbl\itap1\pardeftab720\qr
57 | \cf2 \expnd0\expndtw0\kerning0
58 | \outl0\strokewidth0 world\cell \lastrow\row
59 | \pard\pardeftab720\sa240
60 | \cf2 \expnd0\expndtw0\kerning0
61 | \outl0\strokewidth0 Values:\
62 | 1\
63 | 2\
64 | 3\
65 | }


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
  1 | import assert from "assert";
  2 | import test from "ava";
  3 | import { toggle } from "../lib/LOG.js";
  4 | import * as lib from "../index.js";
  5 | 
  6 | toggle(false);
  7 | 
  8 | const PdfReader = lib.PdfReader;
  9 | const Rule = lib.Rule;
 10 | 
 11 | const TESTFILE = "./test/sample.pdf";
 12 | const TESTFILE_WITH_PASSWORD = "./test/sample-with-password.pdf";
 13 | 
 14 | test("parse raw items from pdf file", async (t) => {
 15 |   const res = new Promise((resolve, reject) => {
 16 |     const items = [];
 17 |     new PdfReader().parseFileItems(TESTFILE, (err, item) => {
 18 |       if (err) reject(err);
 19 |       else if (!item) resolve(items);
 20 |       else items.push(item);
 21 |     });
 22 |   });
 23 |   t.snapshot(await res);
 24 | });
 25 | 
 26 | test("parse structured content from pdf file, using rules", async (t) => {
 27 |   const res = new Promise((resolve, reject) => {
 28 |     const content = [];
 29 |     const rules = [
 30 |       Rule.on(/^Hello \"(.*)\"$/)
 31 |         .extractRegexpValues()
 32 |         .then((value) => content.push({ extractRegexpValues: value })),
 33 |       Rule.on(/^Value\:/)
 34 |         .parseNextItemValue()
 35 |         .then((value) => content.push({ parseNextItemValue: value })),
 36 |       Rule.on(/^c1$/)
 37 |         .parseTable(3)
 38 |         .then((table) =>
 39 |           content.push({
 40 |             "parseTable.renderMatrix": lib.parseTable.renderMatrix(
 41 |               table.matrix
 42 |             ),
 43 |             "parseTable.renderItems": lib.parseTable.renderItems(table.items),
 44 |           })
 45 |         ),
 46 |       Rule.on(/^Values\:/)
 47 |         .accumulateAfterHeading()
 48 |         .then((value) => content.push({ accumulateAfterHeading: value })),
 49 |     ];
 50 |     const processItem = Rule.makeItemProcessor(rules);
 51 |     new PdfReader().parseFileItems(TESTFILE, (err, item) => {
 52 |       if (err) reject(err);
 53 |       else {
 54 |         processItem(item);
 55 |         if (!item) resolve(content);
 56 |       }
 57 |     });
 58 |   });
 59 |   t.snapshot(await res);
 60 | });
 61 | 
 62 | test("parse Table from PDF file, using TableParser", async (t) => {
 63 |   const matrix = await new Promise((resolve, reject) => {
 64 |     // the thresholds were determined manually, based on the horizontal position (x) for column headers
 65 |     const colThresholds = [6.8, 9.5, 13.3, 16.7, 18.4, 28, 32, 36, Infinity];
 66 | 
 67 |     const columnQuantitizer = (item) => {
 68 |       const col = colThresholds.findIndex(
 69 |         (colThreshold) => parseFloat(item.x) < colThreshold
 70 |       );
 71 |       assert(col >= 0, col);
 72 |       assert(col < colThresholds.length, col);
 73 |       // console.log(`COL ${col}\t${parseFloat(item.x)}\t${item.text}`);
 74 |       return col;
 75 |     };
 76 | 
 77 |     const table = new lib.TableParser();
 78 |     new PdfReader().parseFileItems("./test/sample-table.pdf", (err, item) => {
 79 |       if (err) reject(err);
 80 |       else if (!item) {
 81 |         resolve(table.getCleanMatrix({ collisionSeparator: "" }));
 82 |       } else if (item.text) {
 83 |         table.processItem(item, columnQuantitizer(item));
 84 |       }
 85 |     });
 86 |   });
 87 |   // console.table(matrix);
 88 |   t.snapshot(matrix);
 89 | });
 90 | 
 91 | test("support pdf file with password", async (t) => {
 92 |   const promise = new Promise((resolve, reject) =>
 93 |     new PdfReader({ password: "password" }).parseFileItems(
 94 |       TESTFILE_WITH_PASSWORD,
 95 |       (err, item) => {
 96 |         if (err) reject(err);
 97 |         else if (!item) resolve();
 98 |       }
 99 |     )
100 |   );
101 |   await t.notThrowsAsync(promise);
102 | });
103 | 
104 | test("sample scripts should print raw items from pdf file", async (t) => {
105 |   const { execa } = await import("execa");
106 |   const { stdout, stderr } = await execa("npm run test:samples", {
107 |     shell: true, // needed in order to run npm commands with execa
108 |   });
109 |   t.snapshot({ stdout, stderr });
110 | });
111 | 


--------------------------------------------------------------------------------
/lib/parseTable.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * parseTable accumulator, for pdfreader, used by the Rule class.
  3 |  * items are classified into columns and rows, based on their left and top coordinates,
  4 |  * and left position of column headers.
  5 |  * TODO: use TableParser
  6 |  * @author Adrien Joly, http://github.com/adrienjoly
  7 |  * This content is released under the MIT License.
  8 |  **/
  9 | 
 10 | function getTopPos(item) {
 11 |   return item.y;
 12 | }
 13 | 
 14 | function getLeftPos(item) {
 15 |   return item.x;
 16 | }
 17 | 
 18 | function getText(item) {
 19 |   return item.text;
 20 | }
 21 | 
 22 | /**
 23 |  * makeClassifier(): makes a classifier, based on an array of numbers and an expected number of clusters.
 24 |  * nbClusters: expected number of clusters
 25 |  * arr: array of numbers
 26 |  * => returns a function that takes a number, and returns the number of its corresponding column.
 27 |  **/
 28 | function makeFloorClassifier(nbClusters, arr) {
 29 |   var min = Math.min.apply(Math, arr);
 30 |   var delta = Math.max.apply(Math, arr) - min;
 31 |   min -= delta / nbClusters / 2;
 32 |   return function classify(value) {
 33 |     return Math.floor((nbClusters * (value - min)) / delta);
 34 |   };
 35 | }
 36 | 
 37 | function makeColumnClassifier(header) {
 38 |   var colX = [0].concat(header.map(getLeftPos)).sort(function (a, b) {
 39 |     return a - b;
 40 |   });
 41 |   return function classify(item) {
 42 |     for (var i = colX.length - 1; i > -1; --i)
 43 |       if (getLeftPos(item) >= colX[i]) return i;
 44 |   };
 45 | }
 46 | 
 47 | function buildRowList(items, classifyRow) {
 48 |   var rows = [];
 49 |   for (var i in items) {
 50 |     var item = items[i];
 51 |     var row = classifyRow(getTopPos(item));
 52 |     (rows[row] = rows[row] || []).push(item);
 53 |   }
 54 |   return rows;
 55 | }
 56 | 
 57 | function joinCellCollisions(separ) {
 58 |   return function (cell) {
 59 |     return (cell || []).map(getText).join(separ).substr(0, 7);
 60 |   };
 61 | }
 62 | 
 63 | function fillTab(str) {
 64 |   return str.substr(0, 7);
 65 | }
 66 | 
 67 | export function renderTable(table) {
 68 |   return (table || [])
 69 |     .map(function (row) {
 70 |       return (row || []).map(fillTab).join("\t");
 71 |     })
 72 |     .join("\n");
 73 | }
 74 | 
 75 | export function renderMatrix(matrix) {
 76 |   return (matrix || [])
 77 |     .map(function (row) {
 78 |       return (row || []).map(joinCellCollisions("+")).join("\t");
 79 |     })
 80 |     .join("\n");
 81 | }
 82 | 
 83 | export function renderRows(rows) {
 84 |   return (rows || [])
 85 |     .map(function (row, rowId) {
 86 |       var cells = [rowId + ":"];
 87 |       for (var i in row)
 88 |         cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7));
 89 |       return cells.join("\t");
 90 |     })
 91 |     .join("\n");
 92 | }
 93 | 
 94 | export function renderItems(items) {
 95 |   return items
 96 |     .map(function (i) {
 97 |       return [i.y, i.x, i.text].join("\t");
 98 |     })
 99 |     .join("\n");
100 | }
101 | 
102 | function buildMatrix(rows, classifyColumn) {
103 |   var matrix = [];
104 |   for (var y in rows) {
105 |     var row = [];
106 |     for (var x in rows[y]) {
107 |       var item = rows[y][x];
108 |       var colN = classifyColumn(item);
109 |       (row[colN] = row[colN] || []).push(item);
110 |     }
111 |     matrix.push(row);
112 |   }
113 |   return matrix;
114 | }
115 | 
116 | export function detectCollisions(matrix) {
117 |   var collisions = [];
118 |   (matrix || []).map(function (row, rowN) {
119 |     (row || []).map(function (cellItems, colN) {
120 |       if (cellItems.length > 1)
121 |         collisions.push({
122 |           row: rowN,
123 |           col: colN,
124 |           items: cellItems,
125 |         });
126 |     });
127 |   });
128 |   return collisions;
129 | }
130 | 
131 | export const parseTable = function makeAccumulator(nbRows, headerRow) {
132 |   var rule = this,
133 |     items = [];
134 | 
135 |   rule.nbRows = nbRows || 0;
136 |   rule.output = {
137 |     items: items,
138 |     rows: null,
139 |     matrix: null,
140 |   };
141 | 
142 |   function accumulate(item) {
143 |     items.push(item);
144 |   }
145 | 
146 |   // when parsing is done: generate a clean table, from items.
147 |   rule.whenDone(function () {
148 |     // classify items into rows
149 |     var classifyRow = makeFloorClassifier(rule.nbRows, items.map(getTopPos));
150 |     //LOG(items.map(function(i){ return [getTopPos(i), classifyRow(getTopPos(i)), i.text].join("\t"); }).join("\n"));
151 |     this.output.rows = buildRowList(items, classifyRow);
152 |     // classify row items into columns
153 |     var classifyColumn = makeColumnClassifier(this.output.rows[headerRow || 0]);
154 |     this.output.matrix = buildMatrix(this.output.rows, classifyColumn);
155 |   });
156 | 
157 |   return accumulate; // then the same function will be run on all following items, until another rule is triggered
158 | };
159 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pdfreader ![Node CI](https://github.com/adrienjoly/npm-pdfreader/workflows/Node%20CI/badge.svg) [![Code Quality](https://api.codacy.com/project/badge/Grade/73d37dbb0ff84795acf65a55c5936d83)](https://app.codacy.com/gh/adrienjoly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade)
  2 | 
  3 | Read text and parse tables from PDF files.
  4 | 
  5 | Supports **tabular data** with automatic column detection, and **rule-based parsing**.
  6 | 
  7 | Dependencies: it is based on [pdf2json](https://www.npmjs.com/package/pdf2json), which itself relies on Mozilla's [pdf.js](https://github.com/mozilla/pdf.js/).
  8 | 
  9 | 🆕 Now includes TypeScript type definitions!
 10 | 
 11 | ℹ️ Important notes:
 12 | 
 13 | - This module is meant to be run using Node.js only. **It does not work from a web browser.**
 14 | - This module extracts text entries from PDF files. It does not support photographed text. If you cannot select text from the PDF file, **you may need to use OCR software first**.
 15 | 
 16 | Summary:
 17 | 
 18 | - [Installation, tests and CLI usage](#installation-tests-and-cli-usage)
 19 | - [Raw PDF reading](#raw-pdf-reading) (incl. examples)
 20 | - [Rule-based data extraction](#rule-based-data-extraction)
 21 | - [Troubleshooting & FAQ](#troubleshooting--faq)
 22 | 
 23 | ## Installation, tests and CLI usage
 24 | 
 25 | After installing [Node.js](https://nodejs.org/):
 26 | 
 27 | ```sh
 28 | git clone https://github.com/adrienjoly/npm-pdfreader.git
 29 | cd npm-pdfreader
 30 | npm install
 31 | npm test
 32 | node parse.js test/sample.pdf
 33 | ```
 34 | 
 35 | ## Installation into an existing project
 36 | 
 37 | To install `pdfreader` as a dependency of your Node.js project:
 38 | 
 39 | ```sh
 40 | npm install pdfreader
 41 | ```
 42 | 
 43 | Then, see below for examples of use.
 44 | 
 45 | ## Raw PDF reading
 46 | 
 47 | This module exposes the `PdfReader` class, to be instantiated. You can pass `{ debug: true }` to the constructor, in order to log debugging information. (useful for troubleshooting)
 48 | 
 49 | Your instance has two methods for parsing a PDF. They return the same output and differ only in input: `PdfReader.parseFileItems` (as below) for a filename, and `PdfReader.parseBuffer` (see: "Raw PDF reading from a PDF already in memory (buffer)") from data that you don't want to reference from the filesystem.
 50 | 
 51 | Whichever method you choose, it asks for a callback, which gets called each time the instance finds what it denotes as a PDF item.
 52 | 
 53 | An item object can match one of the following objects:
 54 | 
 55 | - `null`, when the parsing is over, or an error occured.
 56 | - File metadata, `{file:{path:string}}`, when a PDF file is being opened, and is always the first item.
 57 | - Page metadata, `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1. This basically acts as a carriage return for the coordinates of text items to be processed.
 58 | - Text items, `{text:string, x:float, y:float, w:float, ...}`, which you can think of as simple objects with a text property, and floating 2D AABB coordinates on the page.
 59 | 
 60 | It's up to your callback to process these items into a data structure of your choice, and also to handle any errors thrown to it.
 61 | 
 62 | For example:
 63 | 
 64 | ```javascript
 65 | import { PdfReader } from "pdfreader";
 66 | 
 67 | new PdfReader().parseFileItems("test/sample.pdf", (err, item) => {
 68 |   if (err) console.error("error:", err);
 69 |   else if (!item) console.warn("end of file");
 70 |   else if (item.text) console.log(item.text);
 71 | });
 72 | ```
 73 | 
 74 | ### Parsing a password-protected PDF file
 75 | 
 76 | ```javascript
 77 | new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems(
 78 |   "test/sample-with-password.pdf",
 79 |   function (err, item) {
 80 |     if (err) console.error(err);
 81 |     else if (!item) console.warn("end of file");
 82 |     else if (item.text) console.log(item.text);
 83 |   }
 84 | );
 85 | ```
 86 | 
 87 | ### Raw PDF reading from a PDF buffer
 88 | 
 89 | As above, but reading from a buffer in memory rather than from a file referenced by path. For example:
 90 | 
 91 | ```javascript
 92 | import fs from "fs";
 93 | import { PdfReader } from "pdfreader";
 94 | 
 95 | fs.readFile("test/sample.pdf", (err, pdfBuffer) => {
 96 |   // pdfBuffer contains the file content
 97 |   new PdfReader().parseBuffer(pdfBuffer, (err, item) => {
 98 |     if (err) console.error("error:", err);
 99 |     else if (!item) console.warn("end of buffer");
100 |     else if (item.text) console.log(item.text);
101 |   });
102 | });
103 | ```
104 | 
105 | ### Other examples of use
106 | 
107 | ![example cv resume parse convert pdf to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseRows.png)
108 | 
109 | ![example cv resume parse convert pdf table to text](https://github.com/adrienjoly/npm-pdfreader-example/raw/master/parseTable.png)
110 | 
111 | Source code of the examples above: [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).
112 | 
113 | For more, see [Examples of use](https://github.com/adrienjoly/npm-pdfreader/discussions/categories/examples-of-use).
114 | 
115 | ## Rule-based data extraction
116 | 
117 | The `Rule` class can be used to define and process data extraction rules, while parsing a PDF document.
118 | 
119 | `Rule` instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule.
120 | 
121 | Example:
122 | 
123 | ```javascript
124 | const processItem = Rule.makeItemProcessor([
125 |   Rule.on(/^Hello \"(.*)\"$/)
126 |     .extractRegexpValues()
127 |     .then(displayValue),
128 |   Rule.on(/^Value\:/)
129 |     .parseNextItemValue()
130 |     .then(displayValue),
131 |   Rule.on(/^c1$/).parseTable(3).then(displayTable),
132 |   Rule.on(/^Values\:/)
133 |     .accumulateAfterHeading()
134 |     .then(displayValue),
135 | ]);
136 | new PdfReader().parseFileItems("test/sample.pdf", (err, item) => {
137 |   if (err) console.error(err);
138 |   else processItem(item);
139 | });
140 | ```
141 | 
142 | ## Troubleshooting & FAQ
143 | 
144 | ### Is it possible to parse a PDF document from a web application?
145 | 
146 | Solutions exist, but this module cannot be run directly by a web browser. If you really want to use this module, you will have to integrate it into your back-end so that PDF files can be read from your server.
147 | 
148 | ### `Cannot read property 'userAgent' of undefined` error from an express-based node.js app
149 | 
150 | Dmitry found out that you may need to run these instructions before including the `pdfreader` module:
151 | 
152 | ```js
153 | global.navigator = {
154 |   userAgent: "node",
155 | };
156 | 
157 | window.navigator = {
158 |   userAgent: "node",
159 | };
160 | ```
161 | 
162 | Source: [express - TypeError: Cannot read property 'userAgent' of undefined error on node.js app run - Stack Overflow](https://stackoverflow.com/questions/49208414/typeerror-cannot-read-property-useragent-of-undefined-error-on-node-js-app-ru)
163 | 


--------------------------------------------------------------------------------
/Rule.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Rule: class that can be used to define and process data extraction rules, while parsing a PDF document.
  3 |  * @author Adrien Joly, http://github.com/adrienjoly
  4 |  * This content is released under the MIT License.
  5 |  **/
  6 | 
  7 | import { log as LOG } from "./lib/LOG.js";
  8 | import { parseColumns } from "./lib/parseColumns.js";
  9 | import { parseTable } from "./lib/parseTable.js";
 10 | 
 11 | /**
 12 |  * regexp: a regular expression which a PDF item's text must match in order to execute that rule.
 13 |  * => a Rule object exposes "accumulators": methods that defines the data extraction strategy of a rule.
 14 |  **/
 15 | export function Rule(regexp) {
 16 |   this.regexp = regexp;
 17 |   var self = this;
 18 |   // proxy accumulators methods
 19 |   Object.keys(Rule.accumulators).forEach(function (name) {
 20 |     self[name] = function () {
 21 |       LOG("building rule:", regexp, "->", name);
 22 |       self.methodName = name;
 23 |       self.accumulatorParams = arguments;
 24 |       self.accumulatorBuilder = Rule.accumulators[name];
 25 |       return self;
 26 |     };
 27 |   });
 28 | }
 29 | 
 30 | // shortcut for defining Rule objects in a more concise manner
 31 | Rule.on = function (regexp) {
 32 |   return new Rule(regexp);
 33 | };
 34 | 
 35 | Rule.after = function (regexp) {
 36 |   var rule = new Rule(regexp);
 37 |   rule.skipCurrentItem = true;
 38 |   return rule;
 39 | };
 40 | 
 41 | /**
 42 |  * then(): defines a function to be called after a Rule's accumulator has finished processing items.
 43 |  * fct: the function to be called after a Rule's accumulator has finished processing items.
 44 |  *      the output of the accumulator will be passed as the first parameter of that function.
 45 |  **/
 46 | Rule.prototype.then = function (fct) {
 47 |   var self = this;
 48 |   this.terminate = function () {
 49 |     fct.call(self, self.output);
 50 |   };
 51 |   return this;
 52 | };
 53 | 
 54 | // private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator.
 55 | Rule.prototype.test = function (item) {
 56 |   if (this.regexp.test(item.text)) {
 57 |     // lazy init of accumulators: build and init the accumulator on first match
 58 |     this.currentItem = item;
 59 |     if (!this.accumulatorImpl && this.accumulatorBuilder) {
 60 |       this.accumulatorImpl = this.accumulatorBuilder.apply(
 61 |         this,
 62 |         this.accumulatorParams
 63 |       );
 64 |       this.accumulatorImpl.methodName = this.methodName;
 65 |       this.accumulatorImpl.terminate = this.terminate;
 66 |     }
 67 |     return this.accumulatorImpl;
 68 |   }
 69 | };
 70 | 
 71 | // intended to be run from accumulator, in order to process output before calling termination then() handler.
 72 | Rule.prototype.whenDone = function (fct) {
 73 |   var self = this;
 74 |   var then = this.terminate;
 75 |   this.terminate = function () {
 76 |     fct.call(self);
 77 |     then();
 78 |   };
 79 | };
 80 | 
 81 | /**
 82 |  * rules: array of Rule objects that will be executed one-by-one, whenever a PDF item matches a rule.
 83 |  *        each rule can only be executed once.
 84 |  * => returns a function to be called for each item by the PdfReader.
 85 |  **/
 86 | Rule.makeItemProcessor = function (rules) {
 87 |   var currentAccumulator = null;
 88 |   function terminateAccumulator() {
 89 |     var terminatePreviousAcc = (currentAccumulator || {}).terminate;
 90 |     if (terminatePreviousAcc) {
 91 |       LOG("terminating accumulator:", currentAccumulator.methodName);
 92 |       terminatePreviousAcc(currentAccumulator); // TODO: remove currentAccumulator parameter
 93 |     }
 94 |   }
 95 |   var applyRulesOnNextItem = true;
 96 |   return function (item) {
 97 |     if (!item)
 98 |       // last item of the file => flush buffers
 99 |       return terminateAccumulator();
100 |     else if (!item.text) return;
101 |     //LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem);
102 |     if (applyRulesOnNextItem)
103 |       for (var r in rules) {
104 |         var accumulator = rules[r].test(item);
105 |         if (accumulator) {
106 |           terminateAccumulator();
107 |           LOG("current accumulator:", accumulator.methodName);
108 |           if (rules[r].skipCurrentItem) applyRulesOnNextItem = false;
109 |           currentAccumulator = accumulator;
110 |           delete rules[r];
111 |           return;
112 |         }
113 |       }
114 |     else applyRulesOnNextItem = true;
115 |     // if reaching this point, the current item matches none of the rules => accumulating data on current accumulator
116 |     if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item);
117 |   };
118 | };
119 | 
120 | /**
121 |  * Rule.accumulators: array of accumulators that can be used for defining Rule objects.
122 |  * An accumulator is a function that may (or may not) accept parameters, to be provided by the developer of a parser.
123 |  * It returns another function that will be run on every following PDF item, in order to accumulate data.
124 |  * The output of an accumulator is stored in this.output (field of its parent Rule object).
125 |  **/
126 | Rule.accumulators = {
127 |   stopAccumulating: function () {
128 |     return function () {};
129 |   },
130 | };
131 | 
132 | // method for adding accumulators
133 | Rule.addAccumulator = function (methodName, methodBuilder) {
134 |   Rule.accumulators[methodName] = methodBuilder;
135 | };
136 | 
137 | /**
138 |  * This accumulator will store the group values extracted by the regexp of the Rule object,
139 |  * on the current matching PDF item, into an array.
140 |  *
141 |  * E.g. with regex: /hello ([a-z]+)/, the text "hello world" will yield "world".
142 |  **/
143 | Rule.addAccumulator("extractRegexpValues", function () {
144 |   var matches = this.regexp.exec(this.currentItem.text);
145 |   this.output = matches.slice(1);
146 |   return function () {}; // following lines are not to be processed by this accumulator
147 | });
148 | 
149 | /**
150 |  * This accumulator will store the value of the next PDF item.
151 |  **/
152 | Rule.addAccumulator("parseNextItemValue", function () {
153 |   var self = this,
154 |     done = false;
155 |   return function (item) {
156 |     if (done) return;
157 |     done = true;
158 |     self.output = item.text;
159 |   };
160 | });
161 | 
162 | /**
163 |  * This accumulator will store the text of all following PDF items into an array.
164 |  **/
165 | Rule.addAccumulator("accumulateAfterHeading", function () {
166 |   var output = (this.output = []);
167 |   return function accumulate(item) {
168 |     output.push(item.text);
169 |   };
170 | });
171 | 
172 | /**
173 |  * This accumulator will store the text of all following PDF items with equal x-coordinates.
174 |  **/
175 | Rule.addAccumulator("accumulateFromSameX", function () {
176 |   var output = (this.output = []),
177 |     x = null;
178 |   return function accumulate(item) {
179 |     if (x === null) x = item.x;
180 |     if (x == item.x) output.push(item.text);
181 |   };
182 | });
183 | 
184 | /**
185 |  * This accumulator will store a table by detecting its columns, given their names.
186 |  **/
187 | Rule.addAccumulator("parseColumns", parseColumns);
188 | 
189 | /**
190 |  * This accumulator will store a table by detecting its columns, given their count.
191 |  **/
192 | Rule.addAccumulator("parseTable", parseTable);
193 | 


--------------------------------------------------------------------------------
/test/snapshots/test.js.md:
--------------------------------------------------------------------------------
  1 | # Snapshot report for `test/test.js`
  2 | 
  3 | The actual snapshot is saved in `test.js.snap`.
  4 | 
  5 | Generated by [AVA](https://avajs.dev).
  6 | 
  7 | ## parse raw items from pdf file
  8 | 
  9 | > Snapshot 1
 10 | 
 11 |     [
 12 |       {
 13 |         file: {
 14 |           path: './test/sample.pdf',
 15 |         },
 16 |       },
 17 |       {
 18 |         height: 52.618,
 19 |         page: 1,
 20 |         width: 37.205,
 21 |       },
 22 |       {
 23 |         A: 'left',
 24 |         R: [
 25 |           {
 26 |             S: -1,
 27 |             T: 'Hello%20%22world%22',
 28 |             TS: [
 29 |               0,
 30 |               28,
 31 |               1,
 32 |               0,
 33 |             ],
 34 |           },
 35 |         ],
 36 |         clr: 0,
 37 |         sw: 0.32553125,
 38 |         text: 'Hello "world"',
 39 |         w: 6.138,
 40 |         x: 4.555,
 41 |         y: 5.154,
 42 |       },
 43 |       {
 44 |         A: 'left',
 45 |         R: [
 46 |           {
 47 |             S: -1,
 48 |             T: 'Value%3A',
 49 |             TS: [
 50 |               0,
 51 |               15,
 52 |               0,
 53 |               0,
 54 |             ],
 55 |           },
 56 |         ],
 57 |         clr: 0,
 58 |         sw: NaN,
 59 |         text: 'Value:',
 60 |         w: 2.666,
 61 |         x: 4.555,
 62 |         y: 7.174,
 63 |       },
 64 |       {
 65 |         A: 'left',
 66 |         R: [
 67 |           {
 68 |             S: -1,
 69 |             T: '4',
 70 |             TS: [
 71 |               0,
 72 |               15,
 73 |               0,
 74 |               0,
 75 |             ],
 76 |           },
 77 |         ],
 78 |         clr: 0,
 79 |         sw: NaN,
 80 |         text: '4',
 81 |         w: 0.5,
 82 |         x: 4.555,
 83 |         y: 8.761,
 84 |       },
 85 |       {
 86 |         A: 'left',
 87 |         R: [
 88 |           {
 89 |             S: -1,
 90 |             T: 'c1',
 91 |             TS: [
 92 |               0,
 93 |               16,
 94 |               1,
 95 |               0,
 96 |             ],
 97 |           },
 98 |         ],
 99 |         clr: 0,
100 |         sw: 0.32553125,
101 |         text: 'c1',
102 |         w: 0.944,
103 |         x: 5.095,
104 |         y: 10.501,
105 |       },
106 |       {
107 |         A: 'left',
108 |         R: [
109 |           {
110 |             S: -1,
111 |             T: 'c2',
112 |             TS: [
113 |               0,
114 |               16,
115 |               1,
116 |               0,
117 |             ],
118 |           },
119 |         ],
120 |         clr: 0,
121 |         sw: 0.32553125,
122 |         text: 'c2',
123 |         w: 0.944,
124 |         x: 7.262,
125 |         y: 10.501,
126 |       },
127 |       {
128 |         A: 'left',
129 |         R: [
130 |           {
131 |             S: -1,
132 |             T: 'c3',
133 |             TS: [
134 |               0,
135 |               16,
136 |               1,
137 |               0,
138 |             ],
139 |           },
140 |         ],
141 |         clr: 0,
142 |         sw: 0.32553125,
143 |         text: 'c3',
144 |         w: 0.944,
145 |         x: 10.131,
146 |         y: 10.501,
147 |       },
148 |       {
149 |         A: 'left',
150 |         R: [
151 |           {
152 |             S: -1,
153 |             T: '1',
154 |             TS: [
155 |               0,
156 |               15,
157 |               0,
158 |               0,
159 |             ],
160 |           },
161 |         ],
162 |         clr: 0,
163 |         sw: NaN,
164 |         text: '1',
165 |         w: 0.5,
166 |         x: 5.288,
167 |         y: 11.447,
168 |       },
169 |       {
170 |         A: 'left',
171 |         R: [
172 |           {
173 |             S: -1,
174 |             T: '2.3',
175 |             TS: [
176 |               0,
177 |               15,
178 |               0,
179 |               0,
180 |             ],
181 |           },
182 |         ],
183 |         clr: 0,
184 |         sw: NaN,
185 |         text: '2.3',
186 |         w: 1.25,
187 |         x: 10.477,
188 |         y: 11.447,
189 |       },
190 |       {
191 |         A: 'left',
192 |         R: [
193 |           {
194 |             S: -1,
195 |             T: 'hello',
196 |             TS: [
197 |               0,
198 |               15,
199 |               0,
200 |               0,
201 |             ],
202 |           },
203 |         ],
204 |         clr: 0,
205 |         sw: NaN,
206 |         text: 'hello',
207 |         w: 2,
208 |         x: 6.937,
209 |         y: 12.363,
210 |       },
211 |       {
212 |         A: 'left',
213 |         R: [
214 |           {
215 |             S: -1,
216 |             T: 'world',
217 |             TS: [
218 |               0,
219 |               15,
220 |               0,
221 |               0,
222 |             ],
223 |           },
224 |         ],
225 |         clr: 0,
226 |         sw: NaN,
227 |         text: 'world',
228 |         w: 2.333,
229 |         x: 9.684,
230 |         y: 12.363,
231 |       },
232 |       {
233 |         A: 'left',
234 |         R: [
235 |           {
236 |             S: -1,
237 |             T: 'Values%3A',
238 |             TS: [
239 |               0,
240 |               15,
241 |               0,
242 |               0,
243 |             ],
244 |           },
245 |         ],
246 |         clr: 0,
247 |         sw: NaN,
248 |         text: 'Values:',
249 |         w: 3.055,
250 |         x: 4.555,
251 |         y: 13.248,
252 |       },
253 |       {
254 |         A: 'left',
255 |         R: [
256 |           {
257 |             S: -1,
258 |             T: '1',
259 |             TS: [
260 |               0,
261 |               15,
262 |               0,
263 |               0,
264 |             ],
265 |           },
266 |         ],
267 |         clr: 0,
268 |         sw: NaN,
269 |         text: '1',
270 |         w: 0.5,
271 |         x: 4.555,
272 |         y: 14.835,
273 |       },
274 |       {
275 |         A: 'left',
276 |         R: [
277 |           {
278 |             S: -1,
279 |             T: '2',
280 |             TS: [
281 |               0,
282 |               15,
283 |               0,
284 |               0,
285 |             ],
286 |           },
287 |         ],
288 |         clr: 0,
289 |         sw: NaN,
290 |         text: '2',
291 |         w: 0.5,
292 |         x: 4.555,
293 |         y: 16.423,
294 |       },
295 |       {
296 |         A: 'left',
297 |         R: [
298 |           {
299 |             S: -1,
300 |             T: '3',
301 |             TS: [
302 |               0,
303 |               15,
304 |               0,
305 |               0,
306 |             ],
307 |           },
308 |         ],
309 |         clr: 0,
310 |         sw: NaN,
311 |         text: '3',
312 |         w: 0.5,
313 |         x: 4.555,
314 |         y: 18.01,
315 |       },
316 |     ]
317 | 
318 | ## parse structured content from pdf file, using rules
319 | 
320 | > Snapshot 1
321 | 
322 |     [
323 |       {
324 |         extractRegexpValues: [
325 |           'world',
326 |         ],
327 |       },
328 |       {
329 |         parseNextItemValue: '4',
330 |       },
331 |       {
332 |         'parseTable.renderItems': `10.501	7.262	c2␊
333 |         10.501	10.131	c3␊
334 |         11.447	5.288	1␊
335 |         11.447	10.477	2.3␊
336 |         12.363	6.937	hello␊
337 |         12.363	9.684	world`,
338 |         'parseTable.renderMatrix': `	c2	c3␊
339 |         1		2.3␊
340 |         hello	world`,
341 |       },
342 |       {
343 |         accumulateAfterHeading: [
344 |           '1',
345 |           '2',
346 |           '3',
347 |         ],
348 |       },
349 |     ]
350 | 
351 | ## parse Table from PDF file, using TableParser
352 | 
353 | > Snapshot 1
354 | 
355 |     [
356 |       [
357 |         'Version',
358 |         'LTS',
359 |         'Date',
360 |         'V8',
361 |         'npm',
362 |         'NODE_MODULE_VERSION [1]',
363 |       ],
364 |       [
365 |         'Node.js 17.1.0',
366 |         undefined,
367 |         '2021-11-09',
368 |         '9.5.172.25',
369 |         '8.1.2',
370 |         '102',
371 |         'Downloads',
372 |         ' Changelog ',
373 |         'Docs',
374 |       ],
375 |       [
376 |         'Node.js 17.0.1',
377 |         undefined,
378 |         '2021-10-20',
379 |         '9.5.172.21',
380 |         '8.1.0',
381 |         '102',
382 |         'Downloads',
383 |         ' Changelog ',
384 |         'Docs',
385 |       ],
386 |       [
387 |         'Node.js 17.0.0',
388 |         undefined,
389 |         '2021-10-19',
390 |         '9.5.172.21',
391 |         '8.1.0',
392 |         '102',
393 |         'Downloads',
394 |         ' Changelog ',
395 |         'Docs',
396 |       ],
397 |       [
398 |         'Node.js 16.14.2',
399 |         'Gallium',
400 |         '2022-03-17',
401 |         '9.4.146.24',
402 |         '8.5.0',
403 |         '93',
404 |         'Downloads',
405 |         ' Changelog ',
406 |         'Docs',
407 |       ],
408 |       [
409 |         'Node.js 16.14.1',
410 |         'Gallium',
411 |         '2022-03-16',
412 |         '9.4.146.24',
413 |         '8.5.0',
414 |         '93',
415 |         'Downloads',
416 |         ' Changelog ',
417 |         'Docs',
418 |       ],
419 |       [
420 |         'Node.js 16.14.0',
421 |         'Gallium',
422 |         '2022-02-08',
423 |         '9.4.146.24',
424 |         '8.3.1',
425 |         '93',
426 |         'Downloads',
427 |         ' Changelog ',
428 |         'Docs',
429 |       ],
430 |       [
431 |         'Node.js 16.13.2',
432 |         'Gallium',
433 |         '2022-01-10',
434 |         '9.4.146.24',
435 |         '8.1.2',
436 |         '93',
437 |         'Downloads',
438 |         ' Changelog ',
439 |         'Docs',
440 |       ],
441 |       [
442 |         'Node.js 16.13.1',
443 |         'Gallium',
444 |         '2021-12-01',
445 |         '9.4.146.24',
446 |         '8.1.2',
447 |         '93',
448 |         'Downloads',
449 |         ' Changelog ',
450 |         'Docs',
451 |       ],
452 |       [
453 |         'Node.js 16.13.0',
454 |         'Gallium',
455 |         '2021-10-26',
456 |         '9.4.146.19',
457 |         '8.1.0',
458 |         '93',
459 |         'Downloads',
460 |         ' Changelog ',
461 |         'Docs',
462 |       ],
463 |       [
464 |         'Node.js 16.12.0',
465 |         undefined,
466 |         '2021-10-20',
467 |         '9.4.146.19',
468 |         '8.1.0',
469 |         '93',
470 |         'Downloads',
471 |         ' Changelog ',
472 |         'Docs',
473 |       ],
474 |     ]
475 | 
476 | ## sample scripts should print raw items from pdf file
477 | 
478 | > Snapshot 1
479 | 
480 |     {
481 |       stderr: `printing raw items from file: test/sample.pdf ...␊
482 |       done.␊
483 |       printing raw items from file: test/sample.pdf ...␊
484 |       done.`,
485 |       stdout: `␊
486 |       > pdfreader@0.0.0-development test:samples␊
487 |       > node parse.js test/sample.pdf && node parseAsBuffer.js test/sample.pdf␊
488 |       ␊
489 |       file = test/sample.pdf␊
490 |       page = 1␊
491 |       4.555	5.154		left	6	Hello "world"␊
492 |       4.555	7.174		left	2	Value:␊
493 |       4.555	8.761		left	0	4␊
494 |       5.095	10.501		left	0	c1␊
495 |       7.262	10.501		left	0	c2␊
496 |       10.131	10.501		left	0	c3␊
497 |       5.288	11.447		left	0	1␊
498 |       10.477	11.447		left	1	2.3␊
499 |       6.937	12.363		left	2	hello␊
500 |       9.684	12.363		left	2	world␊
501 |       4.555	13.248		left	3	Values:␊
502 |       4.555	14.835		left	0	1␊
503 |       4.555	16.423		left	0	2␊
504 |       4.555	18.01		left	0	3␊
505 |       file = undefined␊
506 |       page = 1␊
507 |       4.555	5.154		left	6	Hello "world"␊
508 |       4.555	7.174		left	2	Value:␊
509 |       4.555	8.761		left	0	4␊
510 |       5.095	10.501		left	0	c1␊
511 |       7.262	10.501		left	0	c2␊
512 |       10.131	10.501		left	0	c3␊
513 |       5.288	11.447		left	0	1␊
514 |       10.477	11.447		left	1	2.3␊
515 |       6.937	12.363		left	2	hello␊
516 |       9.684	12.363		left	2	world␊
517 |       4.555	13.248		left	3	Values:␊
518 |       4.555	14.835		left	0	1␊
519 |       4.555	16.423		left	0	2␊
520 |       4.555	18.01		left	0	3`,
521 |     }
522 | 


--------------------------------------------------------------------------------