├── .nvmrc
├── .prettierignore
├── .gitattributes
├── .gitignore
├── .npmignore
├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ └── bug-report.md
└── workflows
│ └── nodejs.yml
├── test
├── sample.pdf
├── sample-table.pdf
├── sample-with-password.pdf
├── snapshots
│ ├── test.js.snap
│ └── test.js.md
├── sample.html
├── sample.rtf
└── test.js
├── .eslintrc.json
├── .vscode
├── settings.json
└── extensions.json
├── .editorconfig
├── index.js
├── lib
├── LOG.js
├── SequentialParser.js
├── ColumnsParser.js
├── parseColumns.js
├── TableParser.js
└── parseTable.js
├── parse.js
├── LICENSE
├── parseAsBuffer.js
├── package.json
├── index.d.ts
├── PdfReader.js
├── CHANGELOG.md
├── README.md
└── Rule.js
/.nvmrc:
--------------------------------------------------------------------------------
1 | v16
2 |
--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | CHANGELOG.md
2 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | * text=auto eol=lf
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 | /dist
3 |
--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
1 | .*
2 | node_modules
3 | test
4 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [adrienjoly]
2 | custom: ['https://adrienjoly.com/donate/']
3 |
--------------------------------------------------------------------------------
/test/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample.pdf
--------------------------------------------------------------------------------
/test/sample-table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample-table.pdf
--------------------------------------------------------------------------------
/test/sample-with-password.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/sample-with-password.pdf
--------------------------------------------------------------------------------
/test/snapshots/test.js.snap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/adrienjoly/npm-pdfreader/HEAD/test/snapshots/test.js.snap
--------------------------------------------------------------------------------
/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": ["plugin:prettier/recommended"],
3 | "parserOptions": { "ecmaVersion": 2020, "sourceType": "module" }
4 | }
5 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "editor.formatOnSave": true,
3 | "editor.defaultFormatter": "esbenp.prettier-vscode",
4 | "prettier.singleQuote": false
5 | }
6 |
--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | "recommendations": [
3 | "editorconfig.editorconfig",
4 | "dbaeumer.vscode-eslint",
5 | "esbenp.prettier-vscode"
6 | ]
7 | }
8 |
--------------------------------------------------------------------------------
/.editorconfig:
--------------------------------------------------------------------------------
1 | ; http://editorconfig.org
2 |
3 | root = true
4 |
5 | [*]
6 | indent_style = space
7 | indent_size = 2
8 | end_of_line = lf
9 | charset = utf-8
10 | trim_trailing_whitespace = true
11 | insert_final_newline = true
12 |
--------------------------------------------------------------------------------
/test/sample.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
13 |
14 |
15 | Hello "world"
16 | Value:
17 | 4
18 |
19 | | c1 | c2 | c3 |
20 | | 1 | | 2.3 |
21 | | hello | world |
22 |
23 | Values:
24 | 1
25 | 2
26 | 3
27 |
28 |
--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | export { PdfReader } from "./PdfReader.js";
2 | export { Rule } from "./Rule.js";
3 | export * as LOG from "./lib/LOG.js";
4 | import * as parseTableExports from "./lib/parseTable.js";
5 | export const parseTable = Object.assign(
6 | parseTableExports.parseTable,
7 | parseTableExports
8 | );
9 | import * as parseColumnsExports from "./lib/parseColumns.js";
10 | export const parseColumns = Object.assign(
11 | parseColumnsExports.parseColumns,
12 | parseColumnsExports
13 | );
14 | export { SequentialParser } from "./lib/SequentialParser.js"; // experimental
15 | export { TableParser } from "./lib/TableParser.js";
16 | export { ColumnsParser } from "./lib/ColumnsParser.js";
17 |
--------------------------------------------------------------------------------
/lib/LOG.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Minimal logger
3 | * @author Adrien Joly, http://github.com/adrienjoly
4 | * This content is released under the MIT License.
5 | **/
6 |
7 | import util from "util";
8 |
9 | var nullLog = function LOG() {};
10 |
11 | var realLog = function LOG() {
12 | for (var i in arguments)
13 | if (arguments[i] instanceof Object || arguments[i] instanceof Array)
14 | arguments[i] = util.inspect(arguments[i]);
15 | console.log("[DEBUG] " + Array.prototype.join.call(arguments, " "));
16 | };
17 |
18 | var LOG = nullLog;
19 |
20 | export function log() {
21 | LOG.apply(null, arguments);
22 | }
23 |
24 | export function toggle(enabled) {
25 | LOG = !enabled ? nullLog : realLog;
26 | }
27 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve this npm package
4 | ---
5 |
6 | **Describe the bug**
7 | A clear and concise description of what the bug is.
8 |
9 | **To Reproduce**
10 | List the steps you followed and/or share your code to help us reproduce the bug
11 |
12 | **Expected behavior**
13 | A clear and concise description of what you expected to happen.
14 |
15 | **Screenshots, outputs or logs**
16 | If applicable, add screenshots, outputs or logs to help explain your problem.
17 |
18 | **Desktop (please complete the following information):**
19 |
20 | - OS: (e.g. iOS)
21 | - Browser: (e.g. chrome, safari)
22 | - Version: (e.g. 22)
23 |
24 | **Additional context**
25 | Add any other context about the problem here.
26 |
--------------------------------------------------------------------------------
/parse.js:
--------------------------------------------------------------------------------
1 | import { toggle } from "./lib/LOG.js";
2 | import { PdfReader } from "./index.js";
3 |
4 | toggle(false);
5 |
6 | function printRawItems(filename, callback) {
7 | new PdfReader().parseFileItems(filename, function (err, item) {
8 | if (err) callback(err);
9 | else if (!item) callback();
10 | else if (item.file) console.log("file =", item.file.path);
11 | else if (item.page) console.log("page =", item.page);
12 | else if (item.x)
13 | console.log(
14 | [item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
15 | "\t"
16 | )
17 | );
18 | else console.warn(item);
19 | });
20 | }
21 |
22 | var filename = process.argv[2];
23 | if (!filename) {
24 | console.error("please provide the name of a PDF file");
25 | } else {
26 | console.warn("printing raw items from file:", filename, "...");
27 | printRawItems(filename, function (err) {
28 | if (err) {
29 | console.error(err);
30 | process.exit(1);
31 | }
32 | console.warn("done.");
33 | });
34 | }
35 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2015 Adrien Joly
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/parseAsBuffer.js:
--------------------------------------------------------------------------------
1 | import fs from "fs";
2 | import { toggle } from "./lib/LOG.js";
3 | import { PdfReader } from "./index.js";
4 |
5 | toggle(false);
6 |
7 | function printRawItems(pdfBuffer, callback) {
8 | new PdfReader().parseBuffer(pdfBuffer, function (err, item) {
9 | if (err) callback(err);
10 | else if (!item) callback();
11 | else if (item.file) console.log("file =", item.file.path);
12 | else if (item.page) console.log("page =", item.page);
13 | else if (item.x)
14 | console.log(
15 | [item.x, item.y, item.oc, item.A, Math.floor(item.w), item.text].join(
16 | "\t"
17 | )
18 | );
19 | else console.warn(item);
20 | });
21 | }
22 |
23 | var filename = process.argv[2];
24 | if (!filename) {
25 | console.error("please provide the name of a PDF file");
26 | } else {
27 | console.warn("printing raw items from file:", filename, "...");
28 | fs.readFile(filename, (err, pdfBuffer) => {
29 | if (err) console.error(err);
30 | printRawItems(pdfBuffer, function (err) {
31 | if (err) {
32 | console.error(err);
33 | process.exit(1);
34 | }
35 | console.warn("done.");
36 | });
37 | });
38 | }
39 |
--------------------------------------------------------------------------------
/lib/SequentialParser.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Applies a list of simple actions to apply to each provided item, in order to accumulate field values.
3 | * Provides a list of parsed `fields`.
4 | * Calls `callback(error, this)` when all accumulators were processed, or when processing a null item.
5 | **/
6 | export function SequentialParser(accumulators, callback) {
7 | var step = 0;
8 | var fields = {};
9 | return {
10 | fields: fields,
11 | addField: function (key, value) {
12 | this.fields[key] = value;
13 | },
14 | parseItem: function (item) {
15 | if (step >= accumulators.length) {
16 | return console.warn(
17 | "warning: skipping item, because SequentialParser is done."
18 | );
19 | }
20 | var current = accumulators[step];
21 | if (current.field) {
22 | this.addField(current.field, item);
23 | ++step;
24 | } else if (current.accumulator) {
25 | var doneAccumulating = current.accumulator(item, this);
26 | if (doneAccumulating) ++step;
27 | } // no action => skip item
28 | else ++step;
29 | if (!item || step >= accumulators.length) {
30 | callback && callback(null, this);
31 | }
32 | },
33 | };
34 | }
35 |
--------------------------------------------------------------------------------
/lib/ColumnsParser.js:
--------------------------------------------------------------------------------
1 | /**
2 | * ColumnsParser
3 | * Classifies items into columns, nearest to the left position of their corresponding header.
4 | * @author Adrien Joly, http://github.com/adrienjoly
5 | * This content is released under the MIT License.
6 | **/
7 |
8 | import { log as LOG } from "./LOG.js";
9 |
10 | function getColumnIndex(cols, x) {
11 | var bestDist = null;
12 | for (var i = 0; i < cols.length; ++i) {
13 | var dist = Math.abs(x - cols[i].x);
14 | if (bestDist !== null && dist > bestDist) {
15 | break;
16 | } else {
17 | bestDist = dist;
18 | }
19 | }
20 | return i - 1;
21 | }
22 |
23 | export function ColumnsParser(colNames) {
24 | this.cols = [];
25 | var cols = this.cols,
26 | colNames = colNames.slice(), // clone (for parameter immutability)
27 | line = -1; // -1 = header
28 |
29 | this.processItem = function (item) {
30 | if (line == -1) {
31 | // parse x-position of column headers
32 | var i = colNames.indexOf(item.text);
33 | if (i > -1) {
34 | LOG("ColumnsParser header", i, item.text, "=> x:", item.x);
35 | cols[i] = {
36 | name: item.text,
37 | x: item.x,
38 | items: [],
39 | };
40 | colNames[i] = ""; // needed so that a column name can be associated to more than 1 index
41 | }
42 | if (cols.length == colNames.length) {
43 | // done parsing header
44 | line++;
45 | }
46 | } else {
47 | cols[getColumnIndex(cols, item.x)].items.push(item);
48 | }
49 | };
50 | }
51 |
--------------------------------------------------------------------------------
/lib/parseColumns.js:
--------------------------------------------------------------------------------
1 | /**
2 | * parseColumns, for pdfreader, used by the Rule class.
3 | * accumulates values below each column header (on 1st row, given their name), without detecting empty rows.
4 | * TODO: use ColumnsParser
5 | * @author Adrien Joly, http://github.com/adrienjoly
6 | * This content is released under the MIT License.
7 | **/
8 |
9 | import { log as LOG } from "./LOG.js";
10 |
11 | export const parseColumns = function (/* columns */) {
12 | this.output = [];
13 | this.cols = Array.prototype.slice.apply(arguments);
14 | var colNames = this.cols,
15 | colX = [],
16 | rows = this.output,
17 | line = -1, // header
18 | lineY = null;
19 | function processItem(item) {
20 | if (line == -1) {
21 | // parse x-position of column headers
22 | var i = colNames.indexOf(item.text);
23 | if (i > -1) colX[i] = item.x;
24 | if (colX.length == colNames.length) {
25 | LOG("table header:", colNames, colX);
26 | line++;
27 | }
28 | } else {
29 | if (lineY === null) {
30 | lineY = item.y;
31 | } else if (lineY != item.y) {
32 | lineY = item.y;
33 | line++;
34 | }
35 | // parsing values for each column
36 | var col = 0;
37 | for (var i = colX.length - 1; i >= 0; --i)
38 | if (item.x > colX[i]) {
39 | col = i;
40 | break;
41 | }
42 | rows[lineY] = rows[lineY] || {};
43 | rows[lineY][col] = item.text;
44 | }
45 | }
46 | processItem(this.currentItem); // apply on header's first item
47 | return processItem; // then the same function will be run on all following items, until another rule is triggered
48 | };
49 |
--------------------------------------------------------------------------------
/.github/workflows/nodejs.yml:
--------------------------------------------------------------------------------
1 | name: Node CI
2 |
3 | on:
4 | push:
5 | branches:
6 | - master
7 | pull_request:
8 | branches:
9 | - master
10 |
11 | jobs:
12 | # Prevent functional regressions on supported Node.js versions
13 | tests:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | matrix:
17 | node-version: [16.x, 18.x]
18 | steps:
19 | - uses: actions/checkout@v1
20 | - name: Use Node.js ${{ matrix.node-version }}
21 | uses: actions/setup-node@v1
22 | with:
23 | node-version: ${{ matrix.node-version }}
24 | - run: npm ci # install dependencies
25 | - run: npm test
26 |
27 | # Checks that files are formatted consistently
28 | formatting:
29 | runs-on: ubuntu-latest
30 | steps:
31 | - uses: actions/checkout@v1
32 | - uses: actions/setup-node@v1
33 | with:
34 | node-version: 16.x
35 | - run: npm ci # install dependencies
36 | - run: npm run prettier:check
37 | - run: npm run lint
38 |
39 | release:
40 | needs:
41 | - tests
42 | - formatting
43 | runs-on: ubuntu-latest
44 | steps:
45 | - name: Checkout
46 | uses: actions/checkout@v2
47 | with:
48 | fetch-depth: 0
49 | - name: Setup Node.js
50 | uses: actions/setup-node@v1
51 | with:
52 | node-version: 16
53 | - name: Install dependencies
54 | run: npm ci
55 | - name: Build commonjs
56 | run: npm run build:cjs
57 | - name: Release
58 | env:
59 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
60 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
61 | run: npm run semantic-release
62 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "pdfreader",
3 | "type": "module",
4 | "version": "0.0.0-development",
5 | "description": "Read text and parse tables from PDF files. Supports tabular data with automatic column detection, and rule-based parsing.",
6 | "main": "dist/index.cjs",
7 | "module": "./index.js",
8 | "typings": "./index.d.ts",
9 | "scripts": {
10 | "prettier:print": "prettier --list-different \"./**/*.js\" \"./**/*.md\" \"./**/*.d.ts\"",
11 | "prettier:check": "npm run -s prettier:print 1>&2; exit $(npm run -s prettier:print | wc -l)",
12 | "prettier:fix": "prettier \"./**/*.js\" \"./**/*.md\" \"./**/*.d.ts\" --write --end-of-line lf",
13 | "test:samples": "node parse.js test/sample.pdf && node parseAsBuffer.js test/sample.pdf",
14 | "test:update": "ava --update-snapshots",
15 | "test": "ava",
16 | "lint": "eslint .",
17 | "semantic-release": "semantic-release",
18 | "build:cjs": "rollup index.js --file dist/index.cjs --format cjs"
19 | },
20 | "repository": {
21 | "type": "git",
22 | "url": "https://github.com/adrienjoly/npm-pdfreader"
23 | },
24 | "keywords": [
25 | "pdf",
26 | "reader",
27 | "parser",
28 | "parse",
29 | "parsing",
30 | "convert",
31 | "CLI",
32 | "table",
33 | "data",
34 | "csv",
35 | "json",
36 | "rules"
37 | ],
38 | "author": "Adrien Joly",
39 | "license": "MIT",
40 | "bugs": {
41 | "url": "https://github.com/adrienjoly/npm-pdfreader/issues"
42 | },
43 | "homepage": "https://github.com/adrienjoly/npm-pdfreader",
44 | "dependencies": {
45 | "pdf2json": "3.1.4"
46 | },
47 | "devDependencies": {
48 | "@semantic-release/changelog": "^6.0.1",
49 | "@semantic-release/git": "^10.0.1",
50 | "@semantic-release/npm": "^9.0.1",
51 | "ava": "^4.1.0",
52 | "eslint": "^8.11.0",
53 | "eslint-config-prettier": "^8.5.0",
54 | "eslint-plugin-prettier": "^4.0.0",
55 | "execa": "^6.1.0",
56 | "prettier": "2.6.1",
57 | "semantic-release": "^19.0.2",
58 | "rollup": "^4.19.1"
59 | },
60 | "engines": {
61 | "node": ">=14"
62 | }
63 | }
--------------------------------------------------------------------------------
/index.d.ts:
--------------------------------------------------------------------------------
1 | export type InitOptions = {
2 | password?: string;
3 | debug?: boolean;
4 | signal?: AbortSignal;
5 | };
6 | export type Error = null | string;
7 |
8 | export type DataEntry = {
9 | page?: number;
10 | width?: number;
11 | height?: number;
12 | text?: string;
13 | file?: {
14 | path?: string;
15 | buffer?: string;
16 | };
17 | } | null;
18 |
19 | export type ItemHandler = (err: Error, data: DataEntry & Item) => void;
20 |
21 | export declare class PdfReader {
22 | constructor(opts?: InitOptions | null);
23 | parseFileItems(pdfFilePath: string, itemHandler: ItemHandler): void;
24 | parseBuffer(buffer: Buffer, itemHandler: ItemHandler): void;
25 | }
26 |
27 | export type Item = {
28 | x: number;
29 | y: number;
30 | sw: number;
31 | w: number;
32 | A: string;
33 | clr: number;
34 | R: {
35 | T: string;
36 | S: number;
37 | TS: any[];
38 | }[];
39 | text: string;
40 | };
41 |
42 | export type RuleAccumulator = (item: Item) => boolean | void;
43 | export type RuleHandler = (value: T) => void;
44 |
45 | export interface TableResult {
46 | matrix: string[][];
47 | items: Item[];
48 | }
49 |
50 | export class TableParser {
51 | private rows: { [key: string]: Item[] };
52 | constructor();
53 | processItem(item: Item, col: number): void;
54 | processHeadingItem(item: Item, col: number): void;
55 | getRows(): Item[][];
56 | renderRows(): string;
57 | /** row-> column-> items_collisionning_in_column-> item:Item */
58 | getMatrix(): Item[][][];
59 | getCleanMatrix(options?: { collisionSeparator: string }): string[][];
60 | renderMatrix(): string;
61 | }
62 |
63 | export class Rule {
64 | static on(regexp: RegExp): Rule;
65 | static after(regexp: RegExp): Rule;
66 | static makeItemProcessor(rules: Rule[]): (item: DataEntry) => void;
67 | static addAccumulator(methodName: string, methodBuilder: Function): void;
68 |
69 | constructor(regexp: RegExp);
70 |
71 | // Accumulator methods
72 | extractRegexpValues(): Rule;
73 | parseNextItemValue(): Rule;
74 | accumulateAfterHeading(): Rule;
75 | accumulateFromSameX(): Rule;
76 | parseColumns(...args: any[]): Rule;
77 | parseTable(columnCount: number): Rule & {
78 | then(handler: (result: TableResult) => void): Rule;
79 | };
80 |
81 | then(handler: RuleHandler): Rule;
82 |
83 | private test(item: Item): RuleAccumulator | undefined;
84 | private whenDone(callback: () => void): void;
85 | }
86 |
--------------------------------------------------------------------------------
/PdfReader.js:
--------------------------------------------------------------------------------
1 | /**
2 | * PdfReader: class that reads a PDF file, and calls a function on each item found while parsing that file.
3 | * @author Adrien Joly, http://github.com/adrienjoly
4 | * This content is released under the MIT License.
5 | *
6 | * An item object can match one of the following objects:
7 | * - null, when the parsing is over, or an error occured.
8 | * - {file:{path:string}}, when a PDF file is being opened.
9 | * - {page:integer}, when a new page is being parsed, provides the page number, starting at 1.
10 | * - {text:string, x:float, y:float, w:float, h:float...}, represents each text with its position.
11 | *
12 | **/
13 |
14 | import { log as LOG } from "./lib/LOG.js";
15 | import PDFParser from "pdf2json"; // doc: https://github.com/modesty/pdf2json
16 |
17 | function forEachItem(pdf, handler) {
18 | var pageNumber = 0;
19 | // pdf.formImage was removed in pdf2json@2, but we keep backward compatibility too
20 | var Pages = pdf.Pages || pdf.formImage.Pages;
21 | for (var p in Pages) {
22 | var page = Pages[p];
23 | var number = ++pageNumber;
24 | handler(null, {
25 | page: number,
26 | width: page.Width || (pdf.formImage ? pdf.formImage.Width : 0),
27 | height:
28 | page.Height ||
29 | (pdf.formImage ? pdf.formImage.Pages[number - 1].Height : 0),
30 | });
31 | for (var t in page.Texts) {
32 | var item = page.Texts[t];
33 | item.text = decodeURIComponent(item.R[0].T);
34 | handler(null, item);
35 | }
36 | }
37 | handler();
38 | }
39 |
40 | export function PdfReader(options) {
41 | LOG("PdfReader"); // only displayed if LOG.js was first loaded with `true` as init parameter
42 | this.options = options || {};
43 | }
44 |
45 | /**
46 | * parseFileItems: calls itemHandler(error, item) on each item parsed from the pdf file
47 | **/
48 | PdfReader.prototype.parseFileItems = function (pdfFilePath, itemHandler) {
49 | itemHandler(null, { file: { path: pdfFilePath } });
50 | var pdfParser;
51 | if (this.options.password) {
52 | pdfParser = new PDFParser(null, null, this.options.password);
53 | } else {
54 | pdfParser = new PDFParser();
55 | }
56 |
57 | pdfParser.on("pdfParser_dataError", itemHandler);
58 | pdfParser.on("pdfParser_dataReady", function (pdfData) {
59 | forEachItem(pdfData, itemHandler);
60 | });
61 | var verbosity = this.options.debug ? 1 : 0;
62 | pdfParser.loadPDF(pdfFilePath, verbosity);
63 |
64 | this.options.signal?.addEventListener("abort", function () {
65 | pdfParser.destroy();
66 | });
67 | };
68 |
69 | /**
70 | * parseBuffer: calls itemHandler(error, item) on each item parsed from the pdf file received as a buffer
71 | */
72 | PdfReader.prototype.parseBuffer = function (pdfBuffer, itemHandler) {
73 | itemHandler(null, { file: { buffer: pdfBuffer } });
74 | var pdfParser;
75 | if (this.options.password) {
76 | pdfParser = new PDFParser(null, null, this.options.password);
77 | } else {
78 | pdfParser = new PDFParser();
79 | }
80 |
81 | pdfParser.on("pdfParser_dataError", itemHandler);
82 | pdfParser.on("pdfParser_dataReady", function (pdfData) {
83 | forEachItem(pdfData, itemHandler);
84 | });
85 | var verbosity = this.options.debug ? 1 : 0;
86 | pdfParser.parseBuffer(pdfBuffer, verbosity);
87 |
88 | this.options.signal?.addEventListener("abort", function () {
89 | pdfParser.destroy();
90 | });
91 | };
92 |
--------------------------------------------------------------------------------
/lib/TableParser.js:
--------------------------------------------------------------------------------
1 | /**
2 | * TableParser
3 | * Classifies items into columns and rows, based on their left and top coordinates,
4 | * and left position of column headers.
5 | * @author Adrien Joly, http://github.com/adrienjoly
6 | * This content is released under the MIT License.
7 | **/
8 |
9 | export function TableParser() {
10 | this.rows = {};
11 | }
12 |
13 | TableParser.prototype.processItem = function (item, col) {
14 | var row = (this.rows["" + item.y] = this.rows["" + item.y] || {});
15 | (row[col] = row[col] || []).push(item);
16 | };
17 |
18 | TableParser.prototype.processHeadingItem = function (item, col) {
19 | this.processItem(
20 | {
21 | y: 0,
22 | x: item.x,
23 | text: item.text,
24 | },
25 | col
26 | );
27 | };
28 |
29 | // Rows
30 |
31 | function sortAsFloatValues(values) {
32 | return values.slice().sort(function (a, b) {
33 | return parseFloat(a) - parseFloat(b);
34 | });
35 | }
36 |
37 | TableParser.prototype.getRows = function () {
38 | var rows = this.rows;
39 | var yValues = sortAsFloatValues(Object.keys(rows));
40 | return yValues.map(function (y) {
41 | return rows["" + y];
42 | });
43 | };
44 |
45 | function renderRows(rows) {
46 | return (rows || [])
47 | .map(function (row, rowId) {
48 | var cells = [];
49 | for (var i in row)
50 | for (var j in row[i]) cells.push(row[i][j].x + ": " + row[i][j].text);
51 | return rowId + ":\t" + cells.join(", ");
52 | })
53 | .join("\n");
54 | }
55 |
56 | TableParser.prototype.renderRows = function () {
57 | return renderRows(this.getRows());
58 | };
59 |
60 | // Matrix
61 |
62 | function getSortedXValues(rows) {
63 | var xSet = {};
64 | for (var y in rows) for (var x in rows[y]) xSet[x] = true;
65 | return sortAsFloatValues(Object.keys(xSet));
66 | }
67 |
68 | /** @returns an 3-dimension matrix: row -> column -> items_collisionning_in_column -> item */
69 | TableParser.prototype.getMatrix = function () {
70 | var rows = this.getRows();
71 | var xValues = getSortedXValues(rows);
72 | return rows.map(function (row, y) {
73 | var rowNew = [];
74 | for (var x in row) {
75 | var items = row[x];
76 | var colN = xValues.indexOf(x);
77 | rowNew[colN] = (rowNew[colN] || []).concat(items);
78 | }
79 | return rowNew;
80 | });
81 | };
82 |
83 | /**
84 | * For use with console.table().
85 | * @param {String} collisionSeparator separator to use when there are multiple values to join for a given column
86 | * @returns a 2-dimension matrix: row -> column -> value
87 | */
88 | TableParser.prototype.getCleanMatrix = function ({ collisionSeparator } = {}) {
89 | return this.getMatrix().map((rowColumns) =>
90 | rowColumns.map((items) =>
91 | items.map((item) => item.text).join(collisionSeparator || "")
92 | )
93 | );
94 | };
95 |
96 | function getText(item) {
97 | return item.text;
98 | }
99 |
100 | function joinCellCollisions(separ) {
101 | return function (cell) {
102 | return (cell || []).map(getText).join(separ).substr(0, 7);
103 | };
104 | }
105 |
106 | function renderMatrix(matrix) {
107 | return (matrix || [])
108 | .map(function (row) {
109 | return (row || []).map(joinCellCollisions("+")).join("\t");
110 | })
111 | .join("\n");
112 | }
113 |
114 | TableParser.prototype.renderMatrix = function () {
115 | return renderMatrix(this.getMatrix());
116 | };
117 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## [1.2.1](https://github.com/adrienjoly/npm-pdfreader/compare/v1.2.0...v1.2.1) (2020-09-25)
2 |
3 |
4 | ### Bug Fixes
5 |
6 | * **deps:** Update dependencies ([#63](https://github.com/adrienjoly/npm-pdfreader/issues/63)) ([308f322](https://github.com/adrienjoly/npm-pdfreader/commit/308f322ea670ab2ec11f77e3588667674709b453))
7 |
8 | # [1.2.0](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.4...v1.2.0) (2020-09-25)
9 |
10 |
11 | ### Features
12 |
13 | * Support password-protected PDF files ([#61](https://github.com/adrienjoly/npm-pdfreader/issues/61)) ([248af89](https://github.com/adrienjoly/npm-pdfreader/commit/248af89d79304dfa64b5785614b496e4e5d36e69)), closes [#15](https://github.com/adrienjoly/npm-pdfreader/issues/15)
14 |
15 | ## [1.1.4](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.3...v1.1.4) (2020-09-25)
16 |
17 |
18 | ### Bug Fixes
19 |
20 | * Ease contributions ([#62](https://github.com/adrienjoly/npm-pdfreader/issues/62)) ([4a1fe66](https://github.com/adrienjoly/npm-pdfreader/commit/4a1fe6677d5a829049aa0c3c28dccb2f96e8e2f6))
21 |
22 | ## [1.1.3](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.2...v1.1.3) (2020-04-26)
23 |
24 |
25 | ### Bug Fixes
26 |
27 | * **node:** use latest node 10 version ([#52](https://github.com/adrienjoly/npm-pdfreader/issues/52)) ([eb34ea9](https://github.com/adrienjoly/npm-pdfreader/commit/eb34ea92fea924d3d1e28b13a2e730b62a996b51))
28 |
29 | ## [1.1.2](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.1...v1.1.2) (2020-04-26)
30 |
31 |
32 | ### Bug Fixes
33 |
34 | * **deps:** with npm audit fix ([#51](https://github.com/adrienjoly/npm-pdfreader/issues/51)) ([16502fc](https://github.com/adrienjoly/npm-pdfreader/commit/16502fce29af76ebf8216e17aafb388a54326b6c))
35 |
36 | ## [1.1.1](https://github.com/adrienjoly/npm-pdfreader/compare/v1.1.0...v1.1.1) (2020-04-26)
37 |
38 |
39 | ### Bug Fixes
40 |
41 | * **deps:** bump acorn from 6.3.0 to 6.4.1 ([#46](https://github.com/adrienjoly/npm-pdfreader/issues/46)) ([af61802](https://github.com/adrienjoly/npm-pdfreader/commit/af61802d1430adab8c9c56588d8a5b565910bd3a))
42 |
43 | # [1.1.0](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.10...v1.1.0) (2020-04-26)
44 |
45 |
46 | ### Features
47 |
48 | * **deps:** upgrade pdf2json to version 1.2.0 ([#50](https://github.com/adrienjoly/npm-pdfreader/issues/50)) ([0877162](https://github.com/adrienjoly/npm-pdfreader/commit/08771623aa7bf228b4a39e763e38614e79dca10c)), closes [#40](https://github.com/adrienjoly/npm-pdfreader/issues/40)
49 |
50 | ## [1.0.10](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.9...v1.0.10) (2020-04-26)
51 |
52 |
53 | ### Bug Fixes
54 |
55 | * **ci:** check formatting in separate step, after tests ([#49](https://github.com/adrienjoly/npm-pdfreader/issues/49)) ([9129b8a](https://github.com/adrienjoly/npm-pdfreader/commit/9129b8a4f860fbc674fd7485c7c0661c0344a71d))
56 |
57 | ## [1.0.9](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.8...v1.0.9) (2020-04-26)
58 |
59 |
60 | ### Bug Fixes
61 |
62 | * **ci:** prettier to ignore CHANGELOG (generated) ([8bcf776](https://github.com/adrienjoly/npm-pdfreader/commit/8bcf77674a6e472c791accca4d8385e8462679b6))
63 | * **ci:** skip github actions workflow on release commits ([c970cda](https://github.com/adrienjoly/npm-pdfreader/commit/c970cda451a3a3b53c9d42c721524b22a7714544))
64 |
65 | ## [1.0.8](https://github.com/adrienjoly/npm-pdfreader/compare/v1.0.7...v1.0.8) (2020-04-26)
66 |
67 |
68 | ### Bug Fixes
69 |
70 | * **release:** automatic update of version in package.json ([#48](https://github.com/adrienjoly/npm-pdfreader/issues/48)) ([bad1d5b](https://github.com/adrienjoly/npm-pdfreader/commit/bad1d5bfce1c55b503cca3380c3187fb071b6056))
71 |
--------------------------------------------------------------------------------
/test/sample.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg1252\cocoartf1344\cocoasubrtf720
2 | {\fonttbl\f0\froman\fcharset0 Times-Roman;}
3 | {\colortbl;\red255\green255\blue255;\red0\green0\blue0;}
4 | \paperw11900\paperh16840\margl1440\margr1440\vieww10800\viewh8400\viewkind0
5 | \deftab720
6 | \pard\pardeftab720\sa321
7 |
8 | \f0\b\fs48 \cf2 \expnd0\expndtw0\kerning0
9 | \outl0\strokewidth0 \strokec2 Hello "world"\
10 | \pard\pardeftab720\sa240
11 |
12 | \b0\fs24 \cf2 \expnd0\expndtw0\kerning0
13 | \outl0\strokewidth0 Value:\
14 | 4\
15 |
16 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrt\brdrs\brdrw20\brdrcf2 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2
17 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx2880
18 | \clvertalc \clshdrawnil \clwWidth860\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx5760
19 | \clvertalc \clshdrawnil \clwWidth940\clftsWidth3 \clbrdrt\brdrnil \clbrdrl\brdrnil \clbrdrb\brdrnil \clbrdrr\brdrnil \clpadt20 \clpadl20 \clpadb20 \clpadr20 \gaph\cellx8640
20 | \pard\intbl\itap1\pardeftab720\qc
21 |
22 | \b \cf2 \expnd0\expndtw0\kerning0
23 | \outl0\strokewidth0 c1\cell
24 | \pard\intbl\itap1\pardeftab720\qc
25 | \cf2 \expnd0\expndtw0\kerning0
26 | \outl0\strokewidth0 c2\cell
27 | \pard\intbl\itap1\pardeftab720\qc
28 | \cf2 \expnd0\expndtw0\kerning0
29 | \outl0\strokewidth0 c3\cell \row
30 |
31 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2
32 | \clvertalc \clshdrawnil \clwWidth120\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx2880
33 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx5760
34 | \clvertalc \clshdrawnil \clwWidth560\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx8640
35 | \pard\intbl\itap1\pardeftab720\qr
36 |
37 | \b0 \cf2 \expnd0\expndtw0\kerning0
38 | \outl0\strokewidth0 1\cell
39 | \pard\intbl\itap1\pardeftab720\qr
40 | \cf2 \expnd0\expndtw0\kerning0
41 | \outl0\strokewidth0 \cell
42 | \pard\intbl\itap1\pardeftab720\qr
43 | \cf2 \expnd0\expndtw0\kerning0
44 | \outl0\strokewidth0 2.3\cell \row
45 |
46 | \itap1\trowd \taflags1 \trgaph108\trleft-108 \trbrdrl\brdrs\brdrw20\brdrcf2 \trbrdrb\brdrs\brdrw20\brdrcf2 \trbrdrr\brdrs\brdrw20\brdrcf2
47 | \clvertalc \clshdrawnil \clwWidth120\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx2880
48 | \clvertalc \clshdrawnil \clwWidth480\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx5760
49 | \clvertalc \clshdrawnil \clwWidth560\clftsWidth3 \clbrdrt\brdrs\brdrw20\brdrcf2 \clbrdrl\brdrs\brdrw20\brdrcf2 \clbrdrb\brdrs\brdrw20\brdrcf2 \clbrdrr\brdrs\brdrw20\brdrcf2 \clpadl200 \clpadr200 \gaph\cellx8640
50 | \pard\intbl\itap1\pardeftab720\qr
51 | \cf2 \expnd0\expndtw0\kerning0
52 | \outl0\strokewidth0 \cell
53 | \pard\intbl\itap1\pardeftab720\qr
54 | \cf2 \expnd0\expndtw0\kerning0
55 | \outl0\strokewidth0 hello\cell
56 | \pard\intbl\itap1\pardeftab720\qr
57 | \cf2 \expnd0\expndtw0\kerning0
58 | \outl0\strokewidth0 world\cell \lastrow\row
59 | \pard\pardeftab720\sa240
60 | \cf2 \expnd0\expndtw0\kerning0
61 | \outl0\strokewidth0 Values:\
62 | 1\
63 | 2\
64 | 3\
65 | }
--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
1 | import assert from "assert";
2 | import test from "ava";
3 | import { toggle } from "../lib/LOG.js";
4 | import * as lib from "../index.js";
5 |
6 | toggle(false);
7 |
8 | const PdfReader = lib.PdfReader;
9 | const Rule = lib.Rule;
10 |
11 | const TESTFILE = "./test/sample.pdf";
12 | const TESTFILE_WITH_PASSWORD = "./test/sample-with-password.pdf";
13 |
14 | test("parse raw items from pdf file", async (t) => {
15 | const res = new Promise((resolve, reject) => {
16 | const items = [];
17 | new PdfReader().parseFileItems(TESTFILE, (err, item) => {
18 | if (err) reject(err);
19 | else if (!item) resolve(items);
20 | else items.push(item);
21 | });
22 | });
23 | t.snapshot(await res);
24 | });
25 |
26 | test("parse structured content from pdf file, using rules", async (t) => {
27 | const res = new Promise((resolve, reject) => {
28 | const content = [];
29 | const rules = [
30 | Rule.on(/^Hello \"(.*)\"$/)
31 | .extractRegexpValues()
32 | .then((value) => content.push({ extractRegexpValues: value })),
33 | Rule.on(/^Value\:/)
34 | .parseNextItemValue()
35 | .then((value) => content.push({ parseNextItemValue: value })),
36 | Rule.on(/^c1$/)
37 | .parseTable(3)
38 | .then((table) =>
39 | content.push({
40 | "parseTable.renderMatrix": lib.parseTable.renderMatrix(
41 | table.matrix
42 | ),
43 | "parseTable.renderItems": lib.parseTable.renderItems(table.items),
44 | })
45 | ),
46 | Rule.on(/^Values\:/)
47 | .accumulateAfterHeading()
48 | .then((value) => content.push({ accumulateAfterHeading: value })),
49 | ];
50 | const processItem = Rule.makeItemProcessor(rules);
51 | new PdfReader().parseFileItems(TESTFILE, (err, item) => {
52 | if (err) reject(err);
53 | else {
54 | processItem(item);
55 | if (!item) resolve(content);
56 | }
57 | });
58 | });
59 | t.snapshot(await res);
60 | });
61 |
62 | test("parse Table from PDF file, using TableParser", async (t) => {
63 | const matrix = await new Promise((resolve, reject) => {
64 | // the thresholds were determined manually, based on the horizontal position (x) for column headers
65 | const colThresholds = [6.8, 9.5, 13.3, 16.7, 18.4, 28, 32, 36, Infinity];
66 |
67 | const columnQuantitizer = (item) => {
68 | const col = colThresholds.findIndex(
69 | (colThreshold) => parseFloat(item.x) < colThreshold
70 | );
71 | assert(col >= 0, col);
72 | assert(col < colThresholds.length, col);
73 | // console.log(`COL ${col}\t${parseFloat(item.x)}\t${item.text}`);
74 | return col;
75 | };
76 |
77 | const table = new lib.TableParser();
78 | new PdfReader().parseFileItems("./test/sample-table.pdf", (err, item) => {
79 | if (err) reject(err);
80 | else if (!item) {
81 | resolve(table.getCleanMatrix({ collisionSeparator: "" }));
82 | } else if (item.text) {
83 | table.processItem(item, columnQuantitizer(item));
84 | }
85 | });
86 | });
87 | // console.table(matrix);
88 | t.snapshot(matrix);
89 | });
90 |
91 | test("support pdf file with password", async (t) => {
92 | const promise = new Promise((resolve, reject) =>
93 | new PdfReader({ password: "password" }).parseFileItems(
94 | TESTFILE_WITH_PASSWORD,
95 | (err, item) => {
96 | if (err) reject(err);
97 | else if (!item) resolve();
98 | }
99 | )
100 | );
101 | await t.notThrowsAsync(promise);
102 | });
103 |
104 | test("sample scripts should print raw items from pdf file", async (t) => {
105 | const { execa } = await import("execa");
106 | const { stdout, stderr } = await execa("npm run test:samples", {
107 | shell: true, // needed in order to run npm commands with execa
108 | });
109 | t.snapshot({ stdout, stderr });
110 | });
111 |
--------------------------------------------------------------------------------
/lib/parseTable.js:
--------------------------------------------------------------------------------
1 | /**
2 | * parseTable accumulator, for pdfreader, used by the Rule class.
3 | * items are classified into columns and rows, based on their left and top coordinates,
4 | * and left position of column headers.
5 | * TODO: use TableParser
6 | * @author Adrien Joly, http://github.com/adrienjoly
7 | * This content is released under the MIT License.
8 | **/
9 |
10 | function getTopPos(item) {
11 | return item.y;
12 | }
13 |
14 | function getLeftPos(item) {
15 | return item.x;
16 | }
17 |
18 | function getText(item) {
19 | return item.text;
20 | }
21 |
22 | /**
23 | * makeClassifier(): makes a classifier, based on an array of numbers and an expected number of clusters.
24 | * nbClusters: expected number of clusters
25 | * arr: array of numbers
26 | * => returns a function that takes a number, and returns the number of its corresponding column.
27 | **/
28 | function makeFloorClassifier(nbClusters, arr) {
29 | var min = Math.min.apply(Math, arr);
30 | var delta = Math.max.apply(Math, arr) - min;
31 | min -= delta / nbClusters / 2;
32 | return function classify(value) {
33 | return Math.floor((nbClusters * (value - min)) / delta);
34 | };
35 | }
36 |
37 | function makeColumnClassifier(header) {
38 | var colX = [0].concat(header.map(getLeftPos)).sort(function (a, b) {
39 | return a - b;
40 | });
41 | return function classify(item) {
42 | for (var i = colX.length - 1; i > -1; --i)
43 | if (getLeftPos(item) >= colX[i]) return i;
44 | };
45 | }
46 |
47 | function buildRowList(items, classifyRow) {
48 | var rows = [];
49 | for (var i in items) {
50 | var item = items[i];
51 | var row = classifyRow(getTopPos(item));
52 | (rows[row] = rows[row] || []).push(item);
53 | }
54 | return rows;
55 | }
56 |
57 | function joinCellCollisions(separ) {
58 | return function (cell) {
59 | return (cell || []).map(getText).join(separ).substr(0, 7);
60 | };
61 | }
62 |
63 | function fillTab(str) {
64 | return str.substr(0, 7);
65 | }
66 |
67 | export function renderTable(table) {
68 | return (table || [])
69 | .map(function (row) {
70 | return (row || []).map(fillTab).join("\t");
71 | })
72 | .join("\n");
73 | }
74 |
75 | export function renderMatrix(matrix) {
76 | return (matrix || [])
77 | .map(function (row) {
78 | return (row || []).map(joinCellCollisions("+")).join("\t");
79 | })
80 | .join("\n");
81 | }
82 |
83 | export function renderRows(rows) {
84 | return (rows || [])
85 | .map(function (row, rowId) {
86 | var cells = [rowId + ":"];
87 | for (var i in row)
88 | cells.push((Math.floor(row[i].x) + ":" + row[i].text).substr(0, 7));
89 | return cells.join("\t");
90 | })
91 | .join("\n");
92 | }
93 |
94 | export function renderItems(items) {
95 | return items
96 | .map(function (i) {
97 | return [i.y, i.x, i.text].join("\t");
98 | })
99 | .join("\n");
100 | }
101 |
102 | function buildMatrix(rows, classifyColumn) {
103 | var matrix = [];
104 | for (var y in rows) {
105 | var row = [];
106 | for (var x in rows[y]) {
107 | var item = rows[y][x];
108 | var colN = classifyColumn(item);
109 | (row[colN] = row[colN] || []).push(item);
110 | }
111 | matrix.push(row);
112 | }
113 | return matrix;
114 | }
115 |
116 | export function detectCollisions(matrix) {
117 | var collisions = [];
118 | (matrix || []).map(function (row, rowN) {
119 | (row || []).map(function (cellItems, colN) {
120 | if (cellItems.length > 1)
121 | collisions.push({
122 | row: rowN,
123 | col: colN,
124 | items: cellItems,
125 | });
126 | });
127 | });
128 | return collisions;
129 | }
130 |
131 | export const parseTable = function makeAccumulator(nbRows, headerRow) {
132 | var rule = this,
133 | items = [];
134 |
135 | rule.nbRows = nbRows || 0;
136 | rule.output = {
137 | items: items,
138 | rows: null,
139 | matrix: null,
140 | };
141 |
142 | function accumulate(item) {
143 | items.push(item);
144 | }
145 |
146 | // when parsing is done: generate a clean table, from items.
147 | rule.whenDone(function () {
148 | // classify items into rows
149 | var classifyRow = makeFloorClassifier(rule.nbRows, items.map(getTopPos));
150 | //LOG(items.map(function(i){ return [getTopPos(i), classifyRow(getTopPos(i)), i.text].join("\t"); }).join("\n"));
151 | this.output.rows = buildRowList(items, classifyRow);
152 | // classify row items into columns
153 | var classifyColumn = makeColumnClassifier(this.output.rows[headerRow || 0]);
154 | this.output.matrix = buildMatrix(this.output.rows, classifyColumn);
155 | });
156 |
157 | return accumulate; // then the same function will be run on all following items, until another rule is triggered
158 | };
159 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # pdfreader  [](https://app.codacy.com/gh/adrienjoly/npm-pdfreader?utm_source=github.com&utm_medium=referral&utm_content=adrienjoly/npm-pdfreader&utm_campaign=Badge_Grade)
2 |
3 | Read text and parse tables from PDF files.
4 |
5 | Supports **tabular data** with automatic column detection, and **rule-based parsing**.
6 |
7 | Dependencies: it is based on [pdf2json](https://www.npmjs.com/package/pdf2json), which itself relies on Mozilla's [pdf.js](https://github.com/mozilla/pdf.js/).
8 |
9 | 🆕 Now includes TypeScript type definitions!
10 |
11 | ℹ️ Important notes:
12 |
13 | - This module is meant to be run using Node.js only. **It does not work from a web browser.**
14 | - This module extracts text entries from PDF files. It does not support photographed text. If you cannot select text from the PDF file, **you may need to use OCR software first**.
15 |
16 | Summary:
17 |
18 | - [Installation, tests and CLI usage](#installation-tests-and-cli-usage)
19 | - [Raw PDF reading](#raw-pdf-reading) (incl. examples)
20 | - [Rule-based data extraction](#rule-based-data-extraction)
21 | - [Troubleshooting & FAQ](#troubleshooting--faq)
22 |
23 | ## Installation, tests and CLI usage
24 |
25 | After installing [Node.js](https://nodejs.org/):
26 |
27 | ```sh
28 | git clone https://github.com/adrienjoly/npm-pdfreader.git
29 | cd npm-pdfreader
30 | npm install
31 | npm test
32 | node parse.js test/sample.pdf
33 | ```
34 |
35 | ## Installation into an existing project
36 |
37 | To install `pdfreader` as a dependency of your Node.js project:
38 |
39 | ```sh
40 | npm install pdfreader
41 | ```
42 |
43 | Then, see below for examples of use.
44 |
45 | ## Raw PDF reading
46 |
47 | This module exposes the `PdfReader` class, to be instantiated. You can pass `{ debug: true }` to the constructor, in order to log debugging information. (useful for troubleshooting)
48 |
49 | Your instance has two methods for parsing a PDF. They return the same output and differ only in input: `PdfReader.parseFileItems` (as below) for a filename, and `PdfReader.parseBuffer` (see: "Raw PDF reading from a PDF already in memory (buffer)") from data that you don't want to reference from the filesystem.
50 |
51 | Whichever method you choose, it asks for a callback, which gets called each time the instance finds what it denotes as a PDF item.
52 |
53 | An item object can match one of the following objects:
54 |
55 | - `null`, when the parsing is over, or an error occured.
56 | - File metadata, `{file:{path:string}}`, when a PDF file is being opened, and is always the first item.
57 | - Page metadata, `{page:integer, width:float, height:float}`, when a new page is being parsed, provides the page number, starting at 1. This basically acts as a carriage return for the coordinates of text items to be processed.
58 | - Text items, `{text:string, x:float, y:float, w:float, ...}`, which you can think of as simple objects with a text property, and floating 2D AABB coordinates on the page.
59 |
60 | It's up to your callback to process these items into a data structure of your choice, and also to handle any errors thrown to it.
61 |
62 | For example:
63 |
64 | ```javascript
65 | import { PdfReader } from "pdfreader";
66 |
67 | new PdfReader().parseFileItems("test/sample.pdf", (err, item) => {
68 | if (err) console.error("error:", err);
69 | else if (!item) console.warn("end of file");
70 | else if (item.text) console.log(item.text);
71 | });
72 | ```
73 |
74 | ### Parsing a password-protected PDF file
75 |
76 | ```javascript
77 | new PdfReader({ password: "YOUR_PASSWORD" }).parseFileItems(
78 | "test/sample-with-password.pdf",
79 | function (err, item) {
80 | if (err) console.error(err);
81 | else if (!item) console.warn("end of file");
82 | else if (item.text) console.log(item.text);
83 | }
84 | );
85 | ```
86 |
87 | ### Raw PDF reading from a PDF buffer
88 |
89 | As above, but reading from a buffer in memory rather than from a file referenced by path. For example:
90 |
91 | ```javascript
92 | import fs from "fs";
93 | import { PdfReader } from "pdfreader";
94 |
95 | fs.readFile("test/sample.pdf", (err, pdfBuffer) => {
96 | // pdfBuffer contains the file content
97 | new PdfReader().parseBuffer(pdfBuffer, (err, item) => {
98 | if (err) console.error("error:", err);
99 | else if (!item) console.warn("end of buffer");
100 | else if (item.text) console.log(item.text);
101 | });
102 | });
103 | ```
104 |
105 | ### Other examples of use
106 |
107 | 
108 |
109 | 
110 |
111 | Source code of the examples above: [parsing a CV/résumé](https://github.com/adrienjoly/npm-pdfreader-example).
112 |
113 | For more, see [Examples of use](https://github.com/adrienjoly/npm-pdfreader/discussions/categories/examples-of-use).
114 |
115 | ## Rule-based data extraction
116 |
117 | The `Rule` class can be used to define and process data extraction rules, while parsing a PDF document.
118 |
119 | `Rule` instances expose "accumulators": methods that defines the data extraction strategy to be used for each rule.
120 |
121 | Example:
122 |
123 | ```javascript
124 | const processItem = Rule.makeItemProcessor([
125 | Rule.on(/^Hello \"(.*)\"$/)
126 | .extractRegexpValues()
127 | .then(displayValue),
128 | Rule.on(/^Value\:/)
129 | .parseNextItemValue()
130 | .then(displayValue),
131 | Rule.on(/^c1$/).parseTable(3).then(displayTable),
132 | Rule.on(/^Values\:/)
133 | .accumulateAfterHeading()
134 | .then(displayValue),
135 | ]);
136 | new PdfReader().parseFileItems("test/sample.pdf", (err, item) => {
137 | if (err) console.error(err);
138 | else processItem(item);
139 | });
140 | ```
141 |
142 | ## Troubleshooting & FAQ
143 |
144 | ### Is it possible to parse a PDF document from a web application?
145 |
146 | Solutions exist, but this module cannot be run directly by a web browser. If you really want to use this module, you will have to integrate it into your back-end so that PDF files can be read from your server.
147 |
148 | ### `Cannot read property 'userAgent' of undefined` error from an express-based node.js app
149 |
150 | Dmitry found out that you may need to run these instructions before including the `pdfreader` module:
151 |
152 | ```js
153 | global.navigator = {
154 | userAgent: "node",
155 | };
156 |
157 | window.navigator = {
158 | userAgent: "node",
159 | };
160 | ```
161 |
162 | Source: [express - TypeError: Cannot read property 'userAgent' of undefined error on node.js app run - Stack Overflow](https://stackoverflow.com/questions/49208414/typeerror-cannot-read-property-useragent-of-undefined-error-on-node-js-app-ru)
163 |
--------------------------------------------------------------------------------
/Rule.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Rule: class that can be used to define and process data extraction rules, while parsing a PDF document.
3 | * @author Adrien Joly, http://github.com/adrienjoly
4 | * This content is released under the MIT License.
5 | **/
6 |
7 | import { log as LOG } from "./lib/LOG.js";
8 | import { parseColumns } from "./lib/parseColumns.js";
9 | import { parseTable } from "./lib/parseTable.js";
10 |
11 | /**
12 | * regexp: a regular expression which a PDF item's text must match in order to execute that rule.
13 | * => a Rule object exposes "accumulators": methods that defines the data extraction strategy of a rule.
14 | **/
15 | export function Rule(regexp) {
16 | this.regexp = regexp;
17 | var self = this;
18 | // proxy accumulators methods
19 | Object.keys(Rule.accumulators).forEach(function (name) {
20 | self[name] = function () {
21 | LOG("building rule:", regexp, "->", name);
22 | self.methodName = name;
23 | self.accumulatorParams = arguments;
24 | self.accumulatorBuilder = Rule.accumulators[name];
25 | return self;
26 | };
27 | });
28 | }
29 |
30 | // shortcut for defining Rule objects in a more concise manner
31 | Rule.on = function (regexp) {
32 | return new Rule(regexp);
33 | };
34 |
35 | Rule.after = function (regexp) {
36 | var rule = new Rule(regexp);
37 | rule.skipCurrentItem = true;
38 | return rule;
39 | };
40 |
41 | /**
42 | * then(): defines a function to be called after a Rule's accumulator has finished processing items.
43 | * fct: the function to be called after a Rule's accumulator has finished processing items.
44 | * the output of the accumulator will be passed as the first parameter of that function.
45 | **/
46 | Rule.prototype.then = function (fct) {
47 | var self = this;
48 | this.terminate = function () {
49 | fct.call(self, self.output);
50 | };
51 | return this;
52 | };
53 |
54 | // private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator.
55 | Rule.prototype.test = function (item) {
56 | if (this.regexp.test(item.text)) {
57 | // lazy init of accumulators: build and init the accumulator on first match
58 | this.currentItem = item;
59 | if (!this.accumulatorImpl && this.accumulatorBuilder) {
60 | this.accumulatorImpl = this.accumulatorBuilder.apply(
61 | this,
62 | this.accumulatorParams
63 | );
64 | this.accumulatorImpl.methodName = this.methodName;
65 | this.accumulatorImpl.terminate = this.terminate;
66 | }
67 | return this.accumulatorImpl;
68 | }
69 | };
70 |
71 | // intended to be run from accumulator, in order to process output before calling termination then() handler.
72 | Rule.prototype.whenDone = function (fct) {
73 | var self = this;
74 | var then = this.terminate;
75 | this.terminate = function () {
76 | fct.call(self);
77 | then();
78 | };
79 | };
80 |
81 | /**
82 | * rules: array of Rule objects that will be executed one-by-one, whenever a PDF item matches a rule.
83 | * each rule can only be executed once.
84 | * => returns a function to be called for each item by the PdfReader.
85 | **/
86 | Rule.makeItemProcessor = function (rules) {
87 | var currentAccumulator = null;
88 | function terminateAccumulator() {
89 | var terminatePreviousAcc = (currentAccumulator || {}).terminate;
90 | if (terminatePreviousAcc) {
91 | LOG("terminating accumulator:", currentAccumulator.methodName);
92 | terminatePreviousAcc(currentAccumulator); // TODO: remove currentAccumulator parameter
93 | }
94 | }
95 | var applyRulesOnNextItem = true;
96 | return function (item) {
97 | if (!item)
98 | // last item of the file => flush buffers
99 | return terminateAccumulator();
100 | else if (!item.text) return;
101 | //LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem);
102 | if (applyRulesOnNextItem)
103 | for (var r in rules) {
104 | var accumulator = rules[r].test(item);
105 | if (accumulator) {
106 | terminateAccumulator();
107 | LOG("current accumulator:", accumulator.methodName);
108 | if (rules[r].skipCurrentItem) applyRulesOnNextItem = false;
109 | currentAccumulator = accumulator;
110 | delete rules[r];
111 | return;
112 | }
113 | }
114 | else applyRulesOnNextItem = true;
115 | // if reaching this point, the current item matches none of the rules => accumulating data on current accumulator
116 | if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item);
117 | };
118 | };
119 |
120 | /**
121 | * Rule.accumulators: array of accumulators that can be used for defining Rule objects.
122 | * An accumulator is a function that may (or may not) accept parameters, to be provided by the developer of a parser.
123 | * It returns another function that will be run on every following PDF item, in order to accumulate data.
124 | * The output of an accumulator is stored in this.output (field of its parent Rule object).
125 | **/
126 | Rule.accumulators = {
127 | stopAccumulating: function () {
128 | return function () {};
129 | },
130 | };
131 |
132 | // method for adding accumulators
133 | Rule.addAccumulator = function (methodName, methodBuilder) {
134 | Rule.accumulators[methodName] = methodBuilder;
135 | };
136 |
137 | /**
138 | * This accumulator will store the group values extracted by the regexp of the Rule object,
139 | * on the current matching PDF item, into an array.
140 | *
141 | * E.g. with regex: /hello ([a-z]+)/, the text "hello world" will yield "world".
142 | **/
143 | Rule.addAccumulator("extractRegexpValues", function () {
144 | var matches = this.regexp.exec(this.currentItem.text);
145 | this.output = matches.slice(1);
146 | return function () {}; // following lines are not to be processed by this accumulator
147 | });
148 |
149 | /**
150 | * This accumulator will store the value of the next PDF item.
151 | **/
152 | Rule.addAccumulator("parseNextItemValue", function () {
153 | var self = this,
154 | done = false;
155 | return function (item) {
156 | if (done) return;
157 | done = true;
158 | self.output = item.text;
159 | };
160 | });
161 |
162 | /**
163 | * This accumulator will store the text of all following PDF items into an array.
164 | **/
165 | Rule.addAccumulator("accumulateAfterHeading", function () {
166 | var output = (this.output = []);
167 | return function accumulate(item) {
168 | output.push(item.text);
169 | };
170 | });
171 |
172 | /**
173 | * This accumulator will store the text of all following PDF items with equal x-coordinates.
174 | **/
175 | Rule.addAccumulator("accumulateFromSameX", function () {
176 | var output = (this.output = []),
177 | x = null;
178 | return function accumulate(item) {
179 | if (x === null) x = item.x;
180 | if (x == item.x) output.push(item.text);
181 | };
182 | });
183 |
184 | /**
185 | * This accumulator will store a table by detecting its columns, given their names.
186 | **/
187 | Rule.addAccumulator("parseColumns", parseColumns);
188 |
189 | /**
190 | * This accumulator will store a table by detecting its columns, given their count.
191 | **/
192 | Rule.addAccumulator("parseTable", parseTable);
193 |
--------------------------------------------------------------------------------
/test/snapshots/test.js.md:
--------------------------------------------------------------------------------
1 | # Snapshot report for `test/test.js`
2 |
3 | The actual snapshot is saved in `test.js.snap`.
4 |
5 | Generated by [AVA](https://avajs.dev).
6 |
7 | ## parse raw items from pdf file
8 |
9 | > Snapshot 1
10 |
11 | [
12 | {
13 | file: {
14 | path: './test/sample.pdf',
15 | },
16 | },
17 | {
18 | height: 52.618,
19 | page: 1,
20 | width: 37.205,
21 | },
22 | {
23 | A: 'left',
24 | R: [
25 | {
26 | S: -1,
27 | T: 'Hello%20%22world%22',
28 | TS: [
29 | 0,
30 | 28,
31 | 1,
32 | 0,
33 | ],
34 | },
35 | ],
36 | clr: 0,
37 | sw: 0.32553125,
38 | text: 'Hello "world"',
39 | w: 6.138,
40 | x: 4.555,
41 | y: 5.154,
42 | },
43 | {
44 | A: 'left',
45 | R: [
46 | {
47 | S: -1,
48 | T: 'Value%3A',
49 | TS: [
50 | 0,
51 | 15,
52 | 0,
53 | 0,
54 | ],
55 | },
56 | ],
57 | clr: 0,
58 | sw: NaN,
59 | text: 'Value:',
60 | w: 2.666,
61 | x: 4.555,
62 | y: 7.174,
63 | },
64 | {
65 | A: 'left',
66 | R: [
67 | {
68 | S: -1,
69 | T: '4',
70 | TS: [
71 | 0,
72 | 15,
73 | 0,
74 | 0,
75 | ],
76 | },
77 | ],
78 | clr: 0,
79 | sw: NaN,
80 | text: '4',
81 | w: 0.5,
82 | x: 4.555,
83 | y: 8.761,
84 | },
85 | {
86 | A: 'left',
87 | R: [
88 | {
89 | S: -1,
90 | T: 'c1',
91 | TS: [
92 | 0,
93 | 16,
94 | 1,
95 | 0,
96 | ],
97 | },
98 | ],
99 | clr: 0,
100 | sw: 0.32553125,
101 | text: 'c1',
102 | w: 0.944,
103 | x: 5.095,
104 | y: 10.501,
105 | },
106 | {
107 | A: 'left',
108 | R: [
109 | {
110 | S: -1,
111 | T: 'c2',
112 | TS: [
113 | 0,
114 | 16,
115 | 1,
116 | 0,
117 | ],
118 | },
119 | ],
120 | clr: 0,
121 | sw: 0.32553125,
122 | text: 'c2',
123 | w: 0.944,
124 | x: 7.262,
125 | y: 10.501,
126 | },
127 | {
128 | A: 'left',
129 | R: [
130 | {
131 | S: -1,
132 | T: 'c3',
133 | TS: [
134 | 0,
135 | 16,
136 | 1,
137 | 0,
138 | ],
139 | },
140 | ],
141 | clr: 0,
142 | sw: 0.32553125,
143 | text: 'c3',
144 | w: 0.944,
145 | x: 10.131,
146 | y: 10.501,
147 | },
148 | {
149 | A: 'left',
150 | R: [
151 | {
152 | S: -1,
153 | T: '1',
154 | TS: [
155 | 0,
156 | 15,
157 | 0,
158 | 0,
159 | ],
160 | },
161 | ],
162 | clr: 0,
163 | sw: NaN,
164 | text: '1',
165 | w: 0.5,
166 | x: 5.288,
167 | y: 11.447,
168 | },
169 | {
170 | A: 'left',
171 | R: [
172 | {
173 | S: -1,
174 | T: '2.3',
175 | TS: [
176 | 0,
177 | 15,
178 | 0,
179 | 0,
180 | ],
181 | },
182 | ],
183 | clr: 0,
184 | sw: NaN,
185 | text: '2.3',
186 | w: 1.25,
187 | x: 10.477,
188 | y: 11.447,
189 | },
190 | {
191 | A: 'left',
192 | R: [
193 | {
194 | S: -1,
195 | T: 'hello',
196 | TS: [
197 | 0,
198 | 15,
199 | 0,
200 | 0,
201 | ],
202 | },
203 | ],
204 | clr: 0,
205 | sw: NaN,
206 | text: 'hello',
207 | w: 2,
208 | x: 6.937,
209 | y: 12.363,
210 | },
211 | {
212 | A: 'left',
213 | R: [
214 | {
215 | S: -1,
216 | T: 'world',
217 | TS: [
218 | 0,
219 | 15,
220 | 0,
221 | 0,
222 | ],
223 | },
224 | ],
225 | clr: 0,
226 | sw: NaN,
227 | text: 'world',
228 | w: 2.333,
229 | x: 9.684,
230 | y: 12.363,
231 | },
232 | {
233 | A: 'left',
234 | R: [
235 | {
236 | S: -1,
237 | T: 'Values%3A',
238 | TS: [
239 | 0,
240 | 15,
241 | 0,
242 | 0,
243 | ],
244 | },
245 | ],
246 | clr: 0,
247 | sw: NaN,
248 | text: 'Values:',
249 | w: 3.055,
250 | x: 4.555,
251 | y: 13.248,
252 | },
253 | {
254 | A: 'left',
255 | R: [
256 | {
257 | S: -1,
258 | T: '1',
259 | TS: [
260 | 0,
261 | 15,
262 | 0,
263 | 0,
264 | ],
265 | },
266 | ],
267 | clr: 0,
268 | sw: NaN,
269 | text: '1',
270 | w: 0.5,
271 | x: 4.555,
272 | y: 14.835,
273 | },
274 | {
275 | A: 'left',
276 | R: [
277 | {
278 | S: -1,
279 | T: '2',
280 | TS: [
281 | 0,
282 | 15,
283 | 0,
284 | 0,
285 | ],
286 | },
287 | ],
288 | clr: 0,
289 | sw: NaN,
290 | text: '2',
291 | w: 0.5,
292 | x: 4.555,
293 | y: 16.423,
294 | },
295 | {
296 | A: 'left',
297 | R: [
298 | {
299 | S: -1,
300 | T: '3',
301 | TS: [
302 | 0,
303 | 15,
304 | 0,
305 | 0,
306 | ],
307 | },
308 | ],
309 | clr: 0,
310 | sw: NaN,
311 | text: '3',
312 | w: 0.5,
313 | x: 4.555,
314 | y: 18.01,
315 | },
316 | ]
317 |
318 | ## parse structured content from pdf file, using rules
319 |
320 | > Snapshot 1
321 |
322 | [
323 | {
324 | extractRegexpValues: [
325 | 'world',
326 | ],
327 | },
328 | {
329 | parseNextItemValue: '4',
330 | },
331 | {
332 | 'parseTable.renderItems': `10.501 7.262 c2␊
333 | 10.501 10.131 c3␊
334 | 11.447 5.288 1␊
335 | 11.447 10.477 2.3␊
336 | 12.363 6.937 hello␊
337 | 12.363 9.684 world`,
338 | 'parseTable.renderMatrix': ` c2 c3␊
339 | 1 2.3␊
340 | hello world`,
341 | },
342 | {
343 | accumulateAfterHeading: [
344 | '1',
345 | '2',
346 | '3',
347 | ],
348 | },
349 | ]
350 |
351 | ## parse Table from PDF file, using TableParser
352 |
353 | > Snapshot 1
354 |
355 | [
356 | [
357 | 'Version',
358 | 'LTS',
359 | 'Date',
360 | 'V8',
361 | 'npm',
362 | 'NODE_MODULE_VERSION [1]',
363 | ],
364 | [
365 | 'Node.js 17.1.0',
366 | undefined,
367 | '2021-11-09',
368 | '9.5.172.25',
369 | '8.1.2',
370 | '102',
371 | 'Downloads',
372 | ' Changelog ',
373 | 'Docs',
374 | ],
375 | [
376 | 'Node.js 17.0.1',
377 | undefined,
378 | '2021-10-20',
379 | '9.5.172.21',
380 | '8.1.0',
381 | '102',
382 | 'Downloads',
383 | ' Changelog ',
384 | 'Docs',
385 | ],
386 | [
387 | 'Node.js 17.0.0',
388 | undefined,
389 | '2021-10-19',
390 | '9.5.172.21',
391 | '8.1.0',
392 | '102',
393 | 'Downloads',
394 | ' Changelog ',
395 | 'Docs',
396 | ],
397 | [
398 | 'Node.js 16.14.2',
399 | 'Gallium',
400 | '2022-03-17',
401 | '9.4.146.24',
402 | '8.5.0',
403 | '93',
404 | 'Downloads',
405 | ' Changelog ',
406 | 'Docs',
407 | ],
408 | [
409 | 'Node.js 16.14.1',
410 | 'Gallium',
411 | '2022-03-16',
412 | '9.4.146.24',
413 | '8.5.0',
414 | '93',
415 | 'Downloads',
416 | ' Changelog ',
417 | 'Docs',
418 | ],
419 | [
420 | 'Node.js 16.14.0',
421 | 'Gallium',
422 | '2022-02-08',
423 | '9.4.146.24',
424 | '8.3.1',
425 | '93',
426 | 'Downloads',
427 | ' Changelog ',
428 | 'Docs',
429 | ],
430 | [
431 | 'Node.js 16.13.2',
432 | 'Gallium',
433 | '2022-01-10',
434 | '9.4.146.24',
435 | '8.1.2',
436 | '93',
437 | 'Downloads',
438 | ' Changelog ',
439 | 'Docs',
440 | ],
441 | [
442 | 'Node.js 16.13.1',
443 | 'Gallium',
444 | '2021-12-01',
445 | '9.4.146.24',
446 | '8.1.2',
447 | '93',
448 | 'Downloads',
449 | ' Changelog ',
450 | 'Docs',
451 | ],
452 | [
453 | 'Node.js 16.13.0',
454 | 'Gallium',
455 | '2021-10-26',
456 | '9.4.146.19',
457 | '8.1.0',
458 | '93',
459 | 'Downloads',
460 | ' Changelog ',
461 | 'Docs',
462 | ],
463 | [
464 | 'Node.js 16.12.0',
465 | undefined,
466 | '2021-10-20',
467 | '9.4.146.19',
468 | '8.1.0',
469 | '93',
470 | 'Downloads',
471 | ' Changelog ',
472 | 'Docs',
473 | ],
474 | ]
475 |
476 | ## sample scripts should print raw items from pdf file
477 |
478 | > Snapshot 1
479 |
480 | {
481 | stderr: `printing raw items from file: test/sample.pdf ...␊
482 | done.␊
483 | printing raw items from file: test/sample.pdf ...␊
484 | done.`,
485 | stdout: `␊
486 | > pdfreader@0.0.0-development test:samples␊
487 | > node parse.js test/sample.pdf && node parseAsBuffer.js test/sample.pdf␊
488 | ␊
489 | file = test/sample.pdf␊
490 | page = 1␊
491 | 4.555 5.154 left 6 Hello "world"␊
492 | 4.555 7.174 left 2 Value:␊
493 | 4.555 8.761 left 0 4␊
494 | 5.095 10.501 left 0 c1␊
495 | 7.262 10.501 left 0 c2␊
496 | 10.131 10.501 left 0 c3␊
497 | 5.288 11.447 left 0 1␊
498 | 10.477 11.447 left 1 2.3␊
499 | 6.937 12.363 left 2 hello␊
500 | 9.684 12.363 left 2 world␊
501 | 4.555 13.248 left 3 Values:␊
502 | 4.555 14.835 left 0 1␊
503 | 4.555 16.423 left 0 2␊
504 | 4.555 18.01 left 0 3␊
505 | file = undefined␊
506 | page = 1␊
507 | 4.555 5.154 left 6 Hello "world"␊
508 | 4.555 7.174 left 2 Value:␊
509 | 4.555 8.761 left 0 4␊
510 | 5.095 10.501 left 0 c1␊
511 | 7.262 10.501 left 0 c2␊
512 | 10.131 10.501 left 0 c3␊
513 | 5.288 11.447 left 0 1␊
514 | 10.477 11.447 left 1 2.3␊
515 | 6.937 12.363 left 2 hello␊
516 | 9.684 12.363 left 2 world␊
517 | 4.555 13.248 left 3 Values:␊
518 | 4.555 14.835 left 0 1␊
519 | 4.555 16.423 left 0 2␊
520 | 4.555 18.01 left 0 3`,
521 | }
522 |
--------------------------------------------------------------------------------