├── .gitattributes
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yml
    │   └── feature_request.yml
    └── workflows
    │   └── node.js.yml
├── .gitignore
├── LICENSE
├── README.md
├── eslint.config.mjs
├── package-lock.json
├── package.json
├── src
    ├── canvasapi.ts
    ├── index.ts
    ├── nodecanvas.ts
    ├── nodeskiacanvas.ts
    ├── ocrapi.ts
    ├── pdfdata.ts
    ├── pdfdataextractor.ts
    ├── pdfpagedata.ts
    ├── pureimagecanvas.ts
    ├── tesseractjsocr.ts
    └── types.ts
├── test
    ├── basic.extractor.test.ts
    ├── basic.pdf
    ├── basic.test.ts
    ├── empty_outline.extractor.test.ts
    ├── empty_outline.pdf
    ├── empty_outline.test.ts
    ├── outline.extractor.test.ts
    ├── outline.pdf
    ├── outline.test.ts
    ├── simple.extractor.test.ts
    ├── simple.pdf
    ├── specific_pages.extractor.test.ts
    ├── specific_pages.pdf
    └── specific_pages.test.ts
├── tsconfig.json
├── typedoc.json
└── vitest.config.mjs


/.gitattributes:
--------------------------------------------------------------------------------
1 | package.json text eol=lf
2 | package-lock.json text eol=lf


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: Create a report to help us improve
 3 | labels: [bug]
 4 | assignees:
 5 |   - lublak
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         Thanks for taking the time to fill out this bug report!
11 |         Please always be sure to use the latest compatible version.
12 |   - type: textarea
13 |     id: bug-description
14 |     attributes:
15 |       label: Describe the bug
16 |       description: A clear and concise description of what the bug is.
17 |       placeholder: The description of the bug.
18 |     validations:
19 |       required: true
20 |   - type: textarea
21 |     id: expected-behavior
22 |     attributes:
23 |       label: Describe the expected behavior
24 |       description: A clear and concise description of what you expected to happen.
25 |       placeholder: The expected behavior.
26 |     validations:
27 |       required: true
28 |   - type: input
29 |     attributes:
30 |       label: What is your Node.js version?
31 |       placeholder: 14.X.X
32 |     validations:
33 |       required: true
34 |   - type: dropdown
35 |     id: os
36 |     attributes:
37 |       label: What operating system are you seeing the problem on?
38 |       multiple: true
39 |       options:
40 |         - Linux
41 |         - Windows
42 |         - MacOS
43 |         - Other (enter below with the version)
44 |   - type: input
45 |     attributes:
46 |       label: Operating system version (or if other, then please fill in complete name and version)
47 |     validations:
48 |       required: true
49 |   - type: textarea
50 |     id: logs
51 |     attributes:
52 |       label: Relevant log output
53 |       description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks.
54 |       render: shell
55 |   - type: textarea
56 |     id: pdf
57 |     attributes:
58 |       label: PDF File
59 |       description: Please upload the pdf file that can be used to reproduce issues in the area below. (drag and drop)
60 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature Request
 2 | description: Suggest an idea for this project
 3 | labels: [enhancement]
 4 | assignees:
 5 |   - lublak
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         Thanks for taking the time to fill out this feature request!
11 |   - type: textarea
12 |     id: feature-description
13 |     attributes:
14 |       label: Describe the function you would like to have
15 |       description: A clear and concise description of what you want to happen.
16 |       placeholder: The description of the function.
17 |     validations:
18 |       required: true
19 |   - type: textarea
20 |     id: alternative-solution
21 |     attributes:
22 |       label: Describe your current alternative solution.
23 |       description: Your current solution that you use, if there is one.
24 |       placeholder: Your alternative solution.
25 |   - type: textarea
26 |     id: pdf
27 |     attributes:
28 |       label: PDF File
29 |       description: If a PDF file helps to find a solution for this function, please upload the file. (drag and drop)
30 | 


--------------------------------------------------------------------------------
/.github/workflows/node.js.yml:
--------------------------------------------------------------------------------
 1 | name: Node.js CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     runs-on: ubuntu-latest
13 | 
14 |     strategy:
15 |       matrix:
16 |         node-version: [20.x, 21.x, 22.x]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v4
20 |     - name: Use Node.js ${{ matrix.node-version }}
21 |       uses: actions/setup-node@v4
22 |       with:
23 |         node-version: ${{ matrix.node-version }}
24 |     - name: Install dependencies
25 |       run: npm ci
26 |     - name: Lint
27 |       run: npm run lint
28 |     - name: Build
29 |       run: npm run build --if-present
30 |     - name: Test
31 |       run: npm test


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Logs
  2 | logs
  3 | *.log
  4 | npm-debug.log*
  5 | yarn-debug.log*
  6 | yarn-error.log*
  7 | lerna-debug.log*
  8 | 
  9 | # Diagnostic reports (https://nodejs.org/api/report.html)
 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 11 | 
 12 | # Runtime data
 13 | pids
 14 | *.pid
 15 | *.seed
 16 | *.pid.lock
 17 | 
 18 | # Directory for instrumented libs generated by jscoverage/JSCover
 19 | lib-cov
 20 | 
 21 | # Coverage directory used by tools like istanbul
 22 | coverage
 23 | *.lcov
 24 | 
 25 | # nyc test coverage
 26 | .nyc_output
 27 | 
 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 29 | .grunt
 30 | 
 31 | # Bower dependency directory (https://bower.io/)
 32 | bower_components
 33 | 
 34 | # node-waf configuration
 35 | .lock-wscript
 36 | 
 37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 38 | build/Release
 39 | 
 40 | # Dependency directories
 41 | node_modules/
 42 | jspm_packages/
 43 | 
 44 | # TypeScript v1 declaration files
 45 | typings/
 46 | 
 47 | # TypeScript cache
 48 | *.tsbuildinfo
 49 | 
 50 | # Optional npm cache directory
 51 | .npm
 52 | 
 53 | # Optional eslint cache
 54 | .eslintcache
 55 | 
 56 | # Microbundle cache
 57 | .rpt2_cache/
 58 | .rts2_cache_cjs/
 59 | .rts2_cache_es/
 60 | .rts2_cache_umd/
 61 | 
 62 | # Optional REPL history
 63 | .node_repl_history
 64 | 
 65 | # Output of 'npm pack'
 66 | *.tgz
 67 | 
 68 | # Yarn Integrity file
 69 | .yarn-integrity
 70 | 
 71 | # dotenv environment variables file
 72 | .env
 73 | .env.test
 74 | 
 75 | # parcel-bundler cache (https://parceljs.org/)
 76 | .cache
 77 | 
 78 | # Next.js build output
 79 | .next
 80 | 
 81 | # Nuxt.js build / generate output
 82 | .nuxt
 83 | dist
 84 | 
 85 | # Gatsby files
 86 | .cache/
 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js
 88 | # https://nextjs.org/blog/next-9-1#public-directory-support
 89 | # public
 90 | 
 91 | # vuepress build output
 92 | .vuepress/dist
 93 | 
 94 | # Serverless directories
 95 | .serverless/
 96 | 
 97 | # FuseBox cache
 98 | .fusebox/
 99 | 
100 | # DynamoDB Local files
101 | .dynamodb/
102 | 
103 | # TernJS port file
104 | .tern-port
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 lublak
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pdfdataextract
 2 | 
 3 | [![version](https://img.shields.io/npm/v/pdfdataextract.svg)](https://www.npmjs.org/package/pdfdataextract)
 4 | [![downloads](https://img.shields.io/npm/dt/pdfdataextract.svg)](https://www.npmjs.org/package/pdfdataextract)
 5 | [![status](https://github.com/lublak/pdfdataextract/actions/workflows/node.js.yml/badge.svg)](https://github.com/lublak/pdfdataextract/actions/workflows/node.js.yml)
 6 | 
 7 | Extract data from a pdf with pure javascript.
 8 | 
 9 | The PdfData wrapper over PdfDataExtractor is inspired by https://www.npmjs.com/package/pdf-parse, which is currently unmaintained.
10 | PdfDataExtractor itself is a simple interface to extract individual data from a pdf file.
11 | 
12 | ## Install
13 | 
14 | `npm install pdfdataextract`
15 | 
16 | ## Docs
17 | 
18 | Full documentation is available at the [wiki](https://github.com/lublak/pdfdataextract/wiki)
19 | 
20 | ## Usage
21 | 
22 | PdfData is a wrapper around PdfDataExtractor to directly get a complete json structure.
23 | 
24 | ```ts
25 | import { PdfData, VerbosityLevel } from 'pdfdataextract';
26 | import { readFileSync } from 'fs';
27 | const file_data = readFileSync('some_pdf_file.pdf');
28 | 
29 | // all options are optional
30 | PdfData.extract(file_data, {
31 | 	password: '123456', // password of the pdf file
32 | 	pages: 1, // how many pages should be read at most
33 | 	sort: true, // sort the text by text coordinates
34 | 	verbosity: VerbosityLevel.ERRORS, // set the verbosity level for parsing
35 | 	get: { // enable or disable data extraction (all are optional and enabled by default)
36 | 		pages: true, // get number of pages
37 | 		text: true, // get text of each page
38 | 		fingerprint: true, // get fingerprint
39 | 		outline: true, // get outline
40 | 		metadata: true, // get metadata
41 | 		info: true, // get info
42 | 		permissions: true, // get permissions
43 | 	},
44 | }).then((data) => {
45 | 	data.pages; // the number of pages
46 | 	data.text; // an array of text pages
47 | 	data.fingerprint; // fingerprint of the pdf document
48 | 	data.outline; // outline data of the pdf document
49 | 	data.info; // information of the pdf document, such as Author
50 | 	data.metadata; // metadata of the pdf document
51 | 	data.permissions; // permissions for the document
52 | });
53 | ```
54 | 
55 | ```ts
56 | import { PdfDataExtractor, VerbosityLevel } from 'pdfdataextract';
57 | import { readFileSync } from 'fs';
58 | const file_data = readFileSync('some_pdf_file.pdf');
59 | 
60 | // all options are optional
61 | PdfDataExtractor.get(file_data, {
62 | 	password: '123456', // password of the pdf file
63 | 	verbosity: VerbosityLevel.ERRORS, // set the verbosity level for parsing
64 | }).then((extractor) => {
65 | 	extractor.pages; // the number of pages
66 | 	extractor.fingerprint; // fingerprint of the pdf document
67 | 
68 | 	extractor.getText(1, true).then((text) => {
69 | 		// an array of text pages (only one page and sorted)
70 | 	});
71 | 
72 | 	extractor.getText([2]).then((text) => {
73 | 		// an array of text pages (only the second page)
74 | 	});
75 | 
76 | 	extractor.getOutline().then((outline) => {
77 | 		// outline data of the pdf document
78 | 	});
79 | 	
80 | 	extractor.getMetadata().then((metadata) => {
81 | 		// metadata of the pdf document
82 | 	});
83 | 
84 | 	extractor.getPermissions().then((permissions) => {
85 | 		// permissions for the document
86 | 	});
87 | 
88 | 	extractor.close();
89 | });
90 | ```
91 | 
92 | ## Test
93 | 
94 | `npm test`
95 | 
96 | ## License
97 | 
98 | [MIT licensed](/LICENSE)
99 | 


--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
  1 | import { defineConfig } from "eslint/config";
  2 | import { fixupConfigRules, fixupPluginRules } from "@eslint/compat";
  3 | import stylisticTs from '@stylistic/eslint-plugin-ts';
  4 | import typescriptEslint from "@typescript-eslint/eslint-plugin";
  5 | import jsdoc from "eslint-plugin-jsdoc";
  6 | import globals from "globals";
  7 | import tsParser from "@typescript-eslint/parser";
  8 | import path from "node:path";
  9 | import { fileURLToPath } from "node:url";
 10 | import js from "@eslint/js";
 11 | import { FlatCompat } from "@eslint/eslintrc";
 12 | 
 13 | const __filename = fileURLToPath(import.meta.url);
 14 | const __dirname = path.dirname(__filename);
 15 | const compat = new FlatCompat({
 16 |     baseDirectory: __dirname,
 17 |     recommendedConfig: js.configs.recommended,
 18 |     allConfig: js.configs.all
 19 | });
 20 | 
 21 | export default defineConfig([{
 22 |     extends: fixupConfigRules(compat.extends(
 23 |         "eslint:recommended",
 24 |         "plugin:@typescript-eslint/recommended",
 25 |     )),
 26 | 
 27 |     plugins: {
 28 |         "@typescript-eslint": fixupPluginRules(typescriptEslint),
 29 |         jsdoc,
 30 |         "@stylistic/ts": stylisticTs
 31 |     },
 32 | 
 33 |     languageOptions: {
 34 |         globals: {
 35 |             ...globals.browser,
 36 |         },
 37 | 
 38 |         parser: tsParser,
 39 |         ecmaVersion: 12,
 40 |         sourceType: "module",
 41 | 
 42 |         parserOptions: {
 43 |             project: "./tsconfig.json",
 44 |         },
 45 |     },
 46 | 
 47 |     settings: {
 48 |         jsdoc: {
 49 |             mode: "typescript",
 50 | 
 51 |             structuredTags: {
 52 |                 type: {
 53 |                     type: true,
 54 |                     required: ["type"],
 55 |                 },
 56 |             },
 57 | 
 58 |             ignoreInternal: true,
 59 |         },
 60 |     },
 61 | 
 62 |     rules: {
 63 |         indent: ["error", "tab"],
 64 |         "linebreak-style": ["error", "unix"],
 65 |         quotes: ["error", "single"],
 66 |         semi: ["error", "always"],
 67 | 
 68 |         "no-empty": ["error", {
 69 |             allowEmptyCatch: true,
 70 |         }],
 71 | 
 72 |         "no-console": "error",
 73 | 
 74 |         "@typescript-eslint/no-unused-vars": ["error", {
 75 |             vars: "all",
 76 |             args: "after-used",
 77 |             ignoreRestSiblings: false,
 78 |             argsIgnorePattern: "^_",
 79 |             varsIgnorePattern: "^_",
 80 |             destructuredArrayIgnorePattern: "^_",
 81 |             caughtErrorsIgnorePattern: "^_",
 82 |         }],
 83 | 
 84 |         "@typescript-eslint/explicit-module-boundary-types": ["error"],
 85 | 
 86 |         "@typescript-eslint/ban-ts-comment": ["error", {
 87 |             "ts-expect-error": "allow-with-description",
 88 |             "ts-ignore": "allow-with-description",
 89 |             "ts-nocheck": false,
 90 |             "ts-check": false,
 91 |             minimumDescriptionLength: 10,
 92 |         }],
 93 | 
 94 |         "@typescript-eslint/no-restricted-types": ["error", {
 95 |             types: {
 96 |                 String: {
 97 |                     message: "Use string instead",
 98 |                     fixWith: "string",
 99 |                 },
100 | 
101 |                 Boolean: {
102 |                     message: "Use boolean instead",
103 |                     fixWith: "boolean",
104 |                 },
105 | 
106 |                 Number: {
107 |                     message: "Use number instead",
108 |                     fixWith: "number",
109 |                 },
110 | 
111 |                 Symbol: {
112 |                     message: "Use symbol instead",
113 |                     fixWith: "symbol",
114 |                 },
115 | 
116 |                 Function: {
117 |                     message: "The `Function` type accepts any function-like value.\nIt provides no type safety when calling the function, which can be a common source of bugs.\nIt also accepts things like class declarations, which will throw at runtime as they will not be called with `new`.\nIf you are expecting the function to accept certain arguments, you should explicitly define the function shape.",
118 |                 },
119 | 
120 |                 Object: {
121 |                     message: "The `Object` type actually means 'any non-nullish value', so it is marginally better than `unknown`.\n- If you want a type meaning 'any object', you probably want `Record<string, unknown>` instead.\n- If you want a type meaning 'any value', you probably want `unknown` instead.",
122 |                 },
123 | 
124 |                 "{}": {
125 |                     message: "`{}` actually means 'any non-nullish value'.\n- If you want a type meaning 'any object', you probably want `Record<string, unknown>` instead.\n- If you want a type meaning 'any value', you probably want `unknown` instead.\n- If you want a type meaning 'empty object', you probably want `Record<string, never>` instead.",
126 |                 },
127 |             },
128 |         }],
129 | 
130 |         "@typescript-eslint/no-confusing-non-null-assertion": "off",
131 | 
132 |         "@typescript-eslint/no-explicit-any": ["error", {
133 |             fixToUnknown: true,
134 |         }],
135 | 
136 |         "@stylistic/ts/type-annotation-spacing": ["error", {
137 |             before: false,
138 |             after: true,
139 | 
140 |             overrides: {
141 |                 arrow: {
142 |                     before: true,
143 |                     after: true,
144 |                 },
145 |             },
146 |         }],
147 | 
148 |         "@typescript-eslint/typedef": ["error", {
149 |             arrowParameter: true,
150 |             memberVariableDeclaration: true,
151 |             parameter: true,
152 |             propertyDeclaration: true,
153 |             variableDeclaration: true,
154 |         }],
155 | 
156 |         "@typescript-eslint/no-inferrable-types": "off",
157 |         "@stylistic/ts/func-call-spacing": ["error", "never"],
158 | 
159 |         "@stylistic/ts/keyword-spacing": ["error", {
160 |             before: true,
161 |             after: true,
162 |         }],
163 | 
164 |         "@stylistic/ts/space-before-function-paren": ["error", {
165 |             anonymous: "always",
166 |             named: "never",
167 |             asyncArrow: "always",
168 |         }],
169 | 
170 |         "@typescript-eslint/await-thenable": "error",
171 |         "@typescript-eslint/return-await": "error",
172 |         "@typescript-eslint/unified-signatures": "error",
173 | 
174 |         "@stylistic/ts/comma-spacing": ["error", {
175 |             before: false,
176 |             after: true,
177 |         }],
178 | 
179 |         "jsdoc/check-access": "error",
180 |         "jsdoc/check-alignment": "error",
181 |         "jsdoc/check-examples": "off",
182 | 
183 |         "jsdoc/check-indentation": ["error", {
184 |             excludeTags: ["param"],
185 |         }],
186 | 
187 |         "jsdoc/check-line-alignment": "error",
188 |         "jsdoc/check-param-names": "error",
189 |         "jsdoc/check-property-names": "error",
190 |         "jsdoc/check-syntax": "error",
191 |         "jsdoc/check-tag-names": "error",
192 |         "jsdoc/check-types": "error",
193 |         "jsdoc/check-values": "error",
194 |         "jsdoc/empty-tags": "error",
195 |         "jsdoc/implements-on-classes": "error",
196 | 
197 |         "jsdoc/match-description": ["error", {
198 |             matchDescription: "^[a-zA-Z0-9_\\- /\\\\()[\\]{}=?!:.,;*+~#'\"%&<>|\n]+$",
199 |             contexts: ["any"],
200 |         }],
201 | 
202 |         "jsdoc/no-bad-blocks": "error",
203 |         "jsdoc/no-defaults": "off",
204 |         "jsdoc/no-types": "off",
205 |         "jsdoc/no-undefined-types": "off",
206 | 
207 |         "jsdoc/require-description": ["error", {
208 |             contexts: [
209 |                 ":not(:matches(MethodDefinition[key.name=constructor], MethodDefinition[key.name=constructor] *))",
210 |             ],
211 |         }],
212 | 
213 |         "jsdoc/require-description-complete-sentence": "off",
214 |         "jsdoc/require-example": "off",
215 |         "jsdoc/require-file-overview": "off",
216 |         "jsdoc/require-hyphen-before-param-description": "error",
217 | 
218 |         "jsdoc/require-jsdoc": ["error", {
219 |             publicOnly: true,
220 | 
221 |             require: {
222 |                 ArrowFunctionExpression: true,
223 |                 ClassDeclaration: true,
224 |                 ClassExpression: true,
225 |                 FunctionDeclaration: true,
226 |                 FunctionExpression: true,
227 |                 MethodDefinition: false,
228 |             },
229 | 
230 |             contexts: [
231 |                 "MethodDefinition:not([accessibility=\"private\"]) > FunctionExpression",
232 |                 "ClassProperty:not([accessibility=\"private\"])",
233 |                 "TSPropertySignature",
234 |                 "TSInterfaceBody > TSMethodSignature",
235 |                 "TSEnumMember",
236 |             ],
237 |         }],
238 | 
239 |         "jsdoc/require-param": "error",
240 |         "jsdoc/require-param-description": "error",
241 |         "jsdoc/require-param-name": "error",
242 |         "jsdoc/require-param-type": "error",
243 |         "jsdoc/require-property": "error",
244 |         "jsdoc/require-property-description": "error",
245 |         "jsdoc/require-property-name": "error",
246 |         "jsdoc/require-property-type": "error",
247 |         "jsdoc/require-returns": "error",
248 |         "jsdoc/require-returns-check": "error",
249 |         "jsdoc/require-returns-description": "error",
250 |         "jsdoc/require-returns-type": "error",
251 |         "jsdoc/require-throws": "off",
252 |         "jsdoc/require-yields": "error",
253 |         "jsdoc/require-yields-check": "error",
254 |         "jsdoc/valid-types": "error",
255 |     },
256 | }]);


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "pdfdataextract",
 3 |   "version": "4.0.0",
 4 |   "description": "Extract data from a pdf with pure javascript",
 5 |   "main": "dist/index.js",
 6 |   "types": "dist/index.d.ts",
 7 |   "files": [
 8 |     "dist"
 9 |   ],
10 |   "scripts": {
11 |     "lint": "eslint \"src/**/*.ts\" --cache",
12 |     "lint:fix": "eslint \"src/**/*.ts\" --cache --fix",
13 |     "build": "tsc",
14 |     "test": "vitest",
15 |     "prepare": "npm run build",
16 |     "build:doc": "typedoc src/index.ts"
17 |   },
18 |   "keywords": [
19 |     "pdf",
20 |     "pdf.js",
21 |     "pdfjs",
22 |     "parse",
23 |     "parser",
24 |     "json",
25 |     "text",
26 |     "data",
27 |     "crawler",
28 |     "extract",
29 |     "extractor",
30 |     "ocr",
31 |     "converter",
32 |     "image"
33 |   ],
34 |   "dependencies": {
35 |     "pdfjs-dist": "5.0.375"
36 |   },
37 |   "devDependencies": {
38 |     "@eslint/compat": "^1.2.7",
39 |     "@eslint/eslintrc": "^3.3.1",
40 |     "@eslint/js": "^9.23.0",
41 |     "@stylistic/eslint-plugin-ts": "^4.2.0",
42 |     "@types/node": "^22.13.14",
43 |     "@types/opentype.js": "^1.3.8",
44 |     "@typescript-eslint/eslint-plugin": "^8.28.0",
45 |     "@typescript-eslint/parser": "^8.28.0",
46 |     "canvas": "^3.1.0",
47 |     "eslint": "^9.23.0",
48 |     "eslint-plugin-jsdoc": "^50.6.9",
49 |     "globals": "^16.0.0",
50 |     "pureimage": "^0.4.18",
51 |     "skia-canvas": "^2.0.2",
52 |     "tesseract.js": "^6.0.0",
53 |     "typedoc": "^0.28.1",
54 |     "typedoc-github-wiki-theme": "^2.1.0",
55 |     "typedoc-plugin-markdown": "^4.6.0",
56 |     "typescript": "5.8.2",
57 |     "vitest": "^3.0.9"
58 |   },
59 |   "homepage": "https://github.com/lublak/pdfdataextract#readme",
60 |   "bugs": {
61 |     "url": "https://github.com/lublak/pdfdataextract/issues"
62 |   },
63 |   "repository": {
64 |     "type": "git",
65 |     "url": "git+https://github.com/lublak/pdfdataextract.git"
66 |   },
67 |   "author": "lublak",
68 |   "license": "MIT",
69 |   "engines": {
70 |     "node": ">=20"
71 |   }
72 | }


--------------------------------------------------------------------------------
/src/canvasapi.ts:
--------------------------------------------------------------------------------
 1 | export type CanvasApiConstructor<T extends CanvasApi> = { new(width: number, height: number): T };
 2 | 
 3 | export interface CanvasApi {
 4 | 	/**
 5 | 	 * create the 2d context of the canvas
 6 | 	 * 
 7 | 	 * @returns canvas 2d context
 8 | 	 */
 9 | 	createContext(): CanvasRenderingContext2D;
10 | 	/**
11 | 	 * resets the canvas to the give size
12 | 	 * 
13 | 	 * @param {number} width - the canvas width
14 | 	 * @param {number} height - the canvas height
15 | 	 */
16 | 	reset(width: number, height: number): void;
17 | 	/**
18 | 	 * destroys the canvas
19 | 	 */
20 | 	destroy(): void;
21 | 	/**
22 | 	 * converts the canvas to a png
23 | 	 * 
24 | 	 * @returns the image as a {Buffer}
25 | 	 */
26 | 	toPNG(): Promise<Buffer>;
27 | 	/**
28 | 	 * converts the canvas to a jpeg
29 | 	 * 
30 | 	 * @param {number} quality - the quality of the jpeg
31 | 	 * @returns the image as a {Buffer}
32 | 	 */
33 | 	toJPEG(quality?: number): Promise<Buffer>;
34 | }


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 | 	PdfData,
 3 | 	PdfDataOptions,
 4 | 	PdfDataGetOptions,
 5 | } from './pdfdata';
 6 | 
 7 | import {
 8 | 	PdfDataExtractor,
 9 | 	PdfDataExtractorOptions,
10 | } from './pdfdataextractor';
11 | 
12 | import {
13 | 	CanvasApiConstructor,
14 | 	CanvasApi,
15 | } from './canvasapi';
16 | 
17 | import {
18 | 	OcrApiConstructor,
19 | 	OcrApi,
20 | } from './ocrapi';
21 | 
22 | import {
23 | 	VerbosityLevel,
24 | 	Permissions,
25 | 	Metadata,
26 | 	Outline,
27 | 	PageNumberOutline,
28 | 	UrlOutline,
29 | 	PdfReferenceOutline,
30 | 	Info,
31 | 	Name,
32 | } from './types';
33 | 
34 | export {
35 | 	PdfData,
36 | 	PdfDataOptions,
37 | 	PdfDataGetOptions,
38 | 
39 | 	PdfDataExtractor,
40 | 	PdfDataExtractorOptions,
41 | 
42 | 	CanvasApiConstructor,
43 | 	CanvasApi,
44 | 
45 | 	OcrApiConstructor,
46 | 	OcrApi,
47 | 
48 | 	VerbosityLevel,
49 | 	Permissions,
50 | 	Metadata,
51 | 	Outline,
52 | 	PageNumberOutline,
53 | 	PdfReferenceOutline,
54 | 	UrlOutline,
55 | 	Info,
56 | 	Name,
57 | };


--------------------------------------------------------------------------------
/src/nodecanvas.ts:
--------------------------------------------------------------------------------
 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor
 2 | import { Canvas, createCanvas, JpegConfig } from 'canvas';
 3 | import { promisify } from 'util';
 4 | import { CanvasApi } from './canvasapi';
 5 | 
 6 | /**
 7 |  * implementation for node-canvas
 8 |  */
 9 | export class NodeCanvas implements CanvasApi {
10 | 	private canvas: Canvas;
11 | 	/**
12 | 	 * @internal
13 | 	 */
14 | 	public constructor(width: number, height: number) {
15 | 		this.canvas = createCanvas(width, height);
16 | 	}
17 | 	/**
18 | 	 * @internal
19 | 	 */
20 | 	public toPNG(): Promise<Buffer> {
21 | 		return promisify<'image/png', Buffer>(this.canvas.toBuffer)('image/png');
22 | 	}
23 | 	/**
24 | 	 * @internal
25 | 	 */
26 | 	public toJPEG(quality?: number): Promise<Buffer> {
27 | 		return promisify<'image/jpeg', JpegConfig, Buffer>(this.canvas.toBuffer)('image/jpeg', {
28 | 			quality: quality
29 | 		});
30 | 	}
31 | 	/**
32 | 	 * @internal
33 | 	 */
34 | 	public createContext(): CanvasRenderingContext2D {
35 | 		return this.canvas.getContext('2d') as unknown as CanvasRenderingContext2D;
36 | 	}
37 | 	/**
38 | 	 * @internal
39 | 	 */
40 | 	public reset(width: number, height: number): void {
41 | 		this.canvas.width = width;
42 | 		this.canvas.height = height;
43 | 	}
44 | 	/**
45 | 	 * @internal
46 | 	 */
47 | 	public destroy(): void {
48 | 		this.canvas.width = 0;
49 | 		this.canvas.height = 0;
50 | 	}
51 | 
52 | }


--------------------------------------------------------------------------------
/src/nodeskiacanvas.ts:
--------------------------------------------------------------------------------
 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor
 2 | import { Canvas, createCanvas } from '@napi-rs/canvas';
 3 | import { CanvasApi } from './canvasapi';
 4 | 
 5 | /**
 6 |  * implementation for node-skia
 7 |  */
 8 | export class NodeSkiaCanvas implements CanvasApi {
 9 | 	private canvas: Canvas;
10 | 	/**
11 | 	 * @internal
12 | 	 */
13 | 	public constructor(width: number, height: number) {
14 | 		this.canvas = createCanvas(width, height);
15 | 	}
16 | 	/**
17 | 	 * @internal
18 | 	 */
19 | 	public toPNG(): Promise<Buffer> {
20 | 		return this.canvas.encode('png');
21 | 	}
22 | 	/**
23 | 	 * @internal
24 | 	 */
25 | 	public toJPEG(quality?: number): Promise<Buffer> {
26 | 		return this.canvas.encode('jpeg', quality);
27 | 	}
28 | 	/**
29 | 	 * @internal
30 | 	 */
31 | 	public createContext(): CanvasRenderingContext2D {
32 | 		return this.canvas.getContext('2d') as unknown as CanvasRenderingContext2D;
33 | 	}
34 | 	/**
35 | 	 * @internal
36 | 	 */
37 | 	public reset(width: number, height: number): void {
38 | 		this.canvas.width = width;
39 | 		this.canvas.height = height;
40 | 	}
41 | 	/**
42 | 	 * @internal
43 | 	 */
44 | 	public destroy(): void {
45 | 		this.canvas.width = 0;
46 | 		this.canvas.height = 0;
47 | 	}
48 | }


--------------------------------------------------------------------------------
/src/ocrapi.ts:
--------------------------------------------------------------------------------
 1 | import { OCRLang } from './types';
 2 | 
 3 | export type OcrApiConstructor<T extends OcrApi> = { new(): T };
 4 | 
 5 | export interface OcrApi {
 6 | 	/**
 7 | 	 * recognize characters of buffers
 8 | 	 * 
 9 | 	 * @param {Buffer[]} buffers - the image buffers
10 | 	 * @param {OCRLang[]} langs - the language traineddata used for recognition
11 | 	 * @returns {Promise<string[]>} an array with text from each side
12 | 	 */
13 | 	ocrBuffers(buffers: Buffer[], langs: OCRLang[]): Promise<string[]>;
14 | }


--------------------------------------------------------------------------------
/src/pdfdata.ts:
--------------------------------------------------------------------------------
  1 | import { PdfDataExtractor } from './pdfdataextractor';
  2 | import { VerbosityLevel, Permissions, Outline, Info, Metadata } from './types';
  3 | 
  4 | export type PdfDataGetOptions = {
  5 | 	/**
  6 | 	 * get number of pages, by default it is true
  7 | 	 * 
  8 | 	 * @type {boolean|undefined}
  9 | 	 */
 10 | 	pages?: boolean,
 11 | 	/**
 12 | 	 * get text of each page, by default it is true
 13 | 	 * 
 14 | 	 * @type {boolean|undefined}
 15 | 	 */
 16 | 	text?: boolean,
 17 | 	/**
 18 | 	 * get fingerprint, by default it is true
 19 | 	 * 
 20 | 	 * @type {boolean|undefined}
 21 | 	 */
 22 | 	fingerprint?: boolean,
 23 | 	/**
 24 | 	 * get outline, by default it is true
 25 | 	 * 
 26 | 	 * @type {boolean|undefined}
 27 | 	 */
 28 | 	outline?: boolean,
 29 | 	/**
 30 | 	 * get metadata, by default it is true
 31 | 	 * 
 32 | 	 * @type {boolean|undefined}
 33 | 	 */
 34 | 	metadata?: boolean,
 35 | 	/**
 36 | 	 * get info, by default it is true
 37 | 	 * 
 38 | 	 * @type {boolean|undefined}
 39 | 	 */
 40 | 	info?: boolean,
 41 | 	/**
 42 | 	 * get permissions, by default it is true
 43 | 	 * 
 44 | 	 * @type {boolean|undefined}
 45 | 	 */
 46 | 	permissions?: boolean
 47 | };
 48 | 
 49 | export type PdfDataOptions = {
 50 | 	/**
 51 | 	 * password for a password-protected PDF
 52 | 	 * 
 53 | 	 * @type {string|undefined}
 54 | 	 */
 55 | 	password?: string,
 56 | 	/**
 57 | 	 * the number of pages to be read, all pages are read by default
 58 | 	 *
 59 | 	 * @deprecated use pages instead
 60 | 	 * 
 61 | 	 * @type {number|undefined}
 62 | 	 */
 63 | 	max?: number,
 64 | 	/**
 65 | 	 * sort the text by text coordinates
 66 | 	 * 
 67 | 	 * @type {boolean|undefined}
 68 | 	 */
 69 | 	sort?: boolean,
 70 | 	/**
 71 | 	 * the logging level
 72 | 	 * 
 73 | 	 * @type {VerbosityLevel|undefined}
 74 | 	 */
 75 | 	verbosity?: VerbosityLevel,
 76 | 	/**
 77 | 	 * can either be the number of pages to be read,
 78 | 	 * a number array with the exact pages (sorted by page number)
 79 | 	 * or a filter function (return true to parse the page)
 80 | 	 * all pages are read by default
 81 | 	 * not used if get.pages is false
 82 | 	 * 
 83 | 	 * @type {number|number[]|((pageNumber: number) => boolean|undefined)}
 84 | 	 */
 85 | 	pages?: number | number[] | ((pageNumber: number) => boolean),
 86 | 	/**
 87 | 	 * options to enable or disable parsing methods
 88 | 	 * 
 89 | 	 * @type {PdfDataGetOptions|undefined}
 90 | 	 */
 91 | 	get?: PdfDataGetOptions;
 92 | }
 93 | 
 94 | /**
 95 |  * the data of the pdf
 96 |  */
 97 | export class PdfData {
 98 | 	/**
 99 | 	 * the number of pages
100 | 	 * 
101 | 	 * @readonly
102 | 	 * @type {number|undefined}
103 | 	 */
104 | 	readonly pages?: number;
105 | 	/**
106 | 	 * extracted text per page
107 | 	 * 
108 | 	 * @readonly
109 | 	 * @type {string[]|undefined}
110 | 	 */
111 | 	readonly text?: readonly string[];
112 | 	/**
113 | 	 * the fingerprint
114 | 	 * 
115 | 	 * @readonly
116 | 	 * @type {string|undefined}
117 | 	 */
118 | 	readonly fingerprint?: string;
119 | 	/**
120 | 	 * the outline/bookmarks
121 | 	 * 
122 | 	 * @readonly
123 | 	 * @type {Outline[]|undefined}
124 | 	 */
125 | 	readonly outline?: readonly Outline[];
126 | 	/**
127 | 	 * the informations/description
128 | 	 * 
129 | 	 * @readonly
130 | 	 * @type {Info|undefined}
131 | 	 */
132 | 	readonly info?: Info;
133 | 	/**
134 | 	 * the metadata
135 | 	 * 
136 | 	 * @readonly
137 | 	 * @type {Metadata|undefined}
138 | 	 */
139 | 	readonly metadata?: Metadata;
140 | 	/**
141 | 	 * the permission flags
142 | 	 * 
143 | 	 * @readonly
144 | 	 * @type {Permissions | undefined}
145 | 	 */
146 | 	readonly permissions?: Permissions;
147 | 
148 | 	private constructor(pages: number | null, text: string[] | null, fingerprint: string | null, outline: Outline[] | null, info: Info | null, metadata: Metadata | null, permissions: Permissions | null) {
149 | 		if (pages != null) this.pages = pages;
150 | 		if (text != null) this.text = text;
151 | 		if (fingerprint != null) this.fingerprint = fingerprint;
152 | 		if (outline != null) this.outline = outline;
153 | 		if (info != null) this.info = info;
154 | 		if (metadata != null) this.metadata = metadata;
155 | 		if (permissions != null) this.permissions = permissions;
156 | 	}
157 | 
158 | 	/**
159 | 	 * get the data
160 | 	 * 
161 | 	 * @param {Uint8Array} data - the binary data file
162 | 	 * @param {PdfDataOptions} [options={}] - the options on how the data should be extracted
163 | 	 * @returns {Promise<PdfData>} a promise that is resolved with a {PdfData} object with the extracted data
164 | 	 */
165 | 	static async extract(data: Uint8Array, options: PdfDataOptions = {}): Promise<PdfData> {
166 | 		const extractor: PdfDataExtractor = await PdfDataExtractor.get(data, {
167 | 			password: options.password,
168 | 			verbosity: options.verbosity,
169 | 		});
170 | 
171 | 		if (!options.get) options.get = {};
172 | 
173 | 		const pages: number | number[] | ((pageNumber: number) => boolean) | undefined = options.pages ? options.pages : options.max;
174 | 
175 | 		let metadata: Metadata | null = null;
176 | 		let info: Info | null = null;
177 | 
178 | 		if (options.get.metadata === undefined || options.get.metadata || options.get.info === undefined || options.get.info) {
179 | 			const rawMetadata: {
180 | 				info: Info;
181 | 				metadata: Metadata;
182 | 			} | null = await extractor.getMetadata();
183 | 			if (rawMetadata != null) {
184 | 				if (options.get.info === undefined || options.get.info) info = rawMetadata.info;
185 | 				if (options.get.metadata === undefined || options.get.metadata) metadata = rawMetadata.metadata;
186 | 			}
187 | 		}
188 | 
189 | 		const pdfdata: PdfData = new PdfData(
190 | 			options.get.pages === undefined || options.get.pages ? extractor.pages : null,
191 | 			options.get.text === undefined || options.get.text ? await extractor.getText(pages, options.sort) : null,
192 | 			options.get.fingerprint === undefined || options.get.fingerprint ? extractor.fingerprint : null,
193 | 			options.get.outline === undefined || options.get.outline ? await extractor.getOutline() : null,
194 | 			info,
195 | 			metadata,
196 | 			options.get.permissions === undefined || options.get.permissions ? await extractor.getPermissions() : null
197 | 		);
198 | 
199 | 		extractor.close();
200 | 		return pdfdata;
201 | 	}
202 | }


--------------------------------------------------------------------------------
/src/pdfdataextractor.ts:
--------------------------------------------------------------------------------
  1 | import { getDocument, PermissionFlag } from 'pdfjs-dist/legacy/build/pdf.mjs';
  2 | import { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist/types/src/display/api';
  3 | import { CanvasApi, CanvasApiConstructor } from './canvasapi';
  4 | import { OcrApi, OcrApiConstructor } from './ocrapi';
  5 | import { PdfPageData } from './pdfpagedata';
  6 | import { VerbosityLevel, Permissions, Outline, PageNumberOutline, UrlOutline, PdfReferenceOutline, MetadataInfo, Sort } from './types';
  7 | 
  8 | export type PdfDataExtractorOptions = {
  9 | 	/**
 10 | 	 * password for a password-protected PDF
 11 | 	 * 
 12 | 	 * @type {string}
 13 | 	 */
 14 | 	password?: string,
 15 | 	/**
 16 | 	 * the logging level
 17 | 	 * 
 18 | 	 * @type {VerbosityLevel}
 19 | 	 */
 20 | 	verbosity?: VerbosityLevel,
 21 | 	/**
 22 | 	 * the canvas api used for rendering
 23 | 	 * 
 24 | 	 * @type {CanvasApiConstructor}
 25 | 	 */
 26 | 	canvasApi?: CanvasApiConstructor<CanvasApi>,
 27 | 	/**
 28 | 	 * the ocr api used for text detection
 29 | 	 * 
 30 | 	 * @type {OcrApiConstructor}
 31 | 	 */
 32 | 	ocrApi?: OcrApiConstructor<OcrApi>,
 33 | }
 34 | 
 35 | interface RawOutline {
 36 | 	title: string;
 37 | 	bold: boolean;
 38 | 	italic: boolean;
 39 | 	color: Uint8ClampedArray;
 40 | 	dest: string | Array<unknown> | null;
 41 | 	url: string | null;
 42 | 	unsafeUrl: string | undefined;
 43 | 	newWindow: boolean | undefined;
 44 | 	count: number | undefined;
 45 | 	items: RawOutline[] | undefined;
 46 | }
 47 | 
 48 | async function getPageNumber(pdf_document: PDFDocumentProxy, pageRef: { num: number, gen: number }, cache: { [key: string]: number; }) {
 49 | 	const ref: string = pageRef.gen === 0 ? `${pageRef.num}R` : `${pageRef.num}R${pageRef.gen}`;
 50 | 	let number: number = cache[ref];
 51 | 	if (number == null) {
 52 | 		number = await pdf_document.getPageIndex(pageRef) as unknown as number;
 53 | 		cache[ref] = number;
 54 | 	}
 55 | 	return number;
 56 | }
 57 | 
 58 | function parseRemoteUrlDest(remoteUrlDest: string) {
 59 | 	try {
 60 | 		const remoteDest: unknown = JSON.parse(remoteUrlDest);
 61 | 		if (Array.isArray(remoteDest) && Number.isInteger(remoteDest[0])) {
 62 | 			return remoteDest[0];
 63 | 		}
 64 | 	} catch { }
 65 | 	return undefined;
 66 | }
 67 | 
 68 | async function parseOutline(pdf_document: PDFDocumentProxy, outlineData: RawOutline[], cache: { [key: string]: number; }) {
 69 | 	const outline: Outline[] = [];
 70 | 	for (const o of outlineData) {
 71 | 		const dest: unknown = typeof (o.dest) === 'string' ? await pdf_document.getDestination(o.dest) : o.dest;
 72 | 		if (dest == null) {
 73 | 			if (o.unsafeUrl != null) {
 74 | 				if (o.url == null) {
 75 | 					const remoteUrl: string[] = o.unsafeUrl.split('#', 2);
 76 | 					const remoteBaseUrl: string = remoteUrl[0];
 77 | 					if (remoteBaseUrl.toLowerCase().endsWith('.pdf')) {
 78 | 						if (remoteUrl.length == 2) {
 79 | 							outline.push(new PdfReferenceOutline(o.title, remoteBaseUrl, parseRemoteUrlDest(remoteUrl[1]), o.items ? await parseOutline(pdf_document, o.items, cache) : undefined));
 80 | 						} else {
 81 | 							outline.push(new PdfReferenceOutline(o.title, remoteBaseUrl, undefined, o.items ? await parseOutline(pdf_document, o.items, cache) : undefined));
 82 | 						}
 83 | 					} else {
 84 | 						outline.push(new UrlOutline(o.title, o.unsafeUrl, false, o.items ? await parseOutline(pdf_document, o.items, cache) : undefined));
 85 | 					}
 86 | 				} else {
 87 | 					outline.push(new UrlOutline(o.title, o.url, true, o.items ? await parseOutline(pdf_document, o.items, cache) : undefined));
 88 | 				}
 89 | 			} else {
 90 | 				// TODO: ?
 91 | 			}
 92 | 		} else if (Array.isArray(dest)) {
 93 | 			if (typeof dest[0] === 'object') {
 94 | 				outline.push(new PageNumberOutline(
 95 | 					o.title,
 96 | 					await getPageNumber(pdf_document, dest[0] as { num: number, gen: number }, cache),
 97 | 					o.items ? await parseOutline(pdf_document, o.items, cache) : undefined
 98 | 				));
 99 | 			} else if (Number.isInteger(dest[0])) {
100 | 				outline.push(new PageNumberOutline(o.title, dest[0], o.items ? await parseOutline(pdf_document, o.items, cache) : undefined));
101 | 			} else {
102 | 				// TODO: ?
103 | 			}
104 | 		}
105 | 	}
106 | 	return outline;
107 | }
108 | 
109 | async function getInstalledCanvasApi(): Promise<CanvasApiConstructor<CanvasApi> | null> {
110 | 	try {
111 | 		require.resolve('canvas');
112 | 		return (await import('./nodecanvas')).NodeCanvas;
113 | 	} catch (_e) { }
114 | 	try {
115 | 		require.resolve('@napi-rs/canvas');
116 | 		return (await import('./nodeskiacanvas')).NodeSkiaCanvas;
117 | 	} catch (_e) { }
118 | 	try {
119 | 		require.resolve('pureimage');
120 | 		return (await import('./pureimagecanvas')).PureimageCanvas;
121 | 	} catch (_e) { }
122 | 	return null;
123 | }
124 | 
125 | async function getInstalledOcrApi(): Promise<OcrApiConstructor<OcrApi> | null> {
126 | 	try {
127 | 		require.resolve('tesseract.js');
128 | 		return (await import('./tesseractjsocr')).TesseractJsOcr;
129 | 	} catch (_e) { }
130 | 	return null;
131 | }
132 | 
133 | /**
134 |  * the extractor for the data of the pdf
135 |  */
136 | export class PdfDataExtractor {
137 | 	private constructor(
138 | 		private readonly pdf_document: PDFDocumentProxy,
139 | 		private readonly canvasApi: CanvasApiConstructor<CanvasApi> | null,
140 | 		private readonly ocrApi: OcrApiConstructor<OcrApi> | null,
141 | 	) { }
142 | 
143 | 	/**
144 | 	 * get the extractor for the data
145 | 	 * 
146 | 	 * @param {Uint8Array} data - the binary data file
147 | 	 * @param {PdfDataExtractorOptions} [options={}] - the options on how to open the data in the extractor
148 | 	 * @returns {Promise<PdfDataExtractor>} a promise that is resolved with a {PdfDataExtractor} object to pull the extracted data from
149 | 	 */
150 | 	static async get(data: Uint8Array, options: PdfDataExtractorOptions = {}): Promise<PdfDataExtractor> {
151 | 		if (data instanceof Buffer) {
152 | 			data = new Uint8Array(data);
153 | 		}
154 | 		const pdf_document: PDFDocumentProxy = await getDocument({
155 | 			data: data,
156 | 			password: options.password,
157 | 			verbosity: options.verbosity ?? VerbosityLevel.ERRORS,
158 | 			isEvalSupported: false,
159 | 		}).promise;
160 | 		return new PdfDataExtractor(pdf_document, options.canvasApi ?? await getInstalledCanvasApi(), options.ocrApi ?? await getInstalledOcrApi());
161 | 	}
162 | 
163 | 	/**
164 | 	 * get the fingerprint
165 | 	 * 
166 | 	 * @returns {string | null} the fingerprint
167 | 	 */
168 | 	get fingerprint(): string | null {
169 | 		return this.pdf_document.fingerprints[0];
170 | 	}
171 | 
172 | 	/**
173 | 	 * get the number of pages
174 | 	 * 
175 | 	 * @returns {string} the number of pages
176 | 	 */
177 | 	get pages(): number {
178 | 		return this.pdf_document.numPages;
179 | 	}
180 | 
181 | 	/**
182 | 	 * get the permission flags
183 | 	 *
184 | 	 * @returns {Promise<Permissions | null>} a promise that is resolved with a {Permissions | null} object that contains the permission flags for the PDF
185 | 	 */
186 | 	async getPermissions(): Promise<Permissions | null> {
187 | 		const permission_flag_array: number[] | null = await this.pdf_document.getPermissions();
188 | 		return permission_flag_array == null ? null : {
189 | 			assemble: permission_flag_array.includes(PermissionFlag.ASSEMBLE),
190 | 			copy: permission_flag_array.includes(PermissionFlag.COPY),
191 | 			copyForAccessibility: permission_flag_array.includes(PermissionFlag.COPY_FOR_ACCESSIBILITY),
192 | 			fillInteractiveForms: permission_flag_array.includes(PermissionFlag.FILL_INTERACTIVE_FORMS),
193 | 			modifyAnnotations: permission_flag_array.includes(PermissionFlag.MODIFY_ANNOTATIONS),
194 | 			print: permission_flag_array.includes(PermissionFlag.PRINT),
195 | 			printHQ: permission_flag_array.includes(PermissionFlag.PRINT_HIGH_QUALITY),
196 | 			modifyContents: permission_flag_array.includes(PermissionFlag.MODIFY_CONTENTS),
197 | 		};
198 | 	}
199 | 
200 | 	/**
201 | 	 * get the text
202 | 	 * 
203 | 	 * @param {number|number[]|((pageNumber: number) => boolean)} [pages] - can either be the number of pages to be read,
204 | 	 * a number array with the specific pages (sorted by page number)
205 | 	 * or a filter function (return true to parse the page)
206 | 	 * @param {boolean|Sort} [sort=false] - sort the text by text coordinates
207 | 	 * @returns {Promise<string[]>} a promise that is resolved with a {string[]} array with the extracted text per page
208 | 	 */
209 | 	async getText(pages?: number | number[] | ((pageNumber: number) => boolean), sort: boolean | Sort = false): Promise<string[]> {
210 | 		return Promise.all((await this.getPageData(pages)).map(async (page: PdfPageData | null) => page == null ? '' : page.toText(sort)));
211 | 	}
212 | 
213 | 	/**
214 | 	 * get the text
215 | 	 * 
216 | 	 * @param {number|number[]|((pageNumber: number) => boolean)} [pages] - can either be the number of pages to be read,
217 | 	 * a number array with the specific pages (sorted by page number)
218 | 	 * or a filter function (return true to parse the page)
219 | 	 * @returns {Promise<string[]>} a promise that is resolved with a {string[]} array with the extracted text per page
220 | 	 */
221 | 	async getPageData(pages?: number | number[] | ((pageNumber: number) => boolean)): Promise<(PdfPageData | null)[]> {
222 | 		const page_array: (PdfPageData | null)[] = [];
223 | 		const numPages: number = this.pdf_document.numPages;
224 | 
225 | 		if (pages === undefined) {
226 | 			for (let pageNumber: number = 1; pageNumber <= numPages; pageNumber++) {
227 | 				const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null);
228 | 				page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi));
229 | 			}
230 | 		} else if (typeof (pages) === 'number') {
231 | 			const counter: number = pages > numPages ? numPages : pages;
232 | 
233 | 			for (let pageNumber: number = 1; pageNumber <= counter; pageNumber++) {
234 | 				const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null);
235 | 				page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi));
236 | 			}
237 | 		} else if (typeof (pages) === 'function') {
238 | 			for (let pageNumber: number = 1; pageNumber <= numPages; pageNumber++) {
239 | 				if (pages(pageNumber)) {
240 | 					const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null);
241 | 					page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi));
242 | 				}
243 | 			}
244 | 		} else {
245 | 			pages = pages.filter((value: number, index: number, self: number[]) => self.indexOf(value) === index).sort((a: number, b: number) => a - b);
246 | 			for (const pageNumber of pages) {
247 | 				if (pageNumber <= numPages) {
248 | 					const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null);
249 | 					page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi));
250 | 				}
251 | 			}
252 | 		}
253 | 
254 | 		return page_array;
255 | 	}
256 | 
257 | 	/**
258 | 	 * get the outline/bookmarks
259 | 	 *
260 | 	 * @returns {Promise<Outline[]>} a promise that is resolved with a {Outline[]} array with information from the tree outline
261 | 	 */
262 | 	async getOutline(): Promise<Outline[] | null> {
263 | 		const outlineData: RawOutline[] = await this.pdf_document.getOutline();
264 | 		if (outlineData == null) return null;
265 | 		return parseOutline(this.pdf_document, outlineData, {});
266 | 	}
267 | 
268 | 	/**
269 | 	 * get the metadata
270 | 	 *
271 | 	 * @returns {Promise<MetadataInfo | null>} a promise that is resolved with a {MetadataInfo | null} object with information from the metadata section
272 | 	 */
273 | 	async getMetadata(): Promise<MetadataInfo | null> {
274 | 		return await this.pdf_document.getMetadata().catch(() => null) as MetadataInfo | null;
275 | 	}
276 | 
277 | 	/**
278 | 	 * close the extractor
279 | 	 *
280 | 	 * @returns {Promise<void>} a promise that is resolved when destruction is completed
281 | 	 */
282 | 	async close(): Promise<void> {
283 | 		return this.pdf_document.destroy();
284 | 	}
285 | }


--------------------------------------------------------------------------------
/src/pdfpagedata.ts:
--------------------------------------------------------------------------------
  1 | import { PDFPageProxy, TextContent, TextItem } from 'pdfjs-dist/types/src/display/api';
  2 | import { OCRLang, Sort } from './types';
  3 | import { PageViewport } from 'pdfjs-dist/types/src/display/display_utils';
  4 | import { CanvasApi, CanvasApiConstructor } from './canvasapi';
  5 | import { OcrApi, OcrApiConstructor } from './ocrapi';
  6 | 
  7 | /**
  8 |  * pdf data information per page
  9 |  */
 10 | export class PdfPageData {
 11 | 	/**
 12 | 	 * @internal
 13 | 	 */
 14 | 	public constructor(
 15 | 		private page: PDFPageProxy,
 16 | 		private readonly canvasApi: CanvasApiConstructor<CanvasApi> | null,
 17 | 		private readonly ocrApi: OcrApiConstructor<OcrApi> | null,
 18 | 	) { }
 19 | 
 20 | 	/**
 21 | 	 * get the text of the page
 22 | 	 * 
 23 | 	 * @param {boolean|Sort} [sort=false] - sort the text by text coordinates
 24 | 	 * @returns {Promise<string>} a promise that is resolved with a {string} with the extracted text of the page
 25 | 	 */
 26 | 	public async toText(sort: boolean | Sort = false): Promise<string> {
 27 | 		const sortOption: Sort | null = typeof sort === 'boolean' ? (sort ? Sort.ASC : null) : sort;
 28 | 		return this.page.getTextContent({
 29 | 			disableNormalization: false,
 30 | 			includeMarkedContent: false,
 31 | 		}).then((textContent: TextContent) => {
 32 | 			const items: TextItem[] = textContent.items as TextItem[];
 33 | 			/*
 34 | 				transform is a array with a transform matrix [scale x,shear x,shear y,scale y,offset x,offset y]
 35 | 			
 36 | 				0,1         1,1
 37 | 				  -----------
 38 | 				  |         |
 39 | 				  |         |
 40 | 				  |   pdf   |
 41 | 				  |         |
 42 | 				  |         |
 43 | 				  -----------
 44 | 				0,0         1,0
 45 | 			*/
 46 | 
 47 | 			//coordinate based sorting
 48 | 			if (sortOption !== null) {
 49 | 				if (sortOption === Sort.ASC) {
 50 | 					items.sort((e1: TextItem, e2: TextItem) => {
 51 | 						if (e1.transform[5] < e2.transform[5]) return 1;
 52 | 						else if (e1.transform[5] > e2.transform[5]) return -1;
 53 | 						else if (e1.transform[4] < e2.transform[4]) return -1;
 54 | 						else if (e1.transform[4] > e2.transform[4]) return 1;
 55 | 						else return 0;
 56 | 					});
 57 | 				} else {
 58 | 					items.sort((e1: TextItem, e2: TextItem) => {
 59 | 						if (e1.transform[5] < e2.transform[5]) return -1;
 60 | 						else if (e1.transform[5] > e2.transform[5]) return 1;
 61 | 						else if (e1.transform[4] < e2.transform[4]) return 1;
 62 | 						else if (e1.transform[4] > e2.transform[4]) return -1;
 63 | 						else return 0;
 64 | 					});
 65 | 				}
 66 | 			}
 67 | 
 68 | 			let lastLineY: number = -1, text: string = '';
 69 | 			for (const item of items) {
 70 | 				if (lastLineY === -1 || lastLineY == item.transform[5]) {
 71 | 					text += item.str;
 72 | 					// TODO if spaced by coordinates (x + text width + space width = next x)
 73 | 					//textContent.styles[item.fontName];
 74 | 					//dummyContext.font = '';
 75 | 					//dummyContext.measureText(item.str);
 76 | 				} else {
 77 | 					text += '\n' + item.str;
 78 | 				}
 79 | 				lastLineY = item.transform[5];
 80 | 			}
 81 | 			return text;
 82 | 		}, () => '');
 83 | 	}
 84 | 
 85 | 	/**
 86 | 	 * recognizes the text from the image information of this pdf page
 87 | 	 * requires node-canvas/node-pureimage and tesseract.js as additional installation
 88 | 	 * 
 89 | 	 * @param {OCRLang[]} langs - the language traineddata used for recognition
 90 | 	 * @returns {Promise<string>} the result as text
 91 | 	 */
 92 | 	public async ocr(langs: OCRLang[]): Promise<string> {
 93 | 		if (!this.ocrApi) throw new Error('OcrFactory.ocrApi is not set (tesseractjs)');
 94 | 		const ocr: OcrApi = new this.ocrApi();
 95 | 		const result: string[] = await ocr.ocrBuffers([await this.toJPEG()], langs);
 96 | 		return result[0];
 97 | 	}
 98 | 
 99 | 	/**
100 | 	 * creates a canvas and renders 
101 | 	 *
102 | 	 * @param {T} canvasApi - the canvas api that is used to create the canvas
103 | 	 * @returns {Promise<T>} the canvas
104 | 	 */
105 | 	public async toCanvasApi<T extends CanvasApi>(canvasApi: CanvasApiConstructor<T>): Promise<T> {
106 | 		const viewport: PageViewport = this.page.getViewport({ scale: 1.0 });
107 | 		const canvas: T = new canvasApi(viewport.width, viewport.height);
108 | 		await this.page.render({
109 | 			canvasContext: canvas.createContext(),
110 | 			viewport: viewport,
111 | 		}).promise;
112 | 		return canvas;
113 | 	}
114 | 
115 | 	/**
116 | 	 * converts to a jpeg image
117 | 	 *
118 | 	 * @param {number} [quality=0.8] - the quality of the image (0.0-1.0)
119 | 	 * @returns {Promise<Buffer>} the jpeg image as a {Buffer}
120 | 	 */
121 | 	public async toJPEG(quality: number = 0.8): Promise<Buffer> {
122 | 		if (!this.canvasApi) throw new Error('canvasApi is not set (node-canvas or pureimage is not installed)');
123 | 		return (await this.toCanvasApi(this.canvasApi)).toJPEG(quality);
124 | 	}
125 | 
126 | 	/**
127 | 	 * converts to a png image
128 | 	 *
129 | 	 * @returns {Promise<Buffer>} the png image as a {Buffer}
130 | 	 */
131 | 	public async toPNG(): Promise<Buffer> {
132 | 		if (!this.canvasApi) throw new Error('canvasApi is not set (node-canvas or pureimage is not installed)');
133 | 		return (await this.toCanvasApi(this.canvasApi)).toPNG();
134 | 	}
135 | 
136 | 	/**
137 | 	 * close the page data
138 | 	 * @returns {boolean} — if close was successfully
139 | 	 */
140 | 	public close(): boolean {
141 | 		return this.page.cleanup();
142 | 	}
143 | }


--------------------------------------------------------------------------------
/src/pureimagecanvas.ts:
--------------------------------------------------------------------------------
 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor
 2 | import { encodeJPEGToStream, encodePNGToStream, make } from 'pureimage';
 3 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor
 4 | import { Bitmap } from 'pureimage/types/bitmap';
 5 | import { PassThrough } from 'stream';
 6 | import { CanvasApi } from './canvasapi';
 7 | 
 8 | /**
 9 |  * default implementation for pureimage
10 |  * look at the {CanvasApi} doc
11 |  */
12 | export class PureimageCanvas implements CanvasApi {
13 | 	private bitmap: Bitmap;
14 | 	/**
15 | 	 * @internal
16 | 	 */
17 | 	public constructor(width: number, height: number) {
18 | 		this.bitmap = make(width, height);
19 | 	}
20 | 	/**
21 | 	 * @internal
22 | 	 */
23 | 	public async toPNG(): Promise<Buffer> {
24 | 		const result: Uint8Array[] = [];
25 | 		const stream: PassThrough = new PassThrough();
26 | 		stream.on('data', (data: Uint8Array) => result.push(data));
27 | 		await encodePNGToStream(this.bitmap, stream);
28 | 		return Buffer.concat(result);
29 | 	}
30 | 	/**
31 | 	 * @internal
32 | 	 */
33 | 	public async toJPEG(quality?: number): Promise<Buffer> {
34 | 		const result: Uint8Array[] = [];
35 | 		const stream: PassThrough = new PassThrough();
36 | 		stream.on('data', (data: Uint8Array) => result.push(data));
37 | 		await encodeJPEGToStream(this.bitmap, stream, quality);
38 | 		return Buffer.concat(result);
39 | 	}
40 | 	/**
41 | 	 * @internal
42 | 	 */
43 | 	public createContext(): CanvasRenderingContext2D {
44 | 		return this.bitmap.getContext('2d');
45 | 	}
46 | 	/**
47 | 	 * @internal
48 | 	 */
49 | 	public reset(width: number, height: number): void {
50 | 		this.bitmap.width = width;
51 | 		this.bitmap.height = height;
52 | 	}
53 | 	/**
54 | 	 * @internal
55 | 	 */
56 | 	public destroy(): void {
57 | 		this.bitmap.width = 0;
58 | 		this.bitmap.height = 0;
59 | 	}
60 | }


--------------------------------------------------------------------------------
/src/tesseractjsocr.ts:
--------------------------------------------------------------------------------
 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor
 2 | import { createScheduler, createWorker, RecognizeResult, Scheduler, Worker } from 'tesseract.js';
 3 | import { OcrApi } from './ocrapi';
 4 | import { OCRLang } from './types';
 5 | 
 6 | /**
 7 |  * implementation for tesseractjs
 8 |  */
 9 | export class TesseractJsOcr implements OcrApi {
10 | 	/**
11 | 	 * recognize characters of buffers
12 | 	 * 
13 | 	 * @param {Buffer[]} buffers - the image buffers
14 | 	 * @param {OCRLang[]} langs - the language traineddata used for recognition
15 | 	 * @returns {Promise<string[]>} an array with text from each side
16 | 	 */
17 | 	async ocrBuffers(buffers: Buffer[], langs: OCRLang[]): Promise<string[]> {
18 | 		if (buffers.length == 0) return [];
19 | 		if (buffers.length == 1) {
20 | 			const lang: string = langs.join('+');
21 | 			const worker: Worker = await createWorker(lang);
22 | 			const data: RecognizeResult = await worker.recognize(buffers[0]);
23 | 			await worker.terminate();
24 | 			return [data.data.text];
25 | 		}
26 | 		const lang: string = langs.join('+');
27 | 		const scheduler: Scheduler = createScheduler();
28 | 		for (let i: number = 0; i < buffers.length; i++) {
29 | 			const worker: Worker = await createWorker(lang);
30 | 			scheduler.addWorker(worker);
31 | 		}
32 | 		const result: RecognizeResult[] = await Promise.all(buffers.map(async (buffer: Buffer) => scheduler.addJob('recognize', buffer))) as RecognizeResult[];
33 | 		await scheduler.terminate();
34 | 		return result.map((r: RecognizeResult) => r.data.text);
35 | 	}
36 | }


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
  1 | import { VerbosityLevel as RawVerbosityLevel } from 'pdfjs-dist/legacy/build/pdf.mjs';
  2 | 
  3 | export enum VerbosityLevel {
  4 | 	/**
  5 | 	 * logs all errors
  6 | 	 */
  7 | 	ERRORS = RawVerbosityLevel.ERRORS,
  8 | 	/**
  9 | 	 * logs all infos
 10 | 	 */
 11 | 	INFOS = RawVerbosityLevel.INFOS,
 12 | 	/**
 13 | 	 * logs all warnings
 14 | 	 */
 15 | 	WARNINGS = RawVerbosityLevel.WARNINGS,
 16 | }
 17 | 
 18 | export enum Sort {
 19 | 	/**
 20 | 	 * if it should sort ascending
 21 | 	 */
 22 | 	ASC,
 23 | 	/**
 24 | 	 * if it should sort descending
 25 | 	 */
 26 | 	DESC
 27 | }
 28 | 
 29 | export enum OCRLang {
 30 | 	/**
 31 | 	 * AFR
 32 | 	 */
 33 | 	AFR = 'afr',
 34 | 	/**
 35 | 	 * AMH
 36 | 	 */
 37 | 	AMH = 'amh',
 38 | 	/**
 39 | 	 * ARA
 40 | 	 */
 41 | 	ARA = 'ara',
 42 | 	/**
 43 | 	 * ASM
 44 | 	 */
 45 | 	ASM = 'asm',
 46 | 	/**
 47 | 	 * AZE
 48 | 	 */
 49 | 	AZE = 'aze',
 50 | 	/**
 51 | 	 * AZE_CYRL
 52 | 	 */
 53 | 	AZE_CYRL = 'aze_cyrl',
 54 | 	/**
 55 | 	 * BEL
 56 | 	 */
 57 | 	BEL = 'bel',
 58 | 	/**
 59 | 	 * BEN
 60 | 	 */
 61 | 	BEN = 'ben',
 62 | 	/**
 63 | 	 * BOD
 64 | 	 */
 65 | 	BOD = 'bod',
 66 | 	/**
 67 | 	 * BOS
 68 | 	 */
 69 | 	BOS = 'bos',
 70 | 	/**
 71 | 	 * BUL
 72 | 	 */
 73 | 	BUL = 'bul',
 74 | 	/**
 75 | 	 * CAT
 76 | 	 */
 77 | 	CAT = 'cat',
 78 | 	/**
 79 | 	 * CEB
 80 | 	 */
 81 | 	CEB = 'ceb',
 82 | 	/**
 83 | 	 * CES
 84 | 	 */
 85 | 	CES = 'ces',
 86 | 	/**
 87 | 	 * CHI_SIM
 88 | 	 */
 89 | 	CHI_SIM = 'chi_sim',
 90 | 	/**
 91 | 	 * CHI_TRA
 92 | 	 */
 93 | 	CHI_TRA = 'chi_tra',
 94 | 	/**
 95 | 	 * CHR
 96 | 	 */
 97 | 	CHR = 'chr',
 98 | 	/**
 99 | 	 * CYM
100 | 	 */
101 | 	CYM = 'cym',
102 | 	/**
103 | 	 * DAN
104 | 	 */
105 | 	DAN = 'dan',
106 | 	/**
107 | 	 * DEU
108 | 	 */
109 | 	DEU = 'deu',
110 | 	/**
111 | 	 * DZO
112 | 	 */
113 | 	DZO = 'dzo',
114 | 	/**
115 | 	 * ELL
116 | 	 */
117 | 	ELL = 'ell',
118 | 	/**
119 | 	 * ENG
120 | 	 */
121 | 	ENG = 'eng',
122 | 	/**
123 | 	 * ENM
124 | 	 */
125 | 	ENM = 'enm',
126 | 	/**
127 | 	 * EPO
128 | 	 */
129 | 	EPO = 'epo',
130 | 	/**
131 | 	 * EST
132 | 	 */
133 | 	EST = 'est',
134 | 	/**
135 | 	 * EUS
136 | 	 */
137 | 	EUS = 'eus',
138 | 	/**
139 | 	 * FAS
140 | 	 */
141 | 	FAS = 'fas',
142 | 	/**
143 | 	 * FIN
144 | 	 */
145 | 	FIN = 'fin',
146 | 	/**
147 | 	 * FRA
148 | 	 */
149 | 	FRA = 'fra',
150 | 	/**
151 | 	 * FRK
152 | 	 */
153 | 	FRK = 'frk',
154 | 	/**
155 | 	 * FRM
156 | 	 */
157 | 	FRM = 'frm',
158 | 	/**
159 | 	 * GLE
160 | 	 */
161 | 	GLE = 'gle',
162 | 	/**
163 | 	 * GLG
164 | 	 */
165 | 	GLG = 'glg',
166 | 	/**
167 | 	 * GRC
168 | 	 */
169 | 	GRC = 'grc',
170 | 	/**
171 | 	 * GUJ
172 | 	 */
173 | 	GUJ = 'guj',
174 | 	/**
175 | 	 * HAT
176 | 	 */
177 | 	HAT = 'hat',
178 | 	/**
179 | 	 * HEB
180 | 	 */
181 | 	HEB = 'heb',
182 | 	/**
183 | 	 * HIN
184 | 	 */
185 | 	HIN = 'hin',
186 | 	/**
187 | 	 * HRV
188 | 	 */
189 | 	HRV = 'hrv',
190 | 	/**
191 | 	 * HUN
192 | 	 */
193 | 	HUN = 'hun',
194 | 	/**
195 | 	 * IKU
196 | 	 */
197 | 	IKU = 'iku',
198 | 	/**
199 | 	 * IND
200 | 	 */
201 | 	IND = 'ind',
202 | 	/**
203 | 	 * ISL
204 | 	 */
205 | 	ISL = 'isl',
206 | 	/**
207 | 	 * ITA
208 | 	 */
209 | 	ITA = 'ita',
210 | 	/**
211 | 	 * ITA_OLD
212 | 	 */
213 | 	ITA_OLD = 'ita_old',
214 | 	/**
215 | 	 * JAV
216 | 	 */
217 | 	JAV = 'jav',
218 | 	/**
219 | 	 * JPN
220 | 	 */
221 | 	JPN = 'jpn',
222 | 	/**
223 | 	 * KAN
224 | 	 */
225 | 	KAN = 'kan',
226 | 	/**
227 | 	 * KAT
228 | 	 */
229 | 	KAT = 'kat',
230 | 	/**
231 | 	 * KAT_OLD
232 | 	 */
233 | 	KAT_OLD = 'kat_old',
234 | 	/**
235 | 	 * KAZ
236 | 	 */
237 | 	KAZ = 'kaz',
238 | 	/**
239 | 	 * KHM
240 | 	 */
241 | 	KHM = 'khm',
242 | 	/**
243 | 	 * KIR
244 | 	 */
245 | 	KIR = 'kir',
246 | 	/**
247 | 	 * KOR
248 | 	 */
249 | 	KOR = 'kor',
250 | 	/**
251 | 	 * KUR
252 | 	 */
253 | 	KUR = 'kur',
254 | 	/**
255 | 	 * LAO
256 | 	 */
257 | 	LAO = 'lao',
258 | 	/**
259 | 	 * LAT
260 | 	 */
261 | 	LAT = 'lat',
262 | 	/**
263 | 	 * LAV
264 | 	 */
265 | 	LAV = 'lav',
266 | 	/**
267 | 	 * LIT
268 | 	 */
269 | 	LIT = 'lit',
270 | 	/**
271 | 	 * MAL
272 | 	 */
273 | 	MAL = 'mal',
274 | 	/**
275 | 	 * MAR
276 | 	 */
277 | 	MAR = 'mar',
278 | 	/**
279 | 	 * MKD
280 | 	 */
281 | 	MKD = 'mkd',
282 | 	/**
283 | 	 * MLT
284 | 	 */
285 | 	MLT = 'mlt',
286 | 	/**
287 | 	 * MSA
288 | 	 */
289 | 	MSA = 'msa',
290 | 	/**
291 | 	 * MYA
292 | 	 */
293 | 	MYA = 'mya',
294 | 	/**
295 | 	 * NEP
296 | 	 */
297 | 	NEP = 'nep',
298 | 	/**
299 | 	 * NLD
300 | 	 */
301 | 	NLD = 'nld',
302 | 	/**
303 | 	 * NOR
304 | 	 */
305 | 	NOR = 'nor',
306 | 	/**
307 | 	 * ORI
308 | 	 */
309 | 	ORI = 'ori',
310 | 	/**
311 | 	 * PAN
312 | 	 */
313 | 	PAN = 'pan',
314 | 	/**
315 | 	 * POL
316 | 	 */
317 | 	POL = 'pol',
318 | 	/**
319 | 	 * POR
320 | 	 */
321 | 	POR = 'por',
322 | 	/**
323 | 	 * PUS
324 | 	 */
325 | 	PUS = 'pus',
326 | 	/**
327 | 	 * RON
328 | 	 */
329 | 	RON = 'ron',
330 | 	/**
331 | 	 * RUS
332 | 	 */
333 | 	RUS = 'rus',
334 | 	/**
335 | 	 * SAN
336 | 	 */
337 | 	SAN = 'san',
338 | 	/**
339 | 	 * SIN
340 | 	 */
341 | 	SIN = 'sin',
342 | 	/**
343 | 	 * SLK
344 | 	 */
345 | 	SLK = 'slk',
346 | 	/**
347 | 	 * SLV
348 | 	 */
349 | 	SLV = 'slv',
350 | 	/**
351 | 	 * SPA
352 | 	 */
353 | 	SPA = 'spa',
354 | 	/**
355 | 	 * SPA_OLD
356 | 	 */
357 | 	SPA_OLD = 'spa_old',
358 | 	/**
359 | 	 * SQI
360 | 	 */
361 | 	SQI = 'sqi',
362 | 	/**
363 | 	 * SRP
364 | 	 */
365 | 	SRP = 'srp',
366 | 	/**
367 | 	 * SRP_LATN
368 | 	 */
369 | 	SRP_LATN = 'srp_latn',
370 | 	/**
371 | 	 * SWA
372 | 	 */
373 | 	SWA = 'swa',
374 | 	/**
375 | 	 * SWE
376 | 	 */
377 | 	SWE = 'swe',
378 | 	/**
379 | 	 * SYR
380 | 	 */
381 | 	SYR = 'syr',
382 | 	/**
383 | 	 * TAM
384 | 	 */
385 | 	TAM = 'tam',
386 | 	/**
387 | 	 * TEL
388 | 	 */
389 | 	TEL = 'tel',
390 | 	/**
391 | 	 * TGK
392 | 	 */
393 | 	TGK = 'tgk',
394 | 	/**
395 | 	 * TGL
396 | 	 */
397 | 	TGL = 'tgl',
398 | 	/**
399 | 	 * THA
400 | 	 */
401 | 	THA = 'tha',
402 | 	/**
403 | 	 * TIR
404 | 	 */
405 | 	TIR = 'tir',
406 | 	/**
407 | 	 * TUR
408 | 	 */
409 | 	TUR = 'tur',
410 | 	/**
411 | 	 * UIG
412 | 	 */
413 | 	UIG = 'uig',
414 | 	/**
415 | 	 * UKR
416 | 	 */
417 | 	UKR = 'ukr',
418 | 	/**
419 | 	 * URD
420 | 	 */
421 | 	URD = 'urd',
422 | 	/**
423 | 	 * UZB
424 | 	 */
425 | 	UZB = 'uzb',
426 | 	/**
427 | 	 * UZB_CYRL
428 | 	 */
429 | 	UZB_CYRL = 'uzb_cyrl',
430 | 	/**
431 | 	 * VIE
432 | 	 */
433 | 	VIE = 'vie',
434 | 	/**
435 | 	 * YID
436 | 	 */
437 | 	YID = 'yid'
438 | }
439 | 
440 | export interface Permissions {
441 | 	/**
442 | 	 * allow to assemble
443 | 	 * 
444 | 	 * @readonly
445 | 	 * @type {boolean}
446 | 	 */
447 | 	readonly assemble: boolean,
448 | 	/**
449 | 	 * allow to copy the content
450 | 	 * 
451 | 	 * @readonly
452 | 	 * @type {boolean}
453 | 	 */
454 | 	readonly copy: boolean,
455 | 	/**
456 | 	 * allow to fill interactive forms
457 | 	 * 
458 | 	 * @readonly
459 | 	 * @type {boolean}
460 | 	 */
461 | 	readonly fillInteractiveForms: boolean,
462 | 	/**
463 | 	 * allow to modify annotations
464 | 	 * 
465 | 	 * @readonly
466 | 	 * @type {boolean}
467 | 	 */
468 | 	readonly modifyAnnotations: boolean,
469 | 	/**
470 | 	 * allow to modify contents
471 | 	 * 
472 | 	 * @readonly
473 | 	 * @type {boolean}
474 | 	 */
475 | 	readonly modifyContents: boolean,
476 | 	/**
477 | 	 * allow to print
478 | 	 * 
479 | 	 * @readonly
480 | 	 * @type {boolean}
481 | 	 */
482 | 	readonly print: boolean,
483 | 	/**
484 | 	 * allow to print in highquality
485 | 	 * 
486 | 	 * @readonly
487 | 	 * @type {boolean}
488 | 	 */
489 | 	readonly printHQ: boolean,
490 | 	/**
491 | 	 * allow to copy the content for accessibility
492 | 	 * 
493 | 	 * @readonly
494 | 	 * @type {boolean}
495 | 	 */
496 | 	readonly copyForAccessibility: boolean,
497 | }
498 | 
499 | export interface Metadata {
500 | 	/**
501 | 	 * get the raw metadata
502 | 	 * 
503 | 	 * @returns {string} the raw metadata
504 | 	 */
505 | 	getRaw(): string;
506 | 	/**
507 | 	 * get data by name
508 | 	 * 
509 | 	 * @returns {string} the data
510 | 	 */
511 | 	get(name: string): string | string[];
512 | 	/**
513 | 	 * get all data
514 | 	 * 
515 | 	 * @returns {{ [key: string]: string | string[] | undefined }} all data
516 | 	 */
517 | 	getAll(): { [key: string]: string | string[] | undefined };
518 | 	/**
519 | 	 * check whether data with the name are available
520 | 	 * 
521 | 	 * @returns {boolean} if available then true is returned
522 | 	 */
523 | 	has(name: string): boolean;
524 | }
525 | 
526 | export interface MetadataInfo {
527 | 	/**
528 | 	 * the meta information of the document
529 | 	 */
530 | 	info: Info;
531 | 	/**
532 | 	 * the metadata of the document
533 | 	 */
534 | 	metadata: Metadata;
535 | }
536 | 
537 | export interface Name {
538 | 	/**
539 | 	 * the name
540 | 	 * 
541 | 	 * @type {string}
542 | 	 */
543 | 	readonly name: string
544 | }
545 | 
546 | export interface Info {
547 | 	/**
548 | 	 * the title
549 | 	 * 
550 | 	 * @readonly
551 | 	 * @type {string | undefined}
552 | 	 */
553 | 	readonly Title?: string,
554 | 	/**
555 | 	 * the author
556 | 	 * 
557 | 	 * @readonly
558 | 	 * @type {string | undefined}
559 | 	 */
560 | 	readonly Author?: string,
561 | 	/**
562 | 	 * the subject
563 | 	 * 
564 | 	 * @readonly
565 | 	 * @type {string | undefined}
566 | 	 */
567 | 	readonly Subject?: string,
568 | 	/**
569 | 	 * the keywords
570 | 	 * 
571 | 	 * @readonly
572 | 	 * @type {string | undefined}
573 | 	 */
574 | 	readonly Keywords?: string,
575 | 	/**
576 | 	 * the creator
577 | 	 * 
578 | 	 * @readonly
579 | 	 * @type {string | undefined}
580 | 	 */
581 | 	readonly Creator?: string,
582 | 	/**
583 | 	 * the producer
584 | 	 * 
585 | 	 * @readonly
586 | 	 * @type {string | undefined}
587 | 	 */
588 | 	readonly Producer?: string,
589 | 	/**
590 | 	 * the creation date
591 | 	 * 
592 | 	 * @readonly
593 | 	 * @type {string | undefined}
594 | 	 */
595 | 	readonly CreationDate?: string,
596 | 	/**
597 | 	 * the modification date
598 | 	 * 
599 | 	 * @readonly
600 | 	 * @type {string | undefined}
601 | 	 */
602 | 	readonly ModDate?: string,
603 | 	/**
604 | 	 * the trapped
605 | 	 * 
606 | 	 * @readonly
607 | 	 * @type {Name | undefined}
608 | 	 */
609 | 	readonly Trapped?: Name,
610 | 	/**
611 | 	 * the format version
612 | 	 * 
613 | 	 * @readonly
614 | 	 * @type {string | undefined}
615 | 	 */
616 | 	readonly PDFFormatVersion?: string,
617 | 	/**
618 | 	 * if it is linearized
619 | 	 * 
620 | 	 * @readonly
621 | 	 * @type {boolean | undefined}
622 | 	 */
623 | 	readonly IsLinearized?: boolean,
624 | 	/**
625 | 	 * if acro form is present
626 | 	 * 
627 | 	 * @readonly
628 | 	 * @type {boolean | undefined}
629 | 	 */
630 | 	readonly IsAcroFormPresent?: boolean
631 | 	/**
632 | 	 * if xfa form is present
633 | 	 * 
634 | 	 * @readonly
635 | 	 * @type {boolean | undefined}
636 | 	 */
637 | 	readonly IsXFAPresent?: boolean,
638 | 	/**
639 | 	 * if collection is present
640 | 	 * 
641 | 	 * @readonly
642 | 	 * @type {boolean | undefined}
643 | 	 */
644 | 	readonly IsCollectionPresent?: boolean,
645 | 	/**
646 | 	 * if signatures are present
647 | 	 * 
648 | 	 * @readonly
649 | 	 * @type {boolean | undefined}
650 | 	 */
651 | 	readonly IsSignaturesPresent?: boolean,
652 | 	readonly [key: string]: string | number | boolean | Name | undefined,
653 | }
654 | 
655 | /**
656 |  * it is an outline (bookmark) of the pdf document
657 |  */
658 | export interface Outline {
659 | 	/**
660 | 	 * the title
661 | 	 * 
662 | 	 * @readonly
663 | 	 * @type {string}
664 | 	 */
665 | 	readonly title: string,
666 | 	/**
667 | 	 * the childrens
668 | 	 * 
669 | 	 * @readonly
670 | 	 * @type {ReadonlyArray<Outline> | undefined}
671 | 	 */
672 | 	readonly childs?: readonly Outline[],
673 | }
674 | 
675 | /**
676 |  * the outline which includes a url
677 |  */
678 | export class UrlOutline implements Outline {
679 | 	/**
680 | 	 * @param {string} title - the title
681 | 	 * @param {string} url - the url to which the outline points
682 | 	 * @param {boolean} absolute - if the url is absolute
683 | 	 * @param {Outline[]} [childs] - the childrens
684 | 	 */
685 | 	constructor(
686 | 		readonly title: string,
687 | 		readonly url: string,
688 | 		readonly absolute: boolean,
689 | 		readonly childs?: readonly Outline[],
690 | 	) { }
691 | }
692 | 
693 | 
694 | /**
695 |  * the outline which includes a page number
696 |  */
697 | export class PageNumberOutline implements Outline {
698 | 	/**
699 | 	 * @param {string} title - the title
700 | 	 * @param {number} page - the page number to which the outline points
701 | 	 * @param {Outline[]} [childs] - the childrens
702 | 	 */
703 | 	constructor(
704 | 		readonly title: string,
705 | 		readonly page: number,
706 | 		readonly childs?: readonly Outline[],
707 | 	) { }
708 | }
709 | 
710 | /**
711 |  * the outline which includes a reference to another pdf
712 |  */
713 | export class PdfReferenceOutline implements Outline {
714 | 	/**
715 | 	 * @param {string} title - the title
716 | 	 * @param {string} url - the url to which the outline points
717 | 	 * @param {number} page - the remote page number to which the outline points
718 | 	 * @param {Outline[]} [childs] - the childrens
719 | 	 */
720 | 	constructor(
721 | 		readonly title: string,
722 | 		readonly url: string,
723 | 		readonly page?: number,
724 | 		readonly childs?: readonly Outline[],
725 | 	) { }
726 | }


--------------------------------------------------------------------------------
/test/basic.extractor.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/basic.pdf';
 2 | 
 3 | import { PdfDataExtractor, VerbosityLevel } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 | 	const buffer = readFileSync(PDF_TEST_FILE);
 9 | 	test('without password should fail', async () => {
10 | 		await expect(PdfDataExtractor.get(buffer)).rejects.toThrow();
11 | 	});
12 | 	test('extract basic data', async () => {
13 | 		const extractor = await PdfDataExtractor.get(buffer, {
14 | 			password: '123456',
15 | 			verbosity: VerbosityLevel.ERRORS,
16 | 		});
17 | 		//(await extractor.getPageData()).forEach(async cf => {
18 | 		//	console.log(await cf?.contentInfo());
19 | 		//});
20 | 		expect(extractor.pages).toEqual(2);
21 | 		const text = await extractor.getText();
22 | 		expect(text.length).toEqual(2);
23 | 		const first_page_lines = text[0].split('\n');
24 | 		expect(first_page_lines.length).toEqual(31);
25 | 		expect(first_page_lines[7]).toMatch(/^dapibus mattis/);
26 | 		const permissions = await extractor.getPermissions();
27 | 		expect(permissions).not.toBeNull();
28 | 		if (permissions) {
29 | 			expect(permissions.print).toEqual(true);
30 | 			expect(permissions.modifyAnnotations).toEqual(false);
31 | 		}
32 | 	});
33 | });


--------------------------------------------------------------------------------
/test/basic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/basic.pdf


--------------------------------------------------------------------------------
/test/basic.test.ts:
--------------------------------------------------------------------------------
  1 | const PDF_TEST_FILE = './test/basic.pdf';
  2 | 
  3 | import { PdfData, VerbosityLevel } from '../src';
  4 | import { readFileSync } from 'fs';
  5 | import { test, describe, expect } from 'vitest';
  6 | 
  7 | describe(`parse ${PDF_TEST_FILE}`, () => {
  8 | 	const buffer = readFileSync(PDF_TEST_FILE);
  9 | 	test('without password should fail', async () => {
 10 | 		await expect(PdfData.extract(buffer)).rejects.toThrow();
 11 | 	});
 12 | 	test('extract basic data', async () => {
 13 | 		const data = await PdfData.extract(buffer, {
 14 | 			password: '123456',
 15 | 			verbosity: VerbosityLevel.ERRORS,
 16 | 		});
 17 | 		expect(data.pages).toEqual(2);
 18 | 		expect(data.text).not.toBeNull();
 19 | 		if (data.text) {
 20 | 			expect(data.text.length).toEqual(2);
 21 | 			const first_page_lines = data.text[0].split('\n');
 22 | 			expect(first_page_lines.length).toEqual(31);
 23 | 			expect(first_page_lines[7]).toMatch(/^dapibus mattis/);
 24 | 		}
 25 | 		expect(data.permissions).not.toBeNull();
 26 | 		if (data.permissions) {
 27 | 			expect(data.permissions.print).toEqual(true);
 28 | 			expect(data.permissions.modifyAnnotations).toEqual(false);
 29 | 		}
 30 | 	});
 31 | 	test('extract seperated basic data', async () => {
 32 | 		let data: PdfData;
 33 | 
 34 | 		data = await PdfData.extract(buffer, {
 35 | 			password: '123456',
 36 | 			verbosity: VerbosityLevel.ERRORS,
 37 | 			get: {
 38 | 				pages: false,
 39 | 				text: false,
 40 | 				fingerprint: false,
 41 | 				outline: false,
 42 | 				metadata: false,
 43 | 				info: false,
 44 | 				permissions: false
 45 | 			}
 46 | 		});
 47 | 		expect(data.pages).toBeUndefined();
 48 | 		expect(data.text).toBeUndefined();
 49 | 		expect(data.fingerprint).toBeUndefined();
 50 | 		expect(data.outline).toBeUndefined();
 51 | 		expect(data.metadata).toBeUndefined();
 52 | 		expect(data.info).toBeUndefined();
 53 | 		expect(data.permissions).toBeUndefined();
 54 | 
 55 | 		data = await PdfData.extract(buffer, {
 56 | 			password: '123456',
 57 | 			verbosity: VerbosityLevel.ERRORS,
 58 | 			get: {
 59 | 				pages: true,
 60 | 				text: false,
 61 | 				fingerprint: false,
 62 | 				outline: false,
 63 | 				metadata: false,
 64 | 				info: false,
 65 | 				permissions: false
 66 | 			}
 67 | 		});
 68 | 		expect(data.pages).toBeDefined();
 69 | 		expect(data.text).toBeUndefined();
 70 | 		expect(data.fingerprint).toBeUndefined();
 71 | 		expect(data.outline).toBeUndefined();
 72 | 		expect(data.metadata).toBeUndefined();
 73 | 		expect(data.info).toBeUndefined();
 74 | 		expect(data.permissions).toBeUndefined();
 75 | 
 76 | 		data = await PdfData.extract(buffer, {
 77 | 			password: '123456',
 78 | 			verbosity: VerbosityLevel.ERRORS,
 79 | 			get: {
 80 | 				pages: false,
 81 | 				text: true,
 82 | 				fingerprint: false,
 83 | 				outline: false,
 84 | 				metadata: false,
 85 | 				info: false,
 86 | 				permissions: false
 87 | 			}
 88 | 		});
 89 | 		expect(data.pages).toBeUndefined();
 90 | 		expect(data.text).toBeDefined();
 91 | 		expect(data.fingerprint).toBeUndefined();
 92 | 		expect(data.outline).toBeUndefined();
 93 | 		expect(data.metadata).toBeUndefined();
 94 | 		expect(data.info).toBeUndefined();
 95 | 		expect(data.permissions).toBeUndefined();
 96 | 
 97 | 		data = await PdfData.extract(buffer, {
 98 | 			password: '123456',
 99 | 			verbosity: VerbosityLevel.ERRORS,
100 | 			get: {
101 | 				pages: false,
102 | 				text: false,
103 | 				fingerprint: true,
104 | 				outline: false,
105 | 				metadata: false,
106 | 				info: false,
107 | 				permissions: false
108 | 			}
109 | 		});
110 | 		expect(data.pages).toBeUndefined();
111 | 		expect(data.text).toBeUndefined();
112 | 		expect(data.fingerprint).toBeDefined();
113 | 		expect(data.outline).toBeUndefined();
114 | 		expect(data.metadata).toBeUndefined();
115 | 		expect(data.info).toBeUndefined();
116 | 		expect(data.permissions).toBeUndefined();
117 | 
118 | 		data = await PdfData.extract(buffer, {
119 | 			password: '123456',
120 | 			verbosity: VerbosityLevel.ERRORS,
121 | 			get: {
122 | 				pages: false,
123 | 				text: false,
124 | 				fingerprint: false,
125 | 				outline: true,
126 | 				metadata: false,
127 | 				info: false,
128 | 				permissions: false
129 | 			}
130 | 		});
131 | 		expect(data.pages).toBeUndefined();
132 | 		expect(data.text).toBeUndefined();
133 | 		expect(data.fingerprint).toBeUndefined();
134 | 		expect(data.outline).toBeDefined();
135 | 		expect(data.metadata).toBeUndefined();
136 | 		expect(data.info).toBeUndefined();
137 | 		expect(data.permissions).toBeUndefined();
138 | 
139 | 		// TODO
140 | 		//data = await PdfData.extract(buffer, {
141 | 		//	password: '123456',
142 | 		//	verbosity: VerbosityLevel.ERRORS,
143 | 		//	get: {
144 | 		//		pages: false,
145 | 		//		text: false,
146 | 		//		fingerprint: false,
147 | 		//		outline: false,
148 | 		//		metadata: true,
149 | 		//		info: false,
150 | 		//		permissions: false
151 | 		//	}
152 | 		//});
153 | 		//expect(data.pages).toBeUndefined();
154 | 		//expect(data.text).toBeUndefined();
155 | 		//expect(data.fingerprint).toBeUndefined();
156 | 		//expect(data.outline).toBeUndefined();
157 | 		//expect(data.metadata).toBeDefined();
158 | 		//expect(data.info).toBeUndefined();
159 | 		//expect(data.permissions).toBeUndefined();
160 | 
161 | 		data = await PdfData.extract(buffer, {
162 | 			password: '123456',
163 | 			verbosity: VerbosityLevel.ERRORS,
164 | 			get: {
165 | 				pages: false,
166 | 				text: false,
167 | 				fingerprint: false,
168 | 				outline: false,
169 | 				metadata: false,
170 | 				info: true,
171 | 				permissions: false
172 | 			}
173 | 		});
174 | 		expect(data.pages).toBeUndefined();
175 | 		expect(data.text).toBeUndefined();
176 | 		expect(data.fingerprint).toBeUndefined();
177 | 		expect(data.outline).toBeUndefined();
178 | 		expect(data.metadata).toBeUndefined();
179 | 		expect(data.info).toBeDefined();
180 | 		expect(data.permissions).toBeUndefined();
181 | 
182 | 		data = await PdfData.extract(buffer, {
183 | 			password: '123456',
184 | 			verbosity: VerbosityLevel.ERRORS,
185 | 			get: {
186 | 				pages: false,
187 | 				text: false,
188 | 				fingerprint: false,
189 | 				outline: false,
190 | 				metadata: false,
191 | 				info: false,
192 | 				permissions: true
193 | 			}
194 | 		});
195 | 		expect(data.pages).toBeUndefined();
196 | 		expect(data.text).toBeUndefined();
197 | 		expect(data.fingerprint).toBeUndefined();
198 | 		expect(data.outline).toBeUndefined();
199 | 		expect(data.metadata).toBeUndefined();
200 | 		expect(data.info).toBeUndefined();
201 | 		expect(data.permissions).toBeDefined();
202 | 	})
203 | });


--------------------------------------------------------------------------------
/test/empty_outline.extractor.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/empty_outline.pdf';
 2 | 
 3 | import { PdfDataExtractor, VerbosityLevel } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 | 	const buffer = readFileSync(PDF_TEST_FILE);
 9 | 	test('extract empty outline', async () => {
10 | 		const extractor = await PdfDataExtractor.get(buffer, {
11 | 			verbosity: VerbosityLevel.ERRORS,
12 | 		});
13 | 		expect(await extractor.getOutline()).toBeNull();
14 | 	});
15 | });


--------------------------------------------------------------------------------
/test/empty_outline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/empty_outline.pdf


--------------------------------------------------------------------------------
/test/empty_outline.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/empty_outline.pdf';
 2 | 
 3 | import { PdfData, VerbosityLevel } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 | 	const buffer = readFileSync(PDF_TEST_FILE);
 9 | 	test('extract empty outline', async () => {
10 | 		const data = await PdfData.extract(buffer, {
11 | 			verbosity: VerbosityLevel.ERRORS,
12 | 		});
13 | 		expect(await data.outline).toBeUndefined();
14 | 	});
15 | });


--------------------------------------------------------------------------------
/test/outline.extractor.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/outline.pdf';
 2 | 
 3 | import { PdfDataExtractor, VerbosityLevel, PageNumberOutline, UrlOutline, PdfReferenceOutline } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 |   const buffer = readFileSync(PDF_TEST_FILE);
 9 |   test('extract empty outline', async () => {
10 |     const extractor = await PdfDataExtractor.get(buffer, {
11 |       verbosity: VerbosityLevel.ERRORS,
12 |     });
13 |     const outline = await extractor.getOutline();
14 |     expect(outline).not.toBeNull();
15 |     if (outline) {
16 |       const outline0 = outline[0];
17 |       expect(outline0.title).toEqual('to_page_1');
18 |       expect(outline0).toBeInstanceOf(PageNumberOutline);
19 |       if (outline0 instanceof PageNumberOutline) {
20 |         expect(outline0.page).toEqual(0);
21 |       }
22 | 
23 |       const outline1 = outline[1];
24 |       expect(outline1.title).toEqual('to_page_1_reference');
25 |       expect(outline1).toBeInstanceOf(PageNumberOutline);
26 |       if (outline1 instanceof PageNumberOutline) {
27 |         expect(outline1.page).toEqual(0);
28 |       }
29 | 
30 |       const outline2 = outline[2];
31 |       expect(outline2.title).toEqual('url');
32 |       expect(outline2).toBeInstanceOf(UrlOutline);
33 |       if (outline2 instanceof UrlOutline) {
34 |         expect(outline2.url).toEqual('https://github.com/lublak/pdfdataextract');
35 |         expect(outline2.absolute).toEqual(true);
36 |       }
37 | 
38 |       const outline3 = outline[3];
39 |       expect(outline3.title).toEqual('to_pdf');
40 |       expect(outline3).toBeInstanceOf(PdfReferenceOutline);
41 |       if (outline3 instanceof PdfReferenceOutline) {
42 |         expect(outline3.url).toEqual('specific_pages.pdf');
43 |         expect(outline3.page).toEqual(0);
44 |       }
45 | 
46 |       const outline4 = outline[4];
47 |       expect(outline4.title).toEqual('open');
48 |       expect(outline4).toBeInstanceOf(UrlOutline);
49 |       if (outline4 instanceof UrlOutline) {
50 |         expect(outline4.url).toEqual('specific_pages.test.ts');
51 |         expect(outline4.absolute).toEqual(false);
52 |       }
53 |     }
54 |   });
55 | });


--------------------------------------------------------------------------------
/test/outline.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/outline.pdf


--------------------------------------------------------------------------------
/test/outline.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/outline.pdf';
 2 | 
 3 | import { PdfData, VerbosityLevel, PageNumberOutline, UrlOutline, PdfReferenceOutline } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 |   const buffer = readFileSync(PDF_TEST_FILE);
 9 |   test('extract outline', async () => {
10 |     const data = await PdfData.extract(buffer, {
11 |       verbosity: VerbosityLevel.ERRORS,
12 |       get: {
13 |         pages: false,
14 |         text: false,
15 |         fingerprint: false,
16 |         outline: true,
17 |         metadata: false,
18 |         info: false,
19 |         permissions: false
20 |       }
21 |     });
22 |     expect(data.outline).not.toBeNull();
23 |     if (data.outline) {
24 |       const outline0 = data.outline[0];
25 |       expect(outline0.title).toEqual('to_page_1');
26 |       expect(outline0).toBeInstanceOf(PageNumberOutline);
27 |       if (outline0 instanceof PageNumberOutline) {
28 |         expect(outline0.page).toEqual(0);
29 |       }
30 | 
31 |       const outline1 = data.outline[1];
32 |       expect(outline1.title).toEqual('to_page_1_reference');
33 |       expect(outline1).toBeInstanceOf(PageNumberOutline);
34 |       if (outline1 instanceof PageNumberOutline) {
35 |         expect(outline1.page).toEqual(0);
36 |       }
37 | 
38 |       const outline2 = data.outline[2];
39 |       expect(outline2.title).toEqual('url');
40 |       expect(outline2).toBeInstanceOf(UrlOutline);
41 |       if (outline2 instanceof UrlOutline) {
42 |         expect(outline2.url).toEqual('https://github.com/lublak/pdfdataextract');
43 |         expect(outline2.absolute).toEqual(true);
44 |       }
45 | 
46 |       const outline3 = data.outline[3];
47 |       expect(outline3.title).toEqual('to_pdf');
48 |       expect(outline3).toBeInstanceOf(PdfReferenceOutline);
49 |       if (outline3 instanceof PdfReferenceOutline) {
50 |         expect(outline3.url).toEqual('specific_pages.pdf');
51 |         expect(outline3.page).toEqual(0);
52 |       }
53 | 
54 |       const outline4 = data.outline[4];
55 |       expect(outline4.title).toEqual('open');
56 |       expect(outline4).toBeInstanceOf(UrlOutline);
57 |       if (outline4 instanceof UrlOutline) {
58 |         expect(outline4.url).toEqual('specific_pages.test.ts');
59 |         expect(outline4.absolute).toEqual(false);
60 |       }
61 |     }
62 |   });
63 | });


--------------------------------------------------------------------------------
/test/simple.extractor.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/simple.pdf';
 2 | 
 3 | import { PdfDataExtractor, VerbosityLevel } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 | 	const buffer = readFileSync(PDF_TEST_FILE);
 9 | 	test('extract basic data', async () => {
10 | 		const extractor = await PdfDataExtractor.get(buffer, {
11 | 			verbosity: VerbosityLevel.ERRORS,
12 | 		});
13 | 		//(await extractor.getPageData()).forEach(async cf => {
14 | 		//	console.log(await cf?.contentInfo());
15 | 		//});
16 | 	});
17 | });


--------------------------------------------------------------------------------
/test/simple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/simple.pdf


--------------------------------------------------------------------------------
/test/specific_pages.extractor.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/specific_pages.pdf';
 2 | 
 3 | import { PdfDataExtractor, VerbosityLevel } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 | 	const buffer = readFileSync(PDF_TEST_FILE);
 9 | 	test('extract specific pages', async () => {
10 | 		const extractor = await PdfDataExtractor.get(buffer, {
11 | 			verbosity: VerbosityLevel.ERRORS,
12 | 		});
13 | 		const pages = await extractor.getPageData([1]);
14 | 		expect(await extractor.getText([2])).toEqual(['2']);
15 | 		expect(await extractor.getText([5, 9])).toEqual(['5', '9']);
16 | 		expect(await extractor.getText((pageNumber) => pageNumber == 7)).toEqual(['7']);
17 | 		expect(await extractor.getText([5, 9, 5])).toEqual(['5', '9']);
18 | 	});
19 | });


--------------------------------------------------------------------------------
/test/specific_pages.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/specific_pages.pdf


--------------------------------------------------------------------------------
/test/specific_pages.test.ts:
--------------------------------------------------------------------------------
 1 | const PDF_TEST_FILE = './test/specific_pages.pdf';
 2 | 
 3 | import { PdfData, VerbosityLevel } from '../src';
 4 | import { readFileSync } from 'fs';
 5 | import { test, describe, expect } from 'vitest';
 6 | 
 7 | describe(`parse ${PDF_TEST_FILE}`, () => {
 8 | 	const buffer = readFileSync(PDF_TEST_FILE);
 9 | 	test('extract specific pages', async () => {
10 | 
11 | 		expect((await PdfData.extract(buffer, {
12 | 			verbosity: VerbosityLevel.ERRORS,
13 | 			pages: [2]
14 | 		})).text).toEqual(['2']);
15 | 
16 | 		expect((await PdfData.extract(buffer, {
17 | 			verbosity: VerbosityLevel.ERRORS,
18 | 			pages: [5, 9]
19 | 		})).text).toEqual(['5', '9']);
20 | 
21 | 		expect((await PdfData.extract(buffer, {
22 | 			verbosity: VerbosityLevel.ERRORS,
23 | 			pages: (pageNumber) => pageNumber == 7
24 | 		})).text).toEqual(['7']);
25 | 
26 | 		expect((await PdfData.extract(buffer, {
27 | 			verbosity: VerbosityLevel.ERRORS,
28 | 			pages: [5, 9, 5]
29 | 		})).text).toEqual(['5', '9']);
30 | 	});
31 | });


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"compilerOptions": {
 3 | 		"target": "es2017",
 4 | 		"module": "commonjs",
 5 | 		"lib": [
 6 | 			"es2017",
 7 | 			"dom"
 8 | 		],
 9 | 		"outDir": "dist",
10 | 		"rootDir": "src",
11 | 		"strict": true,
12 | 		"strictNullChecks": true,
13 | 		"noImplicitAny": true,
14 | 		"esModuleInterop": true,
15 | 		"resolveJsonModule": true,
16 | 		"forceConsistentCasingInFileNames": true,
17 | 		"declaration": true,
18 | 		"types": [
19 | 			"node"
20 | 		],
21 | 		"sourceMap": true
22 | 	},
23 | 	"include": [
24 | 		"src/**/*.ts"
25 | 	],
26 | 	"exclude": [
27 | 		"node_modules",
28 | 		"**/*.spec.ts"
29 | 	]
30 | }


--------------------------------------------------------------------------------
/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 |   "plugin": ["typedoc-plugin-markdown"],
3 |   "theme": "./node_modules/typedoc-github-wiki-theme/dist",
4 |   "readme": "none",
5 |   "out": "doc",
6 |   "excludePrivate": true
7 | }


--------------------------------------------------------------------------------
/vitest.config.mjs:
--------------------------------------------------------------------------------
1 | import { defineConfig } from 'vite'
2 | export default defineConfig({
3 | 	test: {},
4 | })


--------------------------------------------------------------------------------