├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yml │ └── feature_request.yml └── workflows │ └── node.js.yml ├── .gitignore ├── LICENSE ├── README.md ├── eslint.config.mjs ├── package-lock.json ├── package.json ├── src ├── canvasapi.ts ├── index.ts ├── nodecanvas.ts ├── nodeskiacanvas.ts ├── ocrapi.ts ├── pdfdata.ts ├── pdfdataextractor.ts ├── pdfpagedata.ts ├── pureimagecanvas.ts ├── tesseractjsocr.ts └── types.ts ├── test ├── basic.extractor.test.ts ├── basic.pdf ├── basic.test.ts ├── empty_outline.extractor.test.ts ├── empty_outline.pdf ├── empty_outline.test.ts ├── outline.extractor.test.ts ├── outline.pdf ├── outline.test.ts ├── simple.extractor.test.ts ├── simple.pdf ├── specific_pages.extractor.test.ts ├── specific_pages.pdf └── specific_pages.test.ts ├── tsconfig.json ├── typedoc.json └── vitest.config.mjs /.gitattributes: -------------------------------------------------------------------------------- 1 | package.json text eol=lf 2 | package-lock.json text eol=lf -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Create a report to help us improve 3 | labels: [bug] 4 | assignees: 5 | - lublak 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for taking the time to fill out this bug report! 11 | Please always be sure to use the latest compatible version. 12 | - type: textarea 13 | id: bug-description 14 | attributes: 15 | label: Describe the bug 16 | description: A clear and concise description of what the bug is. 17 | placeholder: The description of the bug. 18 | validations: 19 | required: true 20 | - type: textarea 21 | id: expected-behavior 22 | attributes: 23 | label: Describe the expected behavior 24 | description: A clear and concise description of what you expected to happen. 25 | placeholder: The expected behavior. 26 | validations: 27 | required: true 28 | - type: input 29 | attributes: 30 | label: What is your Node.js version? 31 | placeholder: 14.X.X 32 | validations: 33 | required: true 34 | - type: dropdown 35 | id: os 36 | attributes: 37 | label: What operating system are you seeing the problem on? 38 | multiple: true 39 | options: 40 | - Linux 41 | - Windows 42 | - MacOS 43 | - Other (enter below with the version) 44 | - type: input 45 | attributes: 46 | label: Operating system version (or if other, then please fill in complete name and version) 47 | validations: 48 | required: true 49 | - type: textarea 50 | id: logs 51 | attributes: 52 | label: Relevant log output 53 | description: Please copy and paste any relevant log output. This will be automatically formatted into code, so no need for backticks. 54 | render: shell 55 | - type: textarea 56 | id: pdf 57 | attributes: 58 | label: PDF File 59 | description: Please upload the pdf file that can be used to reproduce issues in the area below. (drag and drop) 60 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Suggest an idea for this project 3 | labels: [enhancement] 4 | assignees: 5 | - lublak 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | Thanks for taking the time to fill out this feature request! 11 | - type: textarea 12 | id: feature-description 13 | attributes: 14 | label: Describe the function you would like to have 15 | description: A clear and concise description of what you want to happen. 16 | placeholder: The description of the function. 17 | validations: 18 | required: true 19 | - type: textarea 20 | id: alternative-solution 21 | attributes: 22 | label: Describe your current alternative solution. 23 | description: Your current solution that you use, if there is one. 24 | placeholder: Your alternative solution. 25 | - type: textarea 26 | id: pdf 27 | attributes: 28 | label: PDF File 29 | description: If a PDF file helps to find a solution for this function, please upload the file. (drag and drop) 30 | -------------------------------------------------------------------------------- /.github/workflows/node.js.yml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | strategy: 15 | matrix: 16 | node-version: [20.x, 21.x, 22.x] 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Use Node.js ${{ matrix.node-version }} 21 | uses: actions/setup-node@v4 22 | with: 23 | node-version: ${{ matrix.node-version }} 24 | - name: Install dependencies 25 | run: npm ci 26 | - name: Lint 27 | run: npm run lint 28 | - name: Build 29 | run: npm run build --if-present 30 | - name: Test 31 | run: npm test -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 lublak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pdfdataextract 2 | 3 | [![version](https://img.shields.io/npm/v/pdfdataextract.svg)](https://www.npmjs.org/package/pdfdataextract) 4 | [![downloads](https://img.shields.io/npm/dt/pdfdataextract.svg)](https://www.npmjs.org/package/pdfdataextract) 5 | [![status](https://github.com/lublak/pdfdataextract/actions/workflows/node.js.yml/badge.svg)](https://github.com/lublak/pdfdataextract/actions/workflows/node.js.yml) 6 | 7 | Extract data from a pdf with pure javascript. 8 | 9 | The PdfData wrapper over PdfDataExtractor is inspired by https://www.npmjs.com/package/pdf-parse, which is currently unmaintained. 10 | PdfDataExtractor itself is a simple interface to extract individual data from a pdf file. 11 | 12 | ## Install 13 | 14 | `npm install pdfdataextract` 15 | 16 | ## Docs 17 | 18 | Full documentation is available at the [wiki](https://github.com/lublak/pdfdataextract/wiki) 19 | 20 | ## Usage 21 | 22 | PdfData is a wrapper around PdfDataExtractor to directly get a complete json structure. 23 | 24 | ```ts 25 | import { PdfData, VerbosityLevel } from 'pdfdataextract'; 26 | import { readFileSync } from 'fs'; 27 | const file_data = readFileSync('some_pdf_file.pdf'); 28 | 29 | // all options are optional 30 | PdfData.extract(file_data, { 31 | password: '123456', // password of the pdf file 32 | pages: 1, // how many pages should be read at most 33 | sort: true, // sort the text by text coordinates 34 | verbosity: VerbosityLevel.ERRORS, // set the verbosity level for parsing 35 | get: { // enable or disable data extraction (all are optional and enabled by default) 36 | pages: true, // get number of pages 37 | text: true, // get text of each page 38 | fingerprint: true, // get fingerprint 39 | outline: true, // get outline 40 | metadata: true, // get metadata 41 | info: true, // get info 42 | permissions: true, // get permissions 43 | }, 44 | }).then((data) => { 45 | data.pages; // the number of pages 46 | data.text; // an array of text pages 47 | data.fingerprint; // fingerprint of the pdf document 48 | data.outline; // outline data of the pdf document 49 | data.info; // information of the pdf document, such as Author 50 | data.metadata; // metadata of the pdf document 51 | data.permissions; // permissions for the document 52 | }); 53 | ``` 54 | 55 | ```ts 56 | import { PdfDataExtractor, VerbosityLevel } from 'pdfdataextract'; 57 | import { readFileSync } from 'fs'; 58 | const file_data = readFileSync('some_pdf_file.pdf'); 59 | 60 | // all options are optional 61 | PdfDataExtractor.get(file_data, { 62 | password: '123456', // password of the pdf file 63 | verbosity: VerbosityLevel.ERRORS, // set the verbosity level for parsing 64 | }).then((extractor) => { 65 | extractor.pages; // the number of pages 66 | extractor.fingerprint; // fingerprint of the pdf document 67 | 68 | extractor.getText(1, true).then((text) => { 69 | // an array of text pages (only one page and sorted) 70 | }); 71 | 72 | extractor.getText([2]).then((text) => { 73 | // an array of text pages (only the second page) 74 | }); 75 | 76 | extractor.getOutline().then((outline) => { 77 | // outline data of the pdf document 78 | }); 79 | 80 | extractor.getMetadata().then((metadata) => { 81 | // metadata of the pdf document 82 | }); 83 | 84 | extractor.getPermissions().then((permissions) => { 85 | // permissions for the document 86 | }); 87 | 88 | extractor.close(); 89 | }); 90 | ``` 91 | 92 | ## Test 93 | 94 | `npm test` 95 | 96 | ## License 97 | 98 | [MIT licensed](/LICENSE) 99 | -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "eslint/config"; 2 | import { fixupConfigRules, fixupPluginRules } from "@eslint/compat"; 3 | import stylisticTs from '@stylistic/eslint-plugin-ts'; 4 | import typescriptEslint from "@typescript-eslint/eslint-plugin"; 5 | import jsdoc from "eslint-plugin-jsdoc"; 6 | import globals from "globals"; 7 | import tsParser from "@typescript-eslint/parser"; 8 | import path from "node:path"; 9 | import { fileURLToPath } from "node:url"; 10 | import js from "@eslint/js"; 11 | import { FlatCompat } from "@eslint/eslintrc"; 12 | 13 | const __filename = fileURLToPath(import.meta.url); 14 | const __dirname = path.dirname(__filename); 15 | const compat = new FlatCompat({ 16 | baseDirectory: __dirname, 17 | recommendedConfig: js.configs.recommended, 18 | allConfig: js.configs.all 19 | }); 20 | 21 | export default defineConfig([{ 22 | extends: fixupConfigRules(compat.extends( 23 | "eslint:recommended", 24 | "plugin:@typescript-eslint/recommended", 25 | )), 26 | 27 | plugins: { 28 | "@typescript-eslint": fixupPluginRules(typescriptEslint), 29 | jsdoc, 30 | "@stylistic/ts": stylisticTs 31 | }, 32 | 33 | languageOptions: { 34 | globals: { 35 | ...globals.browser, 36 | }, 37 | 38 | parser: tsParser, 39 | ecmaVersion: 12, 40 | sourceType: "module", 41 | 42 | parserOptions: { 43 | project: "./tsconfig.json", 44 | }, 45 | }, 46 | 47 | settings: { 48 | jsdoc: { 49 | mode: "typescript", 50 | 51 | structuredTags: { 52 | type: { 53 | type: true, 54 | required: ["type"], 55 | }, 56 | }, 57 | 58 | ignoreInternal: true, 59 | }, 60 | }, 61 | 62 | rules: { 63 | indent: ["error", "tab"], 64 | "linebreak-style": ["error", "unix"], 65 | quotes: ["error", "single"], 66 | semi: ["error", "always"], 67 | 68 | "no-empty": ["error", { 69 | allowEmptyCatch: true, 70 | }], 71 | 72 | "no-console": "error", 73 | 74 | "@typescript-eslint/no-unused-vars": ["error", { 75 | vars: "all", 76 | args: "after-used", 77 | ignoreRestSiblings: false, 78 | argsIgnorePattern: "^_", 79 | varsIgnorePattern: "^_", 80 | destructuredArrayIgnorePattern: "^_", 81 | caughtErrorsIgnorePattern: "^_", 82 | }], 83 | 84 | "@typescript-eslint/explicit-module-boundary-types": ["error"], 85 | 86 | "@typescript-eslint/ban-ts-comment": ["error", { 87 | "ts-expect-error": "allow-with-description", 88 | "ts-ignore": "allow-with-description", 89 | "ts-nocheck": false, 90 | "ts-check": false, 91 | minimumDescriptionLength: 10, 92 | }], 93 | 94 | "@typescript-eslint/no-restricted-types": ["error", { 95 | types: { 96 | String: { 97 | message: "Use string instead", 98 | fixWith: "string", 99 | }, 100 | 101 | Boolean: { 102 | message: "Use boolean instead", 103 | fixWith: "boolean", 104 | }, 105 | 106 | Number: { 107 | message: "Use number instead", 108 | fixWith: "number", 109 | }, 110 | 111 | Symbol: { 112 | message: "Use symbol instead", 113 | fixWith: "symbol", 114 | }, 115 | 116 | Function: { 117 | message: "The `Function` type accepts any function-like value.\nIt provides no type safety when calling the function, which can be a common source of bugs.\nIt also accepts things like class declarations, which will throw at runtime as they will not be called with `new`.\nIf you are expecting the function to accept certain arguments, you should explicitly define the function shape.", 118 | }, 119 | 120 | Object: { 121 | message: "The `Object` type actually means 'any non-nullish value', so it is marginally better than `unknown`.\n- If you want a type meaning 'any object', you probably want `Record` instead.\n- If you want a type meaning 'any value', you probably want `unknown` instead.", 122 | }, 123 | 124 | "{}": { 125 | message: "`{}` actually means 'any non-nullish value'.\n- If you want a type meaning 'any object', you probably want `Record` instead.\n- If you want a type meaning 'any value', you probably want `unknown` instead.\n- If you want a type meaning 'empty object', you probably want `Record` instead.", 126 | }, 127 | }, 128 | }], 129 | 130 | "@typescript-eslint/no-confusing-non-null-assertion": "off", 131 | 132 | "@typescript-eslint/no-explicit-any": ["error", { 133 | fixToUnknown: true, 134 | }], 135 | 136 | "@stylistic/ts/type-annotation-spacing": ["error", { 137 | before: false, 138 | after: true, 139 | 140 | overrides: { 141 | arrow: { 142 | before: true, 143 | after: true, 144 | }, 145 | }, 146 | }], 147 | 148 | "@typescript-eslint/typedef": ["error", { 149 | arrowParameter: true, 150 | memberVariableDeclaration: true, 151 | parameter: true, 152 | propertyDeclaration: true, 153 | variableDeclaration: true, 154 | }], 155 | 156 | "@typescript-eslint/no-inferrable-types": "off", 157 | "@stylistic/ts/func-call-spacing": ["error", "never"], 158 | 159 | "@stylistic/ts/keyword-spacing": ["error", { 160 | before: true, 161 | after: true, 162 | }], 163 | 164 | "@stylistic/ts/space-before-function-paren": ["error", { 165 | anonymous: "always", 166 | named: "never", 167 | asyncArrow: "always", 168 | }], 169 | 170 | "@typescript-eslint/await-thenable": "error", 171 | "@typescript-eslint/return-await": "error", 172 | "@typescript-eslint/unified-signatures": "error", 173 | 174 | "@stylistic/ts/comma-spacing": ["error", { 175 | before: false, 176 | after: true, 177 | }], 178 | 179 | "jsdoc/check-access": "error", 180 | "jsdoc/check-alignment": "error", 181 | "jsdoc/check-examples": "off", 182 | 183 | "jsdoc/check-indentation": ["error", { 184 | excludeTags: ["param"], 185 | }], 186 | 187 | "jsdoc/check-line-alignment": "error", 188 | "jsdoc/check-param-names": "error", 189 | "jsdoc/check-property-names": "error", 190 | "jsdoc/check-syntax": "error", 191 | "jsdoc/check-tag-names": "error", 192 | "jsdoc/check-types": "error", 193 | "jsdoc/check-values": "error", 194 | "jsdoc/empty-tags": "error", 195 | "jsdoc/implements-on-classes": "error", 196 | 197 | "jsdoc/match-description": ["error", { 198 | matchDescription: "^[a-zA-Z0-9_\\- /\\\\()[\\]{}=?!:.,;*+~#'\"%&<>|\n]+$", 199 | contexts: ["any"], 200 | }], 201 | 202 | "jsdoc/no-bad-blocks": "error", 203 | "jsdoc/no-defaults": "off", 204 | "jsdoc/no-types": "off", 205 | "jsdoc/no-undefined-types": "off", 206 | 207 | "jsdoc/require-description": ["error", { 208 | contexts: [ 209 | ":not(:matches(MethodDefinition[key.name=constructor], MethodDefinition[key.name=constructor] *))", 210 | ], 211 | }], 212 | 213 | "jsdoc/require-description-complete-sentence": "off", 214 | "jsdoc/require-example": "off", 215 | "jsdoc/require-file-overview": "off", 216 | "jsdoc/require-hyphen-before-param-description": "error", 217 | 218 | "jsdoc/require-jsdoc": ["error", { 219 | publicOnly: true, 220 | 221 | require: { 222 | ArrowFunctionExpression: true, 223 | ClassDeclaration: true, 224 | ClassExpression: true, 225 | FunctionDeclaration: true, 226 | FunctionExpression: true, 227 | MethodDefinition: false, 228 | }, 229 | 230 | contexts: [ 231 | "MethodDefinition:not([accessibility=\"private\"]) > FunctionExpression", 232 | "ClassProperty:not([accessibility=\"private\"])", 233 | "TSPropertySignature", 234 | "TSInterfaceBody > TSMethodSignature", 235 | "TSEnumMember", 236 | ], 237 | }], 238 | 239 | "jsdoc/require-param": "error", 240 | "jsdoc/require-param-description": "error", 241 | "jsdoc/require-param-name": "error", 242 | "jsdoc/require-param-type": "error", 243 | "jsdoc/require-property": "error", 244 | "jsdoc/require-property-description": "error", 245 | "jsdoc/require-property-name": "error", 246 | "jsdoc/require-property-type": "error", 247 | "jsdoc/require-returns": "error", 248 | "jsdoc/require-returns-check": "error", 249 | "jsdoc/require-returns-description": "error", 250 | "jsdoc/require-returns-type": "error", 251 | "jsdoc/require-throws": "off", 252 | "jsdoc/require-yields": "error", 253 | "jsdoc/require-yields-check": "error", 254 | "jsdoc/valid-types": "error", 255 | }, 256 | }]); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pdfdataextract", 3 | "version": "4.0.0", 4 | "description": "Extract data from a pdf with pure javascript", 5 | "main": "dist/index.js", 6 | "types": "dist/index.d.ts", 7 | "files": [ 8 | "dist" 9 | ], 10 | "scripts": { 11 | "lint": "eslint \"src/**/*.ts\" --cache", 12 | "lint:fix": "eslint \"src/**/*.ts\" --cache --fix", 13 | "build": "tsc", 14 | "test": "vitest", 15 | "prepare": "npm run build", 16 | "build:doc": "typedoc src/index.ts" 17 | }, 18 | "keywords": [ 19 | "pdf", 20 | "pdf.js", 21 | "pdfjs", 22 | "parse", 23 | "parser", 24 | "json", 25 | "text", 26 | "data", 27 | "crawler", 28 | "extract", 29 | "extractor", 30 | "ocr", 31 | "converter", 32 | "image" 33 | ], 34 | "dependencies": { 35 | "pdfjs-dist": "5.0.375" 36 | }, 37 | "devDependencies": { 38 | "@eslint/compat": "^1.2.7", 39 | "@eslint/eslintrc": "^3.3.1", 40 | "@eslint/js": "^9.23.0", 41 | "@stylistic/eslint-plugin-ts": "^4.2.0", 42 | "@types/node": "^22.13.14", 43 | "@types/opentype.js": "^1.3.8", 44 | "@typescript-eslint/eslint-plugin": "^8.28.0", 45 | "@typescript-eslint/parser": "^8.28.0", 46 | "canvas": "^3.1.0", 47 | "eslint": "^9.23.0", 48 | "eslint-plugin-jsdoc": "^50.6.9", 49 | "globals": "^16.0.0", 50 | "pureimage": "^0.4.18", 51 | "skia-canvas": "^2.0.2", 52 | "tesseract.js": "^6.0.0", 53 | "typedoc": "^0.28.1", 54 | "typedoc-github-wiki-theme": "^2.1.0", 55 | "typedoc-plugin-markdown": "^4.6.0", 56 | "typescript": "5.8.2", 57 | "vitest": "^3.0.9" 58 | }, 59 | "homepage": "https://github.com/lublak/pdfdataextract#readme", 60 | "bugs": { 61 | "url": "https://github.com/lublak/pdfdataextract/issues" 62 | }, 63 | "repository": { 64 | "type": "git", 65 | "url": "git+https://github.com/lublak/pdfdataextract.git" 66 | }, 67 | "author": "lublak", 68 | "license": "MIT", 69 | "engines": { 70 | "node": ">=20" 71 | } 72 | } -------------------------------------------------------------------------------- /src/canvasapi.ts: -------------------------------------------------------------------------------- 1 | export type CanvasApiConstructor = { new(width: number, height: number): T }; 2 | 3 | export interface CanvasApi { 4 | /** 5 | * create the 2d context of the canvas 6 | * 7 | * @returns canvas 2d context 8 | */ 9 | createContext(): CanvasRenderingContext2D; 10 | /** 11 | * resets the canvas to the give size 12 | * 13 | * @param {number} width - the canvas width 14 | * @param {number} height - the canvas height 15 | */ 16 | reset(width: number, height: number): void; 17 | /** 18 | * destroys the canvas 19 | */ 20 | destroy(): void; 21 | /** 22 | * converts the canvas to a png 23 | * 24 | * @returns the image as a {Buffer} 25 | */ 26 | toPNG(): Promise; 27 | /** 28 | * converts the canvas to a jpeg 29 | * 30 | * @param {number} quality - the quality of the jpeg 31 | * @returns the image as a {Buffer} 32 | */ 33 | toJPEG(quality?: number): Promise; 34 | } -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { 2 | PdfData, 3 | PdfDataOptions, 4 | PdfDataGetOptions, 5 | } from './pdfdata'; 6 | 7 | import { 8 | PdfDataExtractor, 9 | PdfDataExtractorOptions, 10 | } from './pdfdataextractor'; 11 | 12 | import { 13 | CanvasApiConstructor, 14 | CanvasApi, 15 | } from './canvasapi'; 16 | 17 | import { 18 | OcrApiConstructor, 19 | OcrApi, 20 | } from './ocrapi'; 21 | 22 | import { 23 | VerbosityLevel, 24 | Permissions, 25 | Metadata, 26 | Outline, 27 | PageNumberOutline, 28 | UrlOutline, 29 | PdfReferenceOutline, 30 | Info, 31 | Name, 32 | } from './types'; 33 | 34 | export { 35 | PdfData, 36 | PdfDataOptions, 37 | PdfDataGetOptions, 38 | 39 | PdfDataExtractor, 40 | PdfDataExtractorOptions, 41 | 42 | CanvasApiConstructor, 43 | CanvasApi, 44 | 45 | OcrApiConstructor, 46 | OcrApi, 47 | 48 | VerbosityLevel, 49 | Permissions, 50 | Metadata, 51 | Outline, 52 | PageNumberOutline, 53 | PdfReferenceOutline, 54 | UrlOutline, 55 | Info, 56 | Name, 57 | }; -------------------------------------------------------------------------------- /src/nodecanvas.ts: -------------------------------------------------------------------------------- 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor 2 | import { Canvas, createCanvas, JpegConfig } from 'canvas'; 3 | import { promisify } from 'util'; 4 | import { CanvasApi } from './canvasapi'; 5 | 6 | /** 7 | * implementation for node-canvas 8 | */ 9 | export class NodeCanvas implements CanvasApi { 10 | private canvas: Canvas; 11 | /** 12 | * @internal 13 | */ 14 | public constructor(width: number, height: number) { 15 | this.canvas = createCanvas(width, height); 16 | } 17 | /** 18 | * @internal 19 | */ 20 | public toPNG(): Promise { 21 | return promisify<'image/png', Buffer>(this.canvas.toBuffer)('image/png'); 22 | } 23 | /** 24 | * @internal 25 | */ 26 | public toJPEG(quality?: number): Promise { 27 | return promisify<'image/jpeg', JpegConfig, Buffer>(this.canvas.toBuffer)('image/jpeg', { 28 | quality: quality 29 | }); 30 | } 31 | /** 32 | * @internal 33 | */ 34 | public createContext(): CanvasRenderingContext2D { 35 | return this.canvas.getContext('2d') as unknown as CanvasRenderingContext2D; 36 | } 37 | /** 38 | * @internal 39 | */ 40 | public reset(width: number, height: number): void { 41 | this.canvas.width = width; 42 | this.canvas.height = height; 43 | } 44 | /** 45 | * @internal 46 | */ 47 | public destroy(): void { 48 | this.canvas.width = 0; 49 | this.canvas.height = 0; 50 | } 51 | 52 | } -------------------------------------------------------------------------------- /src/nodeskiacanvas.ts: -------------------------------------------------------------------------------- 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor 2 | import { Canvas, createCanvas } from '@napi-rs/canvas'; 3 | import { CanvasApi } from './canvasapi'; 4 | 5 | /** 6 | * implementation for node-skia 7 | */ 8 | export class NodeSkiaCanvas implements CanvasApi { 9 | private canvas: Canvas; 10 | /** 11 | * @internal 12 | */ 13 | public constructor(width: number, height: number) { 14 | this.canvas = createCanvas(width, height); 15 | } 16 | /** 17 | * @internal 18 | */ 19 | public toPNG(): Promise { 20 | return this.canvas.encode('png'); 21 | } 22 | /** 23 | * @internal 24 | */ 25 | public toJPEG(quality?: number): Promise { 26 | return this.canvas.encode('jpeg', quality); 27 | } 28 | /** 29 | * @internal 30 | */ 31 | public createContext(): CanvasRenderingContext2D { 32 | return this.canvas.getContext('2d') as unknown as CanvasRenderingContext2D; 33 | } 34 | /** 35 | * @internal 36 | */ 37 | public reset(width: number, height: number): void { 38 | this.canvas.width = width; 39 | this.canvas.height = height; 40 | } 41 | /** 42 | * @internal 43 | */ 44 | public destroy(): void { 45 | this.canvas.width = 0; 46 | this.canvas.height = 0; 47 | } 48 | } -------------------------------------------------------------------------------- /src/ocrapi.ts: -------------------------------------------------------------------------------- 1 | import { OCRLang } from './types'; 2 | 3 | export type OcrApiConstructor = { new(): T }; 4 | 5 | export interface OcrApi { 6 | /** 7 | * recognize characters of buffers 8 | * 9 | * @param {Buffer[]} buffers - the image buffers 10 | * @param {OCRLang[]} langs - the language traineddata used for recognition 11 | * @returns {Promise} an array with text from each side 12 | */ 13 | ocrBuffers(buffers: Buffer[], langs: OCRLang[]): Promise; 14 | } -------------------------------------------------------------------------------- /src/pdfdata.ts: -------------------------------------------------------------------------------- 1 | import { PdfDataExtractor } from './pdfdataextractor'; 2 | import { VerbosityLevel, Permissions, Outline, Info, Metadata } from './types'; 3 | 4 | export type PdfDataGetOptions = { 5 | /** 6 | * get number of pages, by default it is true 7 | * 8 | * @type {boolean|undefined} 9 | */ 10 | pages?: boolean, 11 | /** 12 | * get text of each page, by default it is true 13 | * 14 | * @type {boolean|undefined} 15 | */ 16 | text?: boolean, 17 | /** 18 | * get fingerprint, by default it is true 19 | * 20 | * @type {boolean|undefined} 21 | */ 22 | fingerprint?: boolean, 23 | /** 24 | * get outline, by default it is true 25 | * 26 | * @type {boolean|undefined} 27 | */ 28 | outline?: boolean, 29 | /** 30 | * get metadata, by default it is true 31 | * 32 | * @type {boolean|undefined} 33 | */ 34 | metadata?: boolean, 35 | /** 36 | * get info, by default it is true 37 | * 38 | * @type {boolean|undefined} 39 | */ 40 | info?: boolean, 41 | /** 42 | * get permissions, by default it is true 43 | * 44 | * @type {boolean|undefined} 45 | */ 46 | permissions?: boolean 47 | }; 48 | 49 | export type PdfDataOptions = { 50 | /** 51 | * password for a password-protected PDF 52 | * 53 | * @type {string|undefined} 54 | */ 55 | password?: string, 56 | /** 57 | * the number of pages to be read, all pages are read by default 58 | * 59 | * @deprecated use pages instead 60 | * 61 | * @type {number|undefined} 62 | */ 63 | max?: number, 64 | /** 65 | * sort the text by text coordinates 66 | * 67 | * @type {boolean|undefined} 68 | */ 69 | sort?: boolean, 70 | /** 71 | * the logging level 72 | * 73 | * @type {VerbosityLevel|undefined} 74 | */ 75 | verbosity?: VerbosityLevel, 76 | /** 77 | * can either be the number of pages to be read, 78 | * a number array with the exact pages (sorted by page number) 79 | * or a filter function (return true to parse the page) 80 | * all pages are read by default 81 | * not used if get.pages is false 82 | * 83 | * @type {number|number[]|((pageNumber: number) => boolean|undefined)} 84 | */ 85 | pages?: number | number[] | ((pageNumber: number) => boolean), 86 | /** 87 | * options to enable or disable parsing methods 88 | * 89 | * @type {PdfDataGetOptions|undefined} 90 | */ 91 | get?: PdfDataGetOptions; 92 | } 93 | 94 | /** 95 | * the data of the pdf 96 | */ 97 | export class PdfData { 98 | /** 99 | * the number of pages 100 | * 101 | * @readonly 102 | * @type {number|undefined} 103 | */ 104 | readonly pages?: number; 105 | /** 106 | * extracted text per page 107 | * 108 | * @readonly 109 | * @type {string[]|undefined} 110 | */ 111 | readonly text?: readonly string[]; 112 | /** 113 | * the fingerprint 114 | * 115 | * @readonly 116 | * @type {string|undefined} 117 | */ 118 | readonly fingerprint?: string; 119 | /** 120 | * the outline/bookmarks 121 | * 122 | * @readonly 123 | * @type {Outline[]|undefined} 124 | */ 125 | readonly outline?: readonly Outline[]; 126 | /** 127 | * the informations/description 128 | * 129 | * @readonly 130 | * @type {Info|undefined} 131 | */ 132 | readonly info?: Info; 133 | /** 134 | * the metadata 135 | * 136 | * @readonly 137 | * @type {Metadata|undefined} 138 | */ 139 | readonly metadata?: Metadata; 140 | /** 141 | * the permission flags 142 | * 143 | * @readonly 144 | * @type {Permissions | undefined} 145 | */ 146 | readonly permissions?: Permissions; 147 | 148 | private constructor(pages: number | null, text: string[] | null, fingerprint: string | null, outline: Outline[] | null, info: Info | null, metadata: Metadata | null, permissions: Permissions | null) { 149 | if (pages != null) this.pages = pages; 150 | if (text != null) this.text = text; 151 | if (fingerprint != null) this.fingerprint = fingerprint; 152 | if (outline != null) this.outline = outline; 153 | if (info != null) this.info = info; 154 | if (metadata != null) this.metadata = metadata; 155 | if (permissions != null) this.permissions = permissions; 156 | } 157 | 158 | /** 159 | * get the data 160 | * 161 | * @param {Uint8Array} data - the binary data file 162 | * @param {PdfDataOptions} [options={}] - the options on how the data should be extracted 163 | * @returns {Promise} a promise that is resolved with a {PdfData} object with the extracted data 164 | */ 165 | static async extract(data: Uint8Array, options: PdfDataOptions = {}): Promise { 166 | const extractor: PdfDataExtractor = await PdfDataExtractor.get(data, { 167 | password: options.password, 168 | verbosity: options.verbosity, 169 | }); 170 | 171 | if (!options.get) options.get = {}; 172 | 173 | const pages: number | number[] | ((pageNumber: number) => boolean) | undefined = options.pages ? options.pages : options.max; 174 | 175 | let metadata: Metadata | null = null; 176 | let info: Info | null = null; 177 | 178 | if (options.get.metadata === undefined || options.get.metadata || options.get.info === undefined || options.get.info) { 179 | const rawMetadata: { 180 | info: Info; 181 | metadata: Metadata; 182 | } | null = await extractor.getMetadata(); 183 | if (rawMetadata != null) { 184 | if (options.get.info === undefined || options.get.info) info = rawMetadata.info; 185 | if (options.get.metadata === undefined || options.get.metadata) metadata = rawMetadata.metadata; 186 | } 187 | } 188 | 189 | const pdfdata: PdfData = new PdfData( 190 | options.get.pages === undefined || options.get.pages ? extractor.pages : null, 191 | options.get.text === undefined || options.get.text ? await extractor.getText(pages, options.sort) : null, 192 | options.get.fingerprint === undefined || options.get.fingerprint ? extractor.fingerprint : null, 193 | options.get.outline === undefined || options.get.outline ? await extractor.getOutline() : null, 194 | info, 195 | metadata, 196 | options.get.permissions === undefined || options.get.permissions ? await extractor.getPermissions() : null 197 | ); 198 | 199 | extractor.close(); 200 | return pdfdata; 201 | } 202 | } -------------------------------------------------------------------------------- /src/pdfdataextractor.ts: -------------------------------------------------------------------------------- 1 | import { getDocument, PermissionFlag } from 'pdfjs-dist/legacy/build/pdf.mjs'; 2 | import { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist/types/src/display/api'; 3 | import { CanvasApi, CanvasApiConstructor } from './canvasapi'; 4 | import { OcrApi, OcrApiConstructor } from './ocrapi'; 5 | import { PdfPageData } from './pdfpagedata'; 6 | import { VerbosityLevel, Permissions, Outline, PageNumberOutline, UrlOutline, PdfReferenceOutline, MetadataInfo, Sort } from './types'; 7 | 8 | export type PdfDataExtractorOptions = { 9 | /** 10 | * password for a password-protected PDF 11 | * 12 | * @type {string} 13 | */ 14 | password?: string, 15 | /** 16 | * the logging level 17 | * 18 | * @type {VerbosityLevel} 19 | */ 20 | verbosity?: VerbosityLevel, 21 | /** 22 | * the canvas api used for rendering 23 | * 24 | * @type {CanvasApiConstructor} 25 | */ 26 | canvasApi?: CanvasApiConstructor, 27 | /** 28 | * the ocr api used for text detection 29 | * 30 | * @type {OcrApiConstructor} 31 | */ 32 | ocrApi?: OcrApiConstructor, 33 | } 34 | 35 | interface RawOutline { 36 | title: string; 37 | bold: boolean; 38 | italic: boolean; 39 | color: Uint8ClampedArray; 40 | dest: string | Array | null; 41 | url: string | null; 42 | unsafeUrl: string | undefined; 43 | newWindow: boolean | undefined; 44 | count: number | undefined; 45 | items: RawOutline[] | undefined; 46 | } 47 | 48 | async function getPageNumber(pdf_document: PDFDocumentProxy, pageRef: { num: number, gen: number }, cache: { [key: string]: number; }) { 49 | const ref: string = pageRef.gen === 0 ? `${pageRef.num}R` : `${pageRef.num}R${pageRef.gen}`; 50 | let number: number = cache[ref]; 51 | if (number == null) { 52 | number = await pdf_document.getPageIndex(pageRef) as unknown as number; 53 | cache[ref] = number; 54 | } 55 | return number; 56 | } 57 | 58 | function parseRemoteUrlDest(remoteUrlDest: string) { 59 | try { 60 | const remoteDest: unknown = JSON.parse(remoteUrlDest); 61 | if (Array.isArray(remoteDest) && Number.isInteger(remoteDest[0])) { 62 | return remoteDest[0]; 63 | } 64 | } catch { } 65 | return undefined; 66 | } 67 | 68 | async function parseOutline(pdf_document: PDFDocumentProxy, outlineData: RawOutline[], cache: { [key: string]: number; }) { 69 | const outline: Outline[] = []; 70 | for (const o of outlineData) { 71 | const dest: unknown = typeof (o.dest) === 'string' ? await pdf_document.getDestination(o.dest) : o.dest; 72 | if (dest == null) { 73 | if (o.unsafeUrl != null) { 74 | if (o.url == null) { 75 | const remoteUrl: string[] = o.unsafeUrl.split('#', 2); 76 | const remoteBaseUrl: string = remoteUrl[0]; 77 | if (remoteBaseUrl.toLowerCase().endsWith('.pdf')) { 78 | if (remoteUrl.length == 2) { 79 | outline.push(new PdfReferenceOutline(o.title, remoteBaseUrl, parseRemoteUrlDest(remoteUrl[1]), o.items ? await parseOutline(pdf_document, o.items, cache) : undefined)); 80 | } else { 81 | outline.push(new PdfReferenceOutline(o.title, remoteBaseUrl, undefined, o.items ? await parseOutline(pdf_document, o.items, cache) : undefined)); 82 | } 83 | } else { 84 | outline.push(new UrlOutline(o.title, o.unsafeUrl, false, o.items ? await parseOutline(pdf_document, o.items, cache) : undefined)); 85 | } 86 | } else { 87 | outline.push(new UrlOutline(o.title, o.url, true, o.items ? await parseOutline(pdf_document, o.items, cache) : undefined)); 88 | } 89 | } else { 90 | // TODO: ? 91 | } 92 | } else if (Array.isArray(dest)) { 93 | if (typeof dest[0] === 'object') { 94 | outline.push(new PageNumberOutline( 95 | o.title, 96 | await getPageNumber(pdf_document, dest[0] as { num: number, gen: number }, cache), 97 | o.items ? await parseOutline(pdf_document, o.items, cache) : undefined 98 | )); 99 | } else if (Number.isInteger(dest[0])) { 100 | outline.push(new PageNumberOutline(o.title, dest[0], o.items ? await parseOutline(pdf_document, o.items, cache) : undefined)); 101 | } else { 102 | // TODO: ? 103 | } 104 | } 105 | } 106 | return outline; 107 | } 108 | 109 | async function getInstalledCanvasApi(): Promise | null> { 110 | try { 111 | require.resolve('canvas'); 112 | return (await import('./nodecanvas')).NodeCanvas; 113 | } catch (_e) { } 114 | try { 115 | require.resolve('@napi-rs/canvas'); 116 | return (await import('./nodeskiacanvas')).NodeSkiaCanvas; 117 | } catch (_e) { } 118 | try { 119 | require.resolve('pureimage'); 120 | return (await import('./pureimagecanvas')).PureimageCanvas; 121 | } catch (_e) { } 122 | return null; 123 | } 124 | 125 | async function getInstalledOcrApi(): Promise | null> { 126 | try { 127 | require.resolve('tesseract.js'); 128 | return (await import('./tesseractjsocr')).TesseractJsOcr; 129 | } catch (_e) { } 130 | return null; 131 | } 132 | 133 | /** 134 | * the extractor for the data of the pdf 135 | */ 136 | export class PdfDataExtractor { 137 | private constructor( 138 | private readonly pdf_document: PDFDocumentProxy, 139 | private readonly canvasApi: CanvasApiConstructor | null, 140 | private readonly ocrApi: OcrApiConstructor | null, 141 | ) { } 142 | 143 | /** 144 | * get the extractor for the data 145 | * 146 | * @param {Uint8Array} data - the binary data file 147 | * @param {PdfDataExtractorOptions} [options={}] - the options on how to open the data in the extractor 148 | * @returns {Promise} a promise that is resolved with a {PdfDataExtractor} object to pull the extracted data from 149 | */ 150 | static async get(data: Uint8Array, options: PdfDataExtractorOptions = {}): Promise { 151 | if (data instanceof Buffer) { 152 | data = new Uint8Array(data); 153 | } 154 | const pdf_document: PDFDocumentProxy = await getDocument({ 155 | data: data, 156 | password: options.password, 157 | verbosity: options.verbosity ?? VerbosityLevel.ERRORS, 158 | isEvalSupported: false, 159 | }).promise; 160 | return new PdfDataExtractor(pdf_document, options.canvasApi ?? await getInstalledCanvasApi(), options.ocrApi ?? await getInstalledOcrApi()); 161 | } 162 | 163 | /** 164 | * get the fingerprint 165 | * 166 | * @returns {string | null} the fingerprint 167 | */ 168 | get fingerprint(): string | null { 169 | return this.pdf_document.fingerprints[0]; 170 | } 171 | 172 | /** 173 | * get the number of pages 174 | * 175 | * @returns {string} the number of pages 176 | */ 177 | get pages(): number { 178 | return this.pdf_document.numPages; 179 | } 180 | 181 | /** 182 | * get the permission flags 183 | * 184 | * @returns {Promise} a promise that is resolved with a {Permissions | null} object that contains the permission flags for the PDF 185 | */ 186 | async getPermissions(): Promise { 187 | const permission_flag_array: number[] | null = await this.pdf_document.getPermissions(); 188 | return permission_flag_array == null ? null : { 189 | assemble: permission_flag_array.includes(PermissionFlag.ASSEMBLE), 190 | copy: permission_flag_array.includes(PermissionFlag.COPY), 191 | copyForAccessibility: permission_flag_array.includes(PermissionFlag.COPY_FOR_ACCESSIBILITY), 192 | fillInteractiveForms: permission_flag_array.includes(PermissionFlag.FILL_INTERACTIVE_FORMS), 193 | modifyAnnotations: permission_flag_array.includes(PermissionFlag.MODIFY_ANNOTATIONS), 194 | print: permission_flag_array.includes(PermissionFlag.PRINT), 195 | printHQ: permission_flag_array.includes(PermissionFlag.PRINT_HIGH_QUALITY), 196 | modifyContents: permission_flag_array.includes(PermissionFlag.MODIFY_CONTENTS), 197 | }; 198 | } 199 | 200 | /** 201 | * get the text 202 | * 203 | * @param {number|number[]|((pageNumber: number) => boolean)} [pages] - can either be the number of pages to be read, 204 | * a number array with the specific pages (sorted by page number) 205 | * or a filter function (return true to parse the page) 206 | * @param {boolean|Sort} [sort=false] - sort the text by text coordinates 207 | * @returns {Promise} a promise that is resolved with a {string[]} array with the extracted text per page 208 | */ 209 | async getText(pages?: number | number[] | ((pageNumber: number) => boolean), sort: boolean | Sort = false): Promise { 210 | return Promise.all((await this.getPageData(pages)).map(async (page: PdfPageData | null) => page == null ? '' : page.toText(sort))); 211 | } 212 | 213 | /** 214 | * get the text 215 | * 216 | * @param {number|number[]|((pageNumber: number) => boolean)} [pages] - can either be the number of pages to be read, 217 | * a number array with the specific pages (sorted by page number) 218 | * or a filter function (return true to parse the page) 219 | * @returns {Promise} a promise that is resolved with a {string[]} array with the extracted text per page 220 | */ 221 | async getPageData(pages?: number | number[] | ((pageNumber: number) => boolean)): Promise<(PdfPageData | null)[]> { 222 | const page_array: (PdfPageData | null)[] = []; 223 | const numPages: number = this.pdf_document.numPages; 224 | 225 | if (pages === undefined) { 226 | for (let pageNumber: number = 1; pageNumber <= numPages; pageNumber++) { 227 | const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null); 228 | page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi)); 229 | } 230 | } else if (typeof (pages) === 'number') { 231 | const counter: number = pages > numPages ? numPages : pages; 232 | 233 | for (let pageNumber: number = 1; pageNumber <= counter; pageNumber++) { 234 | const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null); 235 | page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi)); 236 | } 237 | } else if (typeof (pages) === 'function') { 238 | for (let pageNumber: number = 1; pageNumber <= numPages; pageNumber++) { 239 | if (pages(pageNumber)) { 240 | const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null); 241 | page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi)); 242 | } 243 | } 244 | } else { 245 | pages = pages.filter((value: number, index: number, self: number[]) => self.indexOf(value) === index).sort((a: number, b: number) => a - b); 246 | for (const pageNumber of pages) { 247 | if (pageNumber <= numPages) { 248 | const page: PDFPageProxy | null = await this.pdf_document.getPage(pageNumber).catch(() => null); 249 | page_array.push(page == null ? null : new PdfPageData(page, this.canvasApi, this.ocrApi)); 250 | } 251 | } 252 | } 253 | 254 | return page_array; 255 | } 256 | 257 | /** 258 | * get the outline/bookmarks 259 | * 260 | * @returns {Promise} a promise that is resolved with a {Outline[]} array with information from the tree outline 261 | */ 262 | async getOutline(): Promise { 263 | const outlineData: RawOutline[] = await this.pdf_document.getOutline(); 264 | if (outlineData == null) return null; 265 | return parseOutline(this.pdf_document, outlineData, {}); 266 | } 267 | 268 | /** 269 | * get the metadata 270 | * 271 | * @returns {Promise} a promise that is resolved with a {MetadataInfo | null} object with information from the metadata section 272 | */ 273 | async getMetadata(): Promise { 274 | return await this.pdf_document.getMetadata().catch(() => null) as MetadataInfo | null; 275 | } 276 | 277 | /** 278 | * close the extractor 279 | * 280 | * @returns {Promise} a promise that is resolved when destruction is completed 281 | */ 282 | async close(): Promise { 283 | return this.pdf_document.destroy(); 284 | } 285 | } -------------------------------------------------------------------------------- /src/pdfpagedata.ts: -------------------------------------------------------------------------------- 1 | import { PDFPageProxy, TextContent, TextItem } from 'pdfjs-dist/types/src/display/api'; 2 | import { OCRLang, Sort } from './types'; 3 | import { PageViewport } from 'pdfjs-dist/types/src/display/display_utils'; 4 | import { CanvasApi, CanvasApiConstructor } from './canvasapi'; 5 | import { OcrApi, OcrApiConstructor } from './ocrapi'; 6 | 7 | /** 8 | * pdf data information per page 9 | */ 10 | export class PdfPageData { 11 | /** 12 | * @internal 13 | */ 14 | public constructor( 15 | private page: PDFPageProxy, 16 | private readonly canvasApi: CanvasApiConstructor | null, 17 | private readonly ocrApi: OcrApiConstructor | null, 18 | ) { } 19 | 20 | /** 21 | * get the text of the page 22 | * 23 | * @param {boolean|Sort} [sort=false] - sort the text by text coordinates 24 | * @returns {Promise} a promise that is resolved with a {string} with the extracted text of the page 25 | */ 26 | public async toText(sort: boolean | Sort = false): Promise { 27 | const sortOption: Sort | null = typeof sort === 'boolean' ? (sort ? Sort.ASC : null) : sort; 28 | return this.page.getTextContent({ 29 | disableNormalization: false, 30 | includeMarkedContent: false, 31 | }).then((textContent: TextContent) => { 32 | const items: TextItem[] = textContent.items as TextItem[]; 33 | /* 34 | transform is a array with a transform matrix [scale x,shear x,shear y,scale y,offset x,offset y] 35 | 36 | 0,1 1,1 37 | ----------- 38 | | | 39 | | | 40 | | pdf | 41 | | | 42 | | | 43 | ----------- 44 | 0,0 1,0 45 | */ 46 | 47 | //coordinate based sorting 48 | if (sortOption !== null) { 49 | if (sortOption === Sort.ASC) { 50 | items.sort((e1: TextItem, e2: TextItem) => { 51 | if (e1.transform[5] < e2.transform[5]) return 1; 52 | else if (e1.transform[5] > e2.transform[5]) return -1; 53 | else if (e1.transform[4] < e2.transform[4]) return -1; 54 | else if (e1.transform[4] > e2.transform[4]) return 1; 55 | else return 0; 56 | }); 57 | } else { 58 | items.sort((e1: TextItem, e2: TextItem) => { 59 | if (e1.transform[5] < e2.transform[5]) return -1; 60 | else if (e1.transform[5] > e2.transform[5]) return 1; 61 | else if (e1.transform[4] < e2.transform[4]) return 1; 62 | else if (e1.transform[4] > e2.transform[4]) return -1; 63 | else return 0; 64 | }); 65 | } 66 | } 67 | 68 | let lastLineY: number = -1, text: string = ''; 69 | for (const item of items) { 70 | if (lastLineY === -1 || lastLineY == item.transform[5]) { 71 | text += item.str; 72 | // TODO if spaced by coordinates (x + text width + space width = next x) 73 | //textContent.styles[item.fontName]; 74 | //dummyContext.font = ''; 75 | //dummyContext.measureText(item.str); 76 | } else { 77 | text += '\n' + item.str; 78 | } 79 | lastLineY = item.transform[5]; 80 | } 81 | return text; 82 | }, () => ''); 83 | } 84 | 85 | /** 86 | * recognizes the text from the image information of this pdf page 87 | * requires node-canvas/node-pureimage and tesseract.js as additional installation 88 | * 89 | * @param {OCRLang[]} langs - the language traineddata used for recognition 90 | * @returns {Promise} the result as text 91 | */ 92 | public async ocr(langs: OCRLang[]): Promise { 93 | if (!this.ocrApi) throw new Error('OcrFactory.ocrApi is not set (tesseractjs)'); 94 | const ocr: OcrApi = new this.ocrApi(); 95 | const result: string[] = await ocr.ocrBuffers([await this.toJPEG()], langs); 96 | return result[0]; 97 | } 98 | 99 | /** 100 | * creates a canvas and renders 101 | * 102 | * @param {T} canvasApi - the canvas api that is used to create the canvas 103 | * @returns {Promise} the canvas 104 | */ 105 | public async toCanvasApi(canvasApi: CanvasApiConstructor): Promise { 106 | const viewport: PageViewport = this.page.getViewport({ scale: 1.0 }); 107 | const canvas: T = new canvasApi(viewport.width, viewport.height); 108 | await this.page.render({ 109 | canvasContext: canvas.createContext(), 110 | viewport: viewport, 111 | }).promise; 112 | return canvas; 113 | } 114 | 115 | /** 116 | * converts to a jpeg image 117 | * 118 | * @param {number} [quality=0.8] - the quality of the image (0.0-1.0) 119 | * @returns {Promise} the jpeg image as a {Buffer} 120 | */ 121 | public async toJPEG(quality: number = 0.8): Promise { 122 | if (!this.canvasApi) throw new Error('canvasApi is not set (node-canvas or pureimage is not installed)'); 123 | return (await this.toCanvasApi(this.canvasApi)).toJPEG(quality); 124 | } 125 | 126 | /** 127 | * converts to a png image 128 | * 129 | * @returns {Promise} the png image as a {Buffer} 130 | */ 131 | public async toPNG(): Promise { 132 | if (!this.canvasApi) throw new Error('canvasApi is not set (node-canvas or pureimage is not installed)'); 133 | return (await this.toCanvasApi(this.canvasApi)).toPNG(); 134 | } 135 | 136 | /** 137 | * close the page data 138 | * @returns {boolean} — if close was successfully 139 | */ 140 | public close(): boolean { 141 | return this.page.cleanup(); 142 | } 143 | } -------------------------------------------------------------------------------- /src/pureimagecanvas.ts: -------------------------------------------------------------------------------- 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor 2 | import { encodeJPEGToStream, encodePNGToStream, make } from 'pureimage'; 3 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor 4 | import { Bitmap } from 'pureimage/types/bitmap'; 5 | import { PassThrough } from 'stream'; 6 | import { CanvasApi } from './canvasapi'; 7 | 8 | /** 9 | * default implementation for pureimage 10 | * look at the {CanvasApi} doc 11 | */ 12 | export class PureimageCanvas implements CanvasApi { 13 | private bitmap: Bitmap; 14 | /** 15 | * @internal 16 | */ 17 | public constructor(width: number, height: number) { 18 | this.bitmap = make(width, height); 19 | } 20 | /** 21 | * @internal 22 | */ 23 | public async toPNG(): Promise { 24 | const result: Uint8Array[] = []; 25 | const stream: PassThrough = new PassThrough(); 26 | stream.on('data', (data: Uint8Array) => result.push(data)); 27 | await encodePNGToStream(this.bitmap, stream); 28 | return Buffer.concat(result); 29 | } 30 | /** 31 | * @internal 32 | */ 33 | public async toJPEG(quality?: number): Promise { 34 | const result: Uint8Array[] = []; 35 | const stream: PassThrough = new PassThrough(); 36 | stream.on('data', (data: Uint8Array) => result.push(data)); 37 | await encodeJPEGToStream(this.bitmap, stream, quality); 38 | return Buffer.concat(result); 39 | } 40 | /** 41 | * @internal 42 | */ 43 | public createContext(): CanvasRenderingContext2D { 44 | return this.bitmap.getContext('2d'); 45 | } 46 | /** 47 | * @internal 48 | */ 49 | public reset(width: number, height: number): void { 50 | this.bitmap.width = width; 51 | this.bitmap.height = height; 52 | } 53 | /** 54 | * @internal 55 | */ 56 | public destroy(): void { 57 | this.bitmap.width = 0; 58 | this.bitmap.height = 0; 59 | } 60 | } -------------------------------------------------------------------------------- /src/tesseractjsocr.ts: -------------------------------------------------------------------------------- 1 | //@ts-ignore: ignore import errors because its dynamicly loaded from pdfdataextractor 2 | import { createScheduler, createWorker, RecognizeResult, Scheduler, Worker } from 'tesseract.js'; 3 | import { OcrApi } from './ocrapi'; 4 | import { OCRLang } from './types'; 5 | 6 | /** 7 | * implementation for tesseractjs 8 | */ 9 | export class TesseractJsOcr implements OcrApi { 10 | /** 11 | * recognize characters of buffers 12 | * 13 | * @param {Buffer[]} buffers - the image buffers 14 | * @param {OCRLang[]} langs - the language traineddata used for recognition 15 | * @returns {Promise} an array with text from each side 16 | */ 17 | async ocrBuffers(buffers: Buffer[], langs: OCRLang[]): Promise { 18 | if (buffers.length == 0) return []; 19 | if (buffers.length == 1) { 20 | const lang: string = langs.join('+'); 21 | const worker: Worker = await createWorker(lang); 22 | const data: RecognizeResult = await worker.recognize(buffers[0]); 23 | await worker.terminate(); 24 | return [data.data.text]; 25 | } 26 | const lang: string = langs.join('+'); 27 | const scheduler: Scheduler = createScheduler(); 28 | for (let i: number = 0; i < buffers.length; i++) { 29 | const worker: Worker = await createWorker(lang); 30 | scheduler.addWorker(worker); 31 | } 32 | const result: RecognizeResult[] = await Promise.all(buffers.map(async (buffer: Buffer) => scheduler.addJob('recognize', buffer))) as RecognizeResult[]; 33 | await scheduler.terminate(); 34 | return result.map((r: RecognizeResult) => r.data.text); 35 | } 36 | } -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | import { VerbosityLevel as RawVerbosityLevel } from 'pdfjs-dist/legacy/build/pdf.mjs'; 2 | 3 | export enum VerbosityLevel { 4 | /** 5 | * logs all errors 6 | */ 7 | ERRORS = RawVerbosityLevel.ERRORS, 8 | /** 9 | * logs all infos 10 | */ 11 | INFOS = RawVerbosityLevel.INFOS, 12 | /** 13 | * logs all warnings 14 | */ 15 | WARNINGS = RawVerbosityLevel.WARNINGS, 16 | } 17 | 18 | export enum Sort { 19 | /** 20 | * if it should sort ascending 21 | */ 22 | ASC, 23 | /** 24 | * if it should sort descending 25 | */ 26 | DESC 27 | } 28 | 29 | export enum OCRLang { 30 | /** 31 | * AFR 32 | */ 33 | AFR = 'afr', 34 | /** 35 | * AMH 36 | */ 37 | AMH = 'amh', 38 | /** 39 | * ARA 40 | */ 41 | ARA = 'ara', 42 | /** 43 | * ASM 44 | */ 45 | ASM = 'asm', 46 | /** 47 | * AZE 48 | */ 49 | AZE = 'aze', 50 | /** 51 | * AZE_CYRL 52 | */ 53 | AZE_CYRL = 'aze_cyrl', 54 | /** 55 | * BEL 56 | */ 57 | BEL = 'bel', 58 | /** 59 | * BEN 60 | */ 61 | BEN = 'ben', 62 | /** 63 | * BOD 64 | */ 65 | BOD = 'bod', 66 | /** 67 | * BOS 68 | */ 69 | BOS = 'bos', 70 | /** 71 | * BUL 72 | */ 73 | BUL = 'bul', 74 | /** 75 | * CAT 76 | */ 77 | CAT = 'cat', 78 | /** 79 | * CEB 80 | */ 81 | CEB = 'ceb', 82 | /** 83 | * CES 84 | */ 85 | CES = 'ces', 86 | /** 87 | * CHI_SIM 88 | */ 89 | CHI_SIM = 'chi_sim', 90 | /** 91 | * CHI_TRA 92 | */ 93 | CHI_TRA = 'chi_tra', 94 | /** 95 | * CHR 96 | */ 97 | CHR = 'chr', 98 | /** 99 | * CYM 100 | */ 101 | CYM = 'cym', 102 | /** 103 | * DAN 104 | */ 105 | DAN = 'dan', 106 | /** 107 | * DEU 108 | */ 109 | DEU = 'deu', 110 | /** 111 | * DZO 112 | */ 113 | DZO = 'dzo', 114 | /** 115 | * ELL 116 | */ 117 | ELL = 'ell', 118 | /** 119 | * ENG 120 | */ 121 | ENG = 'eng', 122 | /** 123 | * ENM 124 | */ 125 | ENM = 'enm', 126 | /** 127 | * EPO 128 | */ 129 | EPO = 'epo', 130 | /** 131 | * EST 132 | */ 133 | EST = 'est', 134 | /** 135 | * EUS 136 | */ 137 | EUS = 'eus', 138 | /** 139 | * FAS 140 | */ 141 | FAS = 'fas', 142 | /** 143 | * FIN 144 | */ 145 | FIN = 'fin', 146 | /** 147 | * FRA 148 | */ 149 | FRA = 'fra', 150 | /** 151 | * FRK 152 | */ 153 | FRK = 'frk', 154 | /** 155 | * FRM 156 | */ 157 | FRM = 'frm', 158 | /** 159 | * GLE 160 | */ 161 | GLE = 'gle', 162 | /** 163 | * GLG 164 | */ 165 | GLG = 'glg', 166 | /** 167 | * GRC 168 | */ 169 | GRC = 'grc', 170 | /** 171 | * GUJ 172 | */ 173 | GUJ = 'guj', 174 | /** 175 | * HAT 176 | */ 177 | HAT = 'hat', 178 | /** 179 | * HEB 180 | */ 181 | HEB = 'heb', 182 | /** 183 | * HIN 184 | */ 185 | HIN = 'hin', 186 | /** 187 | * HRV 188 | */ 189 | HRV = 'hrv', 190 | /** 191 | * HUN 192 | */ 193 | HUN = 'hun', 194 | /** 195 | * IKU 196 | */ 197 | IKU = 'iku', 198 | /** 199 | * IND 200 | */ 201 | IND = 'ind', 202 | /** 203 | * ISL 204 | */ 205 | ISL = 'isl', 206 | /** 207 | * ITA 208 | */ 209 | ITA = 'ita', 210 | /** 211 | * ITA_OLD 212 | */ 213 | ITA_OLD = 'ita_old', 214 | /** 215 | * JAV 216 | */ 217 | JAV = 'jav', 218 | /** 219 | * JPN 220 | */ 221 | JPN = 'jpn', 222 | /** 223 | * KAN 224 | */ 225 | KAN = 'kan', 226 | /** 227 | * KAT 228 | */ 229 | KAT = 'kat', 230 | /** 231 | * KAT_OLD 232 | */ 233 | KAT_OLD = 'kat_old', 234 | /** 235 | * KAZ 236 | */ 237 | KAZ = 'kaz', 238 | /** 239 | * KHM 240 | */ 241 | KHM = 'khm', 242 | /** 243 | * KIR 244 | */ 245 | KIR = 'kir', 246 | /** 247 | * KOR 248 | */ 249 | KOR = 'kor', 250 | /** 251 | * KUR 252 | */ 253 | KUR = 'kur', 254 | /** 255 | * LAO 256 | */ 257 | LAO = 'lao', 258 | /** 259 | * LAT 260 | */ 261 | LAT = 'lat', 262 | /** 263 | * LAV 264 | */ 265 | LAV = 'lav', 266 | /** 267 | * LIT 268 | */ 269 | LIT = 'lit', 270 | /** 271 | * MAL 272 | */ 273 | MAL = 'mal', 274 | /** 275 | * MAR 276 | */ 277 | MAR = 'mar', 278 | /** 279 | * MKD 280 | */ 281 | MKD = 'mkd', 282 | /** 283 | * MLT 284 | */ 285 | MLT = 'mlt', 286 | /** 287 | * MSA 288 | */ 289 | MSA = 'msa', 290 | /** 291 | * MYA 292 | */ 293 | MYA = 'mya', 294 | /** 295 | * NEP 296 | */ 297 | NEP = 'nep', 298 | /** 299 | * NLD 300 | */ 301 | NLD = 'nld', 302 | /** 303 | * NOR 304 | */ 305 | NOR = 'nor', 306 | /** 307 | * ORI 308 | */ 309 | ORI = 'ori', 310 | /** 311 | * PAN 312 | */ 313 | PAN = 'pan', 314 | /** 315 | * POL 316 | */ 317 | POL = 'pol', 318 | /** 319 | * POR 320 | */ 321 | POR = 'por', 322 | /** 323 | * PUS 324 | */ 325 | PUS = 'pus', 326 | /** 327 | * RON 328 | */ 329 | RON = 'ron', 330 | /** 331 | * RUS 332 | */ 333 | RUS = 'rus', 334 | /** 335 | * SAN 336 | */ 337 | SAN = 'san', 338 | /** 339 | * SIN 340 | */ 341 | SIN = 'sin', 342 | /** 343 | * SLK 344 | */ 345 | SLK = 'slk', 346 | /** 347 | * SLV 348 | */ 349 | SLV = 'slv', 350 | /** 351 | * SPA 352 | */ 353 | SPA = 'spa', 354 | /** 355 | * SPA_OLD 356 | */ 357 | SPA_OLD = 'spa_old', 358 | /** 359 | * SQI 360 | */ 361 | SQI = 'sqi', 362 | /** 363 | * SRP 364 | */ 365 | SRP = 'srp', 366 | /** 367 | * SRP_LATN 368 | */ 369 | SRP_LATN = 'srp_latn', 370 | /** 371 | * SWA 372 | */ 373 | SWA = 'swa', 374 | /** 375 | * SWE 376 | */ 377 | SWE = 'swe', 378 | /** 379 | * SYR 380 | */ 381 | SYR = 'syr', 382 | /** 383 | * TAM 384 | */ 385 | TAM = 'tam', 386 | /** 387 | * TEL 388 | */ 389 | TEL = 'tel', 390 | /** 391 | * TGK 392 | */ 393 | TGK = 'tgk', 394 | /** 395 | * TGL 396 | */ 397 | TGL = 'tgl', 398 | /** 399 | * THA 400 | */ 401 | THA = 'tha', 402 | /** 403 | * TIR 404 | */ 405 | TIR = 'tir', 406 | /** 407 | * TUR 408 | */ 409 | TUR = 'tur', 410 | /** 411 | * UIG 412 | */ 413 | UIG = 'uig', 414 | /** 415 | * UKR 416 | */ 417 | UKR = 'ukr', 418 | /** 419 | * URD 420 | */ 421 | URD = 'urd', 422 | /** 423 | * UZB 424 | */ 425 | UZB = 'uzb', 426 | /** 427 | * UZB_CYRL 428 | */ 429 | UZB_CYRL = 'uzb_cyrl', 430 | /** 431 | * VIE 432 | */ 433 | VIE = 'vie', 434 | /** 435 | * YID 436 | */ 437 | YID = 'yid' 438 | } 439 | 440 | export interface Permissions { 441 | /** 442 | * allow to assemble 443 | * 444 | * @readonly 445 | * @type {boolean} 446 | */ 447 | readonly assemble: boolean, 448 | /** 449 | * allow to copy the content 450 | * 451 | * @readonly 452 | * @type {boolean} 453 | */ 454 | readonly copy: boolean, 455 | /** 456 | * allow to fill interactive forms 457 | * 458 | * @readonly 459 | * @type {boolean} 460 | */ 461 | readonly fillInteractiveForms: boolean, 462 | /** 463 | * allow to modify annotations 464 | * 465 | * @readonly 466 | * @type {boolean} 467 | */ 468 | readonly modifyAnnotations: boolean, 469 | /** 470 | * allow to modify contents 471 | * 472 | * @readonly 473 | * @type {boolean} 474 | */ 475 | readonly modifyContents: boolean, 476 | /** 477 | * allow to print 478 | * 479 | * @readonly 480 | * @type {boolean} 481 | */ 482 | readonly print: boolean, 483 | /** 484 | * allow to print in highquality 485 | * 486 | * @readonly 487 | * @type {boolean} 488 | */ 489 | readonly printHQ: boolean, 490 | /** 491 | * allow to copy the content for accessibility 492 | * 493 | * @readonly 494 | * @type {boolean} 495 | */ 496 | readonly copyForAccessibility: boolean, 497 | } 498 | 499 | export interface Metadata { 500 | /** 501 | * get the raw metadata 502 | * 503 | * @returns {string} the raw metadata 504 | */ 505 | getRaw(): string; 506 | /** 507 | * get data by name 508 | * 509 | * @returns {string} the data 510 | */ 511 | get(name: string): string | string[]; 512 | /** 513 | * get all data 514 | * 515 | * @returns {{ [key: string]: string | string[] | undefined }} all data 516 | */ 517 | getAll(): { [key: string]: string | string[] | undefined }; 518 | /** 519 | * check whether data with the name are available 520 | * 521 | * @returns {boolean} if available then true is returned 522 | */ 523 | has(name: string): boolean; 524 | } 525 | 526 | export interface MetadataInfo { 527 | /** 528 | * the meta information of the document 529 | */ 530 | info: Info; 531 | /** 532 | * the metadata of the document 533 | */ 534 | metadata: Metadata; 535 | } 536 | 537 | export interface Name { 538 | /** 539 | * the name 540 | * 541 | * @type {string} 542 | */ 543 | readonly name: string 544 | } 545 | 546 | export interface Info { 547 | /** 548 | * the title 549 | * 550 | * @readonly 551 | * @type {string | undefined} 552 | */ 553 | readonly Title?: string, 554 | /** 555 | * the author 556 | * 557 | * @readonly 558 | * @type {string | undefined} 559 | */ 560 | readonly Author?: string, 561 | /** 562 | * the subject 563 | * 564 | * @readonly 565 | * @type {string | undefined} 566 | */ 567 | readonly Subject?: string, 568 | /** 569 | * the keywords 570 | * 571 | * @readonly 572 | * @type {string | undefined} 573 | */ 574 | readonly Keywords?: string, 575 | /** 576 | * the creator 577 | * 578 | * @readonly 579 | * @type {string | undefined} 580 | */ 581 | readonly Creator?: string, 582 | /** 583 | * the producer 584 | * 585 | * @readonly 586 | * @type {string | undefined} 587 | */ 588 | readonly Producer?: string, 589 | /** 590 | * the creation date 591 | * 592 | * @readonly 593 | * @type {string | undefined} 594 | */ 595 | readonly CreationDate?: string, 596 | /** 597 | * the modification date 598 | * 599 | * @readonly 600 | * @type {string | undefined} 601 | */ 602 | readonly ModDate?: string, 603 | /** 604 | * the trapped 605 | * 606 | * @readonly 607 | * @type {Name | undefined} 608 | */ 609 | readonly Trapped?: Name, 610 | /** 611 | * the format version 612 | * 613 | * @readonly 614 | * @type {string | undefined} 615 | */ 616 | readonly PDFFormatVersion?: string, 617 | /** 618 | * if it is linearized 619 | * 620 | * @readonly 621 | * @type {boolean | undefined} 622 | */ 623 | readonly IsLinearized?: boolean, 624 | /** 625 | * if acro form is present 626 | * 627 | * @readonly 628 | * @type {boolean | undefined} 629 | */ 630 | readonly IsAcroFormPresent?: boolean 631 | /** 632 | * if xfa form is present 633 | * 634 | * @readonly 635 | * @type {boolean | undefined} 636 | */ 637 | readonly IsXFAPresent?: boolean, 638 | /** 639 | * if collection is present 640 | * 641 | * @readonly 642 | * @type {boolean | undefined} 643 | */ 644 | readonly IsCollectionPresent?: boolean, 645 | /** 646 | * if signatures are present 647 | * 648 | * @readonly 649 | * @type {boolean | undefined} 650 | */ 651 | readonly IsSignaturesPresent?: boolean, 652 | readonly [key: string]: string | number | boolean | Name | undefined, 653 | } 654 | 655 | /** 656 | * it is an outline (bookmark) of the pdf document 657 | */ 658 | export interface Outline { 659 | /** 660 | * the title 661 | * 662 | * @readonly 663 | * @type {string} 664 | */ 665 | readonly title: string, 666 | /** 667 | * the childrens 668 | * 669 | * @readonly 670 | * @type {ReadonlyArray | undefined} 671 | */ 672 | readonly childs?: readonly Outline[], 673 | } 674 | 675 | /** 676 | * the outline which includes a url 677 | */ 678 | export class UrlOutline implements Outline { 679 | /** 680 | * @param {string} title - the title 681 | * @param {string} url - the url to which the outline points 682 | * @param {boolean} absolute - if the url is absolute 683 | * @param {Outline[]} [childs] - the childrens 684 | */ 685 | constructor( 686 | readonly title: string, 687 | readonly url: string, 688 | readonly absolute: boolean, 689 | readonly childs?: readonly Outline[], 690 | ) { } 691 | } 692 | 693 | 694 | /** 695 | * the outline which includes a page number 696 | */ 697 | export class PageNumberOutline implements Outline { 698 | /** 699 | * @param {string} title - the title 700 | * @param {number} page - the page number to which the outline points 701 | * @param {Outline[]} [childs] - the childrens 702 | */ 703 | constructor( 704 | readonly title: string, 705 | readonly page: number, 706 | readonly childs?: readonly Outline[], 707 | ) { } 708 | } 709 | 710 | /** 711 | * the outline which includes a reference to another pdf 712 | */ 713 | export class PdfReferenceOutline implements Outline { 714 | /** 715 | * @param {string} title - the title 716 | * @param {string} url - the url to which the outline points 717 | * @param {number} page - the remote page number to which the outline points 718 | * @param {Outline[]} [childs] - the childrens 719 | */ 720 | constructor( 721 | readonly title: string, 722 | readonly url: string, 723 | readonly page?: number, 724 | readonly childs?: readonly Outline[], 725 | ) { } 726 | } -------------------------------------------------------------------------------- /test/basic.extractor.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/basic.pdf'; 2 | 3 | import { PdfDataExtractor, VerbosityLevel } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('without password should fail', async () => { 10 | await expect(PdfDataExtractor.get(buffer)).rejects.toThrow(); 11 | }); 12 | test('extract basic data', async () => { 13 | const extractor = await PdfDataExtractor.get(buffer, { 14 | password: '123456', 15 | verbosity: VerbosityLevel.ERRORS, 16 | }); 17 | //(await extractor.getPageData()).forEach(async cf => { 18 | // console.log(await cf?.contentInfo()); 19 | //}); 20 | expect(extractor.pages).toEqual(2); 21 | const text = await extractor.getText(); 22 | expect(text.length).toEqual(2); 23 | const first_page_lines = text[0].split('\n'); 24 | expect(first_page_lines.length).toEqual(31); 25 | expect(first_page_lines[7]).toMatch(/^dapibus mattis/); 26 | const permissions = await extractor.getPermissions(); 27 | expect(permissions).not.toBeNull(); 28 | if (permissions) { 29 | expect(permissions.print).toEqual(true); 30 | expect(permissions.modifyAnnotations).toEqual(false); 31 | } 32 | }); 33 | }); -------------------------------------------------------------------------------- /test/basic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/basic.pdf -------------------------------------------------------------------------------- /test/basic.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/basic.pdf'; 2 | 3 | import { PdfData, VerbosityLevel } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('without password should fail', async () => { 10 | await expect(PdfData.extract(buffer)).rejects.toThrow(); 11 | }); 12 | test('extract basic data', async () => { 13 | const data = await PdfData.extract(buffer, { 14 | password: '123456', 15 | verbosity: VerbosityLevel.ERRORS, 16 | }); 17 | expect(data.pages).toEqual(2); 18 | expect(data.text).not.toBeNull(); 19 | if (data.text) { 20 | expect(data.text.length).toEqual(2); 21 | const first_page_lines = data.text[0].split('\n'); 22 | expect(first_page_lines.length).toEqual(31); 23 | expect(first_page_lines[7]).toMatch(/^dapibus mattis/); 24 | } 25 | expect(data.permissions).not.toBeNull(); 26 | if (data.permissions) { 27 | expect(data.permissions.print).toEqual(true); 28 | expect(data.permissions.modifyAnnotations).toEqual(false); 29 | } 30 | }); 31 | test('extract seperated basic data', async () => { 32 | let data: PdfData; 33 | 34 | data = await PdfData.extract(buffer, { 35 | password: '123456', 36 | verbosity: VerbosityLevel.ERRORS, 37 | get: { 38 | pages: false, 39 | text: false, 40 | fingerprint: false, 41 | outline: false, 42 | metadata: false, 43 | info: false, 44 | permissions: false 45 | } 46 | }); 47 | expect(data.pages).toBeUndefined(); 48 | expect(data.text).toBeUndefined(); 49 | expect(data.fingerprint).toBeUndefined(); 50 | expect(data.outline).toBeUndefined(); 51 | expect(data.metadata).toBeUndefined(); 52 | expect(data.info).toBeUndefined(); 53 | expect(data.permissions).toBeUndefined(); 54 | 55 | data = await PdfData.extract(buffer, { 56 | password: '123456', 57 | verbosity: VerbosityLevel.ERRORS, 58 | get: { 59 | pages: true, 60 | text: false, 61 | fingerprint: false, 62 | outline: false, 63 | metadata: false, 64 | info: false, 65 | permissions: false 66 | } 67 | }); 68 | expect(data.pages).toBeDefined(); 69 | expect(data.text).toBeUndefined(); 70 | expect(data.fingerprint).toBeUndefined(); 71 | expect(data.outline).toBeUndefined(); 72 | expect(data.metadata).toBeUndefined(); 73 | expect(data.info).toBeUndefined(); 74 | expect(data.permissions).toBeUndefined(); 75 | 76 | data = await PdfData.extract(buffer, { 77 | password: '123456', 78 | verbosity: VerbosityLevel.ERRORS, 79 | get: { 80 | pages: false, 81 | text: true, 82 | fingerprint: false, 83 | outline: false, 84 | metadata: false, 85 | info: false, 86 | permissions: false 87 | } 88 | }); 89 | expect(data.pages).toBeUndefined(); 90 | expect(data.text).toBeDefined(); 91 | expect(data.fingerprint).toBeUndefined(); 92 | expect(data.outline).toBeUndefined(); 93 | expect(data.metadata).toBeUndefined(); 94 | expect(data.info).toBeUndefined(); 95 | expect(data.permissions).toBeUndefined(); 96 | 97 | data = await PdfData.extract(buffer, { 98 | password: '123456', 99 | verbosity: VerbosityLevel.ERRORS, 100 | get: { 101 | pages: false, 102 | text: false, 103 | fingerprint: true, 104 | outline: false, 105 | metadata: false, 106 | info: false, 107 | permissions: false 108 | } 109 | }); 110 | expect(data.pages).toBeUndefined(); 111 | expect(data.text).toBeUndefined(); 112 | expect(data.fingerprint).toBeDefined(); 113 | expect(data.outline).toBeUndefined(); 114 | expect(data.metadata).toBeUndefined(); 115 | expect(data.info).toBeUndefined(); 116 | expect(data.permissions).toBeUndefined(); 117 | 118 | data = await PdfData.extract(buffer, { 119 | password: '123456', 120 | verbosity: VerbosityLevel.ERRORS, 121 | get: { 122 | pages: false, 123 | text: false, 124 | fingerprint: false, 125 | outline: true, 126 | metadata: false, 127 | info: false, 128 | permissions: false 129 | } 130 | }); 131 | expect(data.pages).toBeUndefined(); 132 | expect(data.text).toBeUndefined(); 133 | expect(data.fingerprint).toBeUndefined(); 134 | expect(data.outline).toBeDefined(); 135 | expect(data.metadata).toBeUndefined(); 136 | expect(data.info).toBeUndefined(); 137 | expect(data.permissions).toBeUndefined(); 138 | 139 | // TODO 140 | //data = await PdfData.extract(buffer, { 141 | // password: '123456', 142 | // verbosity: VerbosityLevel.ERRORS, 143 | // get: { 144 | // pages: false, 145 | // text: false, 146 | // fingerprint: false, 147 | // outline: false, 148 | // metadata: true, 149 | // info: false, 150 | // permissions: false 151 | // } 152 | //}); 153 | //expect(data.pages).toBeUndefined(); 154 | //expect(data.text).toBeUndefined(); 155 | //expect(data.fingerprint).toBeUndefined(); 156 | //expect(data.outline).toBeUndefined(); 157 | //expect(data.metadata).toBeDefined(); 158 | //expect(data.info).toBeUndefined(); 159 | //expect(data.permissions).toBeUndefined(); 160 | 161 | data = await PdfData.extract(buffer, { 162 | password: '123456', 163 | verbosity: VerbosityLevel.ERRORS, 164 | get: { 165 | pages: false, 166 | text: false, 167 | fingerprint: false, 168 | outline: false, 169 | metadata: false, 170 | info: true, 171 | permissions: false 172 | } 173 | }); 174 | expect(data.pages).toBeUndefined(); 175 | expect(data.text).toBeUndefined(); 176 | expect(data.fingerprint).toBeUndefined(); 177 | expect(data.outline).toBeUndefined(); 178 | expect(data.metadata).toBeUndefined(); 179 | expect(data.info).toBeDefined(); 180 | expect(data.permissions).toBeUndefined(); 181 | 182 | data = await PdfData.extract(buffer, { 183 | password: '123456', 184 | verbosity: VerbosityLevel.ERRORS, 185 | get: { 186 | pages: false, 187 | text: false, 188 | fingerprint: false, 189 | outline: false, 190 | metadata: false, 191 | info: false, 192 | permissions: true 193 | } 194 | }); 195 | expect(data.pages).toBeUndefined(); 196 | expect(data.text).toBeUndefined(); 197 | expect(data.fingerprint).toBeUndefined(); 198 | expect(data.outline).toBeUndefined(); 199 | expect(data.metadata).toBeUndefined(); 200 | expect(data.info).toBeUndefined(); 201 | expect(data.permissions).toBeDefined(); 202 | }) 203 | }); -------------------------------------------------------------------------------- /test/empty_outline.extractor.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/empty_outline.pdf'; 2 | 3 | import { PdfDataExtractor, VerbosityLevel } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('extract empty outline', async () => { 10 | const extractor = await PdfDataExtractor.get(buffer, { 11 | verbosity: VerbosityLevel.ERRORS, 12 | }); 13 | expect(await extractor.getOutline()).toBeNull(); 14 | }); 15 | }); -------------------------------------------------------------------------------- /test/empty_outline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/empty_outline.pdf -------------------------------------------------------------------------------- /test/empty_outline.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/empty_outline.pdf'; 2 | 3 | import { PdfData, VerbosityLevel } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('extract empty outline', async () => { 10 | const data = await PdfData.extract(buffer, { 11 | verbosity: VerbosityLevel.ERRORS, 12 | }); 13 | expect(await data.outline).toBeUndefined(); 14 | }); 15 | }); -------------------------------------------------------------------------------- /test/outline.extractor.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/outline.pdf'; 2 | 3 | import { PdfDataExtractor, VerbosityLevel, PageNumberOutline, UrlOutline, PdfReferenceOutline } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('extract empty outline', async () => { 10 | const extractor = await PdfDataExtractor.get(buffer, { 11 | verbosity: VerbosityLevel.ERRORS, 12 | }); 13 | const outline = await extractor.getOutline(); 14 | expect(outline).not.toBeNull(); 15 | if (outline) { 16 | const outline0 = outline[0]; 17 | expect(outline0.title).toEqual('to_page_1'); 18 | expect(outline0).toBeInstanceOf(PageNumberOutline); 19 | if (outline0 instanceof PageNumberOutline) { 20 | expect(outline0.page).toEqual(0); 21 | } 22 | 23 | const outline1 = outline[1]; 24 | expect(outline1.title).toEqual('to_page_1_reference'); 25 | expect(outline1).toBeInstanceOf(PageNumberOutline); 26 | if (outline1 instanceof PageNumberOutline) { 27 | expect(outline1.page).toEqual(0); 28 | } 29 | 30 | const outline2 = outline[2]; 31 | expect(outline2.title).toEqual('url'); 32 | expect(outline2).toBeInstanceOf(UrlOutline); 33 | if (outline2 instanceof UrlOutline) { 34 | expect(outline2.url).toEqual('https://github.com/lublak/pdfdataextract'); 35 | expect(outline2.absolute).toEqual(true); 36 | } 37 | 38 | const outline3 = outline[3]; 39 | expect(outline3.title).toEqual('to_pdf'); 40 | expect(outline3).toBeInstanceOf(PdfReferenceOutline); 41 | if (outline3 instanceof PdfReferenceOutline) { 42 | expect(outline3.url).toEqual('specific_pages.pdf'); 43 | expect(outline3.page).toEqual(0); 44 | } 45 | 46 | const outline4 = outline[4]; 47 | expect(outline4.title).toEqual('open'); 48 | expect(outline4).toBeInstanceOf(UrlOutline); 49 | if (outline4 instanceof UrlOutline) { 50 | expect(outline4.url).toEqual('specific_pages.test.ts'); 51 | expect(outline4.absolute).toEqual(false); 52 | } 53 | } 54 | }); 55 | }); -------------------------------------------------------------------------------- /test/outline.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/outline.pdf -------------------------------------------------------------------------------- /test/outline.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/outline.pdf'; 2 | 3 | import { PdfData, VerbosityLevel, PageNumberOutline, UrlOutline, PdfReferenceOutline } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('extract outline', async () => { 10 | const data = await PdfData.extract(buffer, { 11 | verbosity: VerbosityLevel.ERRORS, 12 | get: { 13 | pages: false, 14 | text: false, 15 | fingerprint: false, 16 | outline: true, 17 | metadata: false, 18 | info: false, 19 | permissions: false 20 | } 21 | }); 22 | expect(data.outline).not.toBeNull(); 23 | if (data.outline) { 24 | const outline0 = data.outline[0]; 25 | expect(outline0.title).toEqual('to_page_1'); 26 | expect(outline0).toBeInstanceOf(PageNumberOutline); 27 | if (outline0 instanceof PageNumberOutline) { 28 | expect(outline0.page).toEqual(0); 29 | } 30 | 31 | const outline1 = data.outline[1]; 32 | expect(outline1.title).toEqual('to_page_1_reference'); 33 | expect(outline1).toBeInstanceOf(PageNumberOutline); 34 | if (outline1 instanceof PageNumberOutline) { 35 | expect(outline1.page).toEqual(0); 36 | } 37 | 38 | const outline2 = data.outline[2]; 39 | expect(outline2.title).toEqual('url'); 40 | expect(outline2).toBeInstanceOf(UrlOutline); 41 | if (outline2 instanceof UrlOutline) { 42 | expect(outline2.url).toEqual('https://github.com/lublak/pdfdataextract'); 43 | expect(outline2.absolute).toEqual(true); 44 | } 45 | 46 | const outline3 = data.outline[3]; 47 | expect(outline3.title).toEqual('to_pdf'); 48 | expect(outline3).toBeInstanceOf(PdfReferenceOutline); 49 | if (outline3 instanceof PdfReferenceOutline) { 50 | expect(outline3.url).toEqual('specific_pages.pdf'); 51 | expect(outline3.page).toEqual(0); 52 | } 53 | 54 | const outline4 = data.outline[4]; 55 | expect(outline4.title).toEqual('open'); 56 | expect(outline4).toBeInstanceOf(UrlOutline); 57 | if (outline4 instanceof UrlOutline) { 58 | expect(outline4.url).toEqual('specific_pages.test.ts'); 59 | expect(outline4.absolute).toEqual(false); 60 | } 61 | } 62 | }); 63 | }); -------------------------------------------------------------------------------- /test/simple.extractor.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/simple.pdf'; 2 | 3 | import { PdfDataExtractor, VerbosityLevel } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('extract basic data', async () => { 10 | const extractor = await PdfDataExtractor.get(buffer, { 11 | verbosity: VerbosityLevel.ERRORS, 12 | }); 13 | //(await extractor.getPageData()).forEach(async cf => { 14 | // console.log(await cf?.contentInfo()); 15 | //}); 16 | }); 17 | }); -------------------------------------------------------------------------------- /test/simple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/simple.pdf -------------------------------------------------------------------------------- /test/specific_pages.extractor.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/specific_pages.pdf'; 2 | 3 | import { PdfDataExtractor, VerbosityLevel } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('extract specific pages', async () => { 10 | const extractor = await PdfDataExtractor.get(buffer, { 11 | verbosity: VerbosityLevel.ERRORS, 12 | }); 13 | const pages = await extractor.getPageData([1]); 14 | expect(await extractor.getText([2])).toEqual(['2']); 15 | expect(await extractor.getText([5, 9])).toEqual(['5', '9']); 16 | expect(await extractor.getText((pageNumber) => pageNumber == 7)).toEqual(['7']); 17 | expect(await extractor.getText([5, 9, 5])).toEqual(['5', '9']); 18 | }); 19 | }); -------------------------------------------------------------------------------- /test/specific_pages.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lublak/pdfdataextract/58af98ff20885e864148cd3e324669842bc4fc82/test/specific_pages.pdf -------------------------------------------------------------------------------- /test/specific_pages.test.ts: -------------------------------------------------------------------------------- 1 | const PDF_TEST_FILE = './test/specific_pages.pdf'; 2 | 3 | import { PdfData, VerbosityLevel } from '../src'; 4 | import { readFileSync } from 'fs'; 5 | import { test, describe, expect } from 'vitest'; 6 | 7 | describe(`parse ${PDF_TEST_FILE}`, () => { 8 | const buffer = readFileSync(PDF_TEST_FILE); 9 | test('extract specific pages', async () => { 10 | 11 | expect((await PdfData.extract(buffer, { 12 | verbosity: VerbosityLevel.ERRORS, 13 | pages: [2] 14 | })).text).toEqual(['2']); 15 | 16 | expect((await PdfData.extract(buffer, { 17 | verbosity: VerbosityLevel.ERRORS, 18 | pages: [5, 9] 19 | })).text).toEqual(['5', '9']); 20 | 21 | expect((await PdfData.extract(buffer, { 22 | verbosity: VerbosityLevel.ERRORS, 23 | pages: (pageNumber) => pageNumber == 7 24 | })).text).toEqual(['7']); 25 | 26 | expect((await PdfData.extract(buffer, { 27 | verbosity: VerbosityLevel.ERRORS, 28 | pages: [5, 9, 5] 29 | })).text).toEqual(['5', '9']); 30 | }); 31 | }); -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2017", 4 | "module": "commonjs", 5 | "lib": [ 6 | "es2017", 7 | "dom" 8 | ], 9 | "outDir": "dist", 10 | "rootDir": "src", 11 | "strict": true, 12 | "strictNullChecks": true, 13 | "noImplicitAny": true, 14 | "esModuleInterop": true, 15 | "resolveJsonModule": true, 16 | "forceConsistentCasingInFileNames": true, 17 | "declaration": true, 18 | "types": [ 19 | "node" 20 | ], 21 | "sourceMap": true 22 | }, 23 | "include": [ 24 | "src/**/*.ts" 25 | ], 26 | "exclude": [ 27 | "node_modules", 28 | "**/*.spec.ts" 29 | ] 30 | } -------------------------------------------------------------------------------- /typedoc.json: -------------------------------------------------------------------------------- 1 | { 2 | "plugin": ["typedoc-plugin-markdown"], 3 | "theme": "./node_modules/typedoc-github-wiki-theme/dist", 4 | "readme": "none", 5 | "out": "doc", 6 | "excludePrivate": true 7 | } -------------------------------------------------------------------------------- /vitest.config.mjs: -------------------------------------------------------------------------------- 1 | import { defineConfig } from 'vite' 2 | export default defineConfig({ 3 | test: {}, 4 | }) --------------------------------------------------------------------------------