├── .gitattributes ├── .github └── workflows │ ├── build-for-gh-pages.yml │ ├── tagged-release.yml │ └── tests.yml ├── .gitignore ├── .npmignore ├── .nvmrc ├── .prettierignore ├── .prettierrc.cjs ├── .vscode └── settings.json ├── LICENSE-CC0 ├── LICENSE-MIT ├── README.md ├── configs ├── dep-cruiser.config.ts ├── mocha.config.cjs ├── ncu.config.ts └── typedoc.config.ts ├── cspell.config.cjs ├── package-lock.json ├── package.json ├── src ├── esm-support.test.ts ├── index.ts ├── read-pdf.test.ts ├── read-pdf.ts ├── readme-examples │ ├── lower-level-controls.example.ts │ ├── read-pdf-pages.example.ts │ └── read-pdf-text.example.ts └── repo-paths.test-helper.ts ├── test-files ├── dummy-with-password.pdf ├── dummy.pdf └── pdfkit-out.pdf └── tsconfig.json /.gitattributes: -------------------------------------------------------------------------------- 1 | # helps with line endings for Windows in automated tests 2 | * text=auto eol=lf -------------------------------------------------------------------------------- /.github/workflows/build-for-gh-pages.yml: -------------------------------------------------------------------------------- 1 | name: build-for-gh-pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - dev 7 | 8 | jobs: 9 | build-for-gh-pages: 10 | runs-on: 'ubuntu-latest' 11 | 12 | steps: 13 | - name: Checkout Repository 14 | uses: actions/checkout@v4.1.1 15 | - name: Setup Node 16 | uses: actions/setup-node@v4 17 | with: 18 | node-version-file: '.nvmrc' 19 | cache: 'npm' 20 | - name: build 21 | run: | 22 | npm ci 23 | npm run docs 24 | - uses: JamesIves/github-pages-deploy-action@v4 25 | with: 26 | branch: 'gh-pages' 27 | folder: 'dist-docs' 28 | -------------------------------------------------------------------------------- /.github/workflows/tagged-release.yml: -------------------------------------------------------------------------------- 1 | name: 'tagged-release' 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*' 7 | 8 | jobs: 9 | tagged-release: 10 | name: 'Tagged Release' 11 | runs-on: 'ubuntu-latest' 12 | 13 | steps: 14 | - name: Checkout Repository 15 | uses: actions/checkout@v4.1.1 16 | - name: Setup Node 17 | uses: actions/setup-node@v4 18 | with: 19 | node-version-file: '.nvmrc' 20 | cache: 'npm' 21 | - name: run test 22 | run: | 23 | npm ci 24 | npx playwright install --with-deps 25 | npm run test:all 26 | - name: pack and set vars 27 | id: vars 28 | run: | 29 | tagName="${GITHUB_REF#refs/*/}" 30 | originalTarName="$(npm pack)" 31 | packageNameWithoutFileExtension="${originalTarName%.tgz}" 32 | tagNameWithoutV="${tagName##v}" 33 | packageName="${packageNameWithoutFileExtension%-$tagNameWithoutV}" 34 | newTarName="$packageName-$tagName.tgz" 35 | mv "$originalTarName" "$newTarName" 36 | echo ::set-output name=tarName::"$newTarName" 37 | 38 | - uses: 'marvinpinto/action-automatic-releases@latest' 39 | with: 40 | repo_token: '${{ secrets.GITHUB_TOKEN }}' 41 | prerelease: false 42 | files: | 43 | ${{ steps.vars.outputs.tarName }} 44 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' 7 | 8 | jobs: 9 | build: 10 | runs-on: ${{ matrix.os }} 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | os: [ubuntu-latest, macos-latest] 15 | steps: 16 | - name: Checkout Repository 17 | uses: actions/checkout@v4.1.1 18 | - name: Setup Node 19 | uses: actions/setup-node@v4 20 | with: 21 | node-version-file: '.nvmrc' 22 | cache: 'npm' 23 | - name: run test 24 | run: | 25 | npm ci 26 | npm run test:all 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.tgz 2 | .DS_Store 3 | .not-committed/ 4 | /Icon? 5 | coverage/ 6 | dist-*/ 7 | dist/ 8 | generated-config-* 9 | graphics/ 10 | node_modules/ 11 | src/all-files-for-code-coverage.test.ts 12 | ts.out 13 | tsconfig.tsbuildinfo -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | **/*/readme-examples/ 2 | **/*/test/ 3 | *.book-helper.* 4 | *.book.* 5 | *.d.ts.map 6 | *.test-helper.* 7 | *.test.* 8 | *.tgz 9 | .* 10 | .not-committed/ 11 | /Icon? 12 | /configs/ 13 | /coverage/ 14 | /cspell.config.js 15 | /graphics/ 16 | /src/ 17 | /www-static/ 18 | bash-scripts/ 19 | dist-docs/ 20 | generated-config-* 21 | index.html 22 | test-files/ 23 | ts.out/ 24 | tsconfig*.json 25 | tsconfig.tsbuildinfo -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | 22 -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | **/all-files-for-code-coverage.test.ts 2 | **/coverage/ 3 | **/dist-*/ 4 | **/dist/ 5 | **/generated-config-* 6 | **/graphics/ 7 | **/node_modules/ 8 | **/package-lock.json 9 | **/ts.out/ 10 | **/tsconfig.tsbuildinfo 11 | /Icon? 12 | 13 | coverage/ 14 | dist/ 15 | graphics/ 16 | node_modules/ 17 | package-lock.json 18 | src/all-files-for-code-coverage.test.ts -------------------------------------------------------------------------------- /.prettierrc.cjs: -------------------------------------------------------------------------------- 1 | const {basePrettierConfig} = require('virmator/base-configs/base-prettierrc.js'); 2 | 3 | /** 4 | * @typedef {import('prettier-plugin-multiline-arrays').MultilineArrayOptions} MultilineOptions 5 | * 6 | * @typedef {import('prettier').Options} PrettierOptions 7 | * @type {PrettierOptions & MultilineOptions} 8 | */ 9 | const prettierConfig = { 10 | ...basePrettierConfig, 11 | }; 12 | 13 | module.exports = prettierConfig; 14 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[css]": { 3 | "editor.defaultFormatter": "esbenp.prettier-vscode" 4 | }, 5 | "[html]": { 6 | "editor.defaultFormatter": "esbenp.prettier-vscode" 7 | }, 8 | "[javascript]": { 9 | "editor.defaultFormatter": "esbenp.prettier-vscode" 10 | }, 11 | "[javascriptreact]": { 12 | "editor.defaultFormatter": "esbenp.prettier-vscode" 13 | }, 14 | "[json]": { 15 | "editor.defaultFormatter": "esbenp.prettier-vscode" 16 | }, 17 | "[jsonc]": { 18 | "editor.defaultFormatter": "esbenp.prettier-vscode" 19 | }, 20 | "[less]": { 21 | "editor.defaultFormatter": "esbenp.prettier-vscode" 22 | }, 23 | "[markdown]": { 24 | "editor.defaultFormatter": "esbenp.prettier-vscode" 25 | }, 26 | "[ruby]": { 27 | "editor.defaultFormatter": "esbenp.prettier-vscode" 28 | }, 29 | "[scss]": { 30 | "editor.defaultFormatter": "esbenp.prettier-vscode" 31 | }, 32 | "[toml]": { 33 | "editor.defaultFormatter": "esbenp.prettier-vscode" 34 | }, 35 | "[typescript]": { 36 | "editor.defaultFormatter": "esbenp.prettier-vscode" 37 | }, 38 | "[yaml]": { 39 | "editor.defaultFormatter": "esbenp.prettier-vscode" 40 | }, 41 | "editor.formatOnSave": true, 42 | "editor.rulers": [ 43 | 100 44 | ], 45 | "editor.wordWrapColumn": 100, 46 | "files.associations": { 47 | "*.svg": "html" 48 | }, 49 | "typescript.tsdk": "node_modules/typescript/lib" 50 | } 51 | -------------------------------------------------------------------------------- /LICENSE-CC0: -------------------------------------------------------------------------------- 1 | CC0 1.0 Universal 2 | 3 | Creative Commons Legal Code 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 electrovir 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDF Text Reader 2 | 3 | Dead simple PDF text reader for Node.js. Uses Mozilla's [`pdfjs-dist`](https://www.npmjs.com/package/pdfjs-dist) package. 4 | 5 | Requires ESM and Node.js v22 or greater. (These are requirements from Mozilla's `pdf-dist` package itself.) 6 | 7 | # Install 8 | 9 | ``` 10 | npm install pdf-text-reader 11 | ``` 12 | 13 | # Usage 14 | 15 | - Read all pages into a single string with `readPdfText`: 16 | 17 | 18 | 19 | ```TypeScript 20 | import {readPdfText} from 'pdf-text-reader'; 21 | 22 | async function main() { 23 | const pdfText: string = await readPdfText({url: 'path/to/pdf/file.pdf'}); 24 | console.info(pdfText); 25 | } 26 | 27 | main(); 28 | ``` 29 | 30 | - Read a PDF into individual pages with `readPdfPages`: 31 | 32 | 33 | ```TypeScript 34 | import {readPdfPages} from 'pdf-text-reader'; 35 | 36 | async function main() { 37 | const pages = await readPdfPages({url: 'path/to/pdf/file.pdf'}); 38 | console.info(pages[0]?.lines); 39 | } 40 | 41 | main(); 42 | ``` 43 | 44 | See [the types](https://github.com/electrovir/pdf-text-reader/tree/master/src/read-pdf.ts) for detailed argument and return value types. 45 | 46 | # Details 47 | 48 | This package simply reads the output of `pdfjs.getDocument` and sorts it into lines based on text position in the document. It also inserts spaces for text on the same line that is far apart horizontally and new lines in between lines that are far apart vertically. 49 | 50 | Example: 51 | 52 | The text below in a PDF will be read as having spaces in between them even if the space characters aren't in the PDF. 53 | 54 | ``` 55 | cell 1 cell 2 cell 3 56 | ``` 57 | 58 | The number of spaces to insert is calculated by an extremely naive but very simple calculation of `Math.ceil(distance-between-text/text-height)`. 59 | 60 | # Low Level Control 61 | 62 | If you need lower level parsing control, you can also use the exported `parsePageItems` function. This only reads one page at a time as seen below. This function is used by `readPdfPages` so the output will be identical for the same pdf page. 63 | 64 | You may need to independently install the [`pdfjs-dist`](https://www.npmjs.com/package/pdfjs-dist) npm package for this to work. 65 | 66 | 67 | 68 | ```TypeScript 69 | import * as pdfjs from 'pdfjs-dist'; 70 | import type {TextItem} from 'pdfjs-dist/types/src/display/api'; 71 | import {parsePageItems} from 'pdf-text-reader'; 72 | 73 | async function main() { 74 | const doc = await pdfjs.getDocument('myDocument.pdf').promise; 75 | const page = await doc.getPage(1); // 1-indexed 76 | const content = await page.getTextContent(); 77 | const items: TextItem[] = content.items.filter((item): item is TextItem => 'str' in item); 78 | const parsedPage = parsePageItems(items); 79 | console.info(parsedPage.lines); 80 | } 81 | 82 | main(); 83 | ``` 84 | -------------------------------------------------------------------------------- /configs/dep-cruiser.config.ts: -------------------------------------------------------------------------------- 1 | import type {IConfiguration} from 'dependency-cruiser'; 2 | import {generateDepCruiserConfig} from 'virmator/dist/compiled-base-configs/base-dep-cruiser.config'; 3 | 4 | const baseConfig = generateDepCruiserConfig({ 5 | fileExceptions: { 6 | // enter file exceptions by rule name here 7 | 'no-orphans': { 8 | from: [ 9 | 'src/index.ts', 10 | ], 11 | }, 12 | }, 13 | omitRules: [ 14 | // enter rule names here to omit 15 | ], 16 | }); 17 | 18 | const depCruiserConfig: IConfiguration = { 19 | ...baseConfig, 20 | }; 21 | 22 | module.exports = depCruiserConfig; 23 | -------------------------------------------------------------------------------- /configs/mocha.config.cjs: -------------------------------------------------------------------------------- 1 | const baseOptions = require('virmator/base-configs/base-mocharc.js'); 2 | 3 | /** @type {import('mocha').MochaOptions} */ 4 | const mochaConfig = { 5 | ...baseOptions, 6 | require: ['tsx'], 7 | }; 8 | 9 | module.exports = mochaConfig; 10 | -------------------------------------------------------------------------------- /configs/ncu.config.ts: -------------------------------------------------------------------------------- 1 | import {RunOptions} from 'npm-check-updates'; 2 | import {baseNcuConfig} from 'virmator/dist/compiled-base-configs/base-ncu'; 3 | 4 | export const ncuConfig: RunOptions = { 5 | ...baseNcuConfig, 6 | // exclude these 7 | reject: [ 8 | ...baseNcuConfig.reject, 9 | ], 10 | // include only these 11 | filter: [], 12 | }; 13 | -------------------------------------------------------------------------------- /configs/typedoc.config.ts: -------------------------------------------------------------------------------- 1 | import {join, resolve} from 'path'; 2 | import type {TypeDocOptions} from 'typedoc'; 3 | import {baseTypedocConfig} from 'virmator/dist/compiled-base-configs/base-typedoc'; 4 | 5 | const repoRoot = resolve(__dirname, '..'); 6 | const indexTsFile = join(repoRoot, 'src', 'index.ts'); 7 | 8 | export const typeDocConfig: Partial = { 9 | ...baseTypedocConfig, 10 | out: join(repoRoot, 'dist-docs'), 11 | entryPoints: [ 12 | indexTsFile, 13 | ], 14 | intentionallyNotExported: [], 15 | }; 16 | -------------------------------------------------------------------------------- /cspell.config.cjs: -------------------------------------------------------------------------------- 1 | const {baseConfig} = require('virmator/base-configs/base-cspell.js'); 2 | 3 | module.exports = { 4 | ...baseConfig, 5 | ignorePaths: [ 6 | ...baseConfig.ignorePaths, 7 | ], 8 | words: [ 9 | ...baseConfig.words, 10 | ], 11 | }; 12 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pdf-text-reader", 3 | "version": "5.1.0", 4 | "description": "Dead simple pdf text reader", 5 | "keywords": [ 6 | "pdf", 7 | "text", 8 | "converter", 9 | "pdf-text-reader", 10 | "reader" 11 | ], 12 | "homepage": "https://github.com/electrovir/pdf-text-reader", 13 | "bugs": { 14 | "url": "https://github.com/electrovir/pdf-text-reader/issues" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "https://github.com/electrovir/pdf-text-reader" 19 | }, 20 | "license": "(MIT or CC0 1.0)", 21 | "author": { 22 | "name": "electrovir", 23 | "url": "https://github.com/electrovir" 24 | }, 25 | "type": "module", 26 | "main": "dist/index.js", 27 | "types": "dist/index.d.ts", 28 | "scripts": { 29 | "compile": "rm -rf dist && tsc --pretty", 30 | "docs": "virmator docs", 31 | "format": "prettier --color --cache --cache-strategy content \"./**/*.+(cjs|css|graphql|html|js|json|jsx|less|md|mjs|scss|toml|ts|tsx|yaml|yml)\"", 32 | "publish": "virmator publish \"npm run compile && npm run test:all\"", 33 | "test": "npm run compile && test-as-package mocha --colors --config 'configs/mocha.config.cjs'", 34 | "test:all": "concurrently --colors --kill-others-on-fail -c auto --names types,tests,spelling,format,docs,deps \"npm run test:types\" \"npm run test\" \"npm run test:spelling\" \"npm run test:format\" \"npm run test:docs\" \"npm run test:deps\"", 35 | "test:deps": "virmator deps check", 36 | "test:docs": "virmator docs check", 37 | "test:format": "npm run format -- --check", 38 | "test:spelling": "virmator spellcheck", 39 | "test:types": "tsc --noEmit" 40 | }, 41 | "dependencies": { 42 | "pdfjs-dist": "4.2.67" 43 | }, 44 | "devDependencies": { 45 | "@augment-vir/node-js": "^28.0.0", 46 | "@electrovir/nyc": "^15.1.0-fix0", 47 | "@istanbuljs/nyc-config-typescript": "^1.0.2", 48 | "@types/chai": "^4.3.16", 49 | "@types/mocha": "^10.0.6", 50 | "@types/node": "20.12.10", 51 | "chai": "^5.1.0", 52 | "cspell": "^8.8.0", 53 | "dependency-cruiser": "^16.3.1", 54 | "esbuild": "^0.21.0", 55 | "istanbul-smart-text-reporter": "^1.1.4", 56 | "markdown-code-example-inserter": "^1.0.0", 57 | "mocha": "^10.4.0", 58 | "mocha-spec-reporter-with-file-names": "^0.0.3", 59 | "npm-check-updates": "~16.12.3", 60 | "prettier": "^3.2.5", 61 | "prettier-plugin-interpolated-html-tags": "^1.0.5", 62 | "prettier-plugin-jsdoc": "^1.3.0", 63 | "prettier-plugin-multiline-arrays": "^3.0.4", 64 | "prettier-plugin-organize-imports": "^3.2.4", 65 | "prettier-plugin-packagejson": "^2.5.0", 66 | "prettier-plugin-sort-json": "^4.0.0", 67 | "prettier-plugin-toml": "^2.0.1", 68 | "run-time-assertions": "^1.2.0", 69 | "test-as-package": "^1.0.0", 70 | "tsx": "^4.9.3", 71 | "type-fest": "^4.18.2", 72 | "typedoc": "^0.25.13", 73 | "typescript": "^5.4.5", 74 | "virmator": "^11.5.2" 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/esm-support.test.ts: -------------------------------------------------------------------------------- 1 | import {runShellCommand} from '@augment-vir/node-js'; 2 | 3 | describe('test as esm package', () => { 4 | it('runs', async () => { 5 | await runShellCommand( 6 | `node --experimental-default-type=module -e "console.log(await import('pdf-text-reader'))"`, 7 | {rejectOnError: true}, 8 | ); 9 | }); 10 | }); 11 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export * from './read-pdf.js'; 2 | -------------------------------------------------------------------------------- /src/read-pdf.test.ts: -------------------------------------------------------------------------------- 1 | import {assert} from 'chai'; 2 | import {existsSync} from 'node:fs'; 3 | import {join} from 'node:path'; 4 | import type {TypedArray} from 'pdfjs-dist/types/src/display/api.js'; 5 | import {assertTypeOf} from 'run-time-assertions'; 6 | import type {ReadonlyDeep} from 'type-fest'; 7 | import {PdfProgressData, ReadPdfTextParams, readPdfPages, readPdfText} from './read-pdf.js'; 8 | import {nodeModulesDir, sampleFilesDir} from './repo-paths.test-helper.js'; 9 | 10 | type PdfTestFile = { 11 | filePath: string; 12 | expectedContent: string; 13 | lineCounts: number[]; 14 | pdfPassword?: string | undefined; 15 | }; 16 | 17 | const testFiles = [ 18 | { 19 | filePath: join(sampleFilesDir, 'dummy.pdf'), 20 | expectedContent: 'Dummy PDF file', 21 | lineCounts: [1], 22 | }, 23 | { 24 | filePath: join(sampleFilesDir, 'dummy-with-password.pdf'), 25 | expectedContent: 'Dummy PDF file', 26 | lineCounts: [1], 27 | pdfPassword: 'test password', 28 | }, 29 | { 30 | filePath: join(sampleFilesDir, 'pdfkit-out.pdf'), 31 | // cspell:disable 32 | expectedContent: `Some text with an embedded font! 33 | PNG and JPEG images: 34 | Here is some vector graphics... 35 | 36 | And here is some wrapped text... 37 | Lorem ipsum dolor sit amet, consectetur adipiscing elit. Etiam in 38 | suscipit purus. Vestibulum ante ipsum primis in faucibus orci luctus et 39 | ultrices posuere cubilia Curae; Vivamus nec hendrerit felis. Morbi 40 | aliquam facilisis risus eu lacinia. Sed eu leo in turpis fringilla hendrerit. 41 | Ut nec accumsan nisl. Suspendisse rhoncus nisl posuere tortor 42 | tempus et dapibus elit porta. Cras leo neque, elementum a rhoncus ut, 43 | vestibulum non nibh. Phasellus pretium justo turpis. Etiam vulputate, 44 | odio vitae tincidunt ultricies, eros odio dapibus nisi, ut tincidunt lacus 45 | arcu eu elit. Aenean velit erat, vehicula eget lacinia ut, dignissim non 46 | tellus. Aliquam nec lacus mi, sed vestibulum nunc. Suspendisse 47 | potenti. Curabitur vitae sem turpis. Vestibulum sed neque eget dolor 48 | dapibus porttitor at sit amet sem. Fusce a turpis lorem. Vestibulum ante 49 | ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; 50 | Mauris at ante tellus. Vestibulum a metus lectus. Praesent tempor 51 | purus a lacus blandit eget gravida ante hendrerit. Cras et eros metus. 52 | Sed commodo malesuada eros, vitae interdum augue semper quis. 53 | Fusce id magna nunc. Curabitur sollicitudin placerat semper. Cras et 54 | mi neque, a dignissim risus. Nulla venenatis porta lacus, vel rhoncus 55 | lectus tempor vitae. Duis sagittis venenatis rutrum. Curabitur tempor 56 | massa tortor. 57 | Rendering some SVG paths... 58 | Here is a link! 59 | One 60 | Two 61 | Three`, 62 | // cspell:enable 63 | lineCounts: [ 64 | 2, 65 | 23, 66 | 1, 67 | 4, 68 | ], 69 | }, 70 | ] as const satisfies ReadonlyDeep; 71 | 72 | function forEachTestFile( 73 | callback: (testFile: ReadonlyDeep) => T, 74 | ): T extends Promise ? Promise[]> : T[] { 75 | let hasPromise = false; 76 | const mapped = testFiles.map((testFile) => { 77 | const result = callback(testFile); 78 | if (result instanceof Promise) { 79 | hasPromise = true; 80 | } 81 | return result; 82 | }); 83 | if (hasPromise) { 84 | return Promise.all(mapped) as T extends Promise ? Promise[]> : T[]; 85 | } else { 86 | return mapped as T extends Promise ? Promise[]> : T[]; 87 | } 88 | } 89 | 90 | describe(readPdfPages.name, () => { 91 | it('outputs expected pages', async () => { 92 | await forEachTestFile(async ({filePath, pdfPassword, lineCounts}) => { 93 | const pages = await readPdfPages({ 94 | url: filePath, 95 | password: pdfPassword, 96 | }); 97 | assert.lengthOf( 98 | pages, 99 | lineCounts.length, 100 | `file does not have expected line count: '${filePath}'`, 101 | ); 102 | assert.deepStrictEqual( 103 | pages.map((page) => page.lines.length), 104 | lineCounts, 105 | `file does not have expected line lengths: '${filePath}'`, 106 | ); 107 | }); 108 | }); 109 | 110 | it('sends back progress data', async () => { 111 | const allProgressData: PdfProgressData[] = []; 112 | await readPdfPages({ 113 | url: testFiles[0].filePath, 114 | progressCallback(progressData) { 115 | allProgressData.push(progressData); 116 | }, 117 | options: { 118 | isEvalSupported: false, 119 | }, 120 | }); 121 | 122 | assert.isAbove(allProgressData.length, 0, 'got no progress data'); 123 | }); 124 | }); 125 | 126 | describe('PDF test files', () => { 127 | it('all exist', () => { 128 | forEachTestFile(({filePath}) => { 129 | assert.isTrue(existsSync(filePath), `test file not found: '${filePath}'`); 130 | }); 131 | }); 132 | }); 133 | 134 | describe(readPdfText.name, () => { 135 | it('outputs expected strings', async () => { 136 | await forEachTestFile(async ({expectedContent, filePath, pdfPassword}) => { 137 | const stringOutput = await readPdfText({ 138 | filePath, 139 | password: pdfPassword, 140 | pathToPdfJsDistNodeModule: nodeModulesDir, 141 | }); 142 | assert.strictEqual( 143 | stringOutput.trim(), 144 | expectedContent, 145 | `file does not have expected content: '${filePath}'`, 146 | ); 147 | }); 148 | }); 149 | }); 150 | 151 | describe('ReadPdfTextParams', () => { 152 | it('matches expected types', () => { 153 | assertTypeOf['data']>().toEqualTypeOf< 154 | string | number[] | ArrayBuffer | TypedArray 155 | >(); 156 | assertTypeOf['url']>().toEqualTypeOf(); 157 | assertTypeOf['filePath']>().toEqualTypeOf(); 158 | }); 159 | }); 160 | -------------------------------------------------------------------------------- /src/read-pdf.ts: -------------------------------------------------------------------------------- 1 | import {join} from 'node:path'; 2 | import {getDocument} from 'pdfjs-dist'; 3 | import type { 4 | DocumentInitParameters, 5 | PDFPageProxy, 6 | TextItem, 7 | } from 'pdfjs-dist/types/src/display/api.js'; 8 | import type {RequireExactlyOne} from 'type-fest'; 9 | 10 | export type {DocumentInitParameters} from 'pdfjs-dist/types/src/display/api.js'; 11 | 12 | /** A single page within a PDF file. */ 13 | export type PdfPage = { 14 | lines: string[]; 15 | }; 16 | 17 | /** Progress updates sent by the `pdfjs-dist` dependency. */ 18 | export type PdfProgressData = { 19 | loaded: number; 20 | total: number; 21 | }; 22 | 23 | /** 24 | * Same as the built-in Partial type but also allows each property to be undefined. Compatible with 25 | * PartialAndUndefined from @augment-vir/common. 26 | */ 27 | export type PartialWithUndefined = { 28 | [Prop in keyof T]?: T[Prop] | undefined; 29 | }; 30 | 31 | /** All options for reading pdf text to function. Most are optional. */ 32 | export type ReadPdfTextParams = PartialWithUndefined<{ 33 | /** Password used to open a PDF that's password protected. */ 34 | password: string; 35 | /** This callback will be periodically called while PDF reading is in progress. */ 36 | progressCallback: (progressData: PdfProgressData) => void; 37 | /** 38 | * Set this as an absolute path to the `pdfjs-dist` directory in your `node_modules` directory. 39 | * This is not required for proper operation, but may help with a warning like the following: 40 | * 41 | * Warning: fetchStandardFontData: failed to fetch file "" with 42 | * "UnknownErrorException: The standard font "baseUrl" parameter must be specified, ensure that 43 | * the "standardFontDataUrl" API parameter is provided.". 44 | * 45 | * For more details see https://github.com/mozilla/pdf.js/issues/4244 46 | * 47 | * Example: /home/ubuntu/this-repo/node_modules/pdfjs-dist 48 | */ 49 | pathToPdfJsDistNodeModule: string; 50 | /** 51 | * All options that the Mozilla's `pdfjs-dist` package supports. This will override any options 52 | * that this package passes to `pdfjs-dist`. 53 | */ 54 | options: Partial>; 55 | }> & 56 | RequireExactlyOne<{ 57 | /** File path to the PDF file to read. */ 58 | filePath: NonNullable; 59 | /** URL to the PDF. */ 60 | url: NonNullable; 61 | /** PDF file data that has already been read from a PDF file. */ 62 | data: NonNullable; 63 | }>; 64 | 65 | /** 66 | * Read a PDF and convert it into lines of text. 67 | * 68 | * If a URL is used to fetch the PDF data a standard XMLHttpRequest(XHR) is used, which means it 69 | * must follow the same origin rules that any XHR does e.g. No cross domain requests without CORS. 70 | */ 71 | export async function readPdfPages({ 72 | data, 73 | filePath, 74 | password, 75 | pathToPdfJsDistNodeModule, 76 | progressCallback, 77 | url, 78 | options, 79 | }: ReadPdfTextParams): Promise { 80 | const documentLoadingTask = getDocument({ 81 | data, 82 | url: url || filePath, 83 | useSystemFonts: true, 84 | password, 85 | standardFontDataUrl: pathToPdfJsDistNodeModule 86 | ? join(pathToPdfJsDistNodeModule, 'standard_fonts') 87 | : undefined, 88 | ...options, 89 | }); 90 | if (progressCallback) { 91 | documentLoadingTask.onProgress = progressCallback; 92 | } 93 | 94 | const document = await documentLoadingTask.promise; 95 | 96 | const pageCount = document.numPages; 97 | 98 | const pages: PdfPage[] = []; 99 | 100 | for (let i = 0; i < pageCount; i++) { 101 | const page = await document.getPage(i + 1); 102 | pages.push(await parsePage(page)); 103 | } 104 | 105 | /** This is populated by the pdfjs-dist package. We're deleting it here to prevent memory leaks. */ 106 | delete (globalThis as any).pdfjsWorker; 107 | 108 | return pages; 109 | } 110 | 111 | /** Reads a PDF into a single string. */ 112 | export async function readPdfText(params: ReadPdfTextParams): Promise { 113 | const pdfPages = await readPdfPages(params); 114 | return combinePagesIntoSingleString(pdfPages); 115 | } 116 | 117 | /** Combine all PDF pages into a single string. */ 118 | export function combinePagesIntoSingleString(pages: PdfPage[]): string { 119 | return pages 120 | .map((page) => page.lines) 121 | .flat() 122 | .join('\n'); 123 | } 124 | 125 | /** Parse a single PDF page. */ 126 | export async function parsePage(pdfPage: PDFPageProxy): Promise { 127 | const rawContent = await pdfPage.getTextContent(); 128 | return parsePageItems(rawContent.items.filter((item): item is TextItem => 'str' in item)); 129 | } 130 | 131 | /** 132 | * Parses individual text items generated by pdf.js This allows lower level control of what actually 133 | * gets parsed. For example, a consumer of this function may remove entire sections of the pdf text 134 | * prior to passing items in here. See parsePage function above for example usage. 135 | * 136 | * @param pdfItems An array of TextItem items. 137 | */ 138 | export function parsePageItems(pdfItems: TextItem[]): PdfPage { 139 | const lineData: {[y: number]: TextItem[]} = {}; 140 | 141 | for (let i = 0; i < pdfItems.length; i++) { 142 | const item = pdfItems[i]; 143 | const y = item?.transform[5]; 144 | if (!lineData.hasOwnProperty(y)) { 145 | lineData[y] = []; 146 | } 147 | // how how to intentionally test this 148 | /* istanbul ignore next */ 149 | if (item) { 150 | lineData[y]?.push(item); 151 | } 152 | } 153 | 154 | const yCoords = Object.keys(lineData) 155 | .map((key) => Number(key)) 156 | // b - a here because the bottom is y = 0 so we want that to be last 157 | .sort((a, b) => b - a) 158 | // insert an empty line between any 2 lines where their distance is greater than the upper line's height 159 | .reduce((accum: number[], currentY, index, array) => { 160 | const nextY = array[index + 1]; 161 | if (nextY != undefined) { 162 | const currentLine = lineData[currentY]!; 163 | const currentLineHeight: number = currentLine.reduce( 164 | (finalValue, current) => 165 | finalValue > current.height ? finalValue : current.height, 166 | -1, 167 | ); 168 | 169 | // currentY - nextY because currentY will be higher than nextY 170 | if (Math.floor((currentY - nextY) / currentLineHeight) > 1) { 171 | const newY = currentY - currentLineHeight; 172 | lineData[newY] = []; 173 | return accum.concat(currentY, newY); 174 | } 175 | } 176 | return accum.concat(currentY); 177 | }, []); 178 | 179 | const lines: string[] = []; 180 | for (let i = 0; i < yCoords.length; i++) { 181 | const y = yCoords[i]; 182 | // idk how to actually test this 183 | /* istanbul ignore next */ 184 | if (y == undefined) { 185 | continue; 186 | } 187 | // sort by x position (position in line) 188 | const lineItems = lineData[y]!.sort((a, b) => a.transform[4] - b.transform[4]).filter( 189 | (item) => !!item.str, 190 | ); 191 | const firstLineItem = lineItems[0]!; 192 | let line = lineItems.length ? firstLineItem.str : ''; 193 | for (let j = 1; j < lineItems.length; j++) { 194 | const item = lineItems[j]!; 195 | const lastItem = lineItems[j - 1]!; 196 | const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width); 197 | 198 | // insert spaces for items that are far apart horizontally 199 | // idk how to trigger this 200 | /* istanbul ignore next */ 201 | if ( 202 | item.height !== 0 && 203 | lastItem.height !== 0 && 204 | (xDiff > item.height || xDiff > lastItem.height) 205 | ) { 206 | const spaceCountA = Math.ceil(xDiff / item.height); 207 | let spaceCount = spaceCountA; 208 | if (lastItem.height !== item.height) { 209 | const spaceCountB = Math.ceil(xDiff / lastItem.height); 210 | spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB; 211 | } 212 | line += Array(spaceCount).fill('').join(' '); 213 | } 214 | line += item.str; 215 | } 216 | lines.push(line); 217 | } 218 | 219 | return { 220 | lines, 221 | }; 222 | } 223 | -------------------------------------------------------------------------------- /src/readme-examples/lower-level-controls.example.ts: -------------------------------------------------------------------------------- 1 | import * as pdfjs from 'pdfjs-dist'; 2 | import type {TextItem} from 'pdfjs-dist/types/src/display/api'; 3 | import {parsePageItems} from '..'; 4 | 5 | async function main() { 6 | const doc = await pdfjs.getDocument('myDocument.pdf').promise; 7 | const page = await doc.getPage(1); // 1-indexed 8 | const content = await page.getTextContent(); 9 | const items: TextItem[] = content.items.filter((item): item is TextItem => 'str' in item); 10 | const parsedPage = parsePageItems(items); 11 | console.info(parsedPage.lines); 12 | } 13 | 14 | main(); 15 | -------------------------------------------------------------------------------- /src/readme-examples/read-pdf-pages.example.ts: -------------------------------------------------------------------------------- 1 | import {readPdfPages} from '..'; 2 | 3 | async function main() { 4 | const pages = await readPdfPages({url: 'path/to/pdf/file.pdf'}); 5 | console.info(pages[0]?.lines); 6 | } 7 | 8 | main(); 9 | -------------------------------------------------------------------------------- /src/readme-examples/read-pdf-text.example.ts: -------------------------------------------------------------------------------- 1 | import {readPdfText} from '..'; 2 | 3 | async function main() { 4 | const pdfText: string = await readPdfText({url: 'path/to/pdf/file.pdf'}); 5 | console.info(pdfText); 6 | } 7 | 8 | main(); 9 | -------------------------------------------------------------------------------- /src/repo-paths.test-helper.ts: -------------------------------------------------------------------------------- 1 | import {dirname, join, resolve} from 'node:path'; 2 | import {fileURLToPath} from 'node:url'; 3 | 4 | /** 5 | * Path to the repo's root. Does not use the package name because the source code could 6 | * theoretically be cloned into any folder. "src" is used for the ts source code files (so they CAN 7 | * be run directly without transpiling it into JS) and "dist" is used for the transpiled JS output 8 | * directory. 9 | */ 10 | const repoRootDir = dirname(dirname(fileURLToPath(import.meta.url))); 11 | 12 | export const sampleFilesDir = join(repoRootDir, 'test-files'); 13 | export const nodeModulesDir = resolve(repoRootDir, 'node_modules', 'pdfjs-dist'); 14 | -------------------------------------------------------------------------------- /test-files/dummy-with-password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electrovir/pdf-text-reader/12568d5cbb52c25450700f520961953d784413ed/test-files/dummy-with-password.pdf -------------------------------------------------------------------------------- /test-files/dummy.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electrovir/pdf-text-reader/12568d5cbb52c25450700f520961953d784413ed/test-files/dummy.pdf -------------------------------------------------------------------------------- /test-files/pdfkit-out.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/electrovir/pdf-text-reader/12568d5cbb52c25450700f520961953d784413ed/test-files/pdfkit-out.pdf -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "ES2022", 4 | "outDir": "./dist", 5 | "rootDir": "./src" 6 | }, 7 | "exclude": [ 8 | "./configs", 9 | "./coverage", 10 | "./dist", 11 | "./node_modules", 12 | "./test-files" 13 | ], 14 | "extends": "virmator/base-configs/tsconfig.base.json" 15 | } 16 | --------------------------------------------------------------------------------