├── .gitignore ├── README.md ├── app ├── index.htm └── js │ └── app.js └── server ├── .gitignore ├── bank-statement.pdf ├── index.js ├── package-lock.json └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extract PDF Content 2 | 3 | This repository contains example demonstrations on how to use [PDF.js](https://mozilla.github.io/pdf.js/) in conjunction with [Lodash](https://lodash.com/), to extract data from a pdf. 4 | 5 | There are two example applications, a web application to ease data exploration and a CLI application to ease data entry from a node.js application. 6 | 7 | 8 | _NOTE: These are prototypes for further exploration and will need to be customised to a specific use case._ 9 | 10 | 11 | ## Usage 12 | 13 | Choose between: 14 | 15 | - a CLI implementation ideally to be set up and used on a server (requires [node.js](https://nodejs.org) installed) 16 | 17 | ```bash 18 | cd server/ 19 | npm install 20 | node index.js 21 | ``` 22 | - a web application implementation (open app folder, open index.htm in a web browser) 23 | 24 | 25 | ## License 26 | [Apache-2.0](https://choosealicense.com/licenses/apache-2.0/) -------------------------------------------------------------------------------- /app/index.htm: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Import PDF Data 9 | 10 | 11 | 12 | 13 |
14 |
15 |
16 |
17 |
18 |
19 |

Transactions

20 |

21 | Upload "bank-statement.pdf" provided in the repo 22 |

23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
32 |
33 | 40 |
41 | 47 |

or drag and drop

48 |
49 |

50 | PNG, JPG, GIF up to 10MB 51 |

52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | 65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 | 73 | 74 | 75 | 79 | 83 | 87 | 91 | 95 | 99 | 100 | 101 | 102 |
77 | id 78 | 81 | Date 82 | 85 | Amount 86 | 89 | Description 90 | 93 | Reconciled 94 | 97 | Transaction Type 98 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 | 112 |
113 |
114 | 115 | 118 | 120 | 121 | 122 | 123 | -------------------------------------------------------------------------------- /app/js/app.js: -------------------------------------------------------------------------------- 1 | ((pdfjs, lodash) => { 2 | const reader = new FileReader() 3 | const fileUpload = document.querySelector("#file-upload") 4 | const transactionsTable = document.querySelector("#transactions-table") 5 | 6 | fileUpload.addEventListener("change", (event) => { 7 | let file = event.target.files[0] 8 | 9 | // probably check if we've got the right file 10 | 11 | reader.addEventListener('load', (event) => { 12 | getContent(event.target.result) 13 | .then((content) => { 14 | return content.items 15 | .filter((item) => item.str.trim().length) 16 | .map((item) => item.str) 17 | .filter((item, i) => i > 7) 18 | }) 19 | .then(items => { 20 | const trs = lodash.chunk(items, 6) 21 | .reduce((trs, [id, date, amount, description, reconciled, transaction_type]) => { 22 | let tr = `${id}${date}${amount}${description}${reconciled}${transaction_type}` 23 | trs.push(tr) 24 | return trs 25 | }, []) 26 | 27 | transactionsTable.querySelector('tbody').innerHTML = trs.join('') 28 | }) 29 | .catch(error => { console.log(error) }) // handle errors 30 | }) 31 | 32 | reader.readAsDataURL(file) 33 | }) 34 | 35 | async function getContent(file) { 36 | const doc = await pdfjs.getDocument(file).promise 37 | let page = await doc.getPage(1) // if doc has many pages use doc.numPages to iterate and pass index to doc.getPage 38 | return await page.getTextContent() 39 | } 40 | 41 | })(pdfjsLib, _) -------------------------------------------------------------------------------- /server/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ -------------------------------------------------------------------------------- /server/bank-statement.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ammonvictor/extract-pdf-content/a3e76f82b8d763aa0ef8eb6c2e28969e111f1efc/server/bank-statement.pdf -------------------------------------------------------------------------------- /server/index.js: -------------------------------------------------------------------------------- 1 | const pdfjs = require('pdfjs-dist/es5/build/pdf') 2 | const _ = require("lodash") 3 | 4 | async function getContent(src) { 5 | const doc = await pdfjs.getDocument(src).promise 6 | const page = await doc.getPage(1) // if doc has many pages use doc.numPages to iterate and pass index to doc.getPage 7 | return await page.getTextContent() 8 | } 9 | 10 | async function getItems(src) { 11 | // Perform pre-processing 12 | const content = await getContent(src) 13 | // console.log(content.items) 14 | 15 | return content.items 16 | .filter((item) => item.str.trim().length) 17 | .map((item) => item.str) 18 | .filter((item, i) => i > 7) 19 | } 20 | 21 | 22 | function processItems(items) { 23 | const records = _.chunk(items, 6) 24 | .reduce((records, [id, date, amount, description, reconciled, transaction_type]) => { 25 | records.push({ id, date, amount, description, reconciled, transaction_type }) 26 | return records 27 | }, []) 28 | 29 | // save json or save csv or write to db 30 | console.log(records) 31 | } 32 | 33 | function handleErrors(error) { 34 | // handle errors 35 | console.log(error) 36 | } 37 | 38 | getItems("./bank-statement.pdf") 39 | .then(processItems) 40 | .catch(handleErrors) -------------------------------------------------------------------------------- /server/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "extract-pdf-content", 3 | "version": "1.0.0", 4 | "lockfileVersion": 2, 5 | "requires": true, 6 | "packages": { 7 | "": { 8 | "name": "extract-pdf-content", 9 | "version": "1.0.0", 10 | "license": "Apache-2.0", 11 | "dependencies": { 12 | "lodash": "^4.17.20", 13 | "pdfjs-dist": "^2.6.347" 14 | } 15 | }, 16 | "node_modules/lodash": { 17 | "version": "4.17.20", 18 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.20.tgz", 19 | "integrity": "sha512-PlhdFcillOINfeV7Ni6oF1TAEayyZBoZ8bcshTHqOYJYlrqzRK5hagpagky5o4HfCzzd1TRkXPMFq6cKk9rGmA==" 20 | }, 21 | "node_modules/pdfjs-dist": { 22 | "version": "2.6.347", 23 | "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-2.6.347.tgz", 24 | "integrity": "sha512-QC+h7hG2su9v/nU1wEI3SnpPIrqJODL7GTDFvR74ANKGq1AFJW16PH8VWnhpiTi9YcLSFV9xLeWSgq+ckHLdVQ==" 25 | } 26 | }, 27 | "dependencies": { 28 | "lodash": { 29 | "version": "4.17.20", 30 | "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.20.tgz", 31 | "integrity": "sha512-PlhdFcillOINfeV7Ni6oF1TAEayyZBoZ8bcshTHqOYJYlrqzRK5hagpagky5o4HfCzzd1TRkXPMFq6cKk9rGmA==" 32 | }, 33 | "pdfjs-dist": { 34 | "version": "2.6.347", 35 | "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-2.6.347.tgz", 36 | "integrity": "sha512-QC+h7hG2su9v/nU1wEI3SnpPIrqJODL7GTDFvR74ANKGq1AFJW16PH8VWnhpiTi9YcLSFV9xLeWSgq+ckHLdVQ==" 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /server/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "extract-pdf-content", 3 | "version": "1.0.0", 4 | "description": "Extract pdf content using pdfjs", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "keywords": [ 10 | "extract", 11 | "pdf", 12 | "pdfjs" 13 | ], 14 | "author": "Ammon Victor", 15 | "license": "Apache-2.0", 16 | "dependencies": { 17 | "lodash": "^4.17.20", 18 | "pdfjs-dist": "^2.6.347" 19 | } 20 | } 21 | --------------------------------------------------------------------------------