├── .eslintignore ├── .eslintrc.js ├── .gitignore ├── .prettierignore ├── .prettierrc ├── .vscode └── settings.json ├── README.md ├── next-env.d.ts ├── next.config.js ├── package-lock.json ├── package.json ├── public ├── favicon.ico ├── favicon.png ├── favicon.svg └── pdf.worker.js ├── src ├── components │ ├── FileUpload │ │ ├── FileUpload.tsx │ │ └── index.ts │ ├── Footer │ │ ├── Footer.tsx │ │ └── index.ts │ └── SEO │ │ ├── SEO.tsx │ │ └── index.ts ├── lib │ ├── OCRImages.ts │ ├── download.ts │ └── pdfToImages.ts └── pages │ ├── _app.tsx │ ├── _document.tsx │ └── index.tsx ├── tsconfig.json └── yarn.lock /.eslintignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .next 3 | public/pdf.worker.js -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const path = require('path'); 3 | 4 | const prettierOptions = JSON.parse(fs.readFileSync(path.resolve(__dirname, '.prettierrc'), 'utf8')); 5 | 6 | module.exports = { 7 | extends: ['next/core-web-vitals', 'prettier'], 8 | parser: '@typescript-eslint/parser', 9 | plugins: ['@typescript-eslint', 'react', 'prettier', 'react-hooks', 'sonarjs', 'import'], 10 | rules: { 11 | 'prettier/prettier': ['error', prettierOptions], 12 | }, 13 | overrides: [ 14 | { 15 | files: ['**/*.ts?(x)'], 16 | rules: { 17 | 'prettier/prettier': ['warn', prettierOptions], 18 | 'no-console': ['warn', { allow: ['info'] }], 19 | 'no-unused-vars': 'off', 20 | '@typescript-eslint/no-unused-vars': [ 21 | 'warn', 22 | { vars: 'all', args: 'all', argsIgnorePattern: '^_', varsIgnorePattern: '^_' }, 23 | ], 24 | '@typescript-eslint/no-empty-function': 'warn', 25 | }, 26 | }, 27 | ], 28 | }; 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. 2 | 3 | # dependencies 4 | /node_modules 5 | /.pnp 6 | .pnp.js 7 | 8 | # testing 9 | /coverage 10 | 11 | # next.js 12 | /.next/ 13 | /out/ 14 | 15 | # production 16 | /build 17 | 18 | # misc 19 | .DS_Store 20 | *.pem 21 | 22 | # debug 23 | npm-debug.log* 24 | yarn-debug.log* 25 | yarn-error.log* 26 | 27 | # local env files 28 | .env.local 29 | .env.development.local 30 | .env.test.local 31 | .env.production.local 32 | 33 | # vercel 34 | .vercel 35 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | build/ 2 | node_modules/ 3 | public/pdf.worker.js 4 | yarn.lock 5 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "arrowParens": "avoid", 3 | "printWidth": 120, 4 | "singleQuote": true, 5 | "endOfLine": "auto" 6 | } 7 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.codeActionsOnSave": { 3 | "source.fixAll.eslint": true 4 | }, 5 | "editor.formatOnSave": true, 6 | "editor.defaultFormatter": "esbenp.prettier-vscode" 7 | } 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDF Parser 2 | 3 | A tool to parse PDF files and extract text. 4 | 5 | ## Getting Started 6 | 7 | First, run the development server: 8 | 9 | ```bash 10 | npm run dev 11 | # or 12 | yarn dev 13 | ``` 14 | 15 | Open [http://localhost:3000](http://localhost:3000) with your browser to view the app. 16 | -------------------------------------------------------------------------------- /next-env.d.ts: -------------------------------------------------------------------------------- 1 | /// 2 | /// 3 | /// 4 | 5 | // NOTE: This file should not be edited 6 | // see https://nextjs.org/docs/basic-features/typescript for more information. 7 | -------------------------------------------------------------------------------- /next.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | reactStrictMode: true, 3 | } 4 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "loan-receipts-processor", 3 | "private": true, 4 | "scripts": { 5 | "dev": "next dev", 6 | "build": "next build", 7 | "start": "next start", 8 | "lint": "next lint" 9 | }, 10 | "dependencies": { 11 | "@chakra-ui/react": "^1.7.2", 12 | "@emotion/react": "^11", 13 | "@emotion/styled": "^11", 14 | "@react-pdf-viewer/core": "^2.11.0", 15 | "framer-motion": "^4", 16 | "next": "12.0.4", 17 | "pdfjs-dist": "^2.10.377", 18 | "react": "17.0.2", 19 | "react-dom": "17.0.2", 20 | "react-dropzone": "^11.4.2", 21 | "react-icons": "^4.3.1", 22 | "tesseract.js": "^2.1.5" 23 | }, 24 | "devDependencies": { 25 | "@types/pdfjs-dist": "^2.10.378", 26 | "@types/react": "^17.0.37", 27 | "@typescript-eslint/eslint-plugin": "^5.5.0", 28 | "eslint": "7.32.0", 29 | "eslint-config-next": "12.0.4", 30 | "eslint-config-prettier": "8.3.0", 31 | "eslint-plugin-prettier": "3.4.0", 32 | "eslint-plugin-react": "^7.27.1", 33 | "eslint-plugin-sonarjs": "^0.6.0", 34 | "prettier": "^2.5.0", 35 | "typescript": "^4.5.2" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /public/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fredericoo/pdf-parser/5e6851063a892ebd68338f8cfd2ff77d0802e0e8/public/favicon.ico -------------------------------------------------------------------------------- /public/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fredericoo/pdf-parser/5e6851063a892ebd68338f8cfd2ff77d0802e0e8/public/favicon.png -------------------------------------------------------------------------------- /public/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/components/FileUpload/FileUpload.tsx: -------------------------------------------------------------------------------- 1 | import { useCallback } from 'react'; 2 | import { useDropzone } from 'react-dropzone'; 3 | import { Center, useColorModeValue, Icon } from '@chakra-ui/react'; 4 | import { AiFillFileAdd } from 'react-icons/ai'; 5 | 6 | type FileUploadProps = { 7 | onFileAccepted: (file: File) => void; 8 | }; 9 | 10 | const FileUpload: React.FC = ({ onFileAccepted }) => { 11 | const onDrop = useCallback( 12 | acceptedFiles => { 13 | onFileAccepted(acceptedFiles[0]); 14 | }, 15 | [onFileAccepted] 16 | ); 17 | 18 | const { getRootProps, getInputProps, isDragActive } = useDropzone({ 19 | onDrop, 20 | accept: '.pdf', 21 | maxFiles: 1, 22 | multiple: false, 23 | }); 24 | 25 | const dropText = isDragActive ? 'Drop it like it’s hot…' : 'Drag‘n’drop your file here, or click to select files'; 26 | 27 | const activeBg = useColorModeValue('gray.100', 'gray.600'); 28 | const borderColor = useColorModeValue(isDragActive ? 'teal.300' : 'gray.300', isDragActive ? 'teal.500' : 'gray.500'); 29 | 30 | return ( 31 |
44 | 45 | 46 |

{dropText}

47 |
48 | ); 49 | }; 50 | 51 | export default FileUpload; 52 | -------------------------------------------------------------------------------- /src/components/FileUpload/index.ts: -------------------------------------------------------------------------------- 1 | export { default } from './FileUpload'; 2 | -------------------------------------------------------------------------------- /src/components/Footer/Footer.tsx: -------------------------------------------------------------------------------- 1 | import { Text, Icon } from '@chakra-ui/react'; 2 | import { AiFillGithub } from 'react-icons/ai'; 3 | 4 | const Footer: React.VFC = () => { 5 | return ( 6 | 16 | 17 | 18 | ); 19 | }; 20 | 21 | export default Footer; 22 | -------------------------------------------------------------------------------- /src/components/Footer/index.ts: -------------------------------------------------------------------------------- 1 | export { default } from './Footer'; 2 | -------------------------------------------------------------------------------- /src/components/SEO/SEO.tsx: -------------------------------------------------------------------------------- 1 | import Head from 'next/head'; 2 | import { useRouter } from 'next/router'; 3 | 4 | interface SEOProps { 5 | title?: string; 6 | desc?: string; 7 | type?: string; 8 | imageUrl?: string; 9 | } 10 | 11 | const SEO: React.VFC = ({ title, desc, type, imageUrl }) => { 12 | const { asPath } = useRouter(); 13 | const tabInfo = { 14 | title, 15 | desc: desc || title, 16 | }; 17 | 18 | const seoImage = imageUrl || ''; 19 | 20 | return ( 21 | 22 | {tabInfo.title} 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | ); 36 | }; 37 | 38 | export default SEO; 39 | -------------------------------------------------------------------------------- /src/components/SEO/index.ts: -------------------------------------------------------------------------------- 1 | export { default } from './SEO'; 2 | -------------------------------------------------------------------------------- /src/lib/OCRImages.ts: -------------------------------------------------------------------------------- 1 | import Tesseract from 'tesseract.js'; 2 | 3 | type OCRImagesOptions = { 4 | onProgress?: (progress: { current: number; total: number }) => void; 5 | onStart?: (progress: { current: 0; total: number }) => void; 6 | }; 7 | 8 | const OCRImages = async (urls: string[], options?: OCRImagesOptions): Promise> => { 9 | options.onStart && options.onStart({ current: 0, total: urls.length }); 10 | const progress = { total: urls.length, current: 0 }; 11 | 12 | const promises = urls.map( 13 | async url => 14 | await Tesseract.recognize(url, 'isl').then(({ data: { text } }) => { 15 | progress.current += 1; 16 | options.onProgress && options.onProgress(progress); 17 | return text; 18 | }) 19 | ); 20 | 21 | const texts = await Promise.all(promises); 22 | 23 | return texts.reduce((acc, text, index) => { 24 | return { ...acc, [index + 1]: text }; 25 | }, {}); 26 | }; 27 | 28 | export default OCRImages; 29 | -------------------------------------------------------------------------------- /src/lib/download.ts: -------------------------------------------------------------------------------- 1 | const download = (data: string, filename: string): void => { 2 | if (!window) return; 3 | const blob = new Blob(['\ufeff', data]); 4 | const url = URL.createObjectURL(blob); 5 | 6 | const link = document.createElement('a'); 7 | link.download = filename; 8 | link.target = '_blank'; 9 | link.href = url; 10 | document.body.appendChild(link); 11 | link.click(); 12 | document.body.removeChild(link); 13 | }; 14 | 15 | export default download; 16 | -------------------------------------------------------------------------------- /src/lib/pdfToImages.ts: -------------------------------------------------------------------------------- 1 | import * as pdfjsLib from 'pdfjs-dist'; 2 | 3 | pdfjsLib.GlobalWorkerOptions.workerSrc = '/pdf.worker.js'; 4 | 5 | type PDFToImagesOptions = { 6 | scale?: number; 7 | onProgress?: (progress: { current: number; total: number }) => void; 8 | onStart?: (progress: { current: 0; total: number }) => void; 9 | }; 10 | 11 | const pdfToImages = async (pdf: string, options?: PDFToImagesOptions): Promise => { 12 | const output = []; 13 | const doc = await pdfjsLib.getDocument(pdf).promise; 14 | 15 | options.onStart && options.onStart({ current: 0, total: doc.numPages }); 16 | 17 | for (let i = 1; i < doc.numPages + 1; i++) { 18 | const canvas = document.createElement('canvas'); 19 | 20 | const page = await doc.getPage(i); 21 | const context = canvas.getContext('2d'); 22 | const viewport = page.getViewport({ scale: options.scale || 1 }); 23 | canvas.height = viewport.height; 24 | canvas.width = viewport.width; 25 | 26 | await page.render({ 27 | canvasContext: context, 28 | viewport, 29 | }).promise; 30 | 31 | options.onProgress && options.onProgress({ current: i, total: doc.numPages }); 32 | 33 | output.push(canvas.toDataURL('image/png')); 34 | } 35 | 36 | return output; 37 | }; 38 | 39 | export default pdfToImages; 40 | -------------------------------------------------------------------------------- /src/pages/_app.tsx: -------------------------------------------------------------------------------- 1 | import { AppComponent } from 'next/dist/shared/lib/router/router'; 2 | import { ChakraProvider } from '@chakra-ui/react'; 3 | 4 | const MyApp: AppComponent = ({ Component, pageProps }) => { 5 | return ( 6 | 7 | 8 | 9 | ); 10 | }; 11 | 12 | export default MyApp; 13 | -------------------------------------------------------------------------------- /src/pages/_document.tsx: -------------------------------------------------------------------------------- 1 | import Document, { Html, Head, Main, NextScript } from 'next/document'; 2 | import { ReactElement } from 'react'; 3 | 4 | class MyDocument extends Document { 5 | render(): ReactElement { 6 | return ( 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | {/* WEB APP */} 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 |
25 | 26 | 27 | 28 | ); 29 | } 30 | } 31 | 32 | export default MyDocument; 33 | -------------------------------------------------------------------------------- /src/pages/index.tsx: -------------------------------------------------------------------------------- 1 | import { useState } from 'react'; 2 | import { useToast } from '@chakra-ui/toast'; 3 | import download from '../lib/download'; 4 | import { 5 | Progress, 6 | Text, 7 | Container, 8 | Button, 9 | HStack, 10 | styled, 11 | Heading, 12 | OrderedList, 13 | ListItem, 14 | Box, 15 | Icon, 16 | } from '@chakra-ui/react'; 17 | import SEO from '../components/SEO'; 18 | import FileUpload from '../components/FileUpload'; 19 | import Footer from '../components/Footer'; 20 | import pdfToImages from '../lib/pdfToImages'; 21 | import OCRImages from '../lib/OCRImages'; 22 | import { AiFillInfoCircle } from 'react-icons/ai'; 23 | 24 | const PageContent = styled(Container, { 25 | baseStyle: { 26 | maxW: 'container.lg', 27 | display: 'flex', 28 | flexDirection: 'column', 29 | justifyContent: 'center', 30 | alignItems: 'center', 31 | py: 8, 32 | minH: '100vh', 33 | }, 34 | }); 35 | 36 | const pageInfo = { 37 | title: 'Scanned PDF to JSON', 38 | desc: "Upload a PDF file and we'll convert it to a JSON file with the text on each page.", 39 | }; 40 | 41 | const Home = () => { 42 | const toast = useToast(); 43 | 44 | const [progress, setProgress] = useState<{ current: number; total: number; type?: 'Processing' | 'Recognising' }>({ 45 | current: 0, 46 | total: 0, 47 | }); 48 | const [results, setResults] = useState>({}); 49 | 50 | const handleFileSelect = async (file: File) => { 51 | if (file?.type !== 'application/pdf') { 52 | toast({ status: 'error', title: 'Invalid file type' }); 53 | return; 54 | } 55 | const pdfUrl = URL.createObjectURL(file); 56 | 57 | const imageUrls = await pdfToImages(pdfUrl, { 58 | scale: 2, 59 | onStart: progress => setProgress({ ...progress, total: progress.total * 2, type: 'Processing' }), 60 | onProgress: progress => setProgress({ ...progress, total: progress.total * 2, type: 'Processing' }), 61 | }); 62 | const recognisedImages = await OCRImages(imageUrls, { 63 | onStart: progress => 64 | setProgress({ current: progress.total + progress.current, total: progress.total * 2, type: 'Recognising' }), 65 | onProgress: progress => 66 | setProgress({ current: progress.total + progress.current, total: progress.total * 2, type: 'Recognising' }), 67 | }); 68 | 69 | setResults(recognisedImages); 70 | }; 71 | 72 | if (!progress.total) { 73 | return ( 74 | 75 | 76 | 77 | Recognise text in your PDF files 78 | 79 | 80 | 81 | How it works 82 | 83 | Each page of the uploaded PDF file will be converted to a PNG image 84 | Each PNG image will go through an OCR that will read the text on the screen 85 | A JSON file download will be available for you with the contents of each page. 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | No data about your files is collected or stored. All the processing and text recognition happens on your 95 | device. 96 | 97 | 98 |