├── test ├── basic.test.js ├── simple.test.js ├── newline-optimization.test.js ├── simple-newline.test.js └── scenarios.test.js ├── examples ├── key.pem ├── cert.pem ├── .dockerignore ├── Dockerfile ├── README.md ├── usage-examples.js └── demo.html ├── .npmignore ├── .gitignore ├── LICENSE ├── config.example.json ├── package.json ├── webpack.config.js ├── src ├── types │ └── index.d.ts ├── utils │ ├── SystemPrompts.js │ ├── OutputParser.js │ └── ConfigValidator.js ├── engines │ └── WebLLMEngine.js ├── index.js └── converters │ └── Extract2MDConverter.js ├── MIGRATION.md ├── scripts ├── validate-deployment.js └── postinstall.js ├── DEPLOYMENT.md └── README.md /test/basic.test.js: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/key.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN PRIVATE KEY----- 2 | 3 | -----END PRIVATE KEY----- 4 | -------------------------------------------------------------------------------- /examples/cert.pem: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | 3 | -----END CERTIFICATE----- 4 | -------------------------------------------------------------------------------- /examples/.dockerignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .git 3 | test 4 | src 5 | scripts 6 | *.log 7 | .DS_Store 8 | *.md 9 | package*.json -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | # Node.js dependencies 2 | node_modules/ 3 | 4 | # Test files 5 | test/ 6 | 7 | # Example files 8 | examples/ 9 | 10 | # Build configuration 11 | webpack.config.js 12 | 13 | # Development files 14 | package-lock.json 15 | config.example.json 16 | 17 | # Documentation 18 | DEPLOYMENT.md 19 | LICENSE 20 | MIGRATION.md 21 | 22 | # Source map files 23 | dist/assets/*.map -------------------------------------------------------------------------------- /examples/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:20-alpine 2 | 3 | WORKDIR /app 4 | 5 | # Install extract2md and serve 6 | RUN npm install extract2md serve 7 | 8 | # Copy demo.html into the image 9 | COPY demo.html ./demo.html 10 | 11 | # Copy the dist directory from the installed package to /app/dist 12 | RUN mkdir -p dist && \ 13 | cp -r node_modules/extract2md/dist/* dist/ 14 | 15 | # Expose port for the static server 16 | EXPOSE 8080 17 | 18 | # Serve the current directory (demo.html and dist/) 19 | CMD ["npx", "serve", "--listen", "8080", "."] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | node_modules/ 3 | 4 | # Build output 5 | dist/ 6 | build/ 7 | 8 | # Logs 9 | npm-debug.log* 10 | yarn-debug.log* 11 | yarn-error.log* 12 | *.log 13 | 14 | # OS generated files 15 | .DS_Store 16 | .DS_Store? 17 | ._* 18 | .Spotlight-V100 19 | .Trashes 20 | ehthumbs.db 21 | Thumbs.db 22 | 23 | # Editor directories and files 24 | .idea/ 25 | .vscode/ 26 | *.suo 27 | *.ntvs* 28 | *.njsproj 29 | *.sln 30 | *.sw? 31 | 32 | # Optional: Environment variables file 33 | .env 34 | .env.local 35 | .env.development.local 36 | .env.test.local 37 | .env.production.local 38 | .pem 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright <2025> 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /config.example.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Extract2MD Configuration Schema - Complete example with all available options", 3 | 4 | "ocr": { 5 | "language": "eng", 6 | "oem": 1, 7 | "psm": 6, 8 | "workerPath": "./tesseract-worker.min.js", 9 | "corePath": "./tesseract-core.wasm.js", 10 | "langPath": "./lang-data/", 11 | "options": { 12 | "logger": null, 13 | "errorHandler": null 14 | } 15 | }, 16 | 17 | "webllm": { 18 | "modelId": "Llama-3.2-1B-Instruct-q4f16_1-MLC", 19 | "temperature": 0.7, 20 | "maxTokens": 4000, 21 | "streamingEnabled": false, 22 | "customModel": { 23 | "model": "https://huggingface.co/mlc-ai/custom-model/resolve/main/", 24 | "model_id": "Custom-Model-ID", 25 | "model_lib": "https://example.com/path/to/custom-model.wasm", 26 | "required_features": ["shader-f16"], 27 | "overrides": { 28 | "conv_template": "llama" 29 | } 30 | } 31 | }, 32 | 33 | "systemPrompts": { 34 | "singleExtraction": "Focus on technical accuracy and preserve all code examples exactly as they appear.", 35 | "combinedExtraction": "Pay special attention to diagrams and tables that might be better captured in the OCR version." 36 | }, 37 | 38 | "processing": { 39 | "splitPascalCase": false, 40 | "pdfRenderScale": 2.5, 41 | "postProcessRules": [ 42 | { 43 | "find": "\\bAPI\\b", 44 | "replace": "API" 45 | }, 46 | { 47 | "find": "\\bJSON\\b", 48 | "replace": "JSON" 49 | } 50 | ] 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "extract2md", 3 | "version": "2.0.0", 4 | "description": "Client-side PDF to Markdown conversion with OCR and optional LLM rewrite. Core dependencies bundled for offline use.", 5 | "main": "dist/assets/extract2md.umd.js", 6 | "module": "dist/assets/extract2md.esm.js", 7 | "type": "module", 8 | "types": "dist/assets/extract2md.d.ts", 9 | "scripts": { 10 | "build": "webpack", 11 | "prepublishOnly": "npm run build", 12 | "postinstall": "node scripts/postinstall.js", 13 | "test": "node test/simple.test.js" 14 | }, 15 | "keywords": [ 16 | "pdf", 17 | "markdown", 18 | "ocr", 19 | "tesseract.js", 20 | "pdf.js", 21 | "webllm", 22 | "llm", 23 | "client-side", 24 | "text-extraction", 25 | "pdf to markdown", 26 | "offline" 27 | ], 28 | "author": "Hashan Wickramasinghe ", 29 | "license": "MIT", 30 | "dependencies": { 31 | "@mlc-ai/web-llm": "^0.2.79", 32 | "pdfjs-dist": "^5.2.133", 33 | "tesseract.js": "^5.0.5" 34 | }, 35 | "devDependencies": { 36 | "@babel/core": "^7.24.0", 37 | "@babel/preset-env": "^7.24.0", 38 | "babel-loader": "^9.1.3", 39 | "copy-webpack-plugin": "^12.0.2", 40 | "webpack": "^5.90.3", 41 | "webpack-cli": "^5.1.4" 42 | }, 43 | "files": [ 44 | "dist", 45 | "scripts", 46 | "README.md", 47 | "MIGRATION.md" 48 | ], 49 | "homepage": "https://github.com/hashangit/Extract2MD#readme", 50 | "repository": { 51 | "type": "git", 52 | "url": "git+https://github.com/hashangit/Extract2MD.git" 53 | }, 54 | "bugs": { 55 | "url": "https://github.com/hashangit/Extract2MD/issues" 56 | } 57 | } -------------------------------------------------------------------------------- /test/simple.test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple test to validate package structure 3 | */ 4 | 5 | import fs from 'fs'; 6 | import path from 'path'; 7 | import { fileURLToPath } from 'url'; 8 | 9 | const __filename = fileURLToPath(import.meta.url); 10 | const __dirname = path.dirname(__filename); 11 | 12 | console.log('Testing Extract2MD package structure...'); 13 | 14 | // Test 1: Check TypeScript definitions exist 15 | const typesPath = path.resolve(__dirname, '../src/types/index.d.ts'); 16 | if (fs.existsSync(typesPath)) { 17 | console.log('✅ TypeScript definitions found'); 18 | } else { 19 | console.log('❌ TypeScript definitions missing'); 20 | process.exit(1); 21 | } 22 | 23 | // Test 2: Check configuration example exists 24 | const configPath = path.resolve(__dirname, '../config.example.json'); 25 | if (fs.existsSync(configPath)) { 26 | console.log('✅ Configuration example found'); 27 | } else { 28 | console.log('❌ Configuration example missing'); 29 | process.exit(1); 30 | } 31 | 32 | // Test 3: Check core files exist 33 | const coreFiles = [ 34 | '../src/converters/Extract2MDConverter.js', 35 | '../src/engines/WebLLMEngine.js', 36 | '../src/utils/ConfigValidator.js', 37 | '../src/utils/OutputParser.js', 38 | '../src/utils/SystemPrompts.js' 39 | ]; 40 | 41 | for (const file of coreFiles) { 42 | const filePath = path.resolve(__dirname, file); 43 | if (fs.existsSync(filePath)) { 44 | console.log(`✅ ${file.split('/').pop()} found`); 45 | } else { 46 | console.log(`❌ ${file.split('/').pop()} missing`); 47 | process.exit(1); 48 | } 49 | } 50 | 51 | console.log('\n🎉 All basic structure tests passed!'); 52 | process.exit(0); 53 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Extract2MD Demo Docker Instructions 2 | 3 | This guide explains how to build and run a Docker container to serve the `demo.html` file for Extract2MD, including all required assets. 4 | 5 | ## Prerequisites 6 | 7 | - [Docker](https://www.docker.com/get-started) installed on your system 8 | - Internet connection (to pull the Node.js image and npm packages) 9 | 10 | ## Steps 11 | 12 | ### 1. Build the Docker Image 13 | 14 | From the project root (where the `examples/` folder is located), run: 15 | 16 | ```sh 17 | docker build -t extract2md-demo ./examples 18 | ``` 19 | 20 | This will: 21 | - Use the provided `Dockerfile` in the `examples/` folder 22 | - Install `extract2md` and `serve` via npm 23 | - Copy `demo.html` and all required assets into the image 24 | 25 | ### 2. Run the Docker Container 26 | 27 | Run the following command to start the server (mapping container port 8080 to your local port 8081): 28 | 29 | ```sh 30 | docker run -p 8081:8080 extract2md-demo 31 | ``` 32 | 33 | - The server will be accessible at [http://localhost:8081/demo.html](http://localhost:8081/demo.html) 34 | 35 | ### 3. Using the Demo 36 | 37 | - Open your browser and go to [http://localhost:8081/demo.html](http://localhost:8081/demo.html) 38 | - Upload a PDF and select a scenario to test the Extract2MD conversion features 39 | 40 | ### 4. Troubleshooting 41 | 42 | - If you get a 404 error, make sure you are visiting `/demo.html` (not `/demo` or `/`). 43 | - If you see errors about missing assets, ensure the Docker build completed successfully and that the `dist` directory is present in the container (it should be automatically copied from the npm package). 44 | - For WebLLM scenarios, ensure your browser supports WebGPU. 45 | 46 | ### 5. Stopping the Container 47 | 48 | Press `Ctrl+C` in the terminal where the container is running, or run: 49 | 50 | ```sh 51 | docker ps # Find the container ID 52 | # Then: 53 | docker stop 54 | ``` 55 | 56 | --- 57 | 58 | **For development or advanced usage, you can modify `demo.html` and rebuild the image to see your changes.** -------------------------------------------------------------------------------- /webpack.config.js: -------------------------------------------------------------------------------- 1 | import path from 'path'; 2 | import CopyWebpackPlugin from 'copy-webpack-plugin'; 3 | import { fileURLToPath } from 'url'; 4 | 5 | const __filename = fileURLToPath(import.meta.url); 6 | const __dirname = path.dirname(__filename); 7 | 8 | const commonConfig = { 9 | mode: 'production', // or 'development' 10 | entry: './src/index.js', 11 | module: { 12 | rules: [ 13 | { 14 | test: /\.js$/, 15 | exclude: /node_modules/, 16 | use: { 17 | loader: 'babel-loader', 18 | options: { 19 | presets: ['@babel/preset-env'] 20 | } 21 | } 22 | } 23 | ] 24 | }, 25 | resolve: { 26 | extensions: ['.js'] 27 | }, 28 | devtool: 'source-map', 29 | // externals: { // Keep externals commented unless specifically needed 30 | // 'pdfjs-dist/build/pdf.js': 'pdfjsLib', 31 | // 'tesseract.js': 'Tesseract', 32 | // '@mlc-ai/web-llm': 'webLLM' 33 | // } 34 | }; 35 | 36 | const umdConfig = { 37 | ...commonConfig, 38 | output: { 39 | path: path.resolve(__dirname, 'dist/assets'), 40 | filename: 'extract2md.umd.js', 41 | library: { 42 | name: 'Extract2MD', 43 | type: 'umd', 44 | }, 45 | globalObject: 'this', 46 | }, 47 | plugins: [ 48 | new CopyWebpackPlugin({ 49 | patterns: [ 50 | { 51 | from: path.resolve(__dirname, 'node_modules/pdfjs-dist/build/pdf.worker.min.mjs'), 52 | to: path.resolve(__dirname, 'dist/pdf.worker.min.mjs') 53 | }, 54 | { 55 | from: path.resolve(__dirname, 'node_modules/tesseract.js/dist/worker.min.js'), 56 | to: path.resolve(__dirname, 'dist/assets/tesseract-worker.min.js') 57 | }, 58 | { 59 | from: path.resolve(__dirname, 'node_modules/tesseract.js-core/tesseract-core.wasm.js'), 60 | to: path.resolve(__dirname, 'dist/assets/tesseract-core.wasm.js') 61 | }, 62 | // Copy the main type definition file 63 | { 64 | from: path.resolve(__dirname, 'src/types/index.d.ts'), 65 | to: path.resolve(__dirname, 'dist/assets/extract2md.d.ts') 66 | } 67 | ] 68 | }) 69 | ], 70 | }; 71 | 72 | const esmConfig = { 73 | ...commonConfig, 74 | output: { 75 | path: path.resolve(__dirname, 'dist/assets'), 76 | filename: 'extract2md.esm.js', 77 | library: { 78 | type: 'module', 79 | }, 80 | }, 81 | experiments: { 82 | outputModule: true, 83 | }, 84 | // ESM build typically doesn't need to run CopyWebpackPlugin again if UMD build handles it. 85 | // If you run builds separately or want to ensure assets are copied for both, include it. 86 | // For simplicity, assuming UMD build's CopyWebpackPlugin handles all asset copying. 87 | // If you have a build script that runs webpack once with an array of configs, 88 | // the plugins from one of them (e.g., UMD) will handle the copying. 89 | }; 90 | 91 | export default [umdConfig, esmConfig]; 92 | -------------------------------------------------------------------------------- /test/newline-optimization.test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Test script to verify newline optimization improvements 3 | */ 4 | 5 | // Import the converter classes 6 | import { Extract2MDConverter } from '../src/converters/Extract2MDConverter.js'; 7 | import { LegacyExtract2MDConverter } from '../src/index.js'; 8 | 9 | // Test data with excessive newlines 10 | const testText = ` 11 | Title Here 12 | 13 | 14 | Some text with multiple spaces. 15 | 16 | 17 | 18 | Another paragraph with lots of newlines. 19 | 20 | 21 | 22 | 23 | And more text. 24 | 25 | 26 | `; 27 | 28 | // Mock progress callback 29 | const mockProgressCallback = () => {}; 30 | 31 | console.log('Testing newline optimization improvements...\n'); 32 | 33 | // Test new converter 34 | console.log('Testing new Extract2MDConverter...'); 35 | try { 36 | const newConverter = new Extract2MDConverter({ progressCallback: mockProgressCallback }); 37 | const newResult = newConverter._convertToMarkdown(testText); 38 | 39 | console.log('✅ New converter executed successfully'); 40 | console.log('Result length:', newResult.length); 41 | console.log('Number of consecutive newlines (should be minimal):'); 42 | 43 | const tripleNewlines = (newResult.match(/\n{3,}/g) || []).length; 44 | const doubleNewlines = (newResult.match(/\n{2}/g) || []).length; 45 | 46 | console.log(` - Triple+ newlines: ${tripleNewlines} (should be 0)`); 47 | console.log(` - Double newlines: ${doubleNewlines}`); 48 | 49 | if (tripleNewlines === 0) { 50 | console.log('✅ Newline optimization working correctly for new converter'); 51 | } else { 52 | console.log('❌ Newline optimization needs improvement for new converter'); 53 | } 54 | } catch (error) { 55 | console.log('❌ New converter failed:', error.message); 56 | } 57 | 58 | console.log('\n' + '='.repeat(50) + '\n'); 59 | 60 | // Test legacy converter 61 | console.log('Testing legacy LegacyExtract2MDConverter...'); 62 | try { 63 | const legacyConverter = new LegacyExtract2MDConverter({ progressCallback: mockProgressCallback }); 64 | const legacyResult = legacyConverter._convertToMarkdownLogic(testText); 65 | 66 | console.log('✅ Legacy converter executed successfully'); 67 | console.log('Result length:', legacyResult.length); 68 | console.log('Number of consecutive newlines (should be minimal):'); 69 | 70 | const tripleNewlines = (legacyResult.match(/\n{3,}/g) || []).length; 71 | const doubleNewlines = (legacyResult.match(/\n{2}/g) || []).length; 72 | 73 | console.log(` - Triple+ newlines: ${tripleNewlines} (should be 0)`); 74 | console.log(` - Double newlines: ${doubleNewlines}`); 75 | 76 | if (tripleNewlines === 0) { 77 | console.log('✅ Newline optimization working correctly for legacy converter'); 78 | } else { 79 | console.log('❌ Newline optimization needs improvement for legacy converter'); 80 | } 81 | } catch (error) { 82 | console.log('❌ Legacy converter failed:', error.message); 83 | } 84 | 85 | console.log('\n' + '='.repeat(50) + '\n'); 86 | 87 | // Test post-processing optimization 88 | console.log('Testing post-processing optimization...'); 89 | try { 90 | const newConverter = new Extract2MDConverter({ progressCallback: mockProgressCallback }); 91 | const testPostProcessText = 'Text with filigature and unicode\u2018quotes\u2019 and bullets\u2022'; 92 | 93 | const processedText = newConverter._postProcessText(testPostProcessText); 94 | console.log('Original:', testPostProcessText); 95 | console.log('Processed:', processedText); 96 | 97 | if (processedText.includes('filigature') && processedText.includes("'quotes'") && processedText.includes('-')) { 98 | console.log('✅ Post-processing optimization working correctly'); 99 | } else { 100 | console.log('❌ Post-processing optimization needs review'); 101 | } 102 | } catch (error) { 103 | console.log('❌ Post-processing test failed:', error.message); 104 | } 105 | 106 | console.log('\n🎉 Newline optimization test completed!'); 107 | -------------------------------------------------------------------------------- /test/simple-newline.test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple test to verify newline optimization improvements 3 | * This test focuses on the core markdown conversion logic 4 | */ 5 | 6 | // Test data with excessive newlines 7 | const testText = ` 8 | Title Here 9 | 10 | 11 | Some text with multiple spaces. 12 | 13 | 14 | 15 | Another paragraph with lots of newlines. 16 | 17 | 18 | 19 | 20 | And more text. 21 | 22 | 23 | `; 24 | 25 | // Mock helper functions similar to what's in the converters 26 | function addSeparatorLine(outputLines) { 27 | if (outputLines.length > 0 && outputLines[outputLines.length - 1] !== '') { 28 | outputLines.push(''); 29 | } 30 | } 31 | 32 | function normalizeMarkdownNewlines(lines) { 33 | const normalizedLines = []; 34 | let consecutiveEmptyLines = 0; 35 | 36 | for (const line of lines) { 37 | if (line.trim() === '') { 38 | consecutiveEmptyLines++; 39 | // Allow maximum of 1 consecutive empty line 40 | if (consecutiveEmptyLines <= 1) { 41 | normalizedLines.push(''); 42 | } 43 | } else { 44 | consecutiveEmptyLines = 0; 45 | normalizedLines.push(line.trimEnd()); 46 | } 47 | } 48 | 49 | // Join and do final cleanup 50 | let finalMarkdown = normalizedLines.join('\n'); 51 | // Remove any remaining triple+ newlines and trim 52 | finalMarkdown = finalMarkdown.replace(/\n{3,}/g, '\n\n').trim(); 53 | return finalMarkdown; 54 | } 55 | 56 | // Test the newline optimization 57 | function testNewlineOptimization() { 58 | console.log('Testing newline optimization improvements...\n'); 59 | 60 | // Simulate the basic markdown conversion with newline optimization 61 | let markdownOutputLines = []; 62 | const inputLines = testText.split(/\n/); 63 | 64 | let currentParagraphCollector = []; 65 | 66 | const flushCurrentParagraph = () => { 67 | if (currentParagraphCollector.length > 0) { 68 | markdownOutputLines.push(currentParagraphCollector.join(' ').trim()); 69 | currentParagraphCollector = []; 70 | addSeparatorLine(markdownOutputLines); 71 | } 72 | }; 73 | 74 | for (let i = 0; i < inputLines.length; i++) { 75 | const trimmedLine = inputLines[i].trim(); 76 | 77 | if (trimmedLine === '') { 78 | flushCurrentParagraph(); 79 | continue; 80 | } 81 | 82 | // Simple header detection 83 | if (trimmedLine === 'Title Here') { 84 | flushCurrentParagraph(); 85 | markdownOutputLines.push(`# ${trimmedLine}`); 86 | addSeparatorLine(markdownOutputLines); 87 | continue; 88 | } 89 | 90 | // Regular text 91 | if (trimmedLine) { 92 | currentParagraphCollector.push(trimmedLine); 93 | } 94 | } 95 | 96 | flushCurrentParagraph(); 97 | 98 | // Apply the optimization 99 | const optimizedResult = normalizeMarkdownNewlines(markdownOutputLines); 100 | 101 | console.log('Original text length:', testText.length); 102 | console.log('Optimized result length:', optimizedResult.length); 103 | console.log('\nOptimized result:'); 104 | console.log('---'); 105 | console.log(optimizedResult); 106 | console.log('---'); 107 | 108 | // Count newline patterns 109 | const tripleNewlines = (optimizedResult.match(/\n{3,}/g) || []).length; 110 | const doubleNewlines = (optimizedResult.match(/\n{2}/g) || []).length; 111 | 112 | console.log('\nNewline analysis:'); 113 | console.log(` - Triple+ newlines: ${tripleNewlines} (should be 0)`); 114 | console.log(` - Double newlines: ${doubleNewlines}`); 115 | 116 | if (tripleNewlines === 0) { 117 | console.log('✅ Newline optimization working correctly'); 118 | } else { 119 | console.log('❌ Newline optimization needs improvement'); 120 | } 121 | 122 | // Test post-processing simulation 123 | console.log('\n' + '='.repeat(50)); 124 | console.log('Testing post-processing optimization...'); 125 | 126 | const testPostProcessText = 'Text with filigature and unicode\u2018quotes\u2019 and bullets\u2022'; 127 | 128 | // Simulate the optimized post-processing 129 | const rules = [ 130 | { find: /[\u2018\u2019]/g, replace: "'" }, 131 | { find: /[\u2022\u2023\u25E6\u2043\u2219\u25CF\u25CB\u2981\u2619\u2765]/g, replace: '-' } 132 | ]; 133 | 134 | let processedText = testPostProcessText; 135 | for (const rule of rules) { 136 | processedText = processedText.replace(rule.find, rule.replace); 137 | } 138 | 139 | console.log('Original:', testPostProcessText); 140 | console.log('Processed:', processedText); 141 | 142 | if (processedText.includes("'quotes'") && processedText.includes('-')) { 143 | console.log('✅ Post-processing optimization working correctly'); 144 | } else { 145 | console.log('❌ Post-processing optimization needs review'); 146 | } 147 | 148 | return tripleNewlines === 0; 149 | } 150 | 151 | // Run the test 152 | const success = testNewlineOptimization(); 153 | console.log('\n' + '='.repeat(50)); 154 | console.log('🎉 Newline optimization test completed!'); 155 | console.log(success ? '✅ All tests passed' : '❌ Some tests failed'); 156 | 157 | process.exit(success ? 0 : 1); 158 | -------------------------------------------------------------------------------- /src/types/index.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * TypeScript definitions for Extract2MD 3 | */ 4 | 5 | // Core configuration interfaces 6 | export interface OCRConfig { 7 | language?: string; 8 | oem?: number; 9 | psm?: number; 10 | workerPath?: string; 11 | corePath?: string; 12 | langPath?: string; 13 | options?: any; 14 | } 15 | 16 | export interface WebLLMConfig { 17 | modelId?: string; 18 | temperature?: number; 19 | maxTokens?: number; 20 | streamingEnabled?: boolean; 21 | customModel?: CustomModelConfig; 22 | options?: any; 23 | } 24 | 25 | export interface PostProcessRule { 26 | find: RegExp | string; 27 | replace: string; 28 | } 29 | 30 | export interface ProgressReport { 31 | stage: string; 32 | message: string; 33 | currentPage?: number; 34 | totalPages?: number; 35 | progress?: number; 36 | usage?: any; 37 | error?: any; 38 | } 39 | 40 | export interface TesseractConfig { 41 | workerPath?: string; 42 | corePath?: string; 43 | langPath?: string; 44 | language?: string; 45 | options?: any; 46 | } 47 | 48 | export interface CustomModelConfig { 49 | model: string; 50 | model_id: string; 51 | model_lib: string; 52 | required_features?: string[]; 53 | overrides?: any; 54 | } 55 | 56 | export interface LLMConfig { 57 | model?: string; 58 | customModel?: CustomModelConfig; 59 | options?: { 60 | temperature?: number; 61 | maxTokens?: number; 62 | [key: string]: any; 63 | }; 64 | } 65 | 66 | export interface SystemPromptsConfig { 67 | singleExtraction?: string; 68 | combinedExtraction?: string; 69 | } 70 | 71 | export interface ProcessingConfig { 72 | splitPascalCase?: boolean; 73 | pdfRenderScale?: number; 74 | postProcessRules?: PostProcessRule[]; 75 | } 76 | 77 | export interface Extract2MDConfig { 78 | pdfJsWorkerSrc?: string; 79 | tesseract?: TesseractConfig; 80 | llm?: LLMConfig; 81 | systemPrompts?: SystemPromptsConfig; 82 | processing?: ProcessingConfig; 83 | progressCallback?: (report: ProgressReport) => void; 84 | } 85 | 86 | export interface WebLLMEngineConfig { 87 | progressCallback?: (report: ProgressReport) => void; 88 | defaultModel?: string; 89 | customModelConfig?: CustomModelConfig; 90 | } 91 | 92 | export interface GenerationOptions { 93 | temperature?: number; 94 | maxTokens?: number; 95 | [key: string]: any; 96 | } 97 | 98 | export interface ModelInfo { 99 | isInitialized: boolean; 100 | currentModelId: string | null; 101 | isReady: boolean; 102 | } 103 | 104 | export interface ValidationResult { 105 | isValid: boolean; 106 | issues: string[]; 107 | } 108 | 109 | export class WebLLMEngine { 110 | constructor(config?: WebLLMEngineConfig); 111 | 112 | initialize(modelId?: string | null, modelConfig?: any): Promise; 113 | generate(prompt: string, options?: GenerationOptions): Promise; 114 | generateStream( 115 | prompt: string, 116 | options?: GenerationOptions, 117 | onChunk?: (chunk: string, fullResponse: string) => void 118 | ): Promise; 119 | isReady(): boolean; 120 | getModelInfo(): ModelInfo; 121 | cleanup(): Promise; 122 | } 123 | 124 | export class OutputParser { 125 | constructor(); 126 | 127 | parse(rawOutput: string): string; 128 | removeThinkingBlocks(text: string): string; 129 | applyCleanupPatterns(text: string): string; 130 | ensureMarkdownStructure(text: string): string; 131 | extractMarkdownContent(text: string): string; 132 | validateMarkdown(text: string): ValidationResult; 133 | applyCustomRules(text: string, customRules?: PostProcessRule[]): string; 134 | } 135 | 136 | export class SystemPrompts { 137 | static getSingleExtractionPrompt(customization?: string): string; 138 | static getCombinedExtractionPrompt(customization?: string): string; 139 | static getSingleExtractionUserPrompt(extractedText: string): string; 140 | static getCombinedExtractionUserPrompt(quickExtraction: string, ocrExtraction: string): string; 141 | static buildSystemPrompt(scenarioType: 'single' | 'combined', customization?: string): string; 142 | static buildUserPrompt(scenarioType: 'single' | 'combined', ...extractionResults: string[]): string; 143 | static getThinkingEnabledPrompt(basePrompt: string): string; 144 | } 145 | 146 | export class ConfigValidator { 147 | static getDefaultConfig(): Extract2MDConfig; 148 | static validate(config?: any): Extract2MDConfig; 149 | static validateTesseractConfig(tesseractConfig: any): void; 150 | static validateLLMConfig(llmConfig: any): void; 151 | static validateCustomModel(customModel: any): void; 152 | static validateLLMOptions(options: any): void; 153 | static validateProcessingConfig(processingConfig: any): void; 154 | static validateSystemPrompts(systemPrompts: any): void; 155 | static deepMerge(target: any, source: any): any; 156 | static isObject(value: any): boolean; 157 | static fromJSON(jsonString: string): Extract2MDConfig; 158 | static getSchema(): any; 159 | } 160 | 161 | export class Extract2MDConverter { 162 | constructor(config?: Extract2MDConfig); 163 | 164 | // Scenario-specific static methods 165 | static quickConvertOnly(pdfFile: File, options?: Extract2MDConfig): Promise; 166 | static highAccuracyConvertOnly(pdfFile: File, options?: Extract2MDConfig): Promise; 167 | static quickConvertWithLLM(pdfFile: File, options?: Extract2MDConfig): Promise; 168 | static highAccuracyConvertWithLLM(pdfFile: File, options?: Extract2MDConfig): Promise; 169 | static combinedConvertWithLLM(pdfFile: File, options?: Extract2MDConfig): Promise; 170 | } 171 | 172 | // Legacy support - keeping the old interface available 173 | export interface Extract2MDOptions extends Extract2MDConfig {} 174 | 175 | export interface ConvertOptions { 176 | postProcessRules?: PostProcessRule[]; 177 | } 178 | 179 | export interface HighAccuracyConvertOptions extends ConvertOptions { 180 | tesseractLanguage?: string; 181 | tesseractOptions?: any; 182 | pdfRenderScale?: number; 183 | } 184 | 185 | export interface LLMRewriteOptions { 186 | llmModel?: string; 187 | llmPromptTemplate?: (text: string) => string; 188 | chatOpts?: any; 189 | } 190 | 191 | // Legacy class for backwards compatibility 192 | export class LegacyExtract2MDConverter { 193 | constructor(options?: Extract2MDOptions); 194 | 195 | quickConvert(pdfFile: File, options?: ConvertOptions): Promise; 196 | highAccuracyConvert(pdfFile: File, options?: HighAccuracyConvertOptions): Promise; 197 | llmRewrite(textToRewrite: string, options?: LLMRewriteOptions): Promise; 198 | unloadLLM(): Promise; 199 | } 200 | 201 | // Default export 202 | export default Extract2MDConverter; 203 | -------------------------------------------------------------------------------- /src/utils/SystemPrompts.js: -------------------------------------------------------------------------------- 1 | /** 2 | * SystemPrompts.js 3 | * System prompts for different LLM rewrite scenarios 4 | */ 5 | 6 | export class SystemPrompts { 7 | /** 8 | * Base system prompt for single extraction method scenarios (3 & 4) 9 | */ 10 | static getSingleExtractionPrompt(customization = '') { 11 | const basePrompt = `You are an expert text editor specializing in converting extracted PDF content into clean, well-formatted Markdown. Your task is to: 12 | 13 | 1. **Preserve Original Content**: Maintain all original information, context, and meaning 14 | 2. **Improve Clarity**: Enhance readability and flow while keeping the professional tone 15 | 3. **Fix Errors**: Correct grammatical errors, spelling mistakes in common words (preserve proper nouns, names, places, brands) 16 | 4. **Structure Enhancement**: Organize content with appropriate Markdown formatting (headers, lists, emphasis, code blocks, etc.) 17 | 5. **Remove Artifacts**: Clean up PDF extraction artifacts like weird spacing, broken words, or formatting issues 18 | 19 | **Important Guidelines:** 20 | - Do not change technical terms, names, places, or brand names 21 | - Maintain the original document structure and hierarchy 22 | - Use proper Markdown syntax for formatting 23 | - Do not add information that wasn't in the original text 24 | - Output ONLY the improved Markdown content 25 | 26 | The text you receive was extracted from a PDF and may contain formatting issues or extraction artifacts.`; 27 | 28 | return customization ? `${basePrompt}\n\n**Additional Instructions:**\n${customization}` : basePrompt; 29 | } 30 | 31 | /** 32 | * Specialized system prompt for combined extraction scenarios (scenario 5) 33 | */ 34 | static getCombinedExtractionPrompt(customization = '') { 35 | const basePrompt = `You are an expert text editor specializing in creating a single, comprehensive Markdown document by intelligently combining content from two different PDF extraction sources. Your goal is to produce the most complete, accurate, and well-formatted Markdown output possible. 36 | 37 | You will receive content extracted using two methods. Your task is to: 38 | 39 | 1. **Synthesize Information**: Intelligently merge the content from both extraction sources. Prioritize completeness and accuracy, ensuring no critical information, context, or meaning is lost from either source. 40 | 2. **Preserve Entities and Relationships**: Pay special attention to accurately retaining all names, places, dates, objects, technical terms, brand names, and the relationships between them. 41 | 3. **Enhance Clarity and Structure**: Improve readability and flow. Organize the combined content with appropriate Markdown formatting (headers, lists, emphasis, code blocks, etc.) to create a unified and coherent document. 42 | 4. **Correct Errors and Artifacts**: Fix grammatical errors and spelling mistakes in common words. Preserve proper nouns and specialized terms. Clean up PDF extraction artifacts (e.g., weird spacing, broken words, formatting issues) from both sources. 43 | 5. **Avoid Over-Summarization**: The primary goal is comprehensive extraction and combination, not summarization. Retain all details unless they are clear duplicates. 44 | 45 | **Important Guidelines:** 46 | - Do not change technical terms, names, places, or brand names unless correcting an obvious extraction error. 47 | - Create a single, unified Markdown document. 48 | - Use proper Markdown syntax for all formatting. 49 | - Do not add any information that was not present in the original texts. 50 | - Output ONLY the combined and improved Markdown content. Do not include any explanations, categorizations, or section titles like "Combined Results" or similar. 51 | 52 | The two sets of extracted text will be provided. Your task is to process them and return a single block of clean Markdown.`; 53 | 54 | return customization ? `${basePrompt}\n\n**Additional Instructions:**\n${customization}` : basePrompt; 55 | } 56 | 57 | /** 58 | * Get user prompt for single extraction scenarios 59 | */ 60 | static getSingleExtractionUserPrompt(extractedText) { 61 | return `Please improve and format the following extracted PDF content into clean Markdown: 62 | 63 | **Extracted Content:** 64 | ${extractedText} 65 | 66 | **Improved Markdown:**`; 67 | } 68 | 69 | /** 70 | * Get user prompt for combined extraction scenarios 71 | */ 72 | static getCombinedExtractionUserPrompt(quickExtraction, ocrExtraction) { 73 | return `Please combine, improve, and format the following two sets of extracted PDF content into a single, clean Markdown document: 74 | 75 | **Source 1 Extracted Content:** 76 | ${quickExtraction} 77 | 78 | **Source 2 Extracted Content:** 79 | ${ocrExtraction} 80 | 81 | **Combined and Improved Markdown:**`; 82 | } 83 | 84 | /** 85 | * Build complete system prompt with customizations 86 | */ 87 | static buildSystemPrompt(scenarioType, customization = '') { 88 | switch (scenarioType) { 89 | case 'single': 90 | return this.getSingleExtractionPrompt(customization); 91 | case 'combined': 92 | return this.getCombinedExtractionPrompt(customization); 93 | default: 94 | throw new Error(`Unknown scenario type: ${scenarioType}`); 95 | } 96 | } 97 | 98 | /** 99 | * Build complete user prompt based on scenario 100 | */ 101 | static buildUserPrompt(scenarioType, ...extractionResults) { 102 | switch (scenarioType) { 103 | case 'single': 104 | if (extractionResults.length !== 1) { 105 | throw new Error('Single extraction scenario requires exactly one extraction result'); 106 | } 107 | return this.getSingleExtractionUserPrompt(extractionResults[0]); 108 | case 'combined': 109 | if (extractionResults.length !== 2) { 110 | throw new Error('Combined extraction scenario requires exactly two extraction results'); 111 | } 112 | return this.getCombinedExtractionUserPrompt(extractionResults[0], extractionResults[1]); 113 | default: 114 | throw new Error(`Unknown scenario type: ${scenarioType}`); 115 | } 116 | } 117 | 118 | /** 119 | * Get thinking-enabled prompt variations for models that support it 120 | */ 121 | static getThinkingEnabledPrompt(basePrompt) { 122 | return `${basePrompt} 123 | 124 | Take time to think through your approach before providing the final output. Consider the extraction quality, potential issues, and the best way to structure the content.`; 125 | } 126 | } 127 | 128 | export default SystemPrompts; 129 | -------------------------------------------------------------------------------- /src/utils/OutputParser.js: -------------------------------------------------------------------------------- 1 | /** 2 | * OutputParser.js 3 | * Utility for parsing and cleaning LLM output 4 | * Removes thinking tags and ensures proper markdown formatting 5 | */ 6 | 7 | export class OutputParser { 8 | constructor() { 9 | // Regex pattern to match ... blocks with optional line breaks 10 | this.thinkRegex = /.*?<\/think>\n?\n?/gs; 11 | 12 | // Patterns for cleaning up common LLM output issues 13 | this.cleanupPatterns = [ 14 | // Remove excessive whitespace 15 | { find: /\n{3,}/g, replace: '\n\n' }, 16 | // Fix spacing around headers 17 | { find: /^(#{1,6})\s*(.+)$/gm, replace: '$1 $2' }, 18 | // Ensure proper list formatting 19 | { find: /^(\s*[-*+])\s+/gm, replace: '$1 ' }, 20 | // Clean up numbered lists 21 | { find: /^(\s*\d+\.)\s+/gm, replace: '$1 ' }, 22 | // Remove trailing spaces 23 | { find: /[ \t]+$/gm, replace: '' }, 24 | // Normalize line endings 25 | { find: /\r\n/g, replace: '\n' }, 26 | // Clean up code block formatting 27 | { find: /```\s*\n\s*\n/g, replace: '```\n' }, 28 | { find: /\n\s*\n\s*```/g, replace: '\n```' } 29 | ]; 30 | } 31 | 32 | /** 33 | * Parse LLM output by removing thinking blocks and cleaning formatting 34 | * @param {string} rawOutput - Raw output from LLM 35 | * @returns {string} Cleaned and formatted output 36 | */ 37 | parse(rawOutput) { 38 | if (!rawOutput || typeof rawOutput !== 'string') { 39 | return ''; 40 | } 41 | 42 | let cleanedOutput = rawOutput; 43 | 44 | // Step 1: Remove ... blocks 45 | cleanedOutput = this.removeThinkingBlocks(cleanedOutput); 46 | 47 | // Step 2: Apply general cleanup patterns 48 | cleanedOutput = this.applyCleanupPatterns(cleanedOutput); 49 | 50 | // Step 3: Ensure proper markdown structure 51 | cleanedOutput = this.ensureMarkdownStructure(cleanedOutput); 52 | 53 | return cleanedOutput.trim(); 54 | } 55 | 56 | /** 57 | * Remove ... blocks from the output 58 | * @param {string} text - Input text 59 | * @returns {string} Text with thinking blocks removed 60 | */ 61 | removeThinkingBlocks(text) { 62 | // Remove all ... blocks including optional line breaks after 63 | let cleaned = text.replace(this.thinkRegex, ''); 64 | 65 | // Clean up any remaining empty lines at the start 66 | cleaned = cleaned.replace(/^\s*\n+/, ''); 67 | 68 | return cleaned; 69 | } 70 | 71 | /** 72 | * Apply general cleanup patterns to improve formatting 73 | * @param {string} text - Input text 74 | * @returns {string} Cleaned text 75 | */ 76 | applyCleanupPatterns(text) { 77 | let cleaned = text; 78 | 79 | for (const pattern of this.cleanupPatterns) { 80 | cleaned = cleaned.replace(pattern.find, pattern.replace); 81 | } 82 | 83 | return cleaned; 84 | } 85 | 86 | /** 87 | * Ensure proper markdown structure and formatting 88 | * @param {string} text - Input text 89 | * @returns {string} Properly structured markdown 90 | */ 91 | ensureMarkdownStructure(text) { 92 | let structured = text; 93 | 94 | // Ensure headers have proper spacing 95 | structured = structured.replace(/^(#{1,6}\s.+)$/gm, (match, header) => { 96 | return `\n${header}\n`; 97 | }); 98 | 99 | // Ensure code blocks have proper spacing 100 | structured = structured.replace(/^```/gm, '\n```'); 101 | structured = structured.replace(/```$/gm, '```\n'); 102 | 103 | // Ensure lists have proper spacing 104 | structured = structured.replace(/^(\s*[-*+]\s.+)$/gm, (match, listItem, offset, string) => { 105 | const prevChar = string[offset - 1]; 106 | return prevChar && prevChar !== '\n' ? `\n${listItem}` : listItem; 107 | }); 108 | 109 | // Final cleanup of excessive line breaks 110 | structured = structured.replace(/\n{3,}/g, '\n\n'); 111 | 112 | return structured; 113 | } 114 | 115 | /** 116 | * Extract only the markdown content, removing any prefacing text 117 | * @param {string} text - Input text 118 | * @returns {string} Pure markdown content 119 | */ 120 | extractMarkdownContent(text) { 121 | // Look for common LLM response patterns and extract just the markdown 122 | const patterns = [ 123 | // "Here's the rewritten text:" followed by content 124 | /(?:here'?s?\s+(?:the\s+)?(?:rewritten|improved|cleaned|formatted)\s+(?:text|content|version)[:.]?\s*\n)(.*)/is, 125 | // "Rewritten text:" followed by content 126 | /(?:rewritten\s+text[:.]?\s*\n)(.*)/is, 127 | // Just return the whole thing if no pattern matches 128 | /(.*)/s 129 | ]; 130 | 131 | for (const pattern of patterns) { 132 | const match = text.match(pattern); 133 | if (match && match[1]) { 134 | return match[1].trim(); 135 | } 136 | } 137 | 138 | return text.trim(); 139 | } 140 | 141 | /** 142 | * Validate if the output is properly formatted markdown 143 | * @param {string} text - Text to validate 144 | * @returns {Object} Validation result with status and issues 145 | */ 146 | validateMarkdown(text) { 147 | const issues = []; 148 | 149 | // Check for common markdown issues 150 | if (text.includes('')) { 151 | issues.push('Contains thinking blocks that should be removed'); 152 | } 153 | 154 | // Check for malformed headers 155 | const malformedHeaders = text.match(/^#{7,}/gm); 156 | if (malformedHeaders) { 157 | issues.push('Contains headers with too many # symbols'); 158 | } 159 | 160 | // Check for unclosed code blocks 161 | const codeBlockCount = (text.match(/```/g) || []).length; 162 | if (codeBlockCount % 2 !== 0) { 163 | issues.push('Contains unclosed code blocks'); 164 | } 165 | 166 | // Check for excessive line breaks 167 | if (text.includes('\n\n\n\n')) { 168 | issues.push('Contains excessive line breaks'); 169 | } 170 | 171 | return { 172 | isValid: issues.length === 0, 173 | issues: issues 174 | }; 175 | } 176 | 177 | /** 178 | * Apply custom parsing rules 179 | * @param {string} text - Input text 180 | * @param {Array} customRules - Array of custom parsing rules 181 | * @returns {string} Text with custom rules applied 182 | */ 183 | applyCustomRules(text, customRules = []) { 184 | if (!Array.isArray(customRules) || customRules.length === 0) { 185 | return text; 186 | } 187 | 188 | let processed = text; 189 | 190 | for (const rule of customRules) { 191 | if (rule.find && typeof rule.replace === 'string') { 192 | processed = processed.replace(rule.find, rule.replace); 193 | } 194 | } 195 | 196 | return processed; 197 | } 198 | } 199 | 200 | export default OutputParser; 201 | -------------------------------------------------------------------------------- /MIGRATION.md: -------------------------------------------------------------------------------- 1 | # Migration Guide: Extract2MD v1.0.6 2 | 3 | This guide helps you migrate from the legacy Extract2MD API to the new scenario-based API introduced in v1.0.6. 4 | 5 | ## Overview of Changes 6 | 7 | The Extract2MD package has been restructured to provide clear, scenario-specific methods instead of a single class with multiple configuration options. This makes the API more intuitive and provides better TypeScript support. 8 | 9 | ## Key Changes 10 | 11 | ### Before (Legacy API) 12 | ```javascript 13 | import Extract2MD from 'extract2md'; 14 | 15 | const converter = new Extract2MD(); 16 | const result = await converter.convertPDFToMarkdown(pdfFile, { 17 | useOCR: true, 18 | useLLM: false, 19 | ocrLanguage: 'eng' 20 | }); 21 | ``` 22 | 23 | ### After (New API) 24 | ```javascript 25 | import { Extract2MDConverter } from 'extract2md'; 26 | 27 | const result = await Extract2MDConverter.quickConvertOnly(pdfFile, { 28 | ocr: { 29 | language: 'eng', 30 | oem: 1, 31 | psm: 6 32 | } 33 | }); 34 | ``` 35 | 36 | ## Migration by Use Case 37 | 38 | ### 1. Basic PDF Text Extraction (No OCR, No LLM) 39 | 40 | **Legacy:** 41 | ```javascript 42 | const converter = new Extract2MD(); 43 | const result = await converter.convertPDFToMarkdown(pdfFile, { 44 | useOCR: false, 45 | useLLM: false 46 | }); 47 | ``` 48 | 49 | **New:** 50 | ```javascript 51 | const result = await Extract2MDConverter.quickConvertOnly(pdfFile, { 52 | // OCR config is optional - will use PDF text extraction only 53 | }); 54 | ``` 55 | 56 | ### 2. PDF with OCR (No LLM) 57 | 58 | **Legacy:** 59 | ```javascript 60 | const converter = new Extract2MD(); 61 | const result = await converter.convertPDFToMarkdown(pdfFile, { 62 | useOCR: true, 63 | useLLM: false, 64 | ocrLanguage: 'eng', 65 | ocrPSM: 6 66 | }); 67 | ``` 68 | 69 | **New - Quick OCR:** 70 | ```javascript 71 | const result = await Extract2MDConverter.quickConvertOnly(pdfFile, { 72 | ocr: { 73 | language: 'eng', 74 | oem: 1, 75 | psm: 6 76 | } 77 | }); 78 | ``` 79 | 80 | **New - High Accuracy OCR:** 81 | ```javascript 82 | const result = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile, { 83 | ocr: { 84 | language: 'eng', 85 | oem: 1, 86 | psm: 8 87 | } 88 | }); 89 | ``` 90 | 91 | ### 3. PDF with OCR and LLM Rewrite 92 | 93 | **Legacy:** 94 | ```javascript 95 | const converter = new Extract2MD(); 96 | const result = await converter.convertPDFToMarkdown(pdfFile, { 97 | useOCR: true, 98 | useLLM: true, 99 | ocrLanguage: 'eng', 100 | llmModel: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 101 | llmTemperature: 0.7 102 | }); 103 | ``` 104 | 105 | **New - Quick OCR + LLM:** 106 | ```javascript 107 | const result = await Extract2MDConverter.quickConvertWithLLM(pdfFile, { 108 | ocr: { 109 | language: 'eng', 110 | oem: 1, 111 | psm: 6 112 | }, 113 | webllm: { 114 | modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 115 | temperature: 0.7, 116 | maxTokens: 4000, 117 | streamingEnabled: false 118 | } 119 | }); 120 | ``` 121 | 122 | **New - High Accuracy OCR + LLM:** 123 | ```javascript 124 | const result = await Extract2MDConverter.highAccuracyConvertWithLLM(pdfFile, { 125 | ocr: { 126 | language: 'eng', 127 | oem: 1, 128 | psm: 8 129 | }, 130 | webllm: { 131 | modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 132 | temperature: 0.7, 133 | maxTokens: 4000, 134 | streamingEnabled: false 135 | } 136 | }); 137 | ``` 138 | 139 | ### 4. Combined Extraction Methods with LLM 140 | 141 | This is a new feature not available in the legacy API: 142 | 143 | ```javascript 144 | const result = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, { 145 | ocr: { 146 | language: 'eng', 147 | oem: 1, 148 | psm: 6 149 | }, 150 | webllm: { 151 | modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 152 | temperature: 0.7, 153 | maxTokens: 4000, 154 | streamingEnabled: false 155 | } 156 | }); 157 | ``` 158 | 159 | ## Configuration Changes 160 | 161 | ### OCR Configuration 162 | 163 | **Legacy:** 164 | ```javascript 165 | { 166 | ocrLanguage: 'eng', 167 | ocrPSM: 6, 168 | ocrOEM: 1 169 | } 170 | ``` 171 | 172 | **New:** 173 | ```javascript 174 | { 175 | ocr: { 176 | language: 'eng', 177 | psm: 6, 178 | oem: 1 179 | } 180 | } 181 | ``` 182 | 183 | ### LLM Configuration 184 | 185 | **Legacy:** 186 | ```javascript 187 | { 188 | llmModel: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 189 | llmTemperature: 0.7, 190 | llmMaxTokens: 4000 191 | } 192 | ``` 193 | 194 | **New:** 195 | ```javascript 196 | { 197 | webllm: { 198 | modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 199 | temperature: 0.7, 200 | maxTokens: 4000, 201 | streamingEnabled: false 202 | } 203 | } 204 | ``` 205 | 206 | ## Backwards Compatibility 207 | 208 | The legacy API is still available for backwards compatibility: 209 | 210 | ```javascript 211 | import { LegacyExtract2MDConverter } from 'extract2md'; 212 | 213 | const converter = new LegacyExtract2MDConverter(); 214 | // Use old API as before 215 | ``` 216 | 217 | **Note:** The legacy API is deprecated and will be removed in v2.0.0. Please migrate to the new API. 218 | 219 | ## Benefits of the New API 220 | 221 | 1. **Clear Scenarios**: Each method has a specific purpose, making it easier to choose the right approach 222 | 2. **Better TypeScript Support**: Full type definitions for all configurations and return types 223 | 3. **Modular Architecture**: Better code organization and maintainability 224 | 4. **Configuration Validation**: Built-in validation for all configuration options 225 | 5. **Improved Error Handling**: More specific error messages and better error recovery 226 | 6. **Better Documentation**: Each scenario method is well-documented with examples 227 | 228 | ## Configuration Files 229 | 230 | You can now use external configuration files: 231 | 232 | ```javascript 233 | // config.json 234 | { 235 | "ocr": { 236 | "language": "eng", 237 | "oem": 1, 238 | "psm": 6 239 | }, 240 | "webllm": { 241 | "modelId": "Llama-3.2-1B-Instruct-q4f16_1-MLC", 242 | "temperature": 0.7, 243 | "maxTokens": 4000, 244 | "streamingEnabled": false 245 | } 246 | } 247 | ``` 248 | 249 | ```javascript 250 | // In your code 251 | const result = await Extract2MDConverter.quickConvertWithLLM(pdfFile, 'config.json'); 252 | ``` 253 | 254 | ## Error Handling 255 | 256 | The new API provides more specific error types: 257 | 258 | ```javascript 259 | try { 260 | const result = await Extract2MDConverter.quickConvertOnly(pdfFile, config); 261 | } catch (error) { 262 | if (error.name === 'ConfigurationError') { 263 | console.error('Configuration issue:', error.message); 264 | } else if (error.name === 'OCRError') { 265 | console.error('OCR processing failed:', error.message); 266 | } else if (error.name === 'WebLLMError') { 267 | console.error('LLM processing failed:', error.message); 268 | } else { 269 | console.error('General error:', error.message); 270 | } 271 | } 272 | ``` 273 | 274 | ## Testing Your Migration 275 | 276 | Use the provided test files to validate your migration: 277 | 278 | ```javascript 279 | import { Extract2MDTests } from 'extract2md/test/scenarios.test.js'; 280 | 281 | const tests = new Extract2MDTests(); 282 | await tests.runBasicTests(); 283 | 284 | // With a PDF file 285 | await tests.runFullTests(pdfFile); 286 | ``` 287 | 288 | ## Need Help? 289 | 290 | - Check the [examples](./examples/) folder for complete usage examples 291 | - See the [README.md](./README.md) for full API documentation 292 | - Open an issue on GitHub if you encounter migration problems 293 | 294 | ## Timeline 295 | 296 | - **v1.0.6**: New API introduced, legacy API deprecated 297 | - **v1.1.0**: Legacy API will show deprecation warnings 298 | - **v2.0.0**: Legacy API will be removed (planned for 6 months after v1.0.6) 299 | 300 | Migrate to the new API as soon as possible to take advantage of the improved features and ensure compatibility with future versions. 301 | -------------------------------------------------------------------------------- /test/scenarios.test.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Test file for validating Extract2MD scenarios 3 | * This is a basic validation test - for production, consider using a testing framework like Jest 4 | */ 5 | 6 | import { Extract2MDConverter } from '../src/index.js'; 7 | 8 | class Extract2MDTests { 9 | constructor() { 10 | this.testResults = []; 11 | this.testPdf = null; // Will be set via file input in demo 12 | } 13 | 14 | log(message, type = 'info') { 15 | console.log(`[${type.toUpperCase()}] ${message}`); 16 | this.testResults.push({ message, type, timestamp: new Date().toISOString() }); 17 | } 18 | 19 | async runBasicTests() { 20 | this.log('Starting Extract2MD basic tests...'); 21 | 22 | try { 23 | // Test 1: Check if all scenario methods exist 24 | this.testScenarioMethodsExist(); 25 | 26 | // Test 2: Check configuration validation 27 | await this.testConfigurationValidation(); 28 | 29 | this.log('Basic tests completed successfully!', 'success'); 30 | } catch (error) { 31 | this.log(`Basic tests failed: ${error.message}`, 'error'); 32 | throw error; 33 | } 34 | } 35 | 36 | testScenarioMethodsExist() { 37 | this.log('Testing scenario methods existence...'); 38 | 39 | const requiredMethods = [ 40 | 'quickConvertOnly', 41 | 'highAccuracyConvertOnly', 42 | 'quickConvertWithLLM', 43 | 'highAccuracyConvertWithLLM', 44 | 'combinedConvertWithLLM' 45 | ]; 46 | 47 | for (const method of requiredMethods) { 48 | if (typeof Extract2MDConverter[method] !== 'function') { 49 | throw new Error(`Method ${method} does not exist or is not a function`); 50 | } 51 | } 52 | 53 | this.log(`All ${requiredMethods.length} scenario methods exist`, 'success'); 54 | } 55 | 56 | async testConfigurationValidation() { 57 | this.log('Testing configuration validation...'); 58 | 59 | // Test valid configuration 60 | const validConfig = { 61 | ocr: { 62 | language: 'eng', 63 | oem: 1, 64 | psm: 6 65 | }, 66 | webllm: { 67 | modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 68 | temperature: 0.7, 69 | maxTokens: 4000, 70 | streamingEnabled: false 71 | } 72 | }; 73 | 74 | try { 75 | // This should not throw 76 | const result = await Extract2MDConverter.quickConvertOnly( 77 | null, // No actual PDF for basic test 78 | validConfig, 79 | true // dry run mode (if implemented) 80 | ); 81 | this.log('Configuration validation passed', 'success'); 82 | } catch (error) { 83 | if (error.message.includes('PDF file is required')) { 84 | this.log('Configuration validation passed (expected PDF error)', 'success'); 85 | } else { 86 | throw error; 87 | } 88 | } 89 | 90 | // Test invalid configuration 91 | try { 92 | await Extract2MDConverter.quickConvertOnly(null, { invalid: 'config' }); 93 | throw new Error('Should have thrown validation error'); 94 | } catch (error) { 95 | if (error.message.includes('validation') || error.message.includes('PDF file is required')) { 96 | this.log('Invalid configuration properly rejected', 'success'); 97 | } else { 98 | throw error; 99 | } 100 | } 101 | } 102 | 103 | async runFullTests(pdfFile) { 104 | if (!pdfFile) { 105 | this.log('No PDF file provided for full tests', 'warning'); 106 | return; 107 | } 108 | 109 | this.log('Starting full Extract2MD tests with PDF file...'); 110 | this.testPdf = pdfFile; 111 | 112 | try { 113 | // Test Scenario 1: Quick Convert Only 114 | await this.testScenario1(); 115 | 116 | // Test Scenario 2: High Accuracy Convert Only 117 | await this.testScenario2(); 118 | 119 | // Note: LLM scenarios would require actual model loading which takes time 120 | // For now, we'll just test that they don't throw immediate errors 121 | await this.testLLMScenariosBasic(); 122 | 123 | this.log('Full tests completed successfully!', 'success'); 124 | } catch (error) { 125 | this.log(`Full tests failed: ${error.message}`, 'error'); 126 | throw error; 127 | } 128 | } 129 | 130 | async testScenario1() { 131 | this.log('Testing Scenario 1: Quick Convert Only...'); 132 | 133 | const config = { 134 | ocr: { 135 | language: 'eng', 136 | oem: 1, 137 | psm: 6 138 | } 139 | }; 140 | 141 | const result = await Extract2MDConverter.quickConvertOnly(this.testPdf, config); 142 | 143 | if (!result || typeof result !== 'string') { 144 | throw new Error('Scenario 1 should return a string result'); 145 | } 146 | 147 | this.log(`Scenario 1 completed. Result length: ${result.length} characters`, 'success'); 148 | } 149 | 150 | async testScenario2() { 151 | this.log('Testing Scenario 2: High Accuracy Convert Only...'); 152 | 153 | const config = { 154 | ocr: { 155 | language: 'eng', 156 | oem: 1, 157 | psm: 8 // Different PSM for high accuracy 158 | } 159 | }; 160 | 161 | const result = await Extract2MDConverter.highAccuracyConvertOnly(this.testPdf, config); 162 | 163 | if (!result || typeof result !== 'string') { 164 | throw new Error('Scenario 2 should return a string result'); 165 | } 166 | 167 | this.log(`Scenario 2 completed. Result length: ${result.length} characters`, 'success'); 168 | } 169 | 170 | async testLLMScenariosBasic() { 171 | this.log('Testing LLM scenarios (basic validation only)...'); 172 | 173 | const config = { 174 | ocr: { 175 | language: 'eng', 176 | oem: 1, 177 | psm: 6 178 | }, 179 | webllm: { 180 | modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC', 181 | temperature: 0.7, 182 | maxTokens: 1000, 183 | streamingEnabled: false 184 | } 185 | }; 186 | 187 | // Test that LLM scenarios at least start without immediate errors 188 | try { 189 | // These will likely fail at model loading, but should not have immediate syntax errors 190 | const scenarios = [ 191 | 'quickConvertWithLLM', 192 | 'highAccuracyConvertWithLLM', 193 | 'combinedConvertWithLLM' 194 | ]; 195 | 196 | for (const scenario of scenarios) { 197 | this.log(`Testing ${scenario} initialization...`); 198 | 199 | try { 200 | // Start the process but don't wait for completion (model loading takes time) 201 | const promise = Extract2MDConverter[scenario](this.testPdf, config); 202 | 203 | // Give it a moment to start, then we'll assume it's working if no immediate error 204 | setTimeout(() => { 205 | this.log(`${scenario} started successfully`, 'success'); 206 | }, 100); 207 | 208 | // Don't await full completion for basic test 209 | break; // Only test one scenario to avoid model loading overhead 210 | } catch (error) { 211 | if (error.message.includes('model') || error.message.includes('WebLLM')) { 212 | this.log(`${scenario} - model loading issue (expected): ${error.message}`, 'warning'); 213 | } else { 214 | throw error; 215 | } 216 | } 217 | } 218 | } catch (error) { 219 | this.log(`LLM scenarios basic test error: ${error.message}`, 'warning'); 220 | } 221 | } 222 | 223 | getTestResults() { 224 | return this.testResults; 225 | } 226 | 227 | clearResults() { 228 | this.testResults = []; 229 | } 230 | } 231 | 232 | // Export for use in demo 233 | export { Extract2MDTests }; 234 | 235 | // Auto-run basic tests if this file is run directly 236 | if (typeof window === 'undefined') { 237 | const tests = new Extract2MDTests(); 238 | tests.runBasicTests().catch(console.error); 239 | } 240 | -------------------------------------------------------------------------------- /scripts/validate-deployment.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /** 4 | * Deployment validation script for Extract2MD 5 | * This script validates the package is ready for deployment 6 | */ 7 | 8 | import fs from 'fs'; 9 | import path from 'path'; 10 | import { fileURLToPath } from 'url'; 11 | 12 | const __filename = fileURLToPath(import.meta.url); 13 | const __dirname = path.dirname(__filename); 14 | 15 | class DeploymentValidator { 16 | constructor() { 17 | this.errors = []; 18 | this.warnings = []; 19 | this.success = []; 20 | } 21 | 22 | log(message, type = 'info') { 23 | const timestamp = new Date().toISOString(); 24 | const prefix = type === 'error' ? '❌' : type === 'warning' ? '⚠️' : '✅'; 25 | console.log(`${prefix} [${timestamp}] ${message}`); 26 | 27 | if (type === 'error') this.errors.push(message); 28 | else if (type === 'warning') this.warnings.push(message); 29 | else this.success.push(message); 30 | } 31 | 32 | async validate() { 33 | console.log('🚀 Starting Extract2MD Deployment Validation...\n'); 34 | 35 | // Check package structure 36 | this.validatePackageStructure(); 37 | 38 | // Check build outputs 39 | this.validateBuildOutputs(); 40 | 41 | // Check documentation 42 | this.validateDocumentation(); 43 | 44 | // Check configuration files 45 | this.validateConfiguration(); 46 | 47 | // Check TypeScript definitions 48 | this.validateTypeDefinitions(); 49 | 50 | // Summary 51 | this.printSummary(); 52 | 53 | return this.errors.length === 0; 54 | } 55 | 56 | validatePackageStructure() { 57 | this.log('Validating package structure...'); 58 | 59 | const requiredFiles = [ 60 | 'package.json', 61 | 'src/index.js', 62 | 'src/types/index.d.ts', 63 | 'dist/assets/extract2md.umd.js', 64 | 'README.md', 65 | 'MIGRATION.md', 66 | 'DEPLOYMENT.md', 67 | 'config.example.json' 68 | ]; 69 | 70 | const requiredDirs = [ 71 | 'src/converters', 72 | 'src/engines', 73 | 'src/utils', 74 | 'examples', 75 | 'test' 76 | ]; 77 | 78 | for (const file of requiredFiles) { 79 | const filePath = path.resolve(__dirname, '..', file); 80 | if (fs.existsSync(filePath)) { 81 | this.log(`Required file found: ${file}`); 82 | } else { 83 | this.log(`Missing required file: ${file}`, 'error'); 84 | } 85 | } 86 | 87 | for (const dir of requiredDirs) { 88 | const dirPath = path.resolve(__dirname, '..', dir); 89 | if (fs.existsSync(dirPath) && fs.statSync(dirPath).isDirectory()) { 90 | this.log(`Required directory found: ${dir}`); 91 | } else { 92 | this.log(`Missing required directory: ${dir}`, 'error'); 93 | } 94 | } 95 | } 96 | 97 | validateBuildOutputs() { 98 | this.log('Validating build outputs...'); 99 | 100 | const buildFiles = [ 101 | 'dist/assets/extract2md.umd.js', 102 | 'dist/assets/tesseract-worker.min.js', 103 | 'dist/assets/tesseract-core.wasm.js', 104 | 'dist/pdf.worker.min.mjs' 105 | ]; 106 | 107 | for (const file of buildFiles) { 108 | const filePath = path.resolve(__dirname, '..', file); 109 | if (fs.existsSync(filePath)) { 110 | const stats = fs.statSync(filePath); 111 | const sizeInMB = (stats.size / (1024 * 1024)).toFixed(2); 112 | this.log(`Build file found: ${file} (${sizeInMB} MB)`); 113 | 114 | if (stats.size === 0) { 115 | this.log(`Build file is empty: ${file}`, 'error'); 116 | } 117 | } else { 118 | this.log(`Missing build file: ${file}`, 'error'); 119 | } 120 | } 121 | } 122 | 123 | validateDocumentation() { 124 | this.log('Validating documentation...'); 125 | 126 | const docFiles = { 127 | 'README.md': ['Installation', 'Scenarios', 'Configuration'], 128 | 'MIGRATION.md': ['Legacy API', 'Migration', 'Backwards Compatibility'], 129 | 'DEPLOYMENT.md': ['Distribution', 'Performance', 'Security'] 130 | }; 131 | 132 | for (const [file, keywords] of Object.entries(docFiles)) { 133 | const filePath = path.resolve(__dirname, '..', file); 134 | if (fs.existsSync(filePath)) { 135 | const content = fs.readFileSync(filePath, 'utf8'); 136 | 137 | for (const keyword of keywords) { 138 | if (content.toLowerCase().includes(keyword.toLowerCase())) { 139 | this.log(`Documentation section found in ${file}: ${keyword}`); 140 | } else { 141 | this.log(`Missing documentation section in ${file}: ${keyword}`, 'warning'); 142 | } 143 | } 144 | } else { 145 | this.log(`Missing documentation file: ${file}`, 'error'); 146 | } 147 | } 148 | } 149 | 150 | validateConfiguration() { 151 | this.log('Validating configuration files...'); 152 | 153 | // Check package.json 154 | const packagePath = path.resolve(__dirname, '..', 'package.json'); 155 | if (fs.existsSync(packagePath)) { 156 | try { 157 | const pkg = JSON.parse(fs.readFileSync(packagePath, 'utf8')); 158 | 159 | const requiredFields = ['name', 'version', 'main', 'module', 'types']; 160 | for (const field of requiredFields) { 161 | if (pkg[field]) { 162 | this.log(`package.json has required field: ${field}`); 163 | } else { 164 | this.log(`package.json missing field: ${field}`, 'error'); 165 | } 166 | } 167 | 168 | // Check if main file exists 169 | if (pkg.main && fs.existsSync(path.resolve(__dirname, '..', pkg.main))) { 170 | this.log(`Main entry point exists: ${pkg.main}`); 171 | } else { 172 | this.log(`Main entry point missing: ${pkg.main}`, 'error'); 173 | } 174 | 175 | } catch (error) { 176 | this.log(`Invalid JSON in package.json: ${error.message}`, 'error'); 177 | } 178 | } 179 | 180 | // Check example config 181 | const configPath = path.resolve(__dirname, '..', 'config.example.json'); 182 | if (fs.existsSync(configPath)) { 183 | try { 184 | const config = JSON.parse(fs.readFileSync(configPath, 'utf8')); 185 | if (config.ocr && config.webllm) { 186 | this.log('Example configuration is valid'); 187 | } else { 188 | this.log('Example configuration missing required sections', 'warning'); 189 | } 190 | } catch (error) { 191 | this.log(`Invalid example configuration: ${error.message}`, 'error'); 192 | } 193 | } 194 | } 195 | 196 | validateTypeDefinitions() { 197 | this.log('Validating TypeScript definitions...'); 198 | 199 | const typesPath = path.resolve(__dirname, '..', 'src/types/index.d.ts'); 200 | if (fs.existsSync(typesPath)) { 201 | const content = fs.readFileSync(typesPath, 'utf8'); 202 | 203 | const requiredInterfaces = [ 204 | 'OCRConfig', 205 | 'WebLLMConfig', 206 | 'Extract2MDConfig', 207 | 'Extract2MDConverter' 208 | ]; 209 | 210 | for (const interfaceName of requiredInterfaces) { 211 | if (content.includes(`interface ${interfaceName}`) || 212 | content.includes(`declare class ${interfaceName}`) || 213 | content.includes(`export class ${interfaceName}`)) { 214 | this.log(`TypeScript interface found: ${interfaceName}`); 215 | } else { 216 | this.log(`Missing TypeScript interface: ${interfaceName}`, 'error'); 217 | } 218 | } 219 | } else { 220 | this.log('TypeScript definitions file not found', 'error'); 221 | } 222 | } 223 | 224 | printSummary() { 225 | console.log('\n📊 Deployment Validation Summary'); 226 | console.log('====================================='); 227 | console.log(`✅ Successful checks: ${this.success.length}`); 228 | console.log(`⚠️ Warnings: ${this.warnings.length}`); 229 | console.log(`❌ Errors: ${this.errors.length}`); 230 | 231 | if (this.errors.length > 0) { 232 | console.log('\n🚨 Critical Issues Found:'); 233 | this.errors.forEach(error => console.log(` - ${error}`)); 234 | } 235 | 236 | if (this.warnings.length > 0) { 237 | console.log('\n⚠️ Warnings:'); 238 | this.warnings.forEach(warning => console.log(` - ${warning}`)); 239 | } 240 | 241 | console.log('\n' + (this.errors.length === 0 ? '🎉 Package is ready for deployment!' : '🔧 Please fix the errors before deployment.')); 242 | } 243 | } 244 | 245 | // Run validation 246 | const validator = new DeploymentValidator(); 247 | validator.validate().then(isValid => { 248 | process.exit(isValid ? 0 : 1); 249 | }).catch(error => { 250 | console.error('Validation failed:', error); 251 | process.exit(1); 252 | }); 253 | -------------------------------------------------------------------------------- /src/engines/WebLLMEngine.js: -------------------------------------------------------------------------------- 1 | /** 2 | * WebLLMEngine.js 3 | * Standalone WebLLM inference engine for Extract2MD 4 | * Handles model initialization, loading, and text generation 5 | */ 6 | 7 | import * as webllm from '@mlc-ai/web-llm'; 8 | 9 | export class WebLLMEngine { 10 | constructor(config = {}) { 11 | this.engine = null; 12 | this.isInitialized = false; 13 | this.currentModelId = null; 14 | this.progressCallback = config.progressCallback || ((progress) => {}); 15 | this.defaultModel = config.defaultModel || 'Qwen3-0.6B-q4f16_1-MLC'; 16 | this.customModelConfig = config.customModelConfig || null; 17 | } 18 | 19 | /** 20 | * Initialize the WebLLM engine with specified model 21 | * @param {string} modelId - Model identifier 22 | * @param {Object} modelConfig - Model configuration options 23 | */ 24 | async initialize(modelId = null, modelConfig = {}) { 25 | const targetModelId = modelId || this.defaultModel; 26 | 27 | // Check if already initialized with the same model 28 | if (this.isInitialized && this.currentModelId === targetModelId) { 29 | this.progressCallback({ 30 | stage: 'webllm_ready', 31 | message: 'WebLLM engine already initialized with the correct model.' 32 | }); 33 | return; 34 | } 35 | 36 | try { 37 | this.progressCallback({ 38 | stage: 'webllm_init_start', 39 | message: `Initializing WebLLM engine with model: ${targetModelId}...` 40 | }); 41 | 42 | // Clean up existing engine if any 43 | if (this.engine) { 44 | await this.cleanup(); 45 | } 46 | 47 | // Setup progress callback for model loading 48 | const initProgressCallback = (report) => { 49 | this.progressCallback({ 50 | stage: 'webllm_load_progress', 51 | message: `Model Loading: ${report.text}`, 52 | progress: report.progress 53 | }); 54 | }; 55 | 56 | // Configure model 57 | let appConfig = null; 58 | if (this.customModelConfig && this.customModelConfig.modelId === targetModelId) { 59 | // Use custom model configuration 60 | appConfig = { 61 | model_list: [this.customModelConfig] 62 | }; 63 | } 64 | 65 | const engineConfig = { 66 | initProgressCallback, 67 | appConfig, 68 | ...modelConfig 69 | }; 70 | 71 | // Create and initialize engine 72 | this.engine = await webllm.CreateMLCEngine(targetModelId, engineConfig); 73 | this.isInitialized = true; 74 | this.currentModelId = targetModelId; 75 | 76 | this.progressCallback({ 77 | stage: 'webllm_init_complete', 78 | message: 'WebLLM engine initialized successfully.' 79 | }); 80 | 81 | } catch (error) { 82 | this.isInitialized = false; 83 | this.currentModelId = null; 84 | this.progressCallback({ 85 | stage: 'webllm_init_error', 86 | message: `WebLLM initialization failed: ${error.message}`, 87 | error 88 | }); 89 | throw new Error(`WebLLM initialization failed: ${error.message}`); 90 | } 91 | } 92 | 93 | /** 94 | * Generate text using the initialized model 95 | * @param {string} prompt - Input prompt 96 | * @param {Object} options - Generation options 97 | * @returns {Promise} Generated text 98 | */ 99 | async generate(prompt, options = {}) { 100 | if (!this.isInitialized || !this.engine) { 101 | throw new Error('WebLLM engine is not initialized. Call initialize() first.'); 102 | } 103 | 104 | try { 105 | this.progressCallback({ 106 | stage: 'webllm_generate_start', 107 | message: 'Generating response...' 108 | }); 109 | 110 | const messages = [{ role: "user", content: prompt }]; 111 | 112 | const requestOptions = { 113 | messages, 114 | temperature: options.temperature || 0.7, 115 | max_tokens: options.maxTokens || 4096, 116 | ...options 117 | }; 118 | 119 | const chatCompletion = await this.engine.chat.completions.create(requestOptions); 120 | 121 | if (chatCompletion.choices && chatCompletion.choices.length > 0) { 122 | const content = chatCompletion.choices[0].message.content || ''; 123 | 124 | this.progressCallback({ 125 | stage: 'webllm_generate_complete', 126 | message: 'Text generation completed.' 127 | }); 128 | 129 | return content; 130 | } else { 131 | throw new Error('No response generated from the model.'); 132 | } 133 | 134 | } catch (error) { 135 | this.progressCallback({ 136 | stage: 'webllm_generate_error', 137 | message: `Text generation failed: ${error.message}`, 138 | error 139 | }); 140 | throw new Error(`Text generation failed: ${error.message}`); 141 | } 142 | } 143 | 144 | /** 145 | * Generate text with streaming support 146 | * @param {string} prompt - Input prompt 147 | * @param {Object} options - Generation options 148 | * @param {Function} onChunk - Callback for each chunk 149 | * @returns {Promise} Complete generated text 150 | */ 151 | async generateStream(prompt, options = {}, onChunk = null) { 152 | if (!this.isInitialized || !this.engine) { 153 | throw new Error('WebLLM engine is not initialized. Call initialize() first.'); 154 | } 155 | 156 | try { 157 | this.progressCallback({ 158 | stage: 'webllm_stream_start', 159 | message: 'Starting streaming generation...' 160 | }); 161 | 162 | const messages = [{ role: "user", content: prompt }]; 163 | 164 | const requestOptions = { 165 | messages, 166 | temperature: options.temperature || 0.7, 167 | max_tokens: options.maxTokens || 4096, 168 | stream: true, 169 | stream_options: { include_usage: true }, 170 | ...options 171 | }; 172 | 173 | const chunks = await this.engine.chat.completions.create(requestOptions); 174 | let fullResponse = ''; 175 | 176 | for await (const chunk of chunks) { 177 | const content = chunk.choices[0]?.delta?.content || ''; 178 | fullResponse += content; 179 | 180 | if (onChunk && content) { 181 | onChunk(content, fullResponse); 182 | } 183 | 184 | if (chunk.usage) { 185 | this.progressCallback({ 186 | stage: 'webllm_stream_usage', 187 | message: 'Stream completed', 188 | usage: chunk.usage 189 | }); 190 | } 191 | } 192 | 193 | this.progressCallback({ 194 | stage: 'webllm_stream_complete', 195 | message: 'Streaming generation completed.' 196 | }); 197 | 198 | return fullResponse; 199 | 200 | } catch (error) { 201 | this.progressCallback({ 202 | stage: 'webllm_stream_error', 203 | message: `Streaming generation failed: ${error.message}`, 204 | error 205 | }); 206 | throw new Error(`Streaming generation failed: ${error.message}`); 207 | } 208 | } 209 | 210 | /** 211 | * Check if the engine is ready for use 212 | * @returns {boolean} Engine readiness status 213 | */ 214 | isReady() { 215 | return this.isInitialized && this.engine !== null; 216 | } 217 | 218 | /** 219 | * Get current model information 220 | * @returns {Object} Model information 221 | */ 222 | getModelInfo() { 223 | return { 224 | isInitialized: this.isInitialized, 225 | currentModelId: this.currentModelId, 226 | isReady: this.isReady() 227 | }; 228 | } 229 | 230 | /** 231 | * Clean up the engine and free resources 232 | */ 233 | async cleanup() { 234 | if (this.engine) { 235 | try { 236 | this.progressCallback({ 237 | stage: 'webllm_cleanup', 238 | message: 'Cleaning up WebLLM engine...' 239 | }); 240 | 241 | // Note: WebLLM's MLCEngine might not have a direct unload method 242 | // But we should clean up references 243 | this.engine = null; 244 | this.isInitialized = false; 245 | this.currentModelId = null; 246 | 247 | this.progressCallback({ 248 | stage: 'webllm_cleanup_complete', 249 | message: 'WebLLM engine cleanup completed.' 250 | }); 251 | } catch (error) { 252 | this.progressCallback({ 253 | stage: 'webllm_cleanup_error', 254 | message: `WebLLM cleanup failed: ${error.message}`, 255 | error 256 | }); 257 | } 258 | } 259 | } 260 | } 261 | 262 | export default WebLLMEngine; 263 | -------------------------------------------------------------------------------- /examples/usage-examples.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Example usage of Extract2MD with all scenarios 3 | */ 4 | 5 | // Fix import statement to match new API structure 6 | import { Extract2MDConverter, ConfigValidator } from '../src/index.js'; 7 | 8 | // Example configurations for different scenarios 9 | const basicConfig = { 10 | progressCallback: (progress) => { 11 | console.log(`[${progress.stage}] ${progress.message}`); 12 | if (progress.currentPage && progress.totalPages) { 13 | console.log(`Progress: ${progress.currentPage}/${progress.totalPages}`); 14 | } 15 | } 16 | }; 17 | 18 | const advancedConfig = { 19 | webllm: { 20 | model: "Qwen3-0.6B-q4f16_1-MLC", 21 | options: { 22 | temperature: 0.7, 23 | maxTokens: 4096 24 | } 25 | }, 26 | systemPrompts: { 27 | singleExtraction: "Focus on preserving technical accuracy and code examples.", 28 | combinedExtraction: "Create comprehensive documentation by leveraging both extraction methods." 29 | }, 30 | processing: { 31 | splitPascalCase: false, 32 | pdfRenderScale: 2.5, 33 | postProcessRules: [ 34 | { find: /\bAPI\b/g, replace: "API" }, 35 | { find: /\bJSON\b/g, replace: "JSON" }, 36 | { find: /\bHTML\b/g, replace: "HTML" } 37 | ] 38 | }, 39 | progressCallback: (progress) => { 40 | console.log(`[${progress.stage}] ${progress.message}`); 41 | if (progress.progress !== undefined) { 42 | console.log(`Loading: ${Math.round(progress.progress * 100)}%`); 43 | } 44 | if (progress.error) { 45 | console.error('Error:', progress.error); 46 | } 47 | } 48 | }; 49 | 50 | // Example: Using different scenarios 51 | async function demonstrateScenarios(pdfFile) { 52 | console.log('=== Extract2MD Scenario Demonstrations ===\n'); 53 | 54 | try { 55 | // Scenario 1: Quick Convert Only 56 | console.log('1. Quick Convert Only (Fast, basic formatting)'); 57 | console.log('Use case: PDFs with selectable text, need quick results'); 58 | const result1 = await Extract2MDConverter.quickConvertOnly(pdfFile, basicConfig); 59 | console.log('✅ Quick conversion completed'); 60 | console.log(`Output length: ${result1.length} characters\n`); 61 | 62 | // Scenario 2: High Accuracy Convert Only 63 | console.log('2. High Accuracy Convert Only (OCR, slower but comprehensive)'); 64 | console.log('Use case: Scanned PDFs, images, complex layouts'); 65 | const result2 = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile, basicConfig); 66 | console.log('✅ High accuracy conversion completed'); 67 | console.log(`Output length: ${result2.length} characters\n`); 68 | 69 | // Scenario 3: Quick Convert + LLM 70 | console.log('3. Quick Convert + LLM Enhancement'); 71 | console.log('Use case: Fast extraction with AI enhancement'); 72 | const result3 = await Extract2MDConverter.quickConvertWithLLM(pdfFile, advancedConfig); 73 | console.log('✅ Quick conversion with LLM completed'); 74 | console.log(`Output length: ${result3.length} characters\n`); 75 | 76 | // Scenario 4: High Accuracy + LLM 77 | console.log('4. High Accuracy + LLM Enhancement'); 78 | console.log('Use case: OCR extraction with AI enhancement'); 79 | const result4 = await Extract2MDConverter.highAccuracyConvertWithLLM(pdfFile, advancedConfig); 80 | console.log('✅ High accuracy conversion with LLM completed'); 81 | console.log(`Output length: ${result4.length} characters\n`); 82 | 83 | // Scenario 5: Combined + LLM (Recommended) 84 | console.log('5. Combined Convert + LLM Enhancement (RECOMMENDED)'); 85 | console.log('Use case: Best possible results using both extraction methods'); 86 | const result5 = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, advancedConfig); 87 | console.log('✅ Combined conversion with LLM completed'); 88 | console.log(`Output length: ${result5.length} characters\n`); 89 | 90 | return { 91 | quickOnly: result1, 92 | ocrOnly: result2, 93 | quickWithLLM: result3, 94 | ocrWithLLM: result4, 95 | combined: result5 96 | }; 97 | 98 | } catch (error) { 99 | console.error('❌ Error during conversion:', error.message); 100 | throw error; 101 | } 102 | } 103 | 104 | // Example: Configuration validation 105 | function demonstrateConfigValidation() { 106 | console.log('=== Configuration Validation Demo ===\n'); 107 | 108 | // Valid configuration 109 | try { 110 | const validConfig = { 111 | webllm: { 112 | model: "Qwen3-0.6B-q4f16_1-MLC", 113 | options: { temperature: 0.8 } 114 | }, 115 | processing: { 116 | splitPascalCase: true, 117 | pdfRenderScale: 3.0 118 | } 119 | }; 120 | 121 | const validated = ConfigValidator.validate(validConfig); 122 | console.log('✅ Configuration validation passed'); 123 | console.log('Validated config keys:', Object.keys(validated)); 124 | } catch (error) { 125 | console.error('❌ Configuration validation failed:', error.message); 126 | } 127 | 128 | // Invalid configuration example 129 | try { 130 | const invalidConfig = { 131 | webllm: { 132 | options: { temperature: 5.0 } // Invalid: temperature > 2 133 | } 134 | }; 135 | 136 | ConfigValidator.validate(invalidConfig); 137 | } catch (error) { 138 | console.log('✅ Invalid configuration correctly rejected:', error.message); 139 | } 140 | 141 | console.log(''); 142 | } 143 | 144 | // Example: Loading configuration from JSON 145 | function demonstrateJSONConfig() { 146 | console.log('=== JSON Configuration Demo ===\n'); 147 | 148 | const configJson = `{ 149 | "webllm": { 150 | "model": "Qwen3-0.6B-q4f16_1-MLC", 151 | "options": { 152 | "temperature": 0.7, 153 | "maxTokens": 2048 154 | } 155 | }, 156 | "systemPrompts": { 157 | "singleExtraction": "Focus on technical accuracy.", 158 | "combinedExtraction": "Create comprehensive documentation." 159 | }, 160 | "processing": { 161 | "splitPascalCase": false, 162 | "postProcessRules": [ 163 | {"find": "\\\\bAPI\\\\b", "replace": "API"} 164 | ] 165 | } 166 | }`; 167 | 168 | try { 169 | const config = ConfigValidator.fromJSON(configJson); 170 | console.log('✅ JSON configuration loaded successfully'); 171 | console.log('LLM model:', config.webllm.model); 172 | console.log('Temperature:', config.webllm.options.temperature); 173 | console.log('Custom single extraction prompt:', config.systemPrompts.singleExtraction); 174 | console.log(''); 175 | } catch (error) { 176 | console.error('❌ JSON configuration failed:', error.message); 177 | } 178 | } 179 | 180 | // Example: Progress tracking 181 | function createProgressTracker() { 182 | const startTime = Date.now(); 183 | let lastStage = ''; 184 | 185 | return (progress) => { 186 | const elapsed = ((Date.now() - startTime) / 1000).toFixed(1); 187 | 188 | if (progress.stage !== lastStage) { 189 | console.log(`\n[${elapsed}s] === ${progress.stage.toUpperCase()} ===`); 190 | lastStage = progress.stage; 191 | } 192 | 193 | console.log(`[${elapsed}s] ${progress.message}`); 194 | 195 | if (progress.currentPage && progress.totalPages) { 196 | const pageProgress = Math.round((progress.currentPage / progress.totalPages) * 100); 197 | console.log(`[${elapsed}s] Page Progress: ${pageProgress}% (${progress.currentPage}/${progress.totalPages})`); 198 | } 199 | 200 | if (progress.progress !== undefined) { 201 | const loadProgress = Math.round(progress.progress * 100); 202 | console.log(`[${elapsed}s] Loading Progress: ${loadProgress}%`); 203 | } 204 | 205 | if (progress.usage) { 206 | console.log(`[${elapsed}s] Token Usage:`, progress.usage); 207 | } 208 | 209 | if (progress.error) { 210 | console.error(`[${elapsed}s] ❌ Error:`, progress.error.message || progress.error); 211 | } 212 | }; 213 | } 214 | 215 | // Main demo function 216 | async function runDemo() { 217 | console.log('🚀 Extract2MD Enhanced API Demo\n'); 218 | 219 | // Configuration demos 220 | demonstrateConfigValidation(); 221 | demonstrateJSONConfig(); 222 | 223 | // File input simulation (in real usage, this would come from user input) 224 | console.log('📄 To test with actual PDF files:'); 225 | console.log('1. Use an HTML file input: '); 226 | console.log('2. Pass the File object to any scenario method'); 227 | console.log('3. Monitor progress through the callback\n'); 228 | 229 | console.log('Example usage:'); 230 | console.log(` 231 | // HTML 232 | 233 | 234 |
235 |
236 | 237 | // JavaScript 238 | async function convertPDF() { 239 | const fileInput = document.getElementById('pdfInput'); 240 | const pdfFile = fileInput.files[0]; 241 | 242 | if (!pdfFile) { 243 | alert('Please select a PDF file'); 244 | return; 245 | } 246 | 247 | const config = { 248 | progressCallback: (progress) => { 249 | document.getElementById('progress').textContent = progress.message; 250 | } 251 | }; 252 | 253 | try { 254 | // Use the best scenario for comprehensive results 255 | const markdown = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config); 256 | document.getElementById('output').innerHTML = 257 | '
' + markdown.replace(/';
258 |   } catch (error) {
259 |     console.error('Conversion failed:', error);
260 |     alert('Conversion failed: ' + error.message);
261 |   }
262 | }
263 |   `);
264 | }
265 | 
266 | // Export for use in other examples
267 | export {
268 |   demonstrateScenarios,
269 |   demonstrateConfigValidation,
270 |   demonstrateJSONConfig,
271 |   createProgressTracker,
272 |   basicConfig,
273 |   advancedConfig
274 | };
275 | 
276 | // Run demo if this file is executed directly
277 | if (typeof window !== 'undefined') {
278 |   // Browser environment
279 |   window.Extract2MDDemo = {
280 |     runDemo,
281 |     demonstrateScenarios,
282 |     createProgressTracker,
283 |     basicConfig,
284 |     advancedConfig
285 |   };
286 | } else {
287 |   // Node.js environment
288 |   runDemo();
289 | }
290 | 


--------------------------------------------------------------------------------
/scripts/postinstall.js:
--------------------------------------------------------------------------------
  1 | import fs from 'fs';
  2 | import https from 'https';
  3 | import zlib from 'zlib';
  4 | import path from 'path';
  5 | import { fileURLToPath } from 'url';
  6 | 
  7 | // Polyfill for __dirname in ES modules
  8 | const __filename = fileURLToPath(import.meta.url);
  9 | const __dirname = path.dirname(__filename);
 10 | 
 11 | const langDataPath = path.resolve(__dirname, '..', 'dist', 'assets', 'lang-data');
 12 | 
 13 | const filesToDownload = [
 14 |   {
 15 |     url: 'https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng@1.0.0/4.0.0_best_int/eng.traineddata.gz',
 16 |     fileName: 'eng.traineddata.gz',
 17 |     destFileName: 'eng.traineddata.gz', // Ensure final file is named .gz but contains uncompressed data
 18 |     gzipped: true,
 19 |   },
 20 |   {
 21 |     url: 'https://github.com/tesseract-ocr/tessdata/raw/4.00/sin.traineddata',
 22 |     fileName: 'sin.traineddata', // Original name from URL (or how we save it initially)
 23 |     destFileName: 'sin.traineddata.gz', // Final name Tesseract.js expects
 24 |     gzipped: false, // Source is not gzipped, so no decompression needed
 25 |   },
 26 | ];
 27 | 
 28 | async function ensureDirExists(dirPath) {
 29 |   try {
 30 |     await fs.promises.mkdir(dirPath, { recursive: true });
 31 |     console.log(`Directory ensured: ${dirPath}`);
 32 |   } catch (error) {
 33 |     if (error.code !== 'EEXIST') {
 34 |       console.error(`Error creating directory ${dirPath}:`, error);
 35 |       throw error;
 36 |     }
 37 |     console.log(`Directory already exists: ${dirPath}`);
 38 |   }
 39 | }
 40 | 
 41 | async function downloadFile(url, destPath, fileName, redirectCount = 0) {
 42 |   const MAX_REDIRECTS = 5;
 43 | 
 44 |   return new Promise((resolve, reject) => {
 45 |     if (redirectCount > MAX_REDIRECTS) {
 46 |       reject(new Error(`Exceeded maximum redirect limit (${MAX_REDIRECTS}) for ${fileName}`));
 47 |       return;
 48 |     }
 49 | 
 50 |     const tempFilePath = path.join(langDataPath, `_${fileName}`); // Download to a temp name
 51 |     // Ensure tempFilePath is not created if the actual destPath is the same (for non-gzipped direct save)
 52 |     // However, for consistency and cleanup, using a temp file is safer.
 53 |     // The final rename will handle placing it correctly.
 54 | 
 55 |     console.log(`Downloading ${fileName} from ${url} (Attempt: ${redirectCount + 1})...`);
 56 | 
 57 |     const request = https.get(url, (response) => {
 58 |       if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
 59 |         console.log(`Redirected for ${fileName} to ${response.headers.location}`);
 60 |         // Consume response data to free up memory
 61 |         response.resume();
 62 |         downloadFile(response.headers.location, destPath, fileName, redirectCount + 1)
 63 |           .then(resolve)
 64 |           .catch(reject);
 65 |         return;
 66 |       }
 67 | 
 68 |       if (response.statusCode !== 200) {
 69 |         // fs.unlink(tempFilePath, () => {}); // Don't unlink if it wasn't opened yet or on redirect
 70 |         reject(new Error(`Failed to download ${fileName}. Status Code: ${response.statusCode} from ${url}`));
 71 |         return;
 72 |       }
 73 | 
 74 |       const fileStream = fs.createWriteStream(tempFilePath);
 75 |       response.pipe(fileStream);
 76 | 
 77 |       fileStream.on('finish', () => {
 78 |         fileStream.close(async (err) => {
 79 |           if (err) {
 80 |             fs.unlink(tempFilePath, () => {}).catch(() => {}); // Clean up temp file, ignore error if it doesn't exist
 81 |             reject(new Error(`Error closing file stream for ${fileName}: ${err.message}`));
 82 |             return;
 83 |           }
 84 |           try {
 85 |             // Ensure target directory exists before renaming
 86 |             await ensureDirExists(path.dirname(destPath));
 87 |             await fs.promises.rename(tempFilePath, destPath);
 88 |             console.log(`Successfully downloaded and saved ${fileName} to ${destPath}`);
 89 |             resolve();
 90 |           } catch (renameError) {
 91 |             fs.unlink(tempFilePath, () => {}).catch(() => {});
 92 |             reject(new Error(`Error renaming ${tempFilePath} to ${destPath}: ${renameError.message}`));
 93 |           }
 94 |         });
 95 |       });
 96 | 
 97 |       fileStream.on('error', (err) => {
 98 |         fs.unlink(tempFilePath, () => {}).catch(() => {});
 99 |         reject(new Error(`Error writing file ${fileName}: ${err.message}`));
100 |       });
101 |     });
102 | 
103 |     request.on('error', (err) => {
104 |       // fs.unlink(tempFilePath, () => {}).catch(() => {}); // Temp file might not exist if request itself failed early
105 |       reject(new Error(`Error downloading ${fileName} from ${url}: ${err.message}`));
106 |     });
107 |   });
108 | }
109 | 
110 | async function decompressGzip(sourcePath, destPath) {
111 |   return new Promise((resolve, reject) => {
112 |     const isSameFile = sourcePath === destPath;
113 |     // Use a temporary file for the decompressed output, especially if decompressing in-place.
114 |     const tempOutputPath = isSameFile ? `${destPath}.tmp_decompress_${Date.now()}` : destPath;
115 | 
116 |     console.log(`Decompressing ${sourcePath} to ${tempOutputPath}${isSameFile ? ' (will then replace original)' : ''}...`);
117 | 
118 |     const gzip = zlib.createGunzip();
119 |     const sourceStream = fs.createReadStream(sourcePath);
120 |     const destStream = fs.createWriteStream(tempOutputPath);
121 | 
122 |     sourceStream.pipe(gzip).pipe(destStream);
123 | 
124 |     destStream.on('finish', () => {
125 |       destStream.close(async (closeErr) => {
126 |         if (closeErr) {
127 |           // Attempt to clean up temporary file if it exists
128 |           if (fs.existsSync(tempOutputPath)) {
129 |             await fs.promises.unlink(tempOutputPath).catch(unlinkErr => console.error(`Error unlinking temp file ${tempOutputPath} on close error:`, unlinkErr));
130 |           }
131 |           reject(new Error(`Error closing destination stream for ${tempOutputPath}: ${closeErr.message}`));
132 |           return;
133 |         }
134 |         try {
135 |           if (isSameFile) {
136 |             // If source and dest are the same, rename temp file to replace original sourcePath with decompressed content
137 |             await fs.promises.rename(tempOutputPath, destPath);
138 |             console.log(`Successfully decompressed and replaced ${sourcePath} with uncompressed content.`);
139 |           } else {
140 |             // If source and dest are different, the decompressed file is at destPath (which was tempOutputPath).
141 |             // The original gzipped sourcePath should be removed.
142 |             await fs.promises.unlink(sourcePath);
143 |             console.log(`Successfully decompressed ${sourcePath} to ${destPath}. Original ${sourcePath} removed.`);
144 |           }
145 |           resolve();
146 |         } catch (moveOrUnlinkError) {
147 |           // Attempt to clean up temporary file if it exists and wasn't the final destPath
148 |           if (fs.existsSync(tempOutputPath) && tempOutputPath !== destPath) {
149 |              await fs.promises.unlink(tempOutputPath).catch(unlinkErr => console.error(`Error unlinking temp file ${tempOutputPath} on finalization error:`, unlinkErr));
150 |           }
151 |           reject(new Error(`Error finalizing decompression for ${sourcePath} (to ${destPath}): ${moveOrUnlinkError.message}`));
152 |         }
153 |       });
154 |     });
155 | 
156 |     destStream.on('error', async (streamErr) => {
157 |       if (fs.existsSync(tempOutputPath)) {
158 |         await fs.promises.unlink(tempOutputPath).catch(unlinkErr => console.error(`Error unlinking temp file ${tempOutputPath} on stream error:`, unlinkErr));
159 |       }
160 |       reject(new Error(`Error writing decompressed file ${tempOutputPath}: ${streamErr.message}`));
161 |     });
162 |     gzip.on('error', (gzipErr) => reject(new Error(`Error decompressing ${sourcePath}: ${gzipErr.message}`)));
163 |     sourceStream.on('error', (sourceErr) => reject(new Error(`Error reading ${sourcePath} for decompression: ${sourceErr.message}`)));
164 |   });
165 | }
166 | 
167 | async function main() {
168 |   try {
169 |     console.log('Starting postinstall script for extract2md...');
170 |     await ensureDirExists(langDataPath);
171 | 
172 |     for (const file of filesToDownload) {
173 |       const downloadedFilePath = path.join(langDataPath, file.fileName);
174 |       const finalDestPath = path.join(langDataPath, file.destFileName);
175 | 
176 |       // Check if final decompressed/copied file already exists
177 |       if (fs.existsSync(finalDestPath)) {
178 |         console.log(`${file.destFileName} already exists at ${finalDestPath}. Skipping download.`);
179 |         continue;
180 |       }
181 |       
182 |       // Check if intermediate .gz file exists (for gzipped files)
183 |       if (file.gzipped && fs.existsSync(downloadedFilePath)) {
184 |          console.log(`Intermediate file ${file.fileName} already exists. Attempting decompression.`);
185 |       } else {
186 |         await downloadFile(file.url, downloadedFilePath, file.fileName);
187 |       }
188 | 
189 |       if (file.gzipped) {
190 |         // Ensure downloaded file exists before trying to decompress
191 |         if (!fs.existsSync(downloadedFilePath)) {
192 |             console.error(`Error: Gzipped file ${downloadedFilePath} not found after download attempt. Skipping decompression.`);
193 |             continue;
194 |         }
195 |         await decompressGzip(downloadedFilePath, finalDestPath);
196 |       } else {
197 |         // Handle non-gzipped files: if downloadedFilePath is different from finalDestPath, rename.
198 |         // This applies if we downloaded 'lang.traineddata' but want 'lang.traineddata.gz' (containing uncompressed data).
199 |         if (downloadedFilePath !== finalDestPath) {
200 |           if (fs.existsSync(downloadedFilePath)) {
201 |             console.log(`Renaming non-gzipped file ${downloadedFilePath} to ${finalDestPath}...`);
202 |             await fs.promises.rename(downloadedFilePath, finalDestPath);
203 |             console.log(`Successfully renamed ${downloadedFilePath} to ${finalDestPath}.`);
204 |           } else {
205 |             console.warn(`File ${downloadedFilePath} not found for renaming to ${finalDestPath}. It might have been saved directly as ${finalDestPath} if download logic handles it, or download failed.`);
206 |           }
207 |         } else {
208 |           // If downloadedFilePath is the same as finalDestPath, it means the file was already saved with the correct name.
209 |           console.log(`Non-gzipped file ${finalDestPath} is already correctly named. No rename needed.`);
210 |         }
211 |       }
212 |     }
213 | 
214 |     console.log('Postinstall script completed successfully.');
215 |   } catch (error) {
216 |     console.error('Error during postinstall script:', error.message);
217 |     // process.exit(1); // Optionally exit with error, though npm might handle this.
218 |   }
219 | }
220 | 
221 | main();


--------------------------------------------------------------------------------
/DEPLOYMENT.md:
--------------------------------------------------------------------------------
  1 | # Deployment Documentation for Extract2MD v2.0.0
  2 | 
  3 | This document outlines the deployment process, distribution methods, and integration guidelines for the Extract2MD package.
  4 | 
  5 | ## Package Structure
  6 | 
  7 | The Extract2MD package is distributed with the following structure:
  8 | 
  9 | ```
 10 | extract2md/
 11 | ├── dist/                           # Built files for distribution
 12 | │   ├── assets/
 13 | │   │   ├── extract2md.umd.js      # Main UMD bundle
 14 | │   │   ├── extract2md.umd.js.map  # Source map
 15 | │   │   ├── tesseract-worker.min.js # Tesseract.js worker
 16 | │   │   └── tesseract-core.wasm.js  # Tesseract WASM core
 17 | │   └── pdf.worker.min.mjs          # PDF.js worker
 18 | ├── src/                            # Source code
 19 | │   ├── types/index.d.ts           # TypeScript definitions
 20 | │   ├── index.js                   # Main entry point
 21 | │   ├── converters/                # Converter modules
 22 | │   ├── engines/                   # Processing engines
 23 | │   └── utils/                     # Utility modules
 24 | ├── examples/                      # Usage examples and demo
 25 | ├── test/                         # Test files
 26 | ├── package.json                  # Package configuration
 27 | ├── config.example.json          # Example configuration
 28 | ├── README.md                 # Main documentation
 29 | └── MIGRATION.md                 # Migration guide
 30 | ```
 31 | 
 32 | ## Distribution Methods
 33 | 
 34 | ### 1. NPM Package Distribution
 35 | 
 36 | The package is designed for npm distribution with full TypeScript support.
 37 | 
 38 | #### Installation
 39 | ```bash
 40 | npm install extract2md
 41 | ```
 42 | 
 43 | #### Package Entry Points
 44 | - **Main (UMD)**: `dist/assets/extract2md.umd.js` - For browser use
 45 | - **Module (ES6)**: `src/index.js` - For modern bundlers
 46 | - **Types**: `src/types/index.d.ts` - TypeScript definitions
 47 | 
 48 | ### 2. CDN Distribution
 49 | 
 50 | The UMD bundle can be served via CDN for direct browser use:
 51 | 
 52 | ```html
 53 | 
 54 | 
 58 | ```
 59 | 
 60 | ### 3. Direct Bundle Integration
 61 | 
 62 | For projects that need to bundle the library:
 63 | 
 64 | ```javascript
 65 | import { Extract2MDConverter } from 'extract2md';
 66 | // Use ES6 modules with tree shaking support
 67 | ```
 68 | 
 69 | ## Build Process
 70 | 
 71 | ### Prerequisites
 72 | - Node.js 14+ 
 73 | - npm 7+
 74 | 
 75 | ### Building the Package
 76 | 
 77 | ```bash
 78 | # Install dependencies
 79 | npm install
 80 | 
 81 | # Build the UMD bundle
 82 | npm run build
 83 | 
 84 | # Run tests
 85 | npm test
 86 | 
 87 | # Prepare for publishing
 88 | npm run prepublishOnly
 89 | ```
 90 | 
 91 | ### Build Outputs
 92 | 
 93 | The build process creates:
 94 | 1. **UMD Bundle**: `dist/assets/extract2md.umd.js` (5.69 MiB)
 95 | 2. **Worker Files**: Required for PDF.js and Tesseract.js
 96 | 3. **Source Maps**: For debugging
 97 | 
 98 | ## Deployment Configurations
 99 | 
100 | ### 1. Web Application Deployment
101 | 
102 | For web applications using the library:
103 | 
104 | ```javascript
105 | // Webpack configuration example
106 | module.exports = {
107 |     resolve: {
108 |         fallback: {
109 |             "fs": false,
110 |             "path": false
111 |         }
112 |     },
113 |     // Copy worker files to your public directory
114 |     plugins: [
115 |         new CopyWebpackPlugin({
116 |             patterns: [
117 |                 { from: 'node_modules/extract2md/dist/pdf.worker.min.mjs', to: 'public/' },
118 |                 { from: 'node_modules/extract2md/dist/assets/tesseract-worker.min.js', to: 'public/' },
119 |                 { from: 'node_modules/extract2md/dist/assets/tesseract-core.wasm.js', to: 'public/' }
120 |             ]
121 |         })
122 |     ]
123 | };
124 | ```
125 | 
126 | ### 2. Node.js Server Deployment
127 | 
128 | For server-side use (limited functionality due to browser dependencies):
129 | 
130 | ```javascript
131 | // Server-side usage (configuration validation only)
132 | import { ConfigValidator } from 'extract2md/src/utils/ConfigValidator.js';
133 | 
134 | const validator = new ConfigValidator();
135 | const isValid = validator.validate(config);
136 | ```
137 | 
138 | ### 3. Static Site Deployment
139 | 
140 | For static sites or demos:
141 | 
142 | ```html
143 | 
144 | 
145 | 
146 |     
147 | 
148 | 
149 |     
150 |     
151 |     
152 | 153 | 168 | 169 | 170 | ``` 171 | 172 | ## Performance Considerations 173 | 174 | ### Bundle Size Optimization 175 | 176 | The package includes large dependencies: 177 | - **PDF.js**: ~951 KB (PDF processing) 178 | - **Tesseract.js**: ~4.5 MB (OCR functionality) 179 | - **WebLLM**: ~Variable (model-dependent) 180 | 181 | #### Optimization Strategies: 182 | 183 | 1. **Lazy Loading**: Load only required modules 184 | ```javascript 185 | // Load only when needed 186 | const { Extract2MDConverter } = await import('extract2md'); 187 | ``` 188 | 189 | 2. **Code Splitting**: Separate scenarios into different chunks 190 | ```javascript 191 | // Webpack code splitting 192 | const quickConvert = () => import('extract2md').then(m => m.Extract2MDConverter.quickConvertOnly); 193 | ``` 194 | 195 | 3. **CDN Caching**: Use CDN for static assets 196 | ```javascript 197 | // Configure worker paths to use CDN 198 | window.EXTRACT2MD_CONFIG = { 199 | workerPaths: { 200 | pdf: 'https://cdn.example.com/pdf.worker.min.mjs', 201 | tesseract: 'https://cdn.example.com/tesseract-worker.min.js' 202 | } 203 | }; 204 | ``` 205 | 206 | ## Security Considerations 207 | 208 | ### Content Security Policy (CSP) 209 | 210 | When deploying, configure CSP headers: 211 | 212 | ``` 213 | Content-Security-Policy: 214 | script-src 'self' 'wasm-unsafe-eval'; 215 | worker-src 'self' blob:; 216 | connect-src 'self' https://huggingface.co; 217 | ``` 218 | 219 | ### File Processing Security 220 | 221 | - Files are processed client-side only 222 | - No data is sent to external servers (except WebLLM model downloads) 223 | - Implement file size limits for production use 224 | 225 | ## Monitoring and Debugging 226 | 227 | ### Error Tracking 228 | 229 | ```javascript 230 | try { 231 | const result = await Extract2MDConverter.quickConvertOnly(file, config); 232 | } catch (error) { 233 | // Log error details for monitoring 234 | console.error('Extract2MD Error:', { 235 | type: error.name, 236 | message: error.message, 237 | scenario: 'quickConvertOnly', 238 | timestamp: new Date().toISOString() 239 | }); 240 | } 241 | ``` 242 | 243 | ### Performance Monitoring 244 | 245 | ```javascript 246 | const startTime = performance.now(); 247 | const result = await Extract2MDConverter.quickConvertOnly(file, config); 248 | const duration = performance.now() - startTime; 249 | 250 | console.log(`Conversion took ${duration}ms`); 251 | ``` 252 | 253 | ## Version Management 254 | 255 | ### Semantic Versioning 256 | 257 | The package follows semantic versioning: 258 | - **Major**: Breaking API changes 259 | - **Minor**: New features, backward compatible 260 | - **Patch**: Bug fixes, backward compatible 261 | 262 | ### Upgrade Path 263 | 264 | 1. **v1.0.x**: Current stable release 265 | 2. **v1.1.x**: Planned features (streaming, progress callbacks) 266 | 3. **v2.0.x**: Planned breaking changes (remove legacy API) 267 | 268 | ## Integration Examples 269 | 270 | ### React Integration 271 | 272 | ```jsx 273 | import React, { useState } from 'react'; 274 | import { Extract2MDConverter } from 'extract2md'; 275 | 276 | function PDFConverter() { 277 | const [result, setResult] = useState(''); 278 | const [loading, setLoading] = useState(false); 279 | 280 | const handleConvert = async (file) => { 281 | setLoading(true); 282 | try { 283 | const markdown = await Extract2MDConverter.quickConvertOnly(file, { 284 | tesseract: { language: 'eng', oem: 1, psm: 6 } 285 | }); 286 | setResult(markdown); 287 | } catch (error) { 288 | console.error('Conversion failed:', error); 289 | } finally { 290 | setLoading(false); 291 | } 292 | }; 293 | 294 | return ( 295 |
296 | handleConvert(e.target.files[0])} 300 | /> 301 | {loading &&

Converting...

} 302 | {result &&
{result}
} 303 |
304 | ); 305 | } 306 | ``` 307 | 308 | ### Vue.js Integration 309 | 310 | ```vue 311 | 318 | 319 | 348 | ``` 349 | 350 | ## Support and Maintenance 351 | 352 | ### Documentation 353 | - **API Documentation**: See `README.md` 354 | - **Migration Guide**: See `MIGRATION.md` 355 | - **Examples**: See `examples/` directory 356 | 357 | ### Community Support 358 | - GitHub Issues for bug reports 359 | - GitHub Discussions for questions 360 | - Stack Overflow for implementation help 361 | 362 | ### Commercial Support 363 | Contact the maintainer for commercial support, custom integrations, or enterprise licensing. 364 | 365 | ## Licensing 366 | 367 | The package is distributed under the MIT License, allowing for both commercial and non-commercial use. See `LICENSE` file for full details. 368 | 369 | --- 370 | 371 | **Note**: This deployment guide is for Extract2MD v2.0.0. Check the latest documentation for updates and changes in newer versions. 372 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Extract2MD - Enhanced PDF to Markdown Converter 2 | 3 | 4 | [![NPM Version](https://img.shields.io/npm/v/extract2md.svg)](https://www.npmjs.com/package/extract2md) 5 | [![License](https://img.shields.io/npm/l/extract2md.svg)](https://github.com/hashangit/Extract2MD/blob/main/LICENSE) 6 | [![Downloads](https://img.shields.io/npm/dt/extract2md.svg)](https://www.npmjs.com/package/extract2md) 7 | 8 | [![Sponsor on Patreon](https://img.shields.io/badge/Sponsor%20on-Patreon-F96854?logo=patreon&style=flat)](https://www.patreon.com/HashanWickramasinghe) 9 | 10 | A powerful client-side JavaScript library for converting PDFs to Markdown with multiple extraction methods and optional LLM enhancement. Now with scenario-specific methods for different use cases. 11 | 12 | ![Extract2MD](https://github.com/user-attachments/assets/0704e80a-54bc-4449-a495-eb944a318400) 13 | 14 | ## 🚀 Quick Start 15 | 16 | Extract2MD now offers 5 distinct scenarios for different conversion needs: 17 | 18 | ```javascript 19 | import Extract2MDConverter from 'extract2md'; 20 | 21 | // Scenario 1: Quick conversion only 22 | const markdown1 = await Extract2MDConverter.quickConvertOnly(pdfFile); 23 | 24 | // Scenario 2: High accuracy OCR conversion only 25 | const markdown2 = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile); 26 | 27 | // Scenario 3: Quick conversion + LLM enhancement 28 | const markdown3 = await Extract2MDConverter.quickConvertWithLLM(pdfFile); 29 | 30 | // Scenario 4: High accuracy conversion + LLM enhancement 31 | const markdown4 = await Extract2MDConverter.highAccuracyConvertWithLLM(pdfFile); 32 | 33 | // Scenario 5: Combined extraction + LLM enhancement (most comprehensive) 34 | const markdown5 = await Extract2MDConverter.combinedConvertWithLLM(pdfFile); 35 | ``` 36 | 37 | ## 📋 Scenarios Explained 38 | 39 | ### Scenario 1: Quick Convert Only 40 | - **Use case**: Fast conversion when PDF has selectable text 41 | - **Method**: `quickConvertOnly(pdfFile, config?)` 42 | - **Tech**: PDF.js text extraction only 43 | - **Output**: Basic markdown formatting 44 | 45 | ### Scenario 2: High Accuracy Convert Only 46 | - **Use case**: PDFs with images, scanned documents, complex layouts 47 | - **Method**: `highAccuracyConvertOnly(pdfFile, config?)` 48 | - **Tech**: Tesseract.js OCR 49 | - **Output**: Markdown from OCR extraction 50 | 51 | ### Scenario 3: Quick Convert + LLM 52 | - **Use case**: Fast extraction with AI enhancement for better formatting 53 | - **Method**: `quickConvertWithLLM(pdfFile, config?)` 54 | - **Tech**: PDF.js + WebLLM 55 | - **Output**: AI-enhanced markdown with improved structure and clarity 56 | 57 | ### Scenario 4: High Accuracy + LLM 58 | - **Use case**: OCR extraction with AI enhancement 59 | - **Method**: `highAccuracyConvertWithLLM(pdfFile, config?)` 60 | - **Tech**: Tesseract.js OCR + WebLLM 61 | - **Output**: AI-enhanced markdown from OCR 62 | 63 | ### Scenario 5: Combined + LLM (Recommended) 64 | - **Use case**: Most comprehensive conversion using both extraction methods 65 | - **Method**: `combinedConvertWithLLM(pdfFile, config?)` 66 | - **Tech**: PDF.js + Tesseract.js + WebLLM with specialized prompts 67 | - **Output**: Best possible markdown leveraging strengths of both extraction methods 68 | 69 | ## ⚙️ Configuration 70 | 71 | Create a configuration object or JSON file to customize behavior: 72 | 73 | ```javascript 74 | const config = { 75 | // PDF.js Worker 76 | pdfJsWorkerSrc: "../pdf.worker.min.mjs", 77 | 78 | // Tesseract OCR Settings 79 | tesseract: { 80 | workerPath: "./tesseract-worker.min.js", 81 | corePath: "./tesseract-core.wasm.js", 82 | langPath: "./lang-data/", 83 | language: "eng", 84 | options: {} 85 | }, 86 | 87 | // LLM Configuration 88 | webllm: { 89 | model: "Qwen3-0.6B-q4f16_1-MLC", 90 | // Optional: Custom model 91 | customModel: { 92 | model: "https://huggingface.co/mlc-ai/your-model/resolve/main/", 93 | model_id: "YourModel-ID", 94 | model_lib: "https://example.com/your-model.wasm", 95 | required_features: ["shader-f16"], 96 | overrides: { conv_template: "qwen" } 97 | }, 98 | options: { 99 | temperature: 0.7, 100 | maxTokens: 4096 101 | } 102 | }, 103 | 104 | // System Prompt Customizations 105 | systemPrompts: { 106 | singleExtraction: "Focus on preserving code examples exactly.", 107 | combinedExtraction: "Pay attention to tables and diagrams from OCR." 108 | }, 109 | 110 | // Processing Options 111 | processing: { 112 | splitPascalCase: false, 113 | pdfRenderScale: 2.5, 114 | postProcessRules: [ 115 | { find: /\bAPI\b/g, replace: "API" } 116 | ] 117 | }, 118 | 119 | // Progress Tracking 120 | progressCallback: (progress) => { 121 | console.log(`${progress.stage}: ${progress.message}`); 122 | if (progress.currentPage) { 123 | console.log(`Page ${progress.currentPage}/${progress.totalPages}`); 124 | } 125 | } 126 | }; 127 | 128 | // Use configuration 129 | const markdown = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config); 130 | ``` 131 | 132 | ## 🔧 Advanced Usage 133 | 134 | ### Using Individual Components 135 | 136 | ```javascript 137 | import { 138 | WebLLMEngine, 139 | OutputParser, 140 | SystemPrompts, 141 | ConfigValidator 142 | } from 'extract2md'; 143 | 144 | // Validate configuration 145 | const validatedConfig = ConfigValidator.validate(userConfig); 146 | 147 | // Initialize WebLLM engine 148 | const engine = new WebLLMEngine(validatedConfig); 149 | await engine.initialize(); 150 | 151 | // Generate text 152 | const result = await engine.generate("Your prompt here"); 153 | 154 | // Parse output 155 | const parser = new OutputParser(); 156 | const cleanMarkdown = parser.parse(result); 157 | ``` 158 | 159 | ### Custom System Prompts 160 | 161 | The library uses different system prompts for different scenarios: 162 | 163 | ```javascript 164 | // For scenarios 3 & 4 (single extraction) 165 | const singlePrompt = SystemPrompts.getSingleExtractionPrompt( 166 | "Additional instruction: Preserve all technical terms." 167 | ); 168 | 169 | // For scenario 5 (combined extraction) 170 | const combinedPrompt = SystemPrompts.getCombinedExtractionPrompt( 171 | "Focus on creating comprehensive documentation." 172 | ); 173 | ``` 174 | 175 | ### Configuration from JSON 176 | 177 | ```javascript 178 | import { ConfigValidator } from 'extract2md'; 179 | 180 | // Load from JSON string 181 | const config = ConfigValidator.fromJSON(configJsonString); 182 | 183 | // Use with any scenario 184 | const result = await Extract2MDConverter.quickConvertWithLLM(pdfFile, config); 185 | ``` 186 | 187 | ## 🎯 Error Handling & Progress Tracking 188 | 189 | ```javascript 190 | const config = { 191 | progressCallback: (progress) => { 192 | switch (progress.stage) { 193 | case 'scenario_5_start': 194 | console.log('Starting combined conversion...'); 195 | break; 196 | case 'webllm_load_progress': 197 | console.log(`Loading model: ${progress.progress}%`); 198 | break; 199 | case 'ocr_page_process': 200 | console.log(`OCR: ${progress.currentPage}/${progress.totalPages}`); 201 | break; 202 | case 'webllm_generate_start': 203 | console.log('AI enhancement in progress...'); 204 | break; 205 | case 'scenario_5_complete': 206 | console.log('Conversion completed!'); 207 | break; 208 | default: 209 | console.log(`${progress.stage}: ${progress.message}`); 210 | } 211 | 212 | if (progress.error) { 213 | console.error('Error:', progress.error); 214 | } 215 | } 216 | }; 217 | 218 | try { 219 | const result = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config); 220 | console.log('Success:', result); 221 | } catch (error) { 222 | console.error('Conversion failed:', error.message); 223 | } 224 | ``` 225 | 226 | ## 🔄 Migration from Legacy API 227 | 228 | If you're using the old API, you can still access it: 229 | 230 | ```javascript 231 | import { LegacyExtract2MDConverter } from 'extract2md'; 232 | 233 | // Old way 234 | const converter = new LegacyExtract2MDConverter(options); 235 | const quick = await converter.quickConvert(pdfFile); 236 | const ocr = await converter.highAccuracyConvert(pdfFile); 237 | const enhanced = await converter.llmRewrite(text); 238 | 239 | // New way (recommended) 240 | const quick = await Extract2MDConverter.quickConvertOnly(pdfFile, config); 241 | const ocr = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile, config); 242 | const enhanced = await Extract2MDConverter.quickConvertWithLLM(pdfFile, config); 243 | ``` 244 | 245 | ## 🌟 Features 246 | 247 | - **5 Scenario-Specific Methods**: Choose the right approach for your use case 248 | - **WebLLM Integration**: Client-side AI enhancement with Qwen models 249 | - **Custom Model Support**: Use your own trained models 250 | - **Advanced Output Parsing**: Automatic removal of thinking tags and formatting 251 | - **Comprehensive Configuration**: Fine-tune every aspect of the conversion 252 | - **Progress Tracking**: Real-time updates for UI integration 253 | - **TypeScript Support**: Full type definitions included 254 | - **Backwards Compatible**: Legacy API still available 255 | 256 | ## 📚 TypeScript Support 257 | 258 | Full TypeScript definitions are included: 259 | 260 | ```typescript 261 | import Extract2MDConverter, { 262 | Extract2MDConfig, 263 | ProgressReport, 264 | CustomModelConfig 265 | } from 'extract2md'; 266 | 267 | const config: Extract2MDConfig = { 268 | webllm: { 269 | model: "Qwen3-0.6B-q4f16_1-MLC", 270 | options: { 271 | temperature: 0.7, 272 | maxTokens: 4096 273 | } 274 | }, 275 | progressCallback: (progress: ProgressReport) => { 276 | console.log(progress.stage, progress.message); 277 | } 278 | }; 279 | 280 | const result: string = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config); 281 | ``` 282 | 283 | ## 🏗️ Installation & Deployment 284 | 285 | ### NPM Installation 286 | ```bash 287 | npm install extract2md 288 | ``` 289 | 290 | ### CDN Usage 291 | ```html 292 | 293 | 297 | ``` 298 | 299 | ### Worker Files Configuration 300 | The package requires worker files for PDF.js and Tesseract.js. These are automatically copied during build: 301 | 302 | ```javascript 303 | // Default worker paths (adjust for your deployment) 304 | const config = { 305 | pdfJsWorkerSrc: "/pdf.worker.min.mjs", 306 | tesseract: { 307 | workerPath: "/tesseract-worker.min.js", 308 | corePath: "/tesseract-core.wasm.js" 309 | } 310 | }; 311 | ``` 312 | 313 | ### Bundle Size Considerations 314 | - **Total Size**: ~11 MB (includes OCR and PDF processing) 315 | - **PDF.js**: ~950 KB 316 | - **Tesseract.js**: ~4.5 MB 317 | - **WebLLM**: Variable (model-dependent) 318 | 319 | Use lazy loading and code splitting for production deployments. 320 | 321 | ## 📚 Documentation 322 | 323 | - **[Migration Guide](./MIGRATION.md)** - Upgrade from legacy API 324 | - **[Deployment Guide](./DEPLOYMENT.md)** - Production deployment instructions 325 | - **[Examples](./examples/)** - Complete usage examples 326 | - **[How To Run the Demo](./examples/README.md)** - Instructions on how to run the demo 327 | - **[TypeScript Definitions](./src/types/index.d.ts)** - Full type definitions 328 | 329 | ## 📄 License 330 | 331 | MIT License - see LICENSE file for details. 332 | 333 | ## 🤝 Contributing 334 | 335 | Contributions welcome! Please read the contributing guidelines before submitting PRs. 336 | 337 | ## 🐛 Issues 338 | 339 | Report issues on the [GitHub Issues page](https://github.com/hashangit/Extract2MD/issues). 340 | -------------------------------------------------------------------------------- /src/utils/ConfigValidator.js: -------------------------------------------------------------------------------- 1 | /** 2 | * ConfigValidator.js 3 | * Validates and normalizes configuration objects 4 | */ 5 | 6 | export class ConfigValidator { 7 | /** 8 | * Default configuration values 9 | */ 10 | static getDefaultConfig() { 11 | return { 12 | // PDF.js configuration 13 | pdfJsWorkerSrc: '../pdf.worker.min.mjs', 14 | 15 | // Tesseract configuration 16 | tesseract: { 17 | workerPath: './tesseract-worker.min.js', 18 | corePath: './tesseract-core.wasm.js', 19 | langPath: './lang-data/', 20 | language: 'eng', 21 | options: {} 22 | }, 23 | 24 | // LLM configuration 25 | webllm: { 26 | model: 'Qwen3-0.6B-q4f16_1-MLC', 27 | customModel: null, 28 | options: { 29 | temperature: 0.7, 30 | maxTokens: 4096 31 | } 32 | }, 33 | 34 | // System prompt customizations 35 | systemPrompts: { 36 | singleExtraction: '', 37 | combinedExtraction: '' 38 | }, 39 | 40 | // Processing options 41 | processing: { 42 | splitPascalCase: false, 43 | pdfRenderScale: 2.5, 44 | postProcessRules: [] 45 | }, 46 | 47 | // Progress tracking 48 | progressCallback: null 49 | }; 50 | } 51 | 52 | /** 53 | * Validate and normalize a configuration object 54 | * @param {Object} config - Configuration object to validate 55 | * @returns {Object} Validated and normalized configuration 56 | */ 57 | static validate(config = {}) { 58 | const defaultConfig = this.getDefaultConfig(); 59 | const normalizedConfig = this.deepMerge(defaultConfig, config); 60 | 61 | // Validate required types and values 62 | this.validateTesseractConfig(normalizedConfig.tesseract); 63 | this.validateLLMConfig(normalizedConfig.webllm); 64 | this.validateProcessingConfig(normalizedConfig.processing); 65 | this.validateSystemPrompts(normalizedConfig.systemPrompts); 66 | 67 | return normalizedConfig; 68 | } 69 | 70 | /** 71 | * Validate Tesseract configuration 72 | * @param {Object} tesseractConfig - Tesseract configuration 73 | */ 74 | static validateTesseractConfig(tesseractConfig) { 75 | if (!tesseractConfig) { 76 | throw new Error('Tesseract configuration is required'); 77 | } 78 | 79 | // Validate language 80 | if (tesseractConfig.language && typeof tesseractConfig.language !== 'string') { 81 | throw new Error('Tesseract language must be a string'); 82 | } 83 | 84 | // Validate paths 85 | const pathFields = ['workerPath', 'corePath', 'langPath']; 86 | for (const field of pathFields) { 87 | if (tesseractConfig[field] && typeof tesseractConfig[field] !== 'string') { 88 | throw new Error(`Tesseract ${field} must be a string`); 89 | } 90 | } 91 | 92 | // Validate options 93 | if (tesseractConfig.options && typeof tesseractConfig.options !== 'object') { 94 | throw new Error('Tesseract options must be an object'); 95 | } 96 | } 97 | 98 | /** 99 | * Validate LLM configuration 100 | * @param {Object} llmConfig - LLM configuration 101 | */ 102 | static validateLLMConfig(llmConfig) { 103 | if (!llmConfig) { 104 | throw new Error('LLM configuration is required'); 105 | } 106 | 107 | // Validate model 108 | if (llmConfig.model && typeof llmConfig.model !== 'string') { 109 | throw new Error('LLM model must be a string'); 110 | } 111 | 112 | // Validate custom model structure 113 | if (llmConfig.customModel) { 114 | this.validateCustomModel(llmConfig.customModel); 115 | } 116 | 117 | // Validate options 118 | if (llmConfig.options) { 119 | this.validateLLMOptions(llmConfig.options); 120 | } 121 | } 122 | 123 | /** 124 | * Validate custom model configuration 125 | * @param {Object} customModel - Custom model configuration 126 | */ 127 | static validateCustomModel(customModel) { 128 | const requiredFields = ['model', 'model_id', 'model_lib']; 129 | 130 | for (const field of requiredFields) { 131 | if (!customModel[field] || typeof customModel[field] !== 'string') { 132 | throw new Error(`Custom model ${field} is required and must be a string`); 133 | } 134 | } 135 | 136 | // Validate optional fields 137 | if (customModel.required_features && !Array.isArray(customModel.required_features)) { 138 | throw new Error('Custom model required_features must be an array'); 139 | } 140 | 141 | if (customModel.overrides && typeof customModel.overrides !== 'object') { 142 | throw new Error('Custom model overrides must be an object'); 143 | } 144 | } 145 | 146 | /** 147 | * Validate LLM options 148 | * @param {Object} options - LLM options 149 | */ 150 | static validateLLMOptions(options) { 151 | if (typeof options !== 'object') { 152 | throw new Error('LLM options must be an object'); 153 | } 154 | 155 | // Validate temperature 156 | if (options.temperature !== undefined) { 157 | if (typeof options.temperature !== 'number' || options.temperature < 0 || options.temperature > 2) { 158 | throw new Error('LLM temperature must be a number between 0 and 2'); 159 | } 160 | } 161 | 162 | // Validate maxTokens 163 | if (options.maxTokens !== undefined) { 164 | if (!Number.isInteger(options.maxTokens) || options.maxTokens < 1) { 165 | throw new Error('LLM maxTokens must be a positive integer'); 166 | } 167 | } 168 | } 169 | 170 | /** 171 | * Validate processing configuration 172 | * @param {Object} processingConfig - Processing configuration 173 | */ 174 | static validateProcessingConfig(processingConfig) { 175 | if (!processingConfig) { 176 | throw new Error('Processing configuration is required'); 177 | } 178 | 179 | // Validate splitPascalCase 180 | if (processingConfig.splitPascalCase !== undefined && typeof processingConfig.splitPascalCase !== 'boolean') { 181 | throw new Error('splitPascalCase must be a boolean'); 182 | } 183 | 184 | // Validate pdfRenderScale 185 | if (processingConfig.pdfRenderScale !== undefined) { 186 | if (typeof processingConfig.pdfRenderScale !== 'number' || processingConfig.pdfRenderScale <= 0) { 187 | throw new Error('pdfRenderScale must be a positive number'); 188 | } 189 | } 190 | 191 | // Validate postProcessRules 192 | if (processingConfig.postProcessRules && !Array.isArray(processingConfig.postProcessRules)) { 193 | throw new Error('postProcessRules must be an array'); 194 | } 195 | 196 | if (processingConfig.postProcessRules) { 197 | for (const rule of processingConfig.postProcessRules) { 198 | if (!rule || typeof rule !== 'object') { 199 | throw new Error('Each postProcessRule must be an object'); 200 | } 201 | if (!rule.find) { 202 | throw new Error('Each postProcessRule must have a "find" property'); 203 | } 204 | if (typeof rule.find !== 'string' && !(rule.find instanceof RegExp)) { 205 | throw new Error('Each postProcessRule "find" property must be a string or RegExp'); 206 | } 207 | if (typeof rule.replace !== 'string') { 208 | throw new Error('Each postProcessRule must have a "replace" string property'); 209 | } 210 | } 211 | } 212 | } 213 | 214 | /** 215 | * Validate system prompts configuration 216 | * @param {Object} systemPrompts - System prompts configuration 217 | */ 218 | static validateSystemPrompts(systemPrompts) { 219 | if (!systemPrompts) { 220 | throw new Error('System prompts configuration is required'); 221 | } 222 | 223 | const promptTypes = ['singleExtraction', 'combinedExtraction']; 224 | for (const promptType of promptTypes) { 225 | if (systemPrompts[promptType] !== undefined && typeof systemPrompts[promptType] !== 'string') { 226 | throw new Error(`System prompt ${promptType} must be a string`); 227 | } 228 | } 229 | } 230 | 231 | /** 232 | * Deep merge two objects 233 | * @param {Object} target - Target object 234 | * @param {Object} source - Source object 235 | * @returns {Object} Merged object 236 | */ 237 | static deepMerge(target, source) { 238 | const result = { ...target }; 239 | 240 | for (const key in source) { 241 | if (Object.prototype.hasOwnProperty.call(source, key)) { 242 | if (this.isObject(source[key]) && this.isObject(target[key])) { 243 | result[key] = this.deepMerge(target[key], source[key]); 244 | } else { 245 | result[key] = source[key]; 246 | } 247 | } 248 | } 249 | 250 | return result; 251 | } 252 | 253 | /** 254 | * Check if value is a plain object 255 | * @param {*} value - Value to check 256 | * @returns {boolean} Whether value is a plain object 257 | */ 258 | static isObject(value) { 259 | return value !== null && typeof value === 'object' && !Array.isArray(value); 260 | } 261 | 262 | /** 263 | * Create a configuration object from a JSON string or file content 264 | * @param {string} jsonString - JSON configuration string 265 | * @returns {Object} Parsed and validated configuration 266 | */ 267 | static fromJSON(jsonString) { 268 | try { 269 | const config = JSON.parse(jsonString); 270 | return this.validate(config); 271 | } catch (error) { 272 | if (error instanceof SyntaxError) { 273 | throw new Error(`Invalid JSON configuration: ${error.message}`); 274 | } 275 | throw error; 276 | } 277 | } 278 | 279 | /** 280 | * Get configuration schema for documentation 281 | * @returns {Object} Configuration schema 282 | */ 283 | static getSchema() { 284 | return { 285 | type: 'object', 286 | properties: { 287 | pdfJsWorkerSrc: { 288 | type: 'string', 289 | description: 'Path to PDF.js worker file' 290 | }, 291 | tesseract: { 292 | type: 'object', 293 | properties: { 294 | workerPath: { type: 'string', description: 'Path to Tesseract worker' }, 295 | corePath: { type: 'string', description: 'Path to Tesseract core WASM' }, 296 | langPath: { type: 'string', description: 'Path to language data directory' }, 297 | language: { type: 'string', description: 'OCR language code' }, 298 | options: { type: 'object', description: 'Additional Tesseract options' } 299 | } 300 | }, 301 | webllm: { 302 | type: 'object', 303 | properties: { 304 | model: { type: 'string', description: 'Model identifier' }, 305 | customModel: { 306 | type: 'object', 307 | description: 'Custom model configuration', 308 | properties: { 309 | model: { type: 'string', description: 'Model URL' }, 310 | model_id: { type: 'string', description: 'Model identifier' }, 311 | model_lib: { type: 'string', description: 'Model library URL' }, 312 | required_features: { type: 'array', description: 'Required GPU features' }, 313 | overrides: { type: 'object', description: 'Model override settings' } 314 | } 315 | }, 316 | options: { 317 | type: 'object', 318 | properties: { 319 | temperature: { type: 'number', minimum: 0, maximum: 2 }, 320 | maxTokens: { type: 'integer', minimum: 1 } 321 | } 322 | } 323 | } 324 | }, 325 | systemPrompts: { 326 | type: 'object', 327 | properties: { 328 | singleExtraction: { type: 'string', description: 'Custom prompt for single extraction scenarios' }, 329 | combinedExtraction: { type: 'string', description: 'Custom prompt for combined extraction scenario' } 330 | } 331 | }, 332 | processing: { 333 | type: 'object', 334 | properties: { 335 | splitPascalCase: { type: 'boolean', description: 'Split PascalCase words' }, 336 | pdfRenderScale: { type: 'number', minimum: 0, description: 'PDF rendering scale for OCR' }, 337 | postProcessRules: { 338 | type: 'array', 339 | items: { 340 | type: 'object', 341 | properties: { 342 | find: { description: 'RegExp or string to find' }, 343 | replace: { type: 'string', description: 'Replacement string' } 344 | }, 345 | required: ['find', 'replace'] 346 | } 347 | } 348 | } 349 | }, 350 | progressCallback: { description: 'Function to handle progress updates' } 351 | } 352 | }; 353 | } 354 | } 355 | 356 | export default ConfigValidator; 357 | -------------------------------------------------------------------------------- /examples/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Extract2MD Demo 7 | 167 | 168 | 169 |
170 |

🚀 Extract2MD Enhanced Demo

171 |

Choose your conversion scenario and upload a PDF to see the magic happen!

172 |
173 | 174 |
175 |
176 |
1. Quick Convert Only
177 |
Tech: PDF.js text extraction
178 |
Fast conversion for PDFs with selectable text. Basic markdown formatting.
179 |
180 | 181 |
182 |
2. High Accuracy OCR Only
183 |
Tech: Tesseract.js OCR
184 |
OCR extraction for scanned documents, images, and complex layouts.
185 |
186 | 187 |
188 |
3. Quick + LLM Enhancement
189 |
Tech: PDF.js + WebLLM
190 |
Fast extraction with AI enhancement for better structure and clarity.
191 |
192 | 193 |
194 |
4. OCR + LLM Enhancement
195 |
Tech: Tesseract.js + WebLLM
196 |
OCR extraction with AI enhancement for comprehensive results.
197 |
198 | 199 |
200 |
5. Combined + LLM (Recommended)
201 |
Tech: PDF.js + Tesseract.js + WebLLM
202 |
Best results using both extraction methods with specialized AI prompts.
203 |
204 |
205 | 206 |
207 |
208 | 209 | 210 |
211 | 212 |
213 | Selected Scenario: 5. Combined + LLM (Recommended)
214 | Note: LLM scenarios require WebGPU support and will download models on first use. 215 |
216 |
217 | 218 |
219 |
Initializing...
220 |
221 |
222 | 223 |
224 |
225 | Conversion Result 226 | 227 |
228 |
229 |
230 | 231 | 232 | 282 | 283 | 412 | 413 | 414 | 415 | 416 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Extract2MD - Enhanced PDF to Markdown conversion library 3 | * New API with scenario-specific methods for different use cases 4 | */ 5 | 6 | // Import new modular components 7 | import Extract2MDConverter from './converters/Extract2MDConverter.js'; 8 | import WebLLMEngine from './engines/WebLLMEngine.js'; 9 | import OutputParser from './utils/OutputParser.js'; 10 | import SystemPrompts from './utils/SystemPrompts.js'; 11 | import ConfigValidator from './utils/ConfigValidator.js'; 12 | 13 | // Legacy imports for backwards compatibility 14 | import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs'; 15 | import Tesseract from 'tesseract.js'; 16 | import { Chat as ImportedChat, CreateMLCEngine as ImportedCreateMLCEngine } from '@mlc-ai/web-llm'; 17 | import * as webllm from '@mlc-ai/web-llm'; 18 | 19 | const DEFAULT_PDFJS_WORKER_SRC = '../pdf.worker.min.mjs'; // Relative to dist/assets/ 20 | const DEFAULT_TESSERACT_WORKER_PATH = './tesseract-worker.min.js'; // Relative to dist/assets/ 21 | const DEFAULT_TESSERACT_CORE_PATH = './tesseract-core.wasm.js'; // Relative to dist/assets/ 22 | const DEFAULT_TESSERACT_LANG_PATH = './lang-data/'; // Relative to dist/assets/ 23 | const DEFAULT_LLM_MODEL = 'Qwen3-0.6B-q4f16_1-MLC'; // Updated to match available WASM 24 | const DEFAULT_LLM_MODEL_LIB_URL = 'https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/web-llm-models/v0_2_48/Qwen3-0.6B-q4f16_1-ctx4k_cs1k-webgpu.wasm'; 25 | 26 | // Legacy converter class for backwards compatibility 27 | class LegacyExtract2MDConverter { 28 | constructor(options = {}) { 29 | this.pdfJsWorkerSrc = options.pdfJsWorkerSrc || DEFAULT_PDFJS_WORKER_SRC; 30 | const pdfjsSetupLib = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : (typeof window !== 'undefined' ? window.pdfjsLib : null)); 31 | if (pdfjsSetupLib && pdfjsSetupLib.GlobalWorkerOptions) { 32 | pdfjsSetupLib.GlobalWorkerOptions.workerSrc = this.pdfJsWorkerSrc; 33 | } else { 34 | console.warn('pdfjsLib or pdfjsLib.GlobalWorkerOptions is not defined. PDF.js worker may not load correctly if not already configured globally.'); 35 | } 36 | 37 | this.tesseractOptions = { 38 | workerPath: options.tesseractWorkerPath || DEFAULT_TESSERACT_WORKER_PATH, 39 | corePath: options.tesseractCorePath || DEFAULT_TESSERACT_CORE_PATH, 40 | langPath: options.tesseractLangPath || DEFAULT_TESSERACT_LANG_PATH, 41 | ...(options.tesseractOptions || {}) 42 | }; 43 | this.tesseractLanguage = options.tesseractLanguage || 'eng'; // Default to English 44 | this.splitPascalCase = options.splitPascalCase || false; 45 | 46 | this.defaultPostProcessRules = [ 47 | { find: /\uFB00/g, replace: 'ff' }, 48 | { find: /\uFB01/g, replace: 'fi' }, 49 | { find: /\uFB02/g, replace: 'fl' }, 50 | { find: /\uFB03/g, replace: 'ffi' }, 51 | { find: /\uFB04/g, replace: 'ffl' }, 52 | { find: /[\u2018\u2019]/g, replace: "'" }, 53 | { find: /[\u201C\u201D]/g, replace: '"' }, 54 | { find: /[\u2022\u2023\u25E6\u2043\u2219\u25CF\u25CB\u2981\u2619\u2765]/g, replace: '-' }, 55 | { find: /[\u2013\u2014]/g, replace: '-' }, 56 | { find: /\u00AD/g, replace: '' }, 57 | { find: /[\s\u00A0\u2000-\u200A\u202F\u205F\u3000]+/g, replace: ' ' }, 58 | ]; 59 | 60 | if (this.splitPascalCase) { 61 | this.defaultPostProcessRules.push( 62 | { find: /([A-Z][a-z]+)([A-Z][a-z]+)/g, replace: '$1 $2' }, 63 | { find: /([a-z])([A-Z][a-z]+)/g, replace: '$1 $2' }, 64 | { find: /([A-Z][a-z]+)([A-Z][a-z]+)/g, replace: '$1 $2' } 65 | ); 66 | } 67 | this.customPostProcessRules = options.postProcessRules || []; 68 | 69 | this.llmModel = options.llmModel || DEFAULT_LLM_MODEL; 70 | this.llmModelLibUrl = options.llmModelLibUrl || null; // New option for user-specified model_lib 71 | this.chatModule = null; 72 | this.llmInitialized = false; 73 | 74 | this.progressCallback = options.progressCallback || function(progress) { /* console.log(progress) */ }; 75 | 76 | this.WebLLMChatConstructor = null; // For fallback 77 | this.WebLLMCreateEngine = null; 78 | this.webllmModule = null; 79 | 80 | // Try to get the full webllm module for modelLibURLPrefix and modelVersion 81 | if (typeof webllm !== 'undefined' && webllm.CreateMLCEngine) { 82 | this.webllmModule = webllm; 83 | this.WebLLMCreateEngine = webllm.CreateMLCEngine; 84 | this.WebLLMChatConstructor = webllm.Chat; // Also get Chat from the main module 85 | } else if (typeof window !== 'undefined' && window.webLLM && typeof window.webLLM.CreateMLCEngine === 'function') { 86 | this.webllmModule = window.webLLM; 87 | this.WebLLMCreateEngine = window.webLLM.CreateMLCEngine; 88 | this.WebLLMChatConstructor = window.webLLM.Chat; 89 | } else { 90 | // Fallback if full module import didn't work as expected, try individual imports 91 | console.warn('Extract2MD_Debug: Full webllm module not found, relying on individual imports/globals for CreateMLCEngine/Chat.'); 92 | if (typeof ImportedCreateMLCEngine !== 'undefined') { 93 | this.WebLLMCreateEngine = ImportedCreateMLCEngine; 94 | } else if (typeof window !== 'undefined' && window.webLLM && typeof window.webLLM.CreateMLCEngine === 'function') { // Redundant but safe 95 | this.WebLLMCreateEngine = window.webLLM.CreateMLCEngine; 96 | } 97 | // Fallback for Chat constructor 98 | if (typeof ImportedChat !== 'undefined') { 99 | this.WebLLMChatConstructor = ImportedChat; 100 | } else if (typeof window !== 'undefined' && window.webLLM && typeof window.webLLM.Chat === 'function') { // Redundant 101 | this.WebLLMChatConstructor = window.webLLM.Chat; 102 | } 103 | } 104 | } 105 | 106 | _postProcessText(text, additionalRules = []) { 107 | if (!text) return ''; 108 | let cleanedText = text; 109 | const allRules = [...this.defaultPostProcessRules, ...this.customPostProcessRules, ...additionalRules]; 110 | 111 | // Optimized rule application - batch similar operations 112 | const unicodeReplacements = []; 113 | const regexReplacements = []; 114 | 115 | for (const rule of allRules) { 116 | if (rule.find && typeof rule.replace === 'string') { 117 | if (rule.find instanceof RegExp) { 118 | regexReplacements.push(rule); 119 | } else { 120 | unicodeReplacements.push(rule); 121 | } 122 | } 123 | } 124 | 125 | // Apply unicode replacements first (typically simpler) 126 | for (const rule of unicodeReplacements) { 127 | cleanedText = cleanedText.replace(rule.find, rule.replace); 128 | } 129 | 130 | // Apply regex replacements 131 | for (const rule of regexReplacements) { 132 | cleanedText = cleanedText.replace(rule.find, rule.replace); 133 | } 134 | 135 | return cleanedText.trim(); 136 | } 137 | 138 | _convertToMarkdownLogic(rawText) { 139 | let markdownOutputLines = []; 140 | const inputLines = rawText.split(/\n/); 141 | 142 | let currentParagraphCollector = []; 143 | let inPotentialTableBlock = false; 144 | let potentialTableBlockLines = []; 145 | 146 | const flushCurrentParagraph = () => { 147 | if (currentParagraphCollector.length > 0) { 148 | markdownOutputLines.push(currentParagraphCollector.join(' ').trim()); 149 | currentParagraphCollector = []; 150 | // Only add empty line if the next content isn't a heading or table block 151 | this._addSeparatorLine(markdownOutputLines); 152 | } 153 | }; 154 | 155 | const flushPotentialTableBlock = () => { 156 | if (potentialTableBlockLines.length > 0) { 157 | if (potentialTableBlockLines.length >= 2) { // Heuristic: at least 2 lines for a table/code block 158 | markdownOutputLines.push('```'); 159 | markdownOutputLines.push(...potentialTableBlockLines.map(l => l.trimEnd())); 160 | markdownOutputLines.push('```'); 161 | } else { 162 | markdownOutputLines.push(potentialTableBlockLines.join(' ').trim()); 163 | } 164 | potentialTableBlockLines = []; 165 | this._addSeparatorLine(markdownOutputLines); 166 | } 167 | inPotentialTableBlock = false; 168 | }; 169 | 170 | for (let i = 0; i < inputLines.length; i++) { 171 | const originalLine = inputLines[i]; 172 | const trimmedLine = originalLine.trim(); 173 | 174 | if (trimmedLine === '') { 175 | if (inPotentialTableBlock) flushPotentialTableBlock(); 176 | flushCurrentParagraph(); 177 | continue; 178 | } 179 | 180 | const isShortLine = trimmedLine.length > 0 && trimmedLine.length < 80; 181 | const noPunctuationEnd = isShortLine && !/[.,;:!?]$/.test(trimmedLine); 182 | const isAllCapsLine = trimmedLine.length > 2 && trimmedLine.length < 80 && /^[A-Z\s\d\W]*[A-Z][A-Z\s\d\W]*$/.test(trimmedLine) && /[A-Z]/.test(trimmedLine) && !/^\d+$/.test(trimmedLine); 183 | const nextLineIsBlankOrEndOfFile = (i + 1 === inputLines.length || inputLines[i + 1].trim() === ''); 184 | 185 | if (isAllCapsLine || (isShortLine && noPunctuationEnd && nextLineIsBlankOrEndOfFile && trimmedLine.length > 1)) { 186 | if (inPotentialTableBlock) flushPotentialTableBlock(); 187 | flushCurrentParagraph(); 188 | markdownOutputLines.push(`# ${trimmedLine}`); 189 | this._addSeparatorLine(markdownOutputLines); 190 | if (nextLineIsBlankOrEndOfFile && inputLines[i+1] && inputLines[i + 1].trim() === '') { 191 | i++; 192 | } 193 | continue; 194 | } 195 | 196 | const hasMultipleSpacesBetweenWords = /\S\s{2,}\S/.test(originalLine); 197 | const hasMultipleColumnsBySpaces = originalLine.split(/\s{2,}/).length > 2 && originalLine.length > 10; 198 | 199 | if (hasMultipleSpacesBetweenWords || hasMultipleColumnsBySpaces) { 200 | flushCurrentParagraph(); 201 | if (!inPotentialTableBlock) inPotentialTableBlock = true; 202 | potentialTableBlockLines.push(originalLine); 203 | } else { 204 | if (inPotentialTableBlock) flushPotentialTableBlock(); 205 | if (trimmedLine) currentParagraphCollector.push(trimmedLine); 206 | } 207 | } 208 | 209 | if (inPotentialTableBlock) flushPotentialTableBlock(); 210 | flushCurrentParagraph(); 211 | 212 | // Optimized final cleanup - single pass to normalize excessive newlines 213 | return this._normalizeMarkdownNewlines(markdownOutputLines); 214 | } 215 | 216 | /** 217 | * Helper method to add separator lines only when needed 218 | */ 219 | _addSeparatorLine(outputLines) { 220 | // Only add empty line if the last line isn't already empty 221 | if (outputLines.length > 0 && outputLines[outputLines.length - 1] !== '') { 222 | outputLines.push(''); 223 | } 224 | } 225 | 226 | /** 227 | * Normalize newlines in the final markdown output 228 | */ 229 | _normalizeMarkdownNewlines(lines) { 230 | // Filter out excessive empty lines while preserving structure 231 | const normalizedLines = []; 232 | let consecutiveEmptyLines = 0; 233 | 234 | for (const line of lines) { 235 | if (line.trim() === '') { 236 | consecutiveEmptyLines++; 237 | // Allow maximum of 1 consecutive empty line 238 | if (consecutiveEmptyLines <= 1) { 239 | normalizedLines.push(''); 240 | } 241 | } else { 242 | consecutiveEmptyLines = 0; 243 | normalizedLines.push(line.trimEnd()); 244 | } 245 | } 246 | 247 | // Join and do final cleanup 248 | let finalMarkdown = normalizedLines.join('\n'); 249 | // Remove any remaining triple+ newlines and trim 250 | finalMarkdown = finalMarkdown.replace(/\n{3,}/g, '\n\n').trim(); 251 | return finalMarkdown; 252 | } 253 | 254 | async _extractTextWithPdfJs(fileArrayBuffer) { 255 | const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : (typeof window !== 'undefined' ? window.pdfjsLib : null)); 256 | if (!pdfjs || !pdfjs.getDocument) { 257 | throw new Error('pdf.js library (pdfjsLib) is not loaded or not fully initialized.'); 258 | } 259 | 260 | this.progressCallback({ stage: 'pdfjs_load', message: 'Loading PDF with pdf.js...' }); 261 | const pdfDoc = await pdfjs.getDocument({ data: fileArrayBuffer }).promise; 262 | let fullText = ''; 263 | const numPages = pdfDoc.numPages; 264 | 265 | for (let pageNum = 1; pageNum <= numPages; pageNum++) { 266 | this.progressCallback({ stage: 'pdfjs_page', message: `Extracting text from page ${pageNum}/${numPages}...`, currentPage: pageNum, totalPages: numPages }); 267 | const page = await pdfDoc.getPage(pageNum); 268 | const textContent = await page.getTextContent({ 269 | normalizeWhitespace: false, 270 | disableCombineTextItems: true 271 | }); 272 | let pageTextBuffer = ''; 273 | if (textContent.items && textContent.items.length > 0) { 274 | for (let i = 0; i < textContent.items.length; i++) { 275 | const item = textContent.items[i]; 276 | pageTextBuffer += item.str; 277 | if (item.hasEOL) { 278 | if (!pageTextBuffer.endsWith('\n')) pageTextBuffer += '\n'; 279 | } else if (i < textContent.items.length - 1) { 280 | const nextItem = textContent.items[i+1]; 281 | if (item.str && !item.str.endsWith(' ') && nextItem.str && !nextItem.str.startsWith(' ') && Math.abs(item.transform[5] - nextItem.transform[5]) < (item.height * 0.5)) { 282 | const currentItemEndX = item.transform[4] + item.width; 283 | const nextItemStartX = nextItem.transform[4]; 284 | if (nextItemStartX - currentItemEndX > -0.5) { 285 | pageTextBuffer += ' '; 286 | } 287 | } 288 | } 289 | } 290 | } 291 | fullText += pageTextBuffer; 292 | if (pageTextBuffer.trim() !== '' && !pageTextBuffer.endsWith('\n')) fullText += '\n'; 293 | } 294 | this.progressCallback({ stage: 'pdfjs_extract_complete', message: 'pdf.js text extraction complete.' }); 295 | return fullText; 296 | } 297 | 298 | async quickConvert(pdfFile, options = {}) { 299 | if (!(pdfFile instanceof File)) throw new Error('Invalid input: pdfFile must be a File object.'); 300 | this.progressCallback({ stage: 'start_quick', message: 'Starting quick conversion...' }); 301 | const arrayBuffer = await pdfFile.arrayBuffer(); 302 | let rawText = await this._extractTextWithPdfJs(arrayBuffer); 303 | 304 | this.progressCallback({ stage: 'postprocess_quick', message: 'Post-processing extracted text...' }); 305 | let cleanedText = this._postProcessText(rawText, options.postProcessRules); 306 | cleanedText = cleanedText.replace(/\r\n/g, '\n').replace(/\n{2,}/g, '\n\n').trim(); 307 | 308 | this.progressCallback({ stage: 'markdown_quick', message: 'Converting to Markdown...' }); 309 | const markdown = this._convertToMarkdownLogic(cleanedText); 310 | this.progressCallback({ stage: 'complete_quick', message: 'Quick conversion complete.' }); 311 | return markdown; 312 | } 313 | 314 | async highAccuracyConvert(pdfFile, options = {}) { 315 | if (!(pdfFile instanceof File)) throw new Error('Invalid input: pdfFile must be a File object.'); 316 | const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : (typeof window !== 'undefined' ? window.pdfjsLib : null)); 317 | if (!pdfjs || !pdfjs.getDocument) throw new Error('pdf.js library (pdfjsLib) is not loaded or not fully initialized.'); 318 | const Tess = (typeof Tesseract !== 'undefined' ? Tesseract : (typeof window !== 'undefined' ? window.Tesseract : null)); 319 | if (!Tess) throw new Error('Tesseract.js library is not loaded.'); 320 | 321 | this.progressCallback({ stage: 'start_ocr', message: 'Starting high-accuracy OCR conversion...' }); 322 | 323 | const tesseractLang = options.tesseractLanguage || this.tesseractLanguage; 324 | const tesseractOpts = { ...this.tesseractOptions, ...(options.tesseractOptions || {}) }; // Merge instance and call options 325 | const pdfRenderScale = options.pdfRenderScale || 2.5; 326 | 327 | let worker; 328 | let workerInitialized = false; 329 | 330 | try { 331 | this.progressCallback({ stage: 'ocr_worker_init', message: 'Initializing Tesseract OCR worker...' }); 332 | 333 | try { 334 | // Set timeout for worker initialization 335 | const workerPromise = Tess.createWorker(tesseractLang, 1, tesseractOpts); 336 | 337 | // Add timeout to prevent hanging 338 | const timeoutPromise = new Promise((_, reject) => { 339 | setTimeout(() => reject(new Error('Worker initialization timed out after 30 seconds')), 30000); 340 | }); 341 | 342 | worker = await Promise.race([workerPromise, timeoutPromise]); 343 | workerInitialized = true; 344 | 345 | this.progressCallback({ stage: 'ocr_worker_ready', message: 'OCR worker initialized successfully.' }); 346 | } catch (err) { 347 | this.progressCallback({ stage: 'ocr_worker_error', message: `Failed to initialize Tesseract worker: ${err.message}`, error: err }); 348 | throw new Error(`Failed to initialize Tesseract worker: ${err.message}. Check if Tesseract.js files are accessible and language data is available.`); 349 | } 350 | 351 | const arrayBuffer = await pdfFile.arrayBuffer(); 352 | const pdfDoc = await pdfjs.getDocument({ data: arrayBuffer }).promise; 353 | let fullTextAccumulator = ''; 354 | const numPages = pdfDoc.numPages; 355 | 356 | for (let pageNum = 1; pageNum <= numPages; pageNum++) { 357 | this.progressCallback({ stage: 'ocr_render_page', message: `Rendering page ${pageNum}/${numPages} for OCR...`, currentPage: pageNum, totalPages: numPages }); 358 | 359 | const page = await pdfDoc.getPage(pageNum); 360 | const viewport = page.getViewport({ scale: pdfRenderScale }); 361 | 362 | const canvas = document.createElement('canvas'); 363 | const context = canvas.getContext('2d'); 364 | canvas.height = viewport.height; 365 | canvas.width = viewport.width; 366 | 367 | try { 368 | await page.render({ canvasContext: context, viewport: viewport }).promise; 369 | 370 | this.progressCallback({ stage: 'ocr_recognize_page', message: `OCR processing page ${pageNum}/${numPages}...`, currentPage: pageNum, totalPages: numPages }); 371 | const recognition = await worker.recognize(canvas); 372 | const ocrPageText = recognition.data?.text || ''; 373 | fullTextAccumulator += ocrPageText + '\n'; 374 | 375 | } catch (pageError) { 376 | this.progressCallback({ stage: 'ocr_page_warning', message: `Warning: Failed to process page ${pageNum}: ${pageError.message}` }); 377 | console.warn(`OCR processing failed for page ${pageNum}:`, pageError); 378 | // Continue with other pages instead of failing completely 379 | } finally { 380 | // Clean up canvas resources 381 | canvas.width = 0; 382 | canvas.height = 0; 383 | } 384 | } 385 | 386 | // Safely terminate worker 387 | if (workerInitialized && worker) { 388 | try { 389 | this.progressCallback({ stage: 'ocr_terminate_worker', message: 'Terminating Tesseract worker...' }); 390 | 391 | await Promise.race([ 392 | worker.terminate(), 393 | new Promise((_, reject) => { 394 | setTimeout(() => reject(new Error('Worker termination timed out')), 10000); 395 | }) 396 | ]); 397 | } catch (terminateError) { 398 | console.warn('Warning: Failed to properly terminate Tesseract worker:', terminateError); 399 | // Don't throw error for termination issues 400 | } 401 | } 402 | } catch (error) { 403 | // Enhanced cleanup on error 404 | if (workerInitialized && worker) { 405 | try { 406 | await Promise.race([ 407 | worker.terminate(), 408 | new Promise((resolve) => setTimeout(resolve, 5000)) // Give up after 5 seconds 409 | ]); 410 | } catch (cleanupError) { 411 | console.warn('Failed to cleanup worker after error:', cleanupError); 412 | } 413 | } 414 | throw error; 415 | } 416 | 417 | this.progressCallback({ stage: 'postprocess_ocr', message: 'Post-processing OCR text...' }); 418 | let cleanedText = this._postProcessText(fullTextAccumulator, options.postProcessRules); 419 | cleanedText = cleanedText.replace(/\r\n/g, '\n').replace(/\n{2,}/g, '\n\n').trim(); 420 | 421 | this.progressCallback({ stage: 'markdown_ocr', message: 'Converting to Markdown...' }); 422 | const markdown = this._convertToMarkdownLogic(cleanedText); 423 | this.progressCallback({ stage: 'complete_ocr', message: 'High-accuracy conversion complete.' }); 424 | return markdown; 425 | } 426 | 427 | async _initializeLLM(modelId, chatOpts = {}) { 428 | if (!this.WebLLMCreateEngine && !this.WebLLMChatConstructor) { 429 | throw new Error('WebLLM (CreateMLCEngine or Chat) module is not loaded. Ensure @mlc-ai/web-llm is correctly imported/bundled, or webLLM is globally available.'); 430 | } 431 | 432 | // Check if LLM is already initialized with the same model. 433 | // For CreateMLCEngine, modelId is part of the engine. For Chat, we stored it. 434 | const currentModelId = this.chatModule ? (this.chatModule.modelId || (this.chatModule.config && this.chatModule.config.model_id)) : null; 435 | if (this.llmInitialized && this.chatModule && currentModelId === modelId) { 436 | this.progressCallback({ stage: 'llm_ready', message: 'LLM already initialized with the correct model.' }); 437 | return; 438 | } 439 | 440 | this.progressCallback({ stage: 'llm_init', message: `Initializing LLM with model: ${modelId}... This may take time.` }); 441 | 442 | if (this.chatModule && typeof this.chatModule.unload === 'function') { 443 | await this.chatModule.unload(); 444 | this.chatModule = null; // Ensure it's cleared 445 | } 446 | this.llmInitialized = false; 447 | 448 | 449 | const llmInitProgressCallback = report => { 450 | this.progressCallback({ 451 | stage: 'llm_load_progress', 452 | message: `LLM Loading: ${report.text}`, 453 | progress: report.progress 454 | }); 455 | }; 456 | 457 | try { 458 | if (this.WebLLMCreateEngine) { 459 | let modelLibToUse; 460 | 461 | if (this.llmModelLibUrl) { 462 | // User provided a specific model_lib URL 463 | modelLibToUse = this.llmModelLibUrl; 464 | } else if (modelId === DEFAULT_LLM_MODEL) { 465 | // Use the hardcoded default model_lib URL for the default model 466 | modelLibToUse = DEFAULT_LLM_MODEL_LIB_URL; 467 | } else { 468 | // No specific URL provided by user, and it's not the default model with a known URL 469 | throw new Error( 470 | `Extract2MD Error: 'model_lib' URL not specified for model '${modelId}'. ` + 471 | `Please provide it via the 'llmModelLibUrl' constructor option, ` + 472 | `or use the default model ('${DEFAULT_LLM_MODEL}').` 473 | ); 474 | } 475 | 476 | const appConfig = { 477 | model_list: [ 478 | { 479 | "model": `https://huggingface.co/mlc-ai/${modelId}/resolve/main/`, 480 | "model_id": modelId, 481 | "model_lib": modelLibToUse, 482 | "required_features": modelId.includes("f16") ? ["shader-f16"] : [], 483 | "overrides": { 484 | "conv_template": "qwen" 485 | } 486 | } 487 | ] 488 | }; 489 | 490 | const engineConfig = { 491 | ...chatOpts, 492 | initProgressCallback: llmInitProgressCallback, 493 | appConfig: appConfig // Pass the constructed appConfig 494 | }; 495 | this.chatModule = await this.WebLLMCreateEngine(modelId, engineConfig); 496 | // CreateMLCEngine loads the model, so no separate reload needed immediately. 497 | // We can store modelId if needed for future checks, though engine usually has it. 498 | if(this.chatModule) this.chatModule.modelId = modelId; // For consistency if checked later 499 | } else if (this.WebLLMChatConstructor) { 500 | // Fallback to Chat constructor - this is the path that had issues 501 | this.chatModule = new this.WebLLMChatConstructor(); 502 | if(this.chatModule) this.chatModule.modelId = modelId; // Store modelId for Chat instances 503 | 504 | const finalChatOpts = { 505 | ...chatOpts, 506 | initProgressCallback: llmInitProgressCallback 507 | }; 508 | if (typeof this.chatModule.reload !== 'function') { 509 | throw new Error('this.chatModule.reload is not a function (Chat fallback path).'); 510 | } 511 | await this.chatModule.reload(modelId, finalChatOpts); 512 | } else { 513 | throw new Error('No valid WebLLM constructor found.'); 514 | } 515 | 516 | this.llmInitialized = true; 517 | this.progressCallback({ stage: 'llm_init_complete', message: 'LLM initialized successfully.' }); 518 | } catch (err) { 519 | this.llmInitialized = false; 520 | this.progressCallback({ stage: 'llm_init_error', message: `LLM initialization failed: ${err.message}`, error: err }); 521 | throw new Error(`LLM initialization failed: ${err.message}`); 522 | } 523 | } 524 | 525 | async llmRewrite(textToRewrite, options = {}) { 526 | const model = options.llmModel || this.llmModel; 527 | const promptTemplate = options.llmPromptTemplate || 528 | ((text) => `Please rewrite the following text, which was extracted from a PDF. Aim to improve its clarity, correct grammatical errors, and enhance its flow and professional tone, while preserving the original meaning, information, details, context and structure. Correct spelling errors in common words (do not change spelling in uncommon words like names, places, brands, etc.). Output only the rewritten text.\n\nOriginal Text:\n${text}\n\nRewritten Text:`); 529 | 530 | const chatOpts = options.chatOpts || {}; 531 | 532 | await this._initializeLLM(model, chatOpts); 533 | if (!this.llmInitialized || !this.chatModule) { 534 | throw new Error('LLM could not be initialized or is not ready.'); 535 | } 536 | 537 | const prompt = promptTemplate(textToRewrite); 538 | this.progressCallback({ stage: 'llm_generate_start', message: 'LLM generating rewritten text...' }); 539 | 540 | try { 541 | // The generate method in newer web-llm might return a ChatCompletion object. 542 | // We need to access the message content. 543 | // For simplicity, assuming it's similar to the previous structure or a direct string. 544 | // If it returns a more complex object, this part might need adjustment based on the exact API of webLLM.Chat. 545 | let replyContent = ''; 546 | if (this.WebLLMCreateEngine && this.chatModule && this.chatModule.chat && typeof this.chatModule.chat.completions.create === 'function') { 547 | // Using MLCEngine's OpenAI-compatible API 548 | const chatCompletion = await this.chatModule.chat.completions.create({ 549 | messages: [{ role: "user", content: prompt }], 550 | model: model // Ensure 'model' here is the modelId used for the engine 551 | }); 552 | if (chatCompletion.choices && chatCompletion.choices.length > 0 && chatCompletion.choices[0].message) { 553 | replyContent = chatCompletion.choices[0].message.content || ''; 554 | } 555 | } else if (this.chatModule && typeof this.chatModule.generate === 'function') { 556 | // Fallback or direct Chat.generate usage 557 | replyContent = await this.chatModule.generate(prompt, undefined, 0); // progressCb and streamInterval to undefined/0 558 | } else { 559 | throw new Error('LLM module does not support generate or chat.completions.create'); 560 | } 561 | 562 | this.progressCallback({ stage: 'llm_generate_complete', message: 'LLM rewrite complete.' }); 563 | return replyContent; 564 | } catch (err) { 565 | this.progressCallback({ stage: 'llm_generate_error', message: `LLM generation failed: ${err.message}`, error: err }); 566 | throw new Error(`LLM generation failed: ${err.message}`); 567 | } 568 | } 569 | 570 | async unloadLLM() { 571 | if (this.chatModule) { 572 | this.progressCallback({ stage: 'llm_unload', message: 'Unloading LLM model...' }); 573 | await this.chatModule.unload(); 574 | this.chatModule = null; 575 | this.llmInitialized = false; 576 | this.progressCallback({ stage: 'llm_unload_complete', message: 'LLM unloaded.' }); 577 | } 578 | } 579 | } 580 | 581 | // Export new API 582 | export default Extract2MDConverter; 583 | 584 | // Export individual components for advanced usage 585 | export { 586 | Extract2MDConverter, 587 | WebLLMEngine, 588 | OutputParser, 589 | SystemPrompts, 590 | ConfigValidator, 591 | LegacyExtract2MDConverter 592 | }; 593 | 594 | // Export legacy class as default for backwards compatibility 595 | export { LegacyExtract2MDConverter as Extract2MDConverter_Legacy }; -------------------------------------------------------------------------------- /src/converters/Extract2MDConverter.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Extract2MDConverter.js 3 | * Main converter class with scenario-specific methods 4 | */ 5 | 6 | import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs'; 7 | import Tesseract from 'tesseract.js'; 8 | import WebLLMEngine from '../engines/WebLLMEngine.js'; 9 | import OutputParser from '../utils/OutputParser.js'; 10 | import SystemPrompts from '../utils/SystemPrompts.js'; 11 | import ConfigValidator from '../utils/ConfigValidator.js'; 12 | 13 | export class Extract2MDConverter { 14 | constructor(config = {}) { 15 | // Validate and normalize configuration 16 | this.config = ConfigValidator.validate(config); 17 | 18 | // Initialize components 19 | this.webllmEngine = null; 20 | this.outputParser = new OutputParser(); 21 | 22 | // Setup PDF.js worker 23 | this.setupPdfJsWorker(); 24 | 25 | // Progress callback 26 | this.progressCallback = this.config.progressCallback || ((progress) => {}); 27 | } 28 | 29 | /** 30 | * Setup PDF.js worker 31 | */ 32 | setupPdfJsWorker() { 33 | const pdfjsSetupLib = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : 34 | (typeof window !== 'undefined' ? window.pdfjsLib : null)); 35 | 36 | if (pdfjsSetupLib && pdfjsSetupLib.GlobalWorkerOptions) { 37 | pdfjsSetupLib.GlobalWorkerOptions.workerSrc = this.config.pdfJsWorkerSrc; 38 | } else { 39 | console.warn('pdfjsLib or pdfjsLib.GlobalWorkerOptions is not defined. PDF.js worker may not load correctly.'); 40 | } 41 | } 42 | 43 | /** 44 | * Scenario 1: Quick convert only - returns MD output 45 | * @param {File} pdfFile - PDF file to convert 46 | * @param {Object} options - Optional configuration overrides 47 | * @returns {Promise} Markdown output 48 | */ 49 | static async quickConvertOnly(pdfFile, options = {}) { 50 | const converter = new Extract2MDConverter(options); 51 | return await converter._performQuickConvert(pdfFile); 52 | } 53 | 54 | /** 55 | * Scenario 2: High accuracy convert only - returns MD output 56 | * @param {File} pdfFile - PDF file to convert 57 | * @param {Object} options - Optional configuration overrides 58 | * @returns {Promise} Markdown output 59 | */ 60 | static async highAccuracyConvertOnly(pdfFile, options = {}) { 61 | const converter = new Extract2MDConverter(options); 62 | return await converter._performHighAccuracyConvert(pdfFile); 63 | } 64 | 65 | /** 66 | * Scenario 3: Quick convert + LLM rewrite - returns MD output 67 | * @param {File} pdfFile - PDF file to convert 68 | * @param {Object} options - Optional configuration overrides 69 | * @returns {Promise} LLM-rewritten markdown output 70 | */ 71 | static async quickConvertWithLLM(pdfFile, options = {}) { 72 | const converter = new Extract2MDConverter(options); 73 | 74 | try { 75 | // Step 1: Quick extraction 76 | converter.progressCallback({ 77 | stage: 'scenario_3_start', 78 | message: 'Starting quick conversion with LLM rewrite...' 79 | }); 80 | 81 | const extractedText = await converter._performQuickExtraction(pdfFile); 82 | 83 | // Step 2: LLM rewrite 84 | await converter._initializeWebLLM(); 85 | const rewrittenMarkdown = await converter._performLLMRewrite( 86 | extractedText, 87 | 'single', 88 | converter.config.systemPrompts.singleExtraction 89 | ); 90 | 91 | converter.progressCallback({ 92 | stage: 'scenario_3_complete', 93 | message: 'Quick conversion with LLM rewrite completed.' 94 | }); 95 | 96 | return rewrittenMarkdown; 97 | 98 | } finally { 99 | await converter._cleanup(); 100 | } 101 | } 102 | 103 | /** 104 | * Scenario 4: High accuracy convert + LLM rewrite - returns MD output 105 | * @param {File} pdfFile - PDF file to convert 106 | * @param {Object} options - Optional configuration overrides 107 | * @returns {Promise} LLM-rewritten markdown output 108 | */ 109 | static async highAccuracyConvertWithLLM(pdfFile, options = {}) { 110 | const converter = new Extract2MDConverter(options); 111 | 112 | try { 113 | // Step 1: High accuracy extraction 114 | converter.progressCallback({ 115 | stage: 'scenario_4_start', 116 | message: 'Starting high accuracy conversion with LLM rewrite...' 117 | }); 118 | 119 | const extractedText = await converter._performHighAccuracyExtraction(pdfFile); 120 | 121 | // Step 2: LLM rewrite 122 | await converter._initializeWebLLM(); 123 | const rewrittenMarkdown = await converter._performLLMRewrite( 124 | extractedText, 125 | 'single', 126 | converter.config.systemPrompts.singleExtraction 127 | ); 128 | 129 | converter.progressCallback({ 130 | stage: 'scenario_4_complete', 131 | message: 'High accuracy conversion with LLM rewrite completed.' 132 | }); 133 | 134 | return rewrittenMarkdown; 135 | 136 | } finally { 137 | await converter._cleanup(); 138 | } 139 | } 140 | 141 | /** 142 | * Scenario 5: Combined convert + LLM rewrite - returns comprehensive MD output 143 | * @param {File} pdfFile - PDF file to convert 144 | * @param {Object} options - Optional configuration overrides 145 | * @returns {Promise} Comprehensive LLM-rewritten markdown output 146 | */ 147 | static async combinedConvertWithLLM(pdfFile, options = {}) { 148 | const converter = new Extract2MDConverter(options); 149 | 150 | try { 151 | converter.progressCallback({ 152 | stage: 'scenario_5_start', 153 | message: 'Starting combined conversion with LLM rewrite...' 154 | }); 155 | 156 | // Step 1: Parallel extraction using both methods 157 | const [quickText, ocrText] = await Promise.all([ 158 | converter._performQuickExtraction(pdfFile), 159 | converter._performHighAccuracyExtraction(pdfFile) 160 | ]); 161 | 162 | // Step 2: LLM rewrite with combined context 163 | await converter._initializeWebLLM(); 164 | const rewrittenMarkdown = await converter._performCombinedLLMRewrite( 165 | quickText, 166 | ocrText, 167 | converter.config.systemPrompts.combinedExtraction 168 | ); 169 | 170 | converter.progressCallback({ 171 | stage: 'scenario_5_complete', 172 | message: 'Combined conversion with LLM rewrite completed.' 173 | }); 174 | 175 | return rewrittenMarkdown; 176 | 177 | } finally { 178 | await converter._cleanup(); 179 | } 180 | } 181 | 182 | // Internal methods for extraction and processing 183 | 184 | /** 185 | * Perform quick text extraction using PDF.js 186 | */ 187 | async _performQuickExtraction(pdfFile) { 188 | // Enhanced input validation 189 | if (!(pdfFile instanceof File)) { 190 | throw new Error('Invalid input: pdfFile must be a File object.'); 191 | } 192 | if (pdfFile.size === 0) { 193 | throw new Error('Invalid input: PDF file is empty.'); 194 | } 195 | if (pdfFile.size > 100 * 1024 * 1024) { // 100MB limit 196 | throw new Error('Invalid input: PDF file is too large (max 100MB).'); 197 | } 198 | if (!pdfFile.type || (!pdfFile.type.includes('pdf') && !pdfFile.name.toLowerCase().endsWith('.pdf'))) { 199 | throw new Error('Invalid input: File must be a PDF document.'); 200 | } 201 | 202 | this.progressCallback({ 203 | stage: 'quick_extraction_start', 204 | message: 'Starting quick PDF text extraction...' 205 | }); 206 | 207 | const arrayBuffer = await pdfFile.arrayBuffer(); 208 | const rawText = await this._extractTextWithPdfJs(arrayBuffer); 209 | const cleanedText = this._postProcessText(rawText); 210 | 211 | this.progressCallback({ 212 | stage: 'quick_extraction_complete', 213 | message: 'Quick extraction completed.' 214 | }); 215 | 216 | return cleanedText; 217 | } 218 | 219 | /** 220 | * Perform quick conversion (extraction + markdown formatting) 221 | */ 222 | async _performQuickConvert(pdfFile) { 223 | const extractedText = await this._performQuickExtraction(pdfFile); 224 | 225 | this.progressCallback({ 226 | stage: 'quick_markdown_start', 227 | message: 'Converting to Markdown...' 228 | }); 229 | 230 | const markdown = this._convertToMarkdown(extractedText); 231 | 232 | this.progressCallback({ 233 | stage: 'quick_markdown_complete', 234 | message: 'Quick conversion completed.' 235 | }); 236 | 237 | return markdown; 238 | } 239 | 240 | /** 241 | * Perform high accuracy text extraction using OCR 242 | */ 243 | async _performHighAccuracyExtraction(pdfFile) { 244 | // Enhanced input validation 245 | if (!(pdfFile instanceof File)) { 246 | throw new Error('Invalid input: pdfFile must be a File object.'); 247 | } 248 | if (pdfFile.size === 0) { 249 | throw new Error('Invalid input: PDF file is empty.'); 250 | } 251 | if (pdfFile.size > 100 * 1024 * 1024) { // 100MB limit 252 | throw new Error('Invalid input: PDF file is too large (max 100MB).'); 253 | } 254 | if (!pdfFile.type || (!pdfFile.type.includes('pdf') && !pdfFile.name.toLowerCase().endsWith('.pdf'))) { 255 | throw new Error('Invalid input: File must be a PDF document.'); 256 | } 257 | 258 | this.progressCallback({ 259 | stage: 'ocr_extraction_start', 260 | message: 'Starting OCR text extraction...' 261 | }); 262 | 263 | const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : 264 | (typeof window !== 'undefined' ? window.pdfjsLib : null)); 265 | if (!pdfjs || !pdfjs.getDocument) { 266 | throw new Error('pdf.js library is not loaded or not fully initialized.'); 267 | } 268 | 269 | const Tess = (typeof Tesseract !== 'undefined' ? Tesseract : 270 | (typeof window !== 'undefined' ? window.Tesseract : null)); 271 | if (!Tess) { 272 | throw new Error('Tesseract.js library is not loaded.'); 273 | } 274 | 275 | let worker; 276 | let workerInitialized = false; 277 | 278 | try { 279 | // Initialize Tesseract worker with enhanced error handling 280 | this.progressCallback({ 281 | stage: 'ocr_worker_init', 282 | message: 'Initializing OCR worker...' 283 | }); 284 | 285 | try { 286 | // Set timeout for worker initialization 287 | const workerPromise = Tess.createWorker( 288 | this.config.tesseract.language, 289 | 1, 290 | { 291 | workerPath: this.config.tesseract.workerPath, 292 | corePath: this.config.tesseract.corePath, 293 | langPath: this.config.tesseract.langPath, 294 | ...this.config.tesseract.options 295 | } 296 | ); 297 | 298 | // Add timeout to prevent hanging 299 | const timeoutPromise = new Promise((_, reject) => { 300 | setTimeout(() => reject(new Error('Worker initialization timed out after 30 seconds')), 30000); 301 | }); 302 | 303 | worker = await Promise.race([workerPromise, timeoutPromise]); 304 | workerInitialized = true; 305 | 306 | this.progressCallback({ 307 | stage: 'ocr_worker_ready', 308 | message: 'OCR worker initialized successfully.' 309 | }); 310 | 311 | } catch (workerError) { 312 | throw new Error(`Failed to initialize Tesseract worker: ${workerError.message}. Check if Tesseract.js files are accessible and language data is available.`); 313 | } 314 | 315 | // Process PDF 316 | const arrayBuffer = await pdfFile.arrayBuffer(); 317 | const pdfDoc = await pdfjs.getDocument({ data: arrayBuffer }).promise; 318 | let fullText = ''; 319 | const numPages = pdfDoc.numPages; 320 | 321 | for (let pageNum = 1; pageNum <= numPages; pageNum++) { 322 | this.progressCallback({ 323 | stage: 'ocr_page_process', 324 | message: `Processing page ${pageNum}/${numPages}...`, 325 | currentPage: pageNum, 326 | totalPages: numPages 327 | }); 328 | 329 | const page = await pdfDoc.getPage(pageNum); 330 | const viewport = page.getViewport({ scale: this.config.processing.pdfRenderScale }); 331 | 332 | const canvas = document.createElement('canvas'); 333 | const context = canvas.getContext('2d'); 334 | canvas.height = viewport.height; 335 | canvas.width = viewport.width; 336 | 337 | try { 338 | await page.render({ canvasContext: context, viewport: viewport }).promise; 339 | 340 | // OCR recognition with error handling 341 | const recognition = await worker.recognize(canvas); 342 | const ocrPageText = recognition.data?.text || ''; 343 | fullText += ocrPageText + '\n'; 344 | 345 | } catch (pageError) { 346 | this.progressCallback({ 347 | stage: 'ocr_page_warning', 348 | message: `Warning: Failed to process page ${pageNum}: ${pageError.message}` 349 | }); 350 | console.warn(`OCR processing failed for page ${pageNum}:`, pageError); 351 | // Continue with other pages instead of failing completely 352 | } finally { 353 | // Clean up canvas resources 354 | canvas.width = 0; 355 | canvas.height = 0; 356 | } 357 | } 358 | 359 | // Safely terminate worker 360 | if (workerInitialized && worker) { 361 | try { 362 | this.progressCallback({ 363 | stage: 'ocr_worker_terminate', 364 | message: 'Terminating OCR worker...' 365 | }); 366 | 367 | await Promise.race([ 368 | worker.terminate(), 369 | new Promise((_, reject) => { 370 | setTimeout(() => reject(new Error('Worker termination timed out')), 10000); 371 | }) 372 | ]); 373 | } catch (terminateError) { 374 | console.warn('Warning: Failed to properly terminate Tesseract worker:', terminateError); 375 | // Don't throw error for termination issues 376 | } 377 | } 378 | 379 | const cleanedText = this._postProcessText(fullText); 380 | 381 | this.progressCallback({ 382 | stage: 'ocr_extraction_complete', 383 | message: 'OCR extraction completed.' 384 | }); 385 | 386 | return cleanedText; 387 | 388 | } catch (error) { 389 | // Enhanced cleanup on error 390 | if (workerInitialized && worker) { 391 | try { 392 | await Promise.race([ 393 | worker.terminate(), 394 | new Promise((resolve) => setTimeout(resolve, 5000)) // Give up after 5 seconds 395 | ]); 396 | } catch (cleanupError) { 397 | console.warn('Failed to cleanup worker after error:', cleanupError); 398 | } 399 | } 400 | throw error; 401 | } 402 | } 403 | 404 | /** 405 | * Perform high accuracy conversion (OCR + markdown formatting) 406 | */ 407 | async _performHighAccuracyConvert(pdfFile) { 408 | const extractedText = await this._performHighAccuracyExtraction(pdfFile); 409 | 410 | this.progressCallback({ 411 | stage: 'ocr_markdown_start', 412 | message: 'Converting OCR results to Markdown...' 413 | }); 414 | 415 | const markdown = this._convertToMarkdown(extractedText); 416 | 417 | this.progressCallback({ 418 | stage: 'ocr_markdown_complete', 419 | message: 'High accuracy conversion completed.' 420 | }); 421 | 422 | return markdown; 423 | } 424 | 425 | /** 426 | * Check WebGPU capability and browser support 427 | * @returns {Promise} WebGPU capability information 428 | */ 429 | static async checkWebGPUCapability() { 430 | const result = { 431 | isSupported: false, 432 | hasShaderF16: false, 433 | error: null, 434 | details: {} 435 | }; 436 | 437 | try { 438 | // Check if WebGPU is available 439 | if (!navigator.gpu) { 440 | result.error = 'WebGPU is not supported in this browser. Please use Chrome 113+ or Edge 113+.'; 441 | return result; 442 | } 443 | 444 | // Request WebGPU adapter 445 | const adapter = await navigator.gpu.requestAdapter(); 446 | if (!adapter) { 447 | result.error = 'No WebGPU adapter found. WebGPU may not be supported on this device.'; 448 | return result; 449 | } 450 | 451 | // Get adapter features 452 | const features = Array.from(adapter.features); 453 | result.hasShaderF16 = features.includes('shader-f16'); 454 | result.isSupported = true; 455 | result.details = { 456 | features, 457 | limits: adapter.limits, 458 | info: adapter.info 459 | }; 460 | 461 | return result; 462 | 463 | } catch (error) { 464 | result.error = `WebGPU capability check failed: ${error.message}`; 465 | return result; 466 | } 467 | } 468 | 469 | /** 470 | * Initialize WebLLM engine with WebGPU capability checks 471 | */ 472 | async _initializeWebLLM() { 473 | // Check WebGPU capability before initializing LLM 474 | this.progressCallback({ 475 | stage: 'webgpu_check_start', 476 | message: 'Checking WebGPU capability for LLM processing...' 477 | }); 478 | 479 | const webgpuCapability = await Extract2MDConverter.checkWebGPUCapability(); 480 | 481 | if (!webgpuCapability.isSupported) { 482 | const errorMessage = `WebGPU capability check failed: ${webgpuCapability.error}`; 483 | this.progressCallback({ 484 | stage: 'webgpu_check_failed', 485 | message: errorMessage, 486 | error: webgpuCapability.error 487 | }); 488 | throw new Error(errorMessage); 489 | } 490 | 491 | this.progressCallback({ 492 | stage: 'webgpu_check_success', 493 | message: `WebGPU is supported. Shader F16: ${webgpuCapability.hasShaderF16 ? 'Yes' : 'No'}` 494 | }); 495 | 496 | // Validate model requirements against WebGPU capabilities 497 | const modelRequiresF16 = this.config.webllm.model && this.config.webllm.model.includes('f16'); 498 | if (modelRequiresF16 && !webgpuCapability.hasShaderF16) { 499 | const warningMessage = `Warning: Model "${this.config.webllm.model}" requires shader-f16 support, but your device doesn't support it. Performance may be reduced.`; 500 | this.progressCallback({ 501 | stage: 'webgpu_compatibility_warning', 502 | message: warningMessage 503 | }); 504 | } 505 | 506 | if (!this.webllmEngine) { 507 | this.webllmEngine = new WebLLMEngine({ 508 | progressCallback: this.progressCallback, 509 | defaultModel: this.config.webllm.model, 510 | customModelConfig: this.config.webllm.customModel 511 | }); 512 | } 513 | 514 | const modelToUse = this.config.webllm.customModel ? 515 | this.config.webllm.customModel.model_id : 516 | this.config.webllm.model; 517 | 518 | await this.webllmEngine.initialize(modelToUse, this.config.webllm.options); 519 | } 520 | 521 | /** 522 | * Perform LLM rewrite for single extraction 523 | */ 524 | async _performLLMRewrite(extractedText, scenarioType, customPrompt) { 525 | const systemPrompt = SystemPrompts.buildSystemPrompt(scenarioType, customPrompt); 526 | const userPrompt = SystemPrompts.buildUserPrompt(scenarioType, extractedText); 527 | 528 | // For models that support thinking, we could enable it 529 | const fullPrompt = `${systemPrompt}\n\n${userPrompt}`; 530 | 531 | const rawOutput = await this.webllmEngine.generate(fullPrompt, this.config.webllm.options); 532 | const cleanedOutput = this.outputParser.parse(rawOutput); 533 | 534 | return cleanedOutput; 535 | } 536 | 537 | /** 538 | * Perform combined LLM rewrite 539 | */ 540 | async _performCombinedLLMRewrite(quickText, ocrText, customPrompt) { 541 | const systemPrompt = SystemPrompts.buildSystemPrompt('combined', customPrompt); 542 | const userPrompt = SystemPrompts.buildUserPrompt('combined', quickText, ocrText); 543 | 544 | const fullPrompt = `${systemPrompt}\n\n${userPrompt}`; 545 | 546 | const rawOutput = await this.webllmEngine.generate(fullPrompt, this.config.webllm.options); 547 | const cleanedOutput = this.outputParser.parse(rawOutput); 548 | 549 | return cleanedOutput; 550 | } 551 | 552 | /** 553 | * Extract text using PDF.js 554 | */ 555 | async _extractTextWithPdfJs(fileArrayBuffer) { 556 | const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : 557 | (typeof window !== 'undefined' ? window.pdfjsLib : null)); 558 | 559 | if (!pdfjs || !pdfjs.getDocument) { 560 | throw new Error('pdf.js library is not loaded or not fully initialized.'); 561 | } 562 | 563 | this.progressCallback({ 564 | stage: 'pdfjs_load', 565 | message: 'Loading PDF with pdf.js...' 566 | }); 567 | 568 | const pdfDoc = await pdfjs.getDocument({ data: fileArrayBuffer }).promise; 569 | let fullText = ''; 570 | const numPages = pdfDoc.numPages; 571 | 572 | for (let pageNum = 1; pageNum <= numPages; pageNum++) { 573 | this.progressCallback({ 574 | stage: 'pdfjs_page', 575 | message: `Extracting text from page ${pageNum}/${numPages}...`, 576 | currentPage: pageNum, 577 | totalPages: numPages 578 | }); 579 | 580 | const page = await pdfDoc.getPage(pageNum); 581 | const textContent = await page.getTextContent({ 582 | normalizeWhitespace: false, 583 | disableCombineTextItems: true 584 | }); 585 | 586 | let pageTextBuffer = ''; 587 | if (textContent.items && textContent.items.length > 0) { 588 | for (let i = 0; i < textContent.items.length; i++) { 589 | const item = textContent.items[i]; 590 | pageTextBuffer += item.str; 591 | 592 | if (item.hasEOL) { 593 | if (!pageTextBuffer.endsWith('\n')) pageTextBuffer += '\n'; 594 | } else if (i < textContent.items.length - 1) { 595 | const nextItem = textContent.items[i + 1]; 596 | if (item.str && !item.str.endsWith(' ') && 597 | nextItem.str && !nextItem.str.startsWith(' ') && 598 | Math.abs(item.transform[5] - nextItem.transform[5]) < (item.height * 0.5)) { 599 | 600 | const currentItemEndX = item.transform[4] + item.width; 601 | const nextItemStartX = nextItem.transform[4]; 602 | if (nextItemStartX - currentItemEndX > -0.5) { 603 | pageTextBuffer += ' '; 604 | } 605 | } 606 | } 607 | } 608 | } 609 | 610 | fullText += pageTextBuffer; 611 | if (pageTextBuffer.trim() !== '' && !pageTextBuffer.endsWith('\n')) { 612 | fullText += '\n'; 613 | } 614 | } 615 | 616 | this.progressCallback({ 617 | stage: 'pdfjs_extract_complete', 618 | message: 'PDF.js text extraction complete.' 619 | }); 620 | 621 | return fullText; 622 | } 623 | 624 | /** 625 | * Post-process extracted text with optimized rule application 626 | */ 627 | _postProcessText(text) { 628 | if (!text) return ''; 629 | 630 | let cleanedText = text; 631 | 632 | // Apply default rules 633 | const defaultRules = [ 634 | { find: /\uFB00/g, replace: 'ff' }, 635 | { find: /\uFB01/g, replace: 'fi' }, 636 | { find: /\uFB02/g, replace: 'fl' }, 637 | { find: /\uFB03/g, replace: 'ffi' }, 638 | { find: /\uFB04/g, replace: 'ffl' }, 639 | { find: /[\u2018\u2019]/g, replace: "'" }, 640 | { find: /[\u201C\u201D]/g, replace: '"' }, 641 | { find: /[\u2022\u2023\u25E6\u2043\u2219\u25CF\u25CB\u2981\u2619\u2765]/g, replace: '-' }, 642 | { find: /[\u2013\u2014]/g, replace: '-' }, 643 | { find: /\u00AD/g, replace: '' }, 644 | { find: /[\s\u00A0\u2000-\u200A\u202F\u205F\u3000]+/g, replace: ' ' } 645 | ]; 646 | 647 | // Add PascalCase rules if enabled 648 | if (this.config.processing.splitPascalCase) { 649 | defaultRules.push( 650 | { find: /([A-Z][a-z]+)([A-Z][a-z]+)/g, replace: '$1 $2' }, 651 | { find: /([a-z])([A-Z][a-z]+)/g, replace: '$1 $2' } 652 | ); 653 | } 654 | 655 | // Combine all rules for efficient processing 656 | const allRules = [...defaultRules, ...this.config.processing.postProcessRules]; 657 | 658 | // Optimized rule application - batch similar operations 659 | const unicodeReplacements = []; 660 | const regexReplacements = []; 661 | 662 | for (const rule of allRules) { 663 | if (rule.find && typeof rule.replace === 'string') { 664 | if (rule.find instanceof RegExp) { 665 | regexReplacements.push(rule); 666 | } else { 667 | unicodeReplacements.push(rule); 668 | } 669 | } 670 | } 671 | 672 | // Apply unicode replacements first (typically simpler) 673 | for (const rule of unicodeReplacements) { 674 | cleanedText = cleanedText.replace(rule.find, rule.replace); 675 | } 676 | 677 | // Apply regex replacements 678 | for (const rule of regexReplacements) { 679 | cleanedText = cleanedText.replace(rule.find, rule.replace); 680 | } 681 | 682 | // Final normalization - combine line break handling with newline normalization 683 | return cleanedText.replace(/\r\n/g, '\n').replace(/\n{3,}/g, '\n\n').trim(); 684 | } 685 | 686 | /** 687 | * Convert text to markdown with optimized newline handling 688 | */ 689 | _convertToMarkdown(rawText) { 690 | // Implementation of markdown conversion logic with optimized newline handling 691 | let markdownOutputLines = []; 692 | const inputLines = rawText.split(/\n/); 693 | 694 | let currentParagraphCollector = []; 695 | let inPotentialTableBlock = false; 696 | let potentialTableBlockLines = []; 697 | 698 | const flushCurrentParagraph = () => { 699 | if (currentParagraphCollector.length > 0) { 700 | markdownOutputLines.push(currentParagraphCollector.join(' ').trim()); 701 | currentParagraphCollector = []; 702 | // Only add empty line if the next content isn't a heading or table block 703 | this._addSeparatorLine(markdownOutputLines); 704 | } 705 | }; 706 | 707 | const flushPotentialTableBlock = () => { 708 | if (potentialTableBlockLines.length > 0) { 709 | if (potentialTableBlockLines.length >= 2) { 710 | markdownOutputLines.push('```'); 711 | markdownOutputLines.push(...potentialTableBlockLines.map(l => l.trimEnd())); 712 | markdownOutputLines.push('```'); 713 | } else { 714 | markdownOutputLines.push(potentialTableBlockLines.join(' ').trim()); 715 | } 716 | potentialTableBlockLines = []; 717 | this._addSeparatorLine(markdownOutputLines); 718 | } 719 | inPotentialTableBlock = false; 720 | }; 721 | 722 | for (let i = 0; i < inputLines.length; i++) { 723 | const originalLine = inputLines[i]; 724 | const trimmedLine = originalLine.trim(); 725 | 726 | if (trimmedLine === '') { 727 | if (inPotentialTableBlock) flushPotentialTableBlock(); 728 | flushCurrentParagraph(); 729 | continue; 730 | } 731 | 732 | const isShortLine = trimmedLine.length > 0 && trimmedLine.length < 80; 733 | const noPunctuationEnd = isShortLine && !/[.,;:!?]$/.test(trimmedLine); 734 | const isAllCapsLine = trimmedLine.length > 2 && trimmedLine.length < 80 && 735 | /^[A-Z\s\d\W]*[A-Z][A-Z\s\d\W]*$/.test(trimmedLine) && 736 | /[A-Z]/.test(trimmedLine) && !/^\d+$/.test(trimmedLine); 737 | const nextLineIsBlankOrEndOfFile = (i + 1 === inputLines.length || 738 | inputLines[i + 1].trim() === ''); 739 | 740 | if (isAllCapsLine || (isShortLine && noPunctuationEnd && nextLineIsBlankOrEndOfFile && trimmedLine.length > 1)) { 741 | if (inPotentialTableBlock) flushPotentialTableBlock(); 742 | flushCurrentParagraph(); 743 | markdownOutputLines.push(`# ${trimmedLine}`); 744 | this._addSeparatorLine(markdownOutputLines); 745 | if (nextLineIsBlankOrEndOfFile && inputLines[i + 1] && inputLines[i + 1].trim() === '') { 746 | i++; 747 | } 748 | continue; 749 | } 750 | 751 | const hasMultipleSpacesBetweenWords = /\S\s{2,}\S/.test(originalLine); 752 | const hasMultipleColumnsBySpaces = originalLine.split(/\s{2,}/).length > 2 && originalLine.length > 10; 753 | 754 | if (hasMultipleSpacesBetweenWords || hasMultipleColumnsBySpaces) { 755 | flushCurrentParagraph(); 756 | if (!inPotentialTableBlock) inPotentialTableBlock = true; 757 | potentialTableBlockLines.push(originalLine); 758 | } else { 759 | if (inPotentialTableBlock) flushPotentialTableBlock(); 760 | if (trimmedLine) currentParagraphCollector.push(trimmedLine); 761 | } 762 | } 763 | 764 | if (inPotentialTableBlock) flushPotentialTableBlock(); 765 | flushCurrentParagraph(); 766 | 767 | // Optimized final cleanup - single pass to normalize excessive newlines 768 | return this._normalizeMarkdownNewlines(markdownOutputLines); 769 | } 770 | 771 | /** 772 | * Helper method to add separator lines only when needed 773 | */ 774 | _addSeparatorLine(outputLines) { 775 | // Only add empty line if the last line isn't already empty 776 | if (outputLines.length > 0 && outputLines[outputLines.length - 1] !== '') { 777 | outputLines.push(''); 778 | } 779 | } 780 | 781 | /** 782 | * Normalize newlines in the final markdown output 783 | */ 784 | _normalizeMarkdownNewlines(lines) { 785 | // Filter out excessive empty lines while preserving structure 786 | const normalizedLines = []; 787 | let consecutiveEmptyLines = 0; 788 | 789 | for (const line of lines) { 790 | if (line.trim() === '') { 791 | consecutiveEmptyLines++; 792 | // Allow maximum of 1 consecutive empty line 793 | if (consecutiveEmptyLines <= 1) { 794 | normalizedLines.push(''); 795 | } 796 | } else { 797 | consecutiveEmptyLines = 0; 798 | normalizedLines.push(line.trimEnd()); 799 | } 800 | } 801 | 802 | // Join and do final cleanup 803 | let finalMarkdown = normalizedLines.join('\n'); 804 | // Remove any remaining triple+ newlines and trim 805 | finalMarkdown = finalMarkdown.replace(/\n{3,}/g, '\n\n').trim(); 806 | return finalMarkdown; 807 | } 808 | 809 | /** 810 | * Cleanup resources with proper error handling 811 | */ 812 | async _cleanup() { 813 | try { 814 | if (this.webllmEngine) { 815 | this.progressCallback({ 816 | stage: 'cleanup_webllm', 817 | message: 'Cleaning up WebLLM engine...' 818 | }); 819 | await this.webllmEngine.cleanup(); 820 | this.webllmEngine = null; 821 | } 822 | 823 | this.progressCallback({ 824 | stage: 'cleanup_complete', 825 | message: 'Resource cleanup completed successfully.' 826 | }); 827 | } catch (error) { 828 | console.warn('Warning: Error during resource cleanup:', error.message); 829 | this.progressCallback({ 830 | stage: 'cleanup_error', 831 | message: `Resource cleanup warning: ${error.message}`, 832 | error: error 833 | }); 834 | // Don't throw - cleanup errors shouldn't break the application 835 | } 836 | } 837 | } 838 | 839 | export default Extract2MDConverter; 840 | --------------------------------------------------------------------------------