├── test
    ├── basic.test.js
    ├── simple.test.js
    ├── newline-optimization.test.js
    ├── simple-newline.test.js
    └── scenarios.test.js
├── examples
    ├── key.pem
    ├── cert.pem
    ├── .dockerignore
    ├── Dockerfile
    ├── README.md
    ├── usage-examples.js
    └── demo.html
├── .npmignore
├── .gitignore
├── LICENSE
├── config.example.json
├── package.json
├── webpack.config.js
├── src
    ├── types
    │   └── index.d.ts
    ├── utils
    │   ├── SystemPrompts.js
    │   ├── OutputParser.js
    │   └── ConfigValidator.js
    ├── engines
    │   └── WebLLMEngine.js
    ├── index.js
    └── converters
    │   └── Extract2MDConverter.js
├── MIGRATION.md
├── scripts
    ├── validate-deployment.js
    └── postinstall.js
├── DEPLOYMENT.md
└── README.md


/test/basic.test.js:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/key.pem:
--------------------------------------------------------------------------------
1 | -----BEGIN PRIVATE KEY-----
2 | 
3 | -----END PRIVATE KEY-----
4 | 


--------------------------------------------------------------------------------
/examples/cert.pem:
--------------------------------------------------------------------------------
1 | -----BEGIN CERTIFICATE-----
2 | 
3 | -----END CERTIFICATE-----
4 | 


--------------------------------------------------------------------------------
/examples/.dockerignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .git
3 | test
4 | src
5 | scripts
6 | *.log
7 | .DS_Store
8 | *.md
9 | package*.json 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | # Node.js dependencies
 2 | node_modules/
 3 | 
 4 | # Test files
 5 | test/
 6 | 
 7 | # Example files
 8 | examples/
 9 | 
10 | # Build configuration
11 | webpack.config.js
12 | 
13 | # Development files
14 | package-lock.json
15 | config.example.json
16 | 
17 | # Documentation
18 | DEPLOYMENT.md
19 | LICENSE
20 | MIGRATION.md
21 | 
22 | # Source map files
23 | dist/assets/*.map


--------------------------------------------------------------------------------
/examples/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:20-alpine
 2 | 
 3 | WORKDIR /app
 4 | 
 5 | # Install extract2md and serve
 6 | RUN npm install extract2md serve
 7 | 
 8 | # Copy demo.html into the image
 9 | COPY demo.html ./demo.html
10 | 
11 | # Copy the dist directory from the installed package to /app/dist
12 | RUN mkdir -p dist && \
13 |     cp -r node_modules/extract2md/dist/* dist/
14 | 
15 | # Expose port for the static server
16 | EXPOSE 8080
17 | 
18 | # Serve the current directory (demo.html and dist/)
19 | CMD ["npx", "serve", "--listen", "8080", "."] 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | node_modules/
 3 | 
 4 | # Build output
 5 | dist/
 6 | build/
 7 | 
 8 | # Logs
 9 | npm-debug.log*
10 | yarn-debug.log*
11 | yarn-error.log*
12 | *.log
13 | 
14 | # OS generated files
15 | .DS_Store
16 | .DS_Store?
17 | ._*
18 | .Spotlight-V100
19 | .Trashes
20 | ehthumbs.db
21 | Thumbs.db
22 | 
23 | # Editor directories and files
24 | .idea/
25 | .vscode/
26 | *.suo
27 | *.ntvs*
28 | *.njsproj
29 | *.sln
30 | *.sw?
31 | 
32 | # Optional: Environment variables file
33 | .env
34 | .env.local
35 | .env.development.local
36 | .env.test.local
37 | .env.production.local
38 | .pem
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright <2025> <Hashan Wickramasinghe>
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/config.example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": "Extract2MD Configuration Schema - Complete example with all available options",
 3 |   
 4 |   "ocr": {
 5 |     "language": "eng",
 6 |     "oem": 1,
 7 |     "psm": 6,
 8 |     "workerPath": "./tesseract-worker.min.js",
 9 |     "corePath": "./tesseract-core.wasm.js", 
10 |     "langPath": "./lang-data/",
11 |     "options": {
12 |       "logger": null,
13 |       "errorHandler": null
14 |     }
15 |   },
16 |   
17 |   "webllm": {
18 |     "modelId": "Llama-3.2-1B-Instruct-q4f16_1-MLC",
19 |     "temperature": 0.7,
20 |     "maxTokens": 4000,
21 |     "streamingEnabled": false,
22 |     "customModel": {
23 |       "model": "https://huggingface.co/mlc-ai/custom-model/resolve/main/",
24 |       "model_id": "Custom-Model-ID",
25 |       "model_lib": "https://example.com/path/to/custom-model.wasm",
26 |       "required_features": ["shader-f16"],
27 |       "overrides": {
28 |         "conv_template": "llama"
29 |       }
30 |     }
31 |   },
32 |   
33 |   "systemPrompts": {
34 |     "singleExtraction": "Focus on technical accuracy and preserve all code examples exactly as they appear.",
35 |     "combinedExtraction": "Pay special attention to diagrams and tables that might be better captured in the OCR version."
36 |   },
37 |   
38 |   "processing": {
39 |     "splitPascalCase": false,
40 |     "pdfRenderScale": 2.5,
41 |     "postProcessRules": [
42 |       {
43 |         "find": "\\bAPI\\b",
44 |         "replace": "API"
45 |       },
46 |       {
47 |         "find": "\\bJSON\\b", 
48 |         "replace": "JSON"
49 |       }
50 |     ]
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "extract2md",
 3 |   "version": "2.0.0",
 4 |   "description": "Client-side PDF to Markdown conversion with OCR and optional LLM rewrite. Core dependencies bundled for offline use.",
 5 |   "main": "dist/assets/extract2md.umd.js",
 6 |   "module": "dist/assets/extract2md.esm.js",
 7 |   "type": "module",
 8 |   "types": "dist/assets/extract2md.d.ts",
 9 |   "scripts": {
10 |     "build": "webpack",
11 |     "prepublishOnly": "npm run build",
12 |     "postinstall": "node scripts/postinstall.js",
13 |     "test": "node test/simple.test.js"
14 |   },
15 |   "keywords": [
16 |     "pdf",
17 |     "markdown",
18 |     "ocr",
19 |     "tesseract.js",
20 |     "pdf.js",
21 |     "webllm",
22 |     "llm",
23 |     "client-side",
24 |     "text-extraction",
25 |     "pdf to markdown",
26 |     "offline"
27 |   ],
28 |   "author": "Hashan Wickramasinghe <hashanwickramasinghe@gmail.com>",
29 |   "license": "MIT",
30 |   "dependencies": {
31 |     "@mlc-ai/web-llm": "^0.2.79",
32 |     "pdfjs-dist": "^5.2.133",
33 |     "tesseract.js": "^5.0.5"
34 |   },
35 |   "devDependencies": {
36 |     "@babel/core": "^7.24.0",
37 |     "@babel/preset-env": "^7.24.0",
38 |     "babel-loader": "^9.1.3",
39 |     "copy-webpack-plugin": "^12.0.2",
40 |     "webpack": "^5.90.3",
41 |     "webpack-cli": "^5.1.4"
42 |   },
43 |   "files": [
44 |     "dist",
45 |     "scripts",
46 |     "README.md",
47 |     "MIGRATION.md"
48 |   ],
49 |   "homepage": "https://github.com/hashangit/Extract2MD#readme",
50 |   "repository": {
51 |     "type": "git",
52 |     "url": "git+https://github.com/hashangit/Extract2MD.git"
53 |   },
54 |   "bugs": {
55 |     "url": "https://github.com/hashangit/Extract2MD/issues"
56 |   }
57 | }


--------------------------------------------------------------------------------
/test/simple.test.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Simple test to validate package structure
 3 |  */
 4 | 
 5 | import fs from 'fs';
 6 | import path from 'path';
 7 | import { fileURLToPath } from 'url';
 8 | 
 9 | const __filename = fileURLToPath(import.meta.url);
10 | const __dirname = path.dirname(__filename);
11 | 
12 | console.log('Testing Extract2MD package structure...');
13 | 
14 | // Test 1: Check TypeScript definitions exist
15 | const typesPath = path.resolve(__dirname, '../src/types/index.d.ts');
16 | if (fs.existsSync(typesPath)) {
17 |     console.log('✅ TypeScript definitions found');
18 | } else {
19 |     console.log('❌ TypeScript definitions missing');
20 |     process.exit(1);
21 | }
22 | 
23 | // Test 2: Check configuration example exists
24 | const configPath = path.resolve(__dirname, '../config.example.json');
25 | if (fs.existsSync(configPath)) {
26 |     console.log('✅ Configuration example found');
27 | } else {
28 |     console.log('❌ Configuration example missing');
29 |     process.exit(1);
30 | }
31 | 
32 | // Test 3: Check core files exist
33 | const coreFiles = [
34 |     '../src/converters/Extract2MDConverter.js',
35 |     '../src/engines/WebLLMEngine.js',
36 |     '../src/utils/ConfigValidator.js',
37 |     '../src/utils/OutputParser.js',
38 |     '../src/utils/SystemPrompts.js'
39 | ];
40 | 
41 | for (const file of coreFiles) {
42 |     const filePath = path.resolve(__dirname, file);
43 |     if (fs.existsSync(filePath)) {
44 |         console.log(`✅ ${file.split('/').pop()} found`);
45 |     } else {
46 |         console.log(`❌ ${file.split('/').pop()} missing`);
47 |         process.exit(1);
48 |     }
49 | }
50 | 
51 | console.log('\n🎉 All basic structure tests passed!');
52 | process.exit(0);
53 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Extract2MD Demo Docker Instructions
 2 | 
 3 | This guide explains how to build and run a Docker container to serve the `demo.html` file for Extract2MD, including all required assets.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - [Docker](https://www.docker.com/get-started) installed on your system
 8 | - Internet connection (to pull the Node.js image and npm packages)
 9 | 
10 | ## Steps
11 | 
12 | ### 1. Build the Docker Image
13 | 
14 | From the project root (where the `examples/` folder is located), run:
15 | 
16 | ```sh
17 | docker build -t extract2md-demo ./examples
18 | ```
19 | 
20 | This will:
21 | - Use the provided `Dockerfile` in the `examples/` folder
22 | - Install `extract2md` and `serve` via npm
23 | - Copy `demo.html` and all required assets into the image
24 | 
25 | ### 2. Run the Docker Container
26 | 
27 | Run the following command to start the server (mapping container port 8080 to your local port 8081):
28 | 
29 | ```sh
30 | docker run -p 8081:8080 extract2md-demo
31 | ```
32 | 
33 | - The server will be accessible at [http://localhost:8081/demo.html](http://localhost:8081/demo.html)
34 | 
35 | ### 3. Using the Demo
36 | 
37 | - Open your browser and go to [http://localhost:8081/demo.html](http://localhost:8081/demo.html)
38 | - Upload a PDF and select a scenario to test the Extract2MD conversion features
39 | 
40 | ### 4. Troubleshooting
41 | 
42 | - If you get a 404 error, make sure you are visiting `/demo.html` (not `/demo` or `/`).
43 | - If you see errors about missing assets, ensure the Docker build completed successfully and that the `dist` directory is present in the container (it should be automatically copied from the npm package).
44 | - For WebLLM scenarios, ensure your browser supports WebGPU.
45 | 
46 | ### 5. Stopping the Container
47 | 
48 | Press `Ctrl+C` in the terminal where the container is running, or run:
49 | 
50 | ```sh
51 | docker ps  # Find the container ID
52 | # Then:
53 | docker stop <container_id>
54 | ```
55 | 
56 | ---
57 | 
58 | **For development or advanced usage, you can modify `demo.html` and rebuild the image to see your changes.** 


--------------------------------------------------------------------------------
/webpack.config.js:
--------------------------------------------------------------------------------
 1 | import path from 'path';
 2 | import CopyWebpackPlugin from 'copy-webpack-plugin';
 3 | import { fileURLToPath } from 'url';
 4 | 
 5 | const __filename = fileURLToPath(import.meta.url);
 6 | const __dirname = path.dirname(__filename);
 7 | 
 8 | const commonConfig = {
 9 |   mode: 'production', // or 'development'
10 |   entry: './src/index.js',
11 |   module: {
12 |     rules: [
13 |       {
14 |         test: /\.js$/,
15 |         exclude: /node_modules/,
16 |         use: {
17 |           loader: 'babel-loader',
18 |           options: {
19 |             presets: ['@babel/preset-env']
20 |           }
21 |         }
22 |       }
23 |     ]
24 |   },
25 |   resolve: {
26 |     extensions: ['.js']
27 |   },
28 |   devtool: 'source-map',
29 |   // externals: { // Keep externals commented unless specifically needed
30 |   //   'pdfjs-dist/build/pdf.js': 'pdfjsLib',
31 |   //   'tesseract.js': 'Tesseract',
32 |   //   '@mlc-ai/web-llm': 'webLLM'
33 |   // }
34 | };
35 | 
36 | const umdConfig = {
37 |   ...commonConfig,
38 |   output: {
39 |     path: path.resolve(__dirname, 'dist/assets'),
40 |     filename: 'extract2md.umd.js',
41 |     library: {
42 |       name: 'Extract2MD',
43 |       type: 'umd',
44 |     },
45 |     globalObject: 'this',
46 |   },
47 |   plugins: [
48 |     new CopyWebpackPlugin({
49 |       patterns: [
50 |         {
51 |           from: path.resolve(__dirname, 'node_modules/pdfjs-dist/build/pdf.worker.min.mjs'),
52 |           to: path.resolve(__dirname, 'dist/pdf.worker.min.mjs')
53 |         },
54 |         {
55 |           from: path.resolve(__dirname, 'node_modules/tesseract.js/dist/worker.min.js'),
56 |           to: path.resolve(__dirname, 'dist/assets/tesseract-worker.min.js')
57 |         },
58 |         {
59 |           from: path.resolve(__dirname, 'node_modules/tesseract.js-core/tesseract-core.wasm.js'),
60 |           to: path.resolve(__dirname, 'dist/assets/tesseract-core.wasm.js')
61 |         },
62 |         // Copy the main type definition file
63 |         {
64 |           from: path.resolve(__dirname, 'src/types/index.d.ts'),
65 |           to: path.resolve(__dirname, 'dist/assets/extract2md.d.ts')
66 |         }
67 |       ]
68 |     })
69 |   ],
70 | };
71 | 
72 | const esmConfig = {
73 |   ...commonConfig,
74 |   output: {
75 |     path: path.resolve(__dirname, 'dist/assets'),
76 |     filename: 'extract2md.esm.js',
77 |     library: {
78 |       type: 'module',
79 |     },
80 |   },
81 |   experiments: {
82 |     outputModule: true,
83 |   },
84 |   // ESM build typically doesn't need to run CopyWebpackPlugin again if UMD build handles it.
85 |   // If you run builds separately or want to ensure assets are copied for both, include it.
86 |   // For simplicity, assuming UMD build's CopyWebpackPlugin handles all asset copying.
87 |   // If you have a build script that runs webpack once with an array of configs,
88 |   // the plugins from one of them (e.g., UMD) will handle the copying.
89 | };
90 | 
91 | export default [umdConfig, esmConfig];
92 | 


--------------------------------------------------------------------------------
/test/newline-optimization.test.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Test script to verify newline optimization improvements
  3 |  */
  4 | 
  5 | // Import the converter classes
  6 | import { Extract2MDConverter } from '../src/converters/Extract2MDConverter.js';
  7 | import { LegacyExtract2MDConverter } from '../src/index.js';
  8 | 
  9 | // Test data with excessive newlines
 10 | const testText = `
 11 | Title Here
 12 | 
 13 | 
 14 | Some text with multiple spaces.
 15 | 
 16 | 
 17 | 
 18 | Another paragraph with lots of newlines.
 19 | 
 20 | 
 21 | 
 22 | 
 23 | And more text.
 24 | 
 25 | 
 26 | `;
 27 | 
 28 | // Mock progress callback
 29 | const mockProgressCallback = () => {};
 30 | 
 31 | console.log('Testing newline optimization improvements...\n');
 32 | 
 33 | // Test new converter
 34 | console.log('Testing new Extract2MDConverter...');
 35 | try {
 36 |     const newConverter = new Extract2MDConverter({ progressCallback: mockProgressCallback });
 37 |     const newResult = newConverter._convertToMarkdown(testText);
 38 |     
 39 |     console.log('✅ New converter executed successfully');
 40 |     console.log('Result length:', newResult.length);
 41 |     console.log('Number of consecutive newlines (should be minimal):');
 42 |     
 43 |     const tripleNewlines = (newResult.match(/\n{3,}/g) || []).length;
 44 |     const doubleNewlines = (newResult.match(/\n{2}/g) || []).length;
 45 |     
 46 |     console.log(`  - Triple+ newlines: ${tripleNewlines} (should be 0)`);
 47 |     console.log(`  - Double newlines: ${doubleNewlines}`);
 48 |     
 49 |     if (tripleNewlines === 0) {
 50 |         console.log('✅ Newline optimization working correctly for new converter');
 51 |     } else {
 52 |         console.log('❌ Newline optimization needs improvement for new converter');
 53 |     }
 54 | } catch (error) {
 55 |     console.log('❌ New converter failed:', error.message);
 56 | }
 57 | 
 58 | console.log('\n' + '='.repeat(50) + '\n');
 59 | 
 60 | // Test legacy converter
 61 | console.log('Testing legacy LegacyExtract2MDConverter...');
 62 | try {
 63 |     const legacyConverter = new LegacyExtract2MDConverter({ progressCallback: mockProgressCallback });
 64 |     const legacyResult = legacyConverter._convertToMarkdownLogic(testText);
 65 |     
 66 |     console.log('✅ Legacy converter executed successfully');
 67 |     console.log('Result length:', legacyResult.length);
 68 |     console.log('Number of consecutive newlines (should be minimal):');
 69 |     
 70 |     const tripleNewlines = (legacyResult.match(/\n{3,}/g) || []).length;
 71 |     const doubleNewlines = (legacyResult.match(/\n{2}/g) || []).length;
 72 |     
 73 |     console.log(`  - Triple+ newlines: ${tripleNewlines} (should be 0)`);
 74 |     console.log(`  - Double newlines: ${doubleNewlines}`);
 75 |     
 76 |     if (tripleNewlines === 0) {
 77 |         console.log('✅ Newline optimization working correctly for legacy converter');
 78 |     } else {
 79 |         console.log('❌ Newline optimization needs improvement for legacy converter');
 80 |     }
 81 | } catch (error) {
 82 |     console.log('❌ Legacy converter failed:', error.message);
 83 | }
 84 | 
 85 | console.log('\n' + '='.repeat(50) + '\n');
 86 | 
 87 | // Test post-processing optimization
 88 | console.log('Testing post-processing optimization...');
 89 | try {
 90 |     const newConverter = new Extract2MDConverter({ progressCallback: mockProgressCallback });
 91 |     const testPostProcessText = 'Text with filigature and unicode\u2018quotes\u2019 and bullets\u2022';
 92 |     
 93 |     const processedText = newConverter._postProcessText(testPostProcessText);
 94 |     console.log('Original:', testPostProcessText);
 95 |     console.log('Processed:', processedText);
 96 |     
 97 |     if (processedText.includes('filigature') && processedText.includes("'quotes'") && processedText.includes('-')) {
 98 |         console.log('✅ Post-processing optimization working correctly');
 99 |     } else {
100 |         console.log('❌ Post-processing optimization needs review');
101 |     }
102 | } catch (error) {
103 |     console.log('❌ Post-processing test failed:', error.message);
104 | }
105 | 
106 | console.log('\n🎉 Newline optimization test completed!');
107 | 


--------------------------------------------------------------------------------
/test/simple-newline.test.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Simple test to verify newline optimization improvements
  3 |  * This test focuses on the core markdown conversion logic
  4 |  */
  5 | 
  6 | // Test data with excessive newlines
  7 | const testText = `
  8 | Title Here
  9 | 
 10 | 
 11 | Some text with multiple spaces.
 12 | 
 13 | 
 14 | 
 15 | Another paragraph with lots of newlines.
 16 | 
 17 | 
 18 | 
 19 | 
 20 | And more text.
 21 | 
 22 | 
 23 | `;
 24 | 
 25 | // Mock helper functions similar to what's in the converters
 26 | function addSeparatorLine(outputLines) {
 27 |     if (outputLines.length > 0 && outputLines[outputLines.length - 1] !== '') {
 28 |         outputLines.push('');
 29 |     }
 30 | }
 31 | 
 32 | function normalizeMarkdownNewlines(lines) {
 33 |     const normalizedLines = [];
 34 |     let consecutiveEmptyLines = 0;
 35 |     
 36 |     for (const line of lines) {
 37 |         if (line.trim() === '') {
 38 |             consecutiveEmptyLines++;
 39 |             // Allow maximum of 1 consecutive empty line
 40 |             if (consecutiveEmptyLines <= 1) {
 41 |                 normalizedLines.push('');
 42 |             }
 43 |         } else {
 44 |             consecutiveEmptyLines = 0;
 45 |             normalizedLines.push(line.trimEnd());
 46 |         }
 47 |     }
 48 |     
 49 |     // Join and do final cleanup
 50 |     let finalMarkdown = normalizedLines.join('\n');
 51 |     // Remove any remaining triple+ newlines and trim
 52 |     finalMarkdown = finalMarkdown.replace(/\n{3,}/g, '\n\n').trim();
 53 |     return finalMarkdown;
 54 | }
 55 | 
 56 | // Test the newline optimization
 57 | function testNewlineOptimization() {
 58 |     console.log('Testing newline optimization improvements...\n');
 59 |     
 60 |     // Simulate the basic markdown conversion with newline optimization
 61 |     let markdownOutputLines = [];
 62 |     const inputLines = testText.split(/\n/);
 63 |     
 64 |     let currentParagraphCollector = [];
 65 |     
 66 |     const flushCurrentParagraph = () => {
 67 |         if (currentParagraphCollector.length > 0) {
 68 |             markdownOutputLines.push(currentParagraphCollector.join(' ').trim());
 69 |             currentParagraphCollector = [];
 70 |             addSeparatorLine(markdownOutputLines);
 71 |         }
 72 |     };
 73 |     
 74 |     for (let i = 0; i < inputLines.length; i++) {
 75 |         const trimmedLine = inputLines[i].trim();
 76 |         
 77 |         if (trimmedLine === '') {
 78 |             flushCurrentParagraph();
 79 |             continue;
 80 |         }
 81 |         
 82 |         // Simple header detection
 83 |         if (trimmedLine === 'Title Here') {
 84 |             flushCurrentParagraph();
 85 |             markdownOutputLines.push(`# ${trimmedLine}`);
 86 |             addSeparatorLine(markdownOutputLines);
 87 |             continue;
 88 |         }
 89 |         
 90 |         // Regular text
 91 |         if (trimmedLine) {
 92 |             currentParagraphCollector.push(trimmedLine);
 93 |         }
 94 |     }
 95 |     
 96 |     flushCurrentParagraph();
 97 |     
 98 |     // Apply the optimization
 99 |     const optimizedResult = normalizeMarkdownNewlines(markdownOutputLines);
100 |     
101 |     console.log('Original text length:', testText.length);
102 |     console.log('Optimized result length:', optimizedResult.length);
103 |     console.log('\nOptimized result:');
104 |     console.log('---');
105 |     console.log(optimizedResult);
106 |     console.log('---');
107 |     
108 |     // Count newline patterns
109 |     const tripleNewlines = (optimizedResult.match(/\n{3,}/g) || []).length;
110 |     const doubleNewlines = (optimizedResult.match(/\n{2}/g) || []).length;
111 |     
112 |     console.log('\nNewline analysis:');
113 |     console.log(`  - Triple+ newlines: ${tripleNewlines} (should be 0)`);
114 |     console.log(`  - Double newlines: ${doubleNewlines}`);
115 |     
116 |     if (tripleNewlines === 0) {
117 |         console.log('✅ Newline optimization working correctly');
118 |     } else {
119 |         console.log('❌ Newline optimization needs improvement');
120 |     }
121 |     
122 |     // Test post-processing simulation
123 |     console.log('\n' + '='.repeat(50));
124 |     console.log('Testing post-processing optimization...');
125 |     
126 |     const testPostProcessText = 'Text with filigature and unicode\u2018quotes\u2019 and bullets\u2022';
127 |     
128 |     // Simulate the optimized post-processing
129 |     const rules = [
130 |         { find: /[\u2018\u2019]/g, replace: "'" },
131 |         { find: /[\u2022\u2023\u25E6\u2043\u2219\u25CF\u25CB\u2981\u2619\u2765]/g, replace: '-' }
132 |     ];
133 |     
134 |     let processedText = testPostProcessText;
135 |     for (const rule of rules) {
136 |         processedText = processedText.replace(rule.find, rule.replace);
137 |     }
138 |     
139 |     console.log('Original:', testPostProcessText);
140 |     console.log('Processed:', processedText);
141 |     
142 |     if (processedText.includes("'quotes'") && processedText.includes('-')) {
143 |         console.log('✅ Post-processing optimization working correctly');
144 |     } else {
145 |         console.log('❌ Post-processing optimization needs review');
146 |     }
147 |     
148 |     return tripleNewlines === 0;
149 | }
150 | 
151 | // Run the test
152 | const success = testNewlineOptimization();
153 | console.log('\n' + '='.repeat(50));
154 | console.log('🎉 Newline optimization test completed!');
155 | console.log(success ? '✅ All tests passed' : '❌ Some tests failed');
156 | 
157 | process.exit(success ? 0 : 1);
158 | 


--------------------------------------------------------------------------------
/src/types/index.d.ts:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * TypeScript definitions for Extract2MD
  3 |  */
  4 | 
  5 | // Core configuration interfaces
  6 | export interface OCRConfig {
  7 |   language?: string;
  8 |   oem?: number;
  9 |   psm?: number;
 10 |   workerPath?: string;
 11 |   corePath?: string;
 12 |   langPath?: string;
 13 |   options?: any;
 14 | }
 15 | 
 16 | export interface WebLLMConfig {
 17 |   modelId?: string;
 18 |   temperature?: number;
 19 |   maxTokens?: number;
 20 |   streamingEnabled?: boolean;
 21 |   customModel?: CustomModelConfig;
 22 |   options?: any;
 23 | }
 24 | 
 25 | export interface PostProcessRule {
 26 |   find: RegExp | string;
 27 |   replace: string;
 28 | }
 29 | 
 30 | export interface ProgressReport {
 31 |   stage: string;
 32 |   message: string;
 33 |   currentPage?: number;
 34 |   totalPages?: number;
 35 |   progress?: number;
 36 |   usage?: any;
 37 |   error?: any;
 38 | }
 39 | 
 40 | export interface TesseractConfig {
 41 |   workerPath?: string;
 42 |   corePath?: string;
 43 |   langPath?: string;
 44 |   language?: string;
 45 |   options?: any;
 46 | }
 47 | 
 48 | export interface CustomModelConfig {
 49 |   model: string;
 50 |   model_id: string;
 51 |   model_lib: string;
 52 |   required_features?: string[];
 53 |   overrides?: any;
 54 | }
 55 | 
 56 | export interface LLMConfig {
 57 |   model?: string;
 58 |   customModel?: CustomModelConfig;
 59 |   options?: {
 60 |     temperature?: number;
 61 |     maxTokens?: number;
 62 |     [key: string]: any;
 63 |   };
 64 | }
 65 | 
 66 | export interface SystemPromptsConfig {
 67 |   singleExtraction?: string;
 68 |   combinedExtraction?: string;
 69 | }
 70 | 
 71 | export interface ProcessingConfig {
 72 |   splitPascalCase?: boolean;
 73 |   pdfRenderScale?: number;
 74 |   postProcessRules?: PostProcessRule[];
 75 | }
 76 | 
 77 | export interface Extract2MDConfig {
 78 |   pdfJsWorkerSrc?: string;
 79 |   tesseract?: TesseractConfig;
 80 |   llm?: LLMConfig;
 81 |   systemPrompts?: SystemPromptsConfig;
 82 |   processing?: ProcessingConfig;
 83 |   progressCallback?: (report: ProgressReport) => void;
 84 | }
 85 | 
 86 | export interface WebLLMEngineConfig {
 87 |   progressCallback?: (report: ProgressReport) => void;
 88 |   defaultModel?: string;
 89 |   customModelConfig?: CustomModelConfig;
 90 | }
 91 | 
 92 | export interface GenerationOptions {
 93 |   temperature?: number;
 94 |   maxTokens?: number;
 95 |   [key: string]: any;
 96 | }
 97 | 
 98 | export interface ModelInfo {
 99 |   isInitialized: boolean;
100 |   currentModelId: string | null;
101 |   isReady: boolean;
102 | }
103 | 
104 | export interface ValidationResult {
105 |   isValid: boolean;
106 |   issues: string[];
107 | }
108 | 
109 | export class WebLLMEngine {
110 |   constructor(config?: WebLLMEngineConfig);
111 |   
112 |   initialize(modelId?: string | null, modelConfig?: any): Promise<void>;
113 |   generate(prompt: string, options?: GenerationOptions): Promise<string>;
114 |   generateStream(
115 |     prompt: string, 
116 |     options?: GenerationOptions, 
117 |     onChunk?: (chunk: string, fullResponse: string) => void
118 |   ): Promise<string>;
119 |   isReady(): boolean;
120 |   getModelInfo(): ModelInfo;
121 |   cleanup(): Promise<void>;
122 | }
123 | 
124 | export class OutputParser {
125 |   constructor();
126 |   
127 |   parse(rawOutput: string): string;
128 |   removeThinkingBlocks(text: string): string;
129 |   applyCleanupPatterns(text: string): string;
130 |   ensureMarkdownStructure(text: string): string;
131 |   extractMarkdownContent(text: string): string;
132 |   validateMarkdown(text: string): ValidationResult;
133 |   applyCustomRules(text: string, customRules?: PostProcessRule[]): string;
134 | }
135 | 
136 | export class SystemPrompts {
137 |   static getSingleExtractionPrompt(customization?: string): string;
138 |   static getCombinedExtractionPrompt(customization?: string): string;
139 |   static getSingleExtractionUserPrompt(extractedText: string): string;
140 |   static getCombinedExtractionUserPrompt(quickExtraction: string, ocrExtraction: string): string;
141 |   static buildSystemPrompt(scenarioType: 'single' | 'combined', customization?: string): string;
142 |   static buildUserPrompt(scenarioType: 'single' | 'combined', ...extractionResults: string[]): string;
143 |   static getThinkingEnabledPrompt(basePrompt: string): string;
144 | }
145 | 
146 | export class ConfigValidator {
147 |   static getDefaultConfig(): Extract2MDConfig;
148 |   static validate(config?: any): Extract2MDConfig;
149 |   static validateTesseractConfig(tesseractConfig: any): void;
150 |   static validateLLMConfig(llmConfig: any): void;
151 |   static validateCustomModel(customModel: any): void;
152 |   static validateLLMOptions(options: any): void;
153 |   static validateProcessingConfig(processingConfig: any): void;
154 |   static validateSystemPrompts(systemPrompts: any): void;
155 |   static deepMerge(target: any, source: any): any;
156 |   static isObject(value: any): boolean;
157 |   static fromJSON(jsonString: string): Extract2MDConfig;
158 |   static getSchema(): any;
159 | }
160 | 
161 | export class Extract2MDConverter {
162 |   constructor(config?: Extract2MDConfig);
163 |   
164 |   // Scenario-specific static methods
165 |   static quickConvertOnly(pdfFile: File, options?: Extract2MDConfig): Promise<string>;
166 |   static highAccuracyConvertOnly(pdfFile: File, options?: Extract2MDConfig): Promise<string>;
167 |   static quickConvertWithLLM(pdfFile: File, options?: Extract2MDConfig): Promise<string>;
168 |   static highAccuracyConvertWithLLM(pdfFile: File, options?: Extract2MDConfig): Promise<string>;
169 |   static combinedConvertWithLLM(pdfFile: File, options?: Extract2MDConfig): Promise<string>;
170 | }
171 | 
172 | // Legacy support - keeping the old interface available
173 | export interface Extract2MDOptions extends Extract2MDConfig {}
174 | 
175 | export interface ConvertOptions {
176 |   postProcessRules?: PostProcessRule[];
177 | }
178 | 
179 | export interface HighAccuracyConvertOptions extends ConvertOptions {
180 |   tesseractLanguage?: string;
181 |   tesseractOptions?: any;
182 |   pdfRenderScale?: number;
183 | }
184 | 
185 | export interface LLMRewriteOptions {
186 |   llmModel?: string;
187 |   llmPromptTemplate?: (text: string) => string;
188 |   chatOpts?: any;
189 | }
190 | 
191 | // Legacy class for backwards compatibility
192 | export class LegacyExtract2MDConverter {
193 |   constructor(options?: Extract2MDOptions);
194 |   
195 |   quickConvert(pdfFile: File, options?: ConvertOptions): Promise<string>;
196 |   highAccuracyConvert(pdfFile: File, options?: HighAccuracyConvertOptions): Promise<string>;
197 |   llmRewrite(textToRewrite: string, options?: LLMRewriteOptions): Promise<string>;
198 |   unloadLLM(): Promise<void>;
199 | }
200 | 
201 | // Default export
202 | export default Extract2MDConverter;
203 | 


--------------------------------------------------------------------------------
/src/utils/SystemPrompts.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * SystemPrompts.js
  3 |  * System prompts for different LLM rewrite scenarios
  4 |  */
  5 | 
  6 | export class SystemPrompts {
  7 |     /**
  8 |      * Base system prompt for single extraction method scenarios (3 & 4)
  9 |      */
 10 |     static getSingleExtractionPrompt(customization = '') {
 11 |         const basePrompt = `You are an expert text editor specializing in converting extracted PDF content into clean, well-formatted Markdown. Your task is to:
 12 | 
 13 | 1. **Preserve Original Content**: Maintain all original information, context, and meaning
 14 | 2. **Improve Clarity**: Enhance readability and flow while keeping the professional tone
 15 | 3. **Fix Errors**: Correct grammatical errors, spelling mistakes in common words (preserve proper nouns, names, places, brands)
 16 | 4. **Structure Enhancement**: Organize content with appropriate Markdown formatting (headers, lists, emphasis, code blocks, etc.)
 17 | 5. **Remove Artifacts**: Clean up PDF extraction artifacts like weird spacing, broken words, or formatting issues
 18 | 
 19 | **Important Guidelines:**
 20 | - Do not change technical terms, names, places, or brand names
 21 | - Maintain the original document structure and hierarchy
 22 | - Use proper Markdown syntax for formatting
 23 | - Do not add information that wasn't in the original text
 24 | - Output ONLY the improved Markdown content
 25 | 
 26 | The text you receive was extracted from a PDF and may contain formatting issues or extraction artifacts.`;
 27 | 
 28 |         return customization ? `${basePrompt}\n\n**Additional Instructions:**\n${customization}` : basePrompt;
 29 |     }
 30 | 
 31 |     /**
 32 |      * Specialized system prompt for combined extraction scenarios (scenario 5)
 33 |      */
 34 |     static getCombinedExtractionPrompt(customization = '') {
 35 |         const basePrompt = `You are an expert text editor specializing in creating a single, comprehensive Markdown document by intelligently combining content from two different PDF extraction sources. Your goal is to produce the most complete, accurate, and well-formatted Markdown output possible.
 36 | 
 37 | You will receive content extracted using two methods. Your task is to:
 38 | 
 39 | 1.  **Synthesize Information**: Intelligently merge the content from both extraction sources. Prioritize completeness and accuracy, ensuring no critical information, context, or meaning is lost from either source.
 40 | 2.  **Preserve Entities and Relationships**: Pay special attention to accurately retaining all names, places, dates, objects, technical terms, brand names, and the relationships between them.
 41 | 3.  **Enhance Clarity and Structure**: Improve readability and flow. Organize the combined content with appropriate Markdown formatting (headers, lists, emphasis, code blocks, etc.) to create a unified and coherent document.
 42 | 4.  **Correct Errors and Artifacts**: Fix grammatical errors and spelling mistakes in common words. Preserve proper nouns and specialized terms. Clean up PDF extraction artifacts (e.g., weird spacing, broken words, formatting issues) from both sources.
 43 | 5.  **Avoid Over-Summarization**: The primary goal is comprehensive extraction and combination, not summarization. Retain all details unless they are clear duplicates.
 44 | 
 45 | **Important Guidelines:**
 46 | - Do not change technical terms, names, places, or brand names unless correcting an obvious extraction error.
 47 | - Create a single, unified Markdown document.
 48 | - Use proper Markdown syntax for all formatting.
 49 | - Do not add any information that was not present in the original texts.
 50 | - Output ONLY the combined and improved Markdown content. Do not include any explanations, categorizations, or section titles like "Combined Results" or similar.
 51 | 
 52 | The two sets of extracted text will be provided. Your task is to process them and return a single block of clean Markdown.`;
 53 | 
 54 |         return customization ? `${basePrompt}\n\n**Additional Instructions:**\n${customization}` : basePrompt;
 55 |     }
 56 | 
 57 |     /**
 58 |      * Get user prompt for single extraction scenarios
 59 |      */
 60 |     static getSingleExtractionUserPrompt(extractedText) {
 61 |         return `Please improve and format the following extracted PDF content into clean Markdown:
 62 | 
 63 | **Extracted Content:**
 64 | ${extractedText}
 65 | 
 66 | **Improved Markdown:**`;
 67 |     }
 68 | 
 69 |     /**
 70 |      * Get user prompt for combined extraction scenarios
 71 |      */
 72 |     static getCombinedExtractionUserPrompt(quickExtraction, ocrExtraction) {
 73 |         return `Please combine, improve, and format the following two sets of extracted PDF content into a single, clean Markdown document:
 74 | 
 75 | **Source 1 Extracted Content:**
 76 | ${quickExtraction}
 77 | 
 78 | **Source 2 Extracted Content:**
 79 | ${ocrExtraction}
 80 | 
 81 | **Combined and Improved Markdown:**`;
 82 |     }
 83 | 
 84 |     /**
 85 |      * Build complete system prompt with customizations
 86 |      */
 87 |     static buildSystemPrompt(scenarioType, customization = '') {
 88 |         switch (scenarioType) {
 89 |             case 'single':
 90 |                 return this.getSingleExtractionPrompt(customization);
 91 |             case 'combined':
 92 |                 return this.getCombinedExtractionPrompt(customization);
 93 |             default:
 94 |                 throw new Error(`Unknown scenario type: ${scenarioType}`);
 95 |         }
 96 |     }
 97 | 
 98 |     /**
 99 |      * Build complete user prompt based on scenario
100 |      */
101 |     static buildUserPrompt(scenarioType, ...extractionResults) {
102 |         switch (scenarioType) {
103 |             case 'single':
104 |                 if (extractionResults.length !== 1) {
105 |                     throw new Error('Single extraction scenario requires exactly one extraction result');
106 |                 }
107 |                 return this.getSingleExtractionUserPrompt(extractionResults[0]);
108 |             case 'combined':
109 |                 if (extractionResults.length !== 2) {
110 |                     throw new Error('Combined extraction scenario requires exactly two extraction results');
111 |                 }
112 |                 return this.getCombinedExtractionUserPrompt(extractionResults[0], extractionResults[1]);
113 |             default:
114 |                 throw new Error(`Unknown scenario type: ${scenarioType}`);
115 |         }
116 |     }
117 | 
118 |     /**
119 |      * Get thinking-enabled prompt variations for models that support it
120 |      */
121 |     static getThinkingEnabledPrompt(basePrompt) {
122 |         return `${basePrompt}
123 | 
124 | Take time to think through your approach before providing the final output. Consider the extraction quality, potential issues, and the best way to structure the content.`;
125 |     }
126 | }
127 | 
128 | export default SystemPrompts;
129 | 


--------------------------------------------------------------------------------
/src/utils/OutputParser.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * OutputParser.js
  3 |  * Utility for parsing and cleaning LLM output
  4 |  * Removes thinking tags and ensures proper markdown formatting
  5 |  */
  6 | 
  7 | export class OutputParser {
  8 |     constructor() {
  9 |         // Regex pattern to match <think>...</think> blocks with optional line breaks
 10 |         this.thinkRegex = /<think>.*?<\/think>\n?\n?/gs;
 11 |         
 12 |         // Patterns for cleaning up common LLM output issues
 13 |         this.cleanupPatterns = [
 14 |             // Remove excessive whitespace
 15 |             { find: /\n{3,}/g, replace: '\n\n' },
 16 |             // Fix spacing around headers
 17 |             { find: /^(#{1,6})\s*(.+)$/gm, replace: '$1 $2' },
 18 |             // Ensure proper list formatting
 19 |             { find: /^(\s*[-*+])\s+/gm, replace: '$1 ' },
 20 |             // Clean up numbered lists
 21 |             { find: /^(\s*\d+\.)\s+/gm, replace: '$1 ' },
 22 |             // Remove trailing spaces
 23 |             { find: /[ \t]+$/gm, replace: '' },
 24 |             // Normalize line endings
 25 |             { find: /\r\n/g, replace: '\n' },
 26 |             // Clean up code block formatting
 27 |             { find: /```\s*\n\s*\n/g, replace: '```\n' },
 28 |             { find: /\n\s*\n\s*```/g, replace: '\n```' }
 29 |         ];
 30 |     }
 31 | 
 32 |     /**
 33 |      * Parse LLM output by removing thinking blocks and cleaning formatting
 34 |      * @param {string} rawOutput - Raw output from LLM
 35 |      * @returns {string} Cleaned and formatted output
 36 |      */
 37 |     parse(rawOutput) {
 38 |         if (!rawOutput || typeof rawOutput !== 'string') {
 39 |             return '';
 40 |         }
 41 | 
 42 |         let cleanedOutput = rawOutput;
 43 | 
 44 |         // Step 1: Remove <think>...</think> blocks
 45 |         cleanedOutput = this.removeThinkingBlocks(cleanedOutput);
 46 | 
 47 |         // Step 2: Apply general cleanup patterns
 48 |         cleanedOutput = this.applyCleanupPatterns(cleanedOutput);
 49 | 
 50 |         // Step 3: Ensure proper markdown structure
 51 |         cleanedOutput = this.ensureMarkdownStructure(cleanedOutput);
 52 | 
 53 |         return cleanedOutput.trim();
 54 |     }
 55 | 
 56 |     /**
 57 |      * Remove <think>...</think> blocks from the output
 58 |      * @param {string} text - Input text
 59 |      * @returns {string} Text with thinking blocks removed
 60 |      */
 61 |     removeThinkingBlocks(text) {
 62 |         // Remove all <think>...</think> blocks including optional line breaks after
 63 |         let cleaned = text.replace(this.thinkRegex, '');
 64 |         
 65 |         // Clean up any remaining empty lines at the start
 66 |         cleaned = cleaned.replace(/^\s*\n+/, '');
 67 |         
 68 |         return cleaned;
 69 |     }
 70 | 
 71 |     /**
 72 |      * Apply general cleanup patterns to improve formatting
 73 |      * @param {string} text - Input text
 74 |      * @returns {string} Cleaned text
 75 |      */
 76 |     applyCleanupPatterns(text) {
 77 |         let cleaned = text;
 78 |         
 79 |         for (const pattern of this.cleanupPatterns) {
 80 |             cleaned = cleaned.replace(pattern.find, pattern.replace);
 81 |         }
 82 |         
 83 |         return cleaned;
 84 |     }
 85 | 
 86 |     /**
 87 |      * Ensure proper markdown structure and formatting
 88 |      * @param {string} text - Input text
 89 |      * @returns {string} Properly structured markdown
 90 |      */
 91 |     ensureMarkdownStructure(text) {
 92 |         let structured = text;
 93 |         
 94 |         // Ensure headers have proper spacing
 95 |         structured = structured.replace(/^(#{1,6}\s.+)$/gm, (match, header) => {
 96 |             return `\n${header}\n`;
 97 |         });
 98 |         
 99 |         // Ensure code blocks have proper spacing
100 |         structured = structured.replace(/^```/gm, '\n```');
101 |         structured = structured.replace(/```$/gm, '```\n');
102 |         
103 |         // Ensure lists have proper spacing
104 |         structured = structured.replace(/^(\s*[-*+]\s.+)$/gm, (match, listItem, offset, string) => {
105 |             const prevChar = string[offset - 1];
106 |             return prevChar && prevChar !== '\n' ? `\n${listItem}` : listItem;
107 |         });
108 |         
109 |         // Final cleanup of excessive line breaks
110 |         structured = structured.replace(/\n{3,}/g, '\n\n');
111 |         
112 |         return structured;
113 |     }
114 | 
115 |     /**
116 |      * Extract only the markdown content, removing any prefacing text
117 |      * @param {string} text - Input text
118 |      * @returns {string} Pure markdown content
119 |      */
120 |     extractMarkdownContent(text) {
121 |         // Look for common LLM response patterns and extract just the markdown
122 |         const patterns = [
123 |             // "Here's the rewritten text:" followed by content
124 |             /(?:here'?s?\s+(?:the\s+)?(?:rewritten|improved|cleaned|formatted)\s+(?:text|content|version)[:.]?\s*\n)(.*)/is,
125 |             // "Rewritten text:" followed by content
126 |             /(?:rewritten\s+text[:.]?\s*\n)(.*)/is,
127 |             // Just return the whole thing if no pattern matches
128 |             /(.*)/s
129 |         ];
130 | 
131 |         for (const pattern of patterns) {
132 |             const match = text.match(pattern);
133 |             if (match && match[1]) {
134 |                 return match[1].trim();
135 |             }
136 |         }
137 | 
138 |         return text.trim();
139 |     }
140 | 
141 |     /**
142 |      * Validate if the output is properly formatted markdown
143 |      * @param {string} text - Text to validate
144 |      * @returns {Object} Validation result with status and issues
145 |      */
146 |     validateMarkdown(text) {
147 |         const issues = [];
148 |         
149 |         // Check for common markdown issues
150 |         if (text.includes('<think>')) {
151 |             issues.push('Contains thinking blocks that should be removed');
152 |         }
153 |         
154 |         // Check for malformed headers
155 |         const malformedHeaders = text.match(/^#{7,}/gm);
156 |         if (malformedHeaders) {
157 |             issues.push('Contains headers with too many # symbols');
158 |         }
159 |         
160 |         // Check for unclosed code blocks
161 |         const codeBlockCount = (text.match(/```/g) || []).length;
162 |         if (codeBlockCount % 2 !== 0) {
163 |             issues.push('Contains unclosed code blocks');
164 |         }
165 |         
166 |         // Check for excessive line breaks
167 |         if (text.includes('\n\n\n\n')) {
168 |             issues.push('Contains excessive line breaks');
169 |         }
170 |         
171 |         return {
172 |             isValid: issues.length === 0,
173 |             issues: issues
174 |         };
175 |     }
176 | 
177 |     /**
178 |      * Apply custom parsing rules
179 |      * @param {string} text - Input text
180 |      * @param {Array} customRules - Array of custom parsing rules
181 |      * @returns {string} Text with custom rules applied
182 |      */
183 |     applyCustomRules(text, customRules = []) {
184 |         if (!Array.isArray(customRules) || customRules.length === 0) {
185 |             return text;
186 |         }
187 |         
188 |         let processed = text;
189 |         
190 |         for (const rule of customRules) {
191 |             if (rule.find && typeof rule.replace === 'string') {
192 |                 processed = processed.replace(rule.find, rule.replace);
193 |             }
194 |         }
195 |         
196 |         return processed;
197 |     }
198 | }
199 | 
200 | export default OutputParser;
201 | 


--------------------------------------------------------------------------------
/MIGRATION.md:
--------------------------------------------------------------------------------
  1 | # Migration Guide: Extract2MD v1.0.6
  2 | 
  3 | This guide helps you migrate from the legacy Extract2MD API to the new scenario-based API introduced in v1.0.6.
  4 | 
  5 | ## Overview of Changes
  6 | 
  7 | The Extract2MD package has been restructured to provide clear, scenario-specific methods instead of a single class with multiple configuration options. This makes the API more intuitive and provides better TypeScript support.
  8 | 
  9 | ## Key Changes
 10 | 
 11 | ### Before (Legacy API)
 12 | ```javascript
 13 | import Extract2MD from 'extract2md';
 14 | 
 15 | const converter = new Extract2MD();
 16 | const result = await converter.convertPDFToMarkdown(pdfFile, {
 17 |     useOCR: true,
 18 |     useLLM: false,
 19 |     ocrLanguage: 'eng'
 20 | });
 21 | ```
 22 | 
 23 | ### After (New API)
 24 | ```javascript
 25 | import { Extract2MDConverter } from 'extract2md';
 26 | 
 27 | const result = await Extract2MDConverter.quickConvertOnly(pdfFile, {
 28 |     ocr: {
 29 |         language: 'eng',
 30 |         oem: 1,
 31 |         psm: 6
 32 |     }
 33 | });
 34 | ```
 35 | 
 36 | ## Migration by Use Case
 37 | 
 38 | ### 1. Basic PDF Text Extraction (No OCR, No LLM)
 39 | 
 40 | **Legacy:**
 41 | ```javascript
 42 | const converter = new Extract2MD();
 43 | const result = await converter.convertPDFToMarkdown(pdfFile, {
 44 |     useOCR: false,
 45 |     useLLM: false
 46 | });
 47 | ```
 48 | 
 49 | **New:**
 50 | ```javascript
 51 | const result = await Extract2MDConverter.quickConvertOnly(pdfFile, {
 52 |     // OCR config is optional - will use PDF text extraction only
 53 | });
 54 | ```
 55 | 
 56 | ### 2. PDF with OCR (No LLM)
 57 | 
 58 | **Legacy:**
 59 | ```javascript
 60 | const converter = new Extract2MD();
 61 | const result = await converter.convertPDFToMarkdown(pdfFile, {
 62 |     useOCR: true,
 63 |     useLLM: false,
 64 |     ocrLanguage: 'eng',
 65 |     ocrPSM: 6
 66 | });
 67 | ```
 68 | 
 69 | **New - Quick OCR:**
 70 | ```javascript
 71 | const result = await Extract2MDConverter.quickConvertOnly(pdfFile, {
 72 |     ocr: {
 73 |         language: 'eng',
 74 |         oem: 1,
 75 |         psm: 6
 76 |     }
 77 | });
 78 | ```
 79 | 
 80 | **New - High Accuracy OCR:**
 81 | ```javascript
 82 | const result = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile, {
 83 |     ocr: {
 84 |         language: 'eng',
 85 |         oem: 1,
 86 |         psm: 8
 87 |     }
 88 | });
 89 | ```
 90 | 
 91 | ### 3. PDF with OCR and LLM Rewrite
 92 | 
 93 | **Legacy:**
 94 | ```javascript
 95 | const converter = new Extract2MD();
 96 | const result = await converter.convertPDFToMarkdown(pdfFile, {
 97 |     useOCR: true,
 98 |     useLLM: true,
 99 |     ocrLanguage: 'eng',
100 |     llmModel: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
101 |     llmTemperature: 0.7
102 | });
103 | ```
104 | 
105 | **New - Quick OCR + LLM:**
106 | ```javascript
107 | const result = await Extract2MDConverter.quickConvertWithLLM(pdfFile, {
108 |     ocr: {
109 |         language: 'eng',
110 |         oem: 1,
111 |         psm: 6
112 |     },
113 |     webllm: {
114 |         modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
115 |         temperature: 0.7,
116 |         maxTokens: 4000,
117 |         streamingEnabled: false
118 |     }
119 | });
120 | ```
121 | 
122 | **New - High Accuracy OCR + LLM:**
123 | ```javascript
124 | const result = await Extract2MDConverter.highAccuracyConvertWithLLM(pdfFile, {
125 |     ocr: {
126 |         language: 'eng',
127 |         oem: 1,
128 |         psm: 8
129 |     },
130 |     webllm: {
131 |         modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
132 |         temperature: 0.7,
133 |         maxTokens: 4000,
134 |         streamingEnabled: false
135 |     }
136 | });
137 | ```
138 | 
139 | ### 4. Combined Extraction Methods with LLM
140 | 
141 | This is a new feature not available in the legacy API:
142 | 
143 | ```javascript
144 | const result = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, {
145 |     ocr: {
146 |         language: 'eng',
147 |         oem: 1,
148 |         psm: 6
149 |     },
150 |     webllm: {
151 |         modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
152 |         temperature: 0.7,
153 |         maxTokens: 4000,
154 |         streamingEnabled: false
155 |     }
156 | });
157 | ```
158 | 
159 | ## Configuration Changes
160 | 
161 | ### OCR Configuration
162 | 
163 | **Legacy:**
164 | ```javascript
165 | {
166 |     ocrLanguage: 'eng',
167 |     ocrPSM: 6,
168 |     ocrOEM: 1
169 | }
170 | ```
171 | 
172 | **New:**
173 | ```javascript
174 | {
175 |     ocr: {
176 |         language: 'eng',
177 |         psm: 6,
178 |         oem: 1
179 |     }
180 | }
181 | ```
182 | 
183 | ### LLM Configuration
184 | 
185 | **Legacy:**
186 | ```javascript
187 | {
188 |     llmModel: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
189 |     llmTemperature: 0.7,
190 |     llmMaxTokens: 4000
191 | }
192 | ```
193 | 
194 | **New:**
195 | ```javascript
196 | {
197 |     webllm: {
198 |         modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
199 |         temperature: 0.7,
200 |         maxTokens: 4000,
201 |         streamingEnabled: false
202 |     }
203 | }
204 | ```
205 | 
206 | ## Backwards Compatibility
207 | 
208 | The legacy API is still available for backwards compatibility:
209 | 
210 | ```javascript
211 | import { LegacyExtract2MDConverter } from 'extract2md';
212 | 
213 | const converter = new LegacyExtract2MDConverter();
214 | // Use old API as before
215 | ```
216 | 
217 | **Note:** The legacy API is deprecated and will be removed in v2.0.0. Please migrate to the new API.
218 | 
219 | ## Benefits of the New API
220 | 
221 | 1. **Clear Scenarios**: Each method has a specific purpose, making it easier to choose the right approach
222 | 2. **Better TypeScript Support**: Full type definitions for all configurations and return types
223 | 3. **Modular Architecture**: Better code organization and maintainability
224 | 4. **Configuration Validation**: Built-in validation for all configuration options
225 | 5. **Improved Error Handling**: More specific error messages and better error recovery
226 | 6. **Better Documentation**: Each scenario method is well-documented with examples
227 | 
228 | ## Configuration Files
229 | 
230 | You can now use external configuration files:
231 | 
232 | ```javascript
233 | // config.json
234 | {
235 |     "ocr": {
236 |         "language": "eng",
237 |         "oem": 1,
238 |         "psm": 6
239 |     },
240 |     "webllm": {
241 |         "modelId": "Llama-3.2-1B-Instruct-q4f16_1-MLC",
242 |         "temperature": 0.7,
243 |         "maxTokens": 4000,
244 |         "streamingEnabled": false
245 |     }
246 | }
247 | ```
248 | 
249 | ```javascript
250 | // In your code
251 | const result = await Extract2MDConverter.quickConvertWithLLM(pdfFile, 'config.json');
252 | ```
253 | 
254 | ## Error Handling
255 | 
256 | The new API provides more specific error types:
257 | 
258 | ```javascript
259 | try {
260 |     const result = await Extract2MDConverter.quickConvertOnly(pdfFile, config);
261 | } catch (error) {
262 |     if (error.name === 'ConfigurationError') {
263 |         console.error('Configuration issue:', error.message);
264 |     } else if (error.name === 'OCRError') {
265 |         console.error('OCR processing failed:', error.message);
266 |     } else if (error.name === 'WebLLMError') {
267 |         console.error('LLM processing failed:', error.message);
268 |     } else {
269 |         console.error('General error:', error.message);
270 |     }
271 | }
272 | ```
273 | 
274 | ## Testing Your Migration
275 | 
276 | Use the provided test files to validate your migration:
277 | 
278 | ```javascript
279 | import { Extract2MDTests } from 'extract2md/test/scenarios.test.js';
280 | 
281 | const tests = new Extract2MDTests();
282 | await tests.runBasicTests();
283 | 
284 | // With a PDF file
285 | await tests.runFullTests(pdfFile);
286 | ```
287 | 
288 | ## Need Help?
289 | 
290 | - Check the [examples](./examples/) folder for complete usage examples
291 | - See the [README.md](./README.md) for full API documentation
292 | - Open an issue on GitHub if you encounter migration problems
293 | 
294 | ## Timeline
295 | 
296 | - **v1.0.6**: New API introduced, legacy API deprecated
297 | - **v1.1.0**: Legacy API will show deprecation warnings
298 | - **v2.0.0**: Legacy API will be removed (planned for 6 months after v1.0.6)
299 | 
300 | Migrate to the new API as soon as possible to take advantage of the improved features and ensure compatibility with future versions.
301 | 


--------------------------------------------------------------------------------
/test/scenarios.test.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Test file for validating Extract2MD scenarios
  3 |  * This is a basic validation test - for production, consider using a testing framework like Jest
  4 |  */
  5 | 
  6 | import { Extract2MDConverter } from '../src/index.js';
  7 | 
  8 | class Extract2MDTests {
  9 |     constructor() {
 10 |         this.testResults = [];
 11 |         this.testPdf = null; // Will be set via file input in demo
 12 |     }
 13 | 
 14 |     log(message, type = 'info') {
 15 |         console.log(`[${type.toUpperCase()}] ${message}`);
 16 |         this.testResults.push({ message, type, timestamp: new Date().toISOString() });
 17 |     }
 18 | 
 19 |     async runBasicTests() {
 20 |         this.log('Starting Extract2MD basic tests...');
 21 | 
 22 |         try {
 23 |             // Test 1: Check if all scenario methods exist
 24 |             this.testScenarioMethodsExist();
 25 | 
 26 |             // Test 2: Check configuration validation
 27 |             await this.testConfigurationValidation();
 28 | 
 29 |             this.log('Basic tests completed successfully!', 'success');
 30 |         } catch (error) {
 31 |             this.log(`Basic tests failed: ${error.message}`, 'error');
 32 |             throw error;
 33 |         }
 34 |     }
 35 | 
 36 |     testScenarioMethodsExist() {
 37 |         this.log('Testing scenario methods existence...');
 38 | 
 39 |         const requiredMethods = [
 40 |             'quickConvertOnly',
 41 |             'highAccuracyConvertOnly',
 42 |             'quickConvertWithLLM',
 43 |             'highAccuracyConvertWithLLM',
 44 |             'combinedConvertWithLLM'
 45 |         ];
 46 | 
 47 |         for (const method of requiredMethods) {
 48 |             if (typeof Extract2MDConverter[method] !== 'function') {
 49 |                 throw new Error(`Method ${method} does not exist or is not a function`);
 50 |             }
 51 |         }
 52 | 
 53 |         this.log(`All ${requiredMethods.length} scenario methods exist`, 'success');
 54 |     }
 55 | 
 56 |     async testConfigurationValidation() {
 57 |         this.log('Testing configuration validation...');
 58 | 
 59 |         // Test valid configuration
 60 |         const validConfig = {
 61 |             ocr: {
 62 |                 language: 'eng',
 63 |                 oem: 1,
 64 |                 psm: 6
 65 |             },
 66 |             webllm: {
 67 |                 modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
 68 |                 temperature: 0.7,
 69 |                 maxTokens: 4000,
 70 |                 streamingEnabled: false
 71 |             }
 72 |         };
 73 | 
 74 |         try {
 75 |             // This should not throw
 76 |             const result = await Extract2MDConverter.quickConvertOnly(
 77 |                 null, // No actual PDF for basic test
 78 |                 validConfig,
 79 |                 true // dry run mode (if implemented)
 80 |             );
 81 |             this.log('Configuration validation passed', 'success');
 82 |         } catch (error) {
 83 |             if (error.message.includes('PDF file is required')) {
 84 |                 this.log('Configuration validation passed (expected PDF error)', 'success');
 85 |             } else {
 86 |                 throw error;
 87 |             }
 88 |         }
 89 | 
 90 |         // Test invalid configuration
 91 |         try {
 92 |             await Extract2MDConverter.quickConvertOnly(null, { invalid: 'config' });
 93 |             throw new Error('Should have thrown validation error');
 94 |         } catch (error) {
 95 |             if (error.message.includes('validation') || error.message.includes('PDF file is required')) {
 96 |                 this.log('Invalid configuration properly rejected', 'success');
 97 |             } else {
 98 |                 throw error;
 99 |             }
100 |         }
101 |     }
102 | 
103 |     async runFullTests(pdfFile) {
104 |         if (!pdfFile) {
105 |             this.log('No PDF file provided for full tests', 'warning');
106 |             return;
107 |         }
108 | 
109 |         this.log('Starting full Extract2MD tests with PDF file...');
110 |         this.testPdf = pdfFile;
111 | 
112 |         try {
113 |             // Test Scenario 1: Quick Convert Only
114 |             await this.testScenario1();
115 | 
116 |             // Test Scenario 2: High Accuracy Convert Only
117 |             await this.testScenario2();
118 | 
119 |             // Note: LLM scenarios would require actual model loading which takes time
120 |             // For now, we'll just test that they don't throw immediate errors
121 |             await this.testLLMScenariosBasic();
122 | 
123 |             this.log('Full tests completed successfully!', 'success');
124 |         } catch (error) {
125 |             this.log(`Full tests failed: ${error.message}`, 'error');
126 |             throw error;
127 |         }
128 |     }
129 | 
130 |     async testScenario1() {
131 |         this.log('Testing Scenario 1: Quick Convert Only...');
132 | 
133 |         const config = {
134 |             ocr: {
135 |                 language: 'eng',
136 |                 oem: 1,
137 |                 psm: 6
138 |             }
139 |         };
140 | 
141 |         const result = await Extract2MDConverter.quickConvertOnly(this.testPdf, config);
142 |         
143 |         if (!result || typeof result !== 'string') {
144 |             throw new Error('Scenario 1 should return a string result');
145 |         }
146 | 
147 |         this.log(`Scenario 1 completed. Result length: ${result.length} characters`, 'success');
148 |     }
149 | 
150 |     async testScenario2() {
151 |         this.log('Testing Scenario 2: High Accuracy Convert Only...');
152 | 
153 |         const config = {
154 |             ocr: {
155 |                 language: 'eng',
156 |                 oem: 1,
157 |                 psm: 8 // Different PSM for high accuracy
158 |             }
159 |         };
160 | 
161 |         const result = await Extract2MDConverter.highAccuracyConvertOnly(this.testPdf, config);
162 |         
163 |         if (!result || typeof result !== 'string') {
164 |             throw new Error('Scenario 2 should return a string result');
165 |         }
166 | 
167 |         this.log(`Scenario 2 completed. Result length: ${result.length} characters`, 'success');
168 |     }
169 | 
170 |     async testLLMScenariosBasic() {
171 |         this.log('Testing LLM scenarios (basic validation only)...');
172 | 
173 |         const config = {
174 |             ocr: {
175 |                 language: 'eng',
176 |                 oem: 1,
177 |                 psm: 6
178 |             },
179 |             webllm: {
180 |                 modelId: 'Llama-3.2-1B-Instruct-q4f16_1-MLC',
181 |                 temperature: 0.7,
182 |                 maxTokens: 1000,
183 |                 streamingEnabled: false
184 |             }
185 |         };
186 | 
187 |         // Test that LLM scenarios at least start without immediate errors
188 |         try {
189 |             // These will likely fail at model loading, but should not have immediate syntax errors
190 |             const scenarios = [
191 |                 'quickConvertWithLLM',
192 |                 'highAccuracyConvertWithLLM',
193 |                 'combinedConvertWithLLM'
194 |             ];
195 | 
196 |             for (const scenario of scenarios) {
197 |                 this.log(`Testing ${scenario} initialization...`);
198 |                 
199 |                 try {
200 |                     // Start the process but don't wait for completion (model loading takes time)
201 |                     const promise = Extract2MDConverter[scenario](this.testPdf, config);
202 |                     
203 |                     // Give it a moment to start, then we'll assume it's working if no immediate error
204 |                     setTimeout(() => {
205 |                         this.log(`${scenario} started successfully`, 'success');
206 |                     }, 100);
207 | 
208 |                     // Don't await full completion for basic test
209 |                     break; // Only test one scenario to avoid model loading overhead
210 |                 } catch (error) {
211 |                     if (error.message.includes('model') || error.message.includes('WebLLM')) {
212 |                         this.log(`${scenario} - model loading issue (expected): ${error.message}`, 'warning');
213 |                     } else {
214 |                         throw error;
215 |                     }
216 |                 }
217 |             }
218 |         } catch (error) {
219 |             this.log(`LLM scenarios basic test error: ${error.message}`, 'warning');
220 |         }
221 |     }
222 | 
223 |     getTestResults() {
224 |         return this.testResults;
225 |     }
226 | 
227 |     clearResults() {
228 |         this.testResults = [];
229 |     }
230 | }
231 | 
232 | // Export for use in demo
233 | export { Extract2MDTests };
234 | 
235 | // Auto-run basic tests if this file is run directly
236 | if (typeof window === 'undefined') {
237 |     const tests = new Extract2MDTests();
238 |     tests.runBasicTests().catch(console.error);
239 | }
240 | 


--------------------------------------------------------------------------------
/scripts/validate-deployment.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | 
  3 | /**
  4 |  * Deployment validation script for Extract2MD
  5 |  * This script validates the package is ready for deployment
  6 |  */
  7 | 
  8 | import fs from 'fs';
  9 | import path from 'path';
 10 | import { fileURLToPath } from 'url';
 11 | 
 12 | const __filename = fileURLToPath(import.meta.url);
 13 | const __dirname = path.dirname(__filename);
 14 | 
 15 | class DeploymentValidator {
 16 |     constructor() {
 17 |         this.errors = [];
 18 |         this.warnings = [];
 19 |         this.success = [];
 20 |     }
 21 | 
 22 |     log(message, type = 'info') {
 23 |         const timestamp = new Date().toISOString();
 24 |         const prefix = type === 'error' ? '❌' : type === 'warning' ? '⚠️' : '✅';
 25 |         console.log(`${prefix} [${timestamp}] ${message}`);
 26 |         
 27 |         if (type === 'error') this.errors.push(message);
 28 |         else if (type === 'warning') this.warnings.push(message);
 29 |         else this.success.push(message);
 30 |     }
 31 | 
 32 |     async validate() {
 33 |         console.log('🚀 Starting Extract2MD Deployment Validation...\n');
 34 | 
 35 |         // Check package structure
 36 |         this.validatePackageStructure();
 37 |         
 38 |         // Check build outputs
 39 |         this.validateBuildOutputs();
 40 |         
 41 |         // Check documentation
 42 |         this.validateDocumentation();
 43 |         
 44 |         // Check configuration files
 45 |         this.validateConfiguration();
 46 |         
 47 |         // Check TypeScript definitions
 48 |         this.validateTypeDefinitions();
 49 | 
 50 |         // Summary
 51 |         this.printSummary();
 52 |         
 53 |         return this.errors.length === 0;
 54 |     }
 55 | 
 56 |     validatePackageStructure() {
 57 |         this.log('Validating package structure...');
 58 | 
 59 |         const requiredFiles = [
 60 |             'package.json',
 61 |             'src/index.js',
 62 |             'src/types/index.d.ts',
 63 |             'dist/assets/extract2md.umd.js',
 64 |             'README.md',
 65 |             'MIGRATION.md',
 66 |             'DEPLOYMENT.md',
 67 |             'config.example.json'
 68 |         ];
 69 | 
 70 |         const requiredDirs = [
 71 |             'src/converters',
 72 |             'src/engines', 
 73 |             'src/utils',
 74 |             'examples',
 75 |             'test'
 76 |         ];
 77 | 
 78 |         for (const file of requiredFiles) {
 79 |             const filePath = path.resolve(__dirname, '..', file);
 80 |             if (fs.existsSync(filePath)) {
 81 |                 this.log(`Required file found: ${file}`);
 82 |             } else {
 83 |                 this.log(`Missing required file: ${file}`, 'error');
 84 |             }
 85 |         }
 86 | 
 87 |         for (const dir of requiredDirs) {
 88 |             const dirPath = path.resolve(__dirname, '..', dir);
 89 |             if (fs.existsSync(dirPath) && fs.statSync(dirPath).isDirectory()) {
 90 |                 this.log(`Required directory found: ${dir}`);
 91 |             } else {
 92 |                 this.log(`Missing required directory: ${dir}`, 'error');
 93 |             }
 94 |         }
 95 |     }
 96 | 
 97 |     validateBuildOutputs() {
 98 |         this.log('Validating build outputs...');
 99 | 
100 |         const buildFiles = [
101 |             'dist/assets/extract2md.umd.js',
102 |             'dist/assets/tesseract-worker.min.js',
103 |             'dist/assets/tesseract-core.wasm.js',
104 |             'dist/pdf.worker.min.mjs'
105 |         ];
106 | 
107 |         for (const file of buildFiles) {
108 |             const filePath = path.resolve(__dirname, '..', file);
109 |             if (fs.existsSync(filePath)) {
110 |                 const stats = fs.statSync(filePath);
111 |                 const sizeInMB = (stats.size / (1024 * 1024)).toFixed(2);
112 |                 this.log(`Build file found: ${file} (${sizeInMB} MB)`);
113 |                 
114 |                 if (stats.size === 0) {
115 |                     this.log(`Build file is empty: ${file}`, 'error');
116 |                 }
117 |             } else {
118 |                 this.log(`Missing build file: ${file}`, 'error');
119 |             }
120 |         }
121 |     }
122 | 
123 |     validateDocumentation() {
124 |         this.log('Validating documentation...');
125 | 
126 |         const docFiles = {
127 |             'README.md': ['Installation', 'Scenarios', 'Configuration'],
128 |             'MIGRATION.md': ['Legacy API', 'Migration', 'Backwards Compatibility'],
129 |             'DEPLOYMENT.md': ['Distribution', 'Performance', 'Security']
130 |         };
131 | 
132 |         for (const [file, keywords] of Object.entries(docFiles)) {
133 |             const filePath = path.resolve(__dirname, '..', file);
134 |             if (fs.existsSync(filePath)) {
135 |                 const content = fs.readFileSync(filePath, 'utf8');
136 |                 
137 |                 for (const keyword of keywords) {
138 |                     if (content.toLowerCase().includes(keyword.toLowerCase())) {
139 |                         this.log(`Documentation section found in ${file}: ${keyword}`);
140 |                     } else {
141 |                         this.log(`Missing documentation section in ${file}: ${keyword}`, 'warning');
142 |                     }
143 |                 }
144 |             } else {
145 |                 this.log(`Missing documentation file: ${file}`, 'error');
146 |             }
147 |         }
148 |     }
149 | 
150 |     validateConfiguration() {
151 |         this.log('Validating configuration files...');
152 | 
153 |         // Check package.json
154 |         const packagePath = path.resolve(__dirname, '..', 'package.json');
155 |         if (fs.existsSync(packagePath)) {
156 |             try {
157 |                 const pkg = JSON.parse(fs.readFileSync(packagePath, 'utf8'));
158 |                 
159 |                 const requiredFields = ['name', 'version', 'main', 'module', 'types'];
160 |                 for (const field of requiredFields) {
161 |                     if (pkg[field]) {
162 |                         this.log(`package.json has required field: ${field}`);
163 |                     } else {
164 |                         this.log(`package.json missing field: ${field}`, 'error');
165 |                     }
166 |                 }
167 | 
168 |                 // Check if main file exists
169 |                 if (pkg.main && fs.existsSync(path.resolve(__dirname, '..', pkg.main))) {
170 |                     this.log(`Main entry point exists: ${pkg.main}`);
171 |                 } else {
172 |                     this.log(`Main entry point missing: ${pkg.main}`, 'error');
173 |                 }
174 | 
175 |             } catch (error) {
176 |                 this.log(`Invalid JSON in package.json: ${error.message}`, 'error');
177 |             }
178 |         }
179 | 
180 |         // Check example config
181 |         const configPath = path.resolve(__dirname, '..', 'config.example.json');
182 |         if (fs.existsSync(configPath)) {
183 |             try {
184 |                 const config = JSON.parse(fs.readFileSync(configPath, 'utf8'));
185 |                 if (config.ocr && config.webllm) {
186 |                     this.log('Example configuration is valid');
187 |                 } else {
188 |                     this.log('Example configuration missing required sections', 'warning');
189 |                 }
190 |             } catch (error) {
191 |                 this.log(`Invalid example configuration: ${error.message}`, 'error');
192 |             }
193 |         }
194 |     }
195 | 
196 |     validateTypeDefinitions() {
197 |         this.log('Validating TypeScript definitions...');
198 | 
199 |         const typesPath = path.resolve(__dirname, '..', 'src/types/index.d.ts');
200 |         if (fs.existsSync(typesPath)) {
201 |             const content = fs.readFileSync(typesPath, 'utf8');
202 |             
203 |             const requiredInterfaces = [
204 |                 'OCRConfig',
205 |                 'WebLLMConfig', 
206 |                 'Extract2MDConfig',
207 |                 'Extract2MDConverter'
208 |             ];
209 | 
210 |             for (const interfaceName of requiredInterfaces) {
211 |                 if (content.includes(`interface ${interfaceName}`) || 
212 |                     content.includes(`declare class ${interfaceName}`) ||
213 |                     content.includes(`export class ${interfaceName}`)) {
214 |                     this.log(`TypeScript interface found: ${interfaceName}`);
215 |                 } else {
216 |                     this.log(`Missing TypeScript interface: ${interfaceName}`, 'error');
217 |                 }
218 |             }
219 |         } else {
220 |             this.log('TypeScript definitions file not found', 'error');
221 |         }
222 |     }
223 | 
224 |     printSummary() {
225 |         console.log('\n📊 Deployment Validation Summary');
226 |         console.log('=====================================');
227 |         console.log(`✅ Successful checks: ${this.success.length}`);
228 |         console.log(`⚠️  Warnings: ${this.warnings.length}`);
229 |         console.log(`❌ Errors: ${this.errors.length}`);
230 | 
231 |         if (this.errors.length > 0) {
232 |             console.log('\n🚨 Critical Issues Found:');
233 |             this.errors.forEach(error => console.log(`  - ${error}`));
234 |         }
235 | 
236 |         if (this.warnings.length > 0) {
237 |             console.log('\n⚠️  Warnings:');
238 |             this.warnings.forEach(warning => console.log(`  - ${warning}`));
239 |         }
240 | 
241 |         console.log('\n' + (this.errors.length === 0 ? '🎉 Package is ready for deployment!' : '🔧 Please fix the errors before deployment.'));
242 |     }
243 | }
244 | 
245 | // Run validation
246 | const validator = new DeploymentValidator();
247 | validator.validate().then(isValid => {
248 |     process.exit(isValid ? 0 : 1);
249 | }).catch(error => {
250 |     console.error('Validation failed:', error);
251 |     process.exit(1);
252 | });
253 | 


--------------------------------------------------------------------------------
/src/engines/WebLLMEngine.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * WebLLMEngine.js
  3 |  * Standalone WebLLM inference engine for Extract2MD
  4 |  * Handles model initialization, loading, and text generation
  5 |  */
  6 | 
  7 | import * as webllm from '@mlc-ai/web-llm';
  8 | 
  9 | export class WebLLMEngine {
 10 |     constructor(config = {}) {
 11 |         this.engine = null;
 12 |         this.isInitialized = false;
 13 |         this.currentModelId = null;
 14 |         this.progressCallback = config.progressCallback || ((progress) => {});
 15 |         this.defaultModel = config.defaultModel || 'Qwen3-0.6B-q4f16_1-MLC';
 16 |         this.customModelConfig = config.customModelConfig || null;
 17 |     }
 18 | 
 19 |     /**
 20 |      * Initialize the WebLLM engine with specified model
 21 |      * @param {string} modelId - Model identifier
 22 |      * @param {Object} modelConfig - Model configuration options
 23 |      */
 24 |     async initialize(modelId = null, modelConfig = {}) {
 25 |         const targetModelId = modelId || this.defaultModel;
 26 |         
 27 |         // Check if already initialized with the same model
 28 |         if (this.isInitialized && this.currentModelId === targetModelId) {
 29 |             this.progressCallback({
 30 |                 stage: 'webllm_ready',
 31 |                 message: 'WebLLM engine already initialized with the correct model.'
 32 |             });
 33 |             return;
 34 |         }
 35 | 
 36 |         try {
 37 |             this.progressCallback({
 38 |                 stage: 'webllm_init_start',
 39 |                 message: `Initializing WebLLM engine with model: ${targetModelId}...`
 40 |             });
 41 | 
 42 |             // Clean up existing engine if any
 43 |             if (this.engine) {
 44 |                 await this.cleanup();
 45 |             }
 46 | 
 47 |             // Setup progress callback for model loading
 48 |             const initProgressCallback = (report) => {
 49 |                 this.progressCallback({
 50 |                     stage: 'webllm_load_progress',
 51 |                     message: `Model Loading: ${report.text}`,
 52 |                     progress: report.progress
 53 |                 });
 54 |             };
 55 | 
 56 |             // Configure model
 57 |             let appConfig = null;
 58 |             if (this.customModelConfig && this.customModelConfig.modelId === targetModelId) {
 59 |                 // Use custom model configuration
 60 |                 appConfig = {
 61 |                     model_list: [this.customModelConfig]
 62 |                 };
 63 |             }
 64 | 
 65 |             const engineConfig = {
 66 |                 initProgressCallback,
 67 |                 appConfig,
 68 |                 ...modelConfig
 69 |             };
 70 | 
 71 |             // Create and initialize engine
 72 |             this.engine = await webllm.CreateMLCEngine(targetModelId, engineConfig);
 73 |             this.isInitialized = true;
 74 |             this.currentModelId = targetModelId;
 75 | 
 76 |             this.progressCallback({
 77 |                 stage: 'webllm_init_complete',
 78 |                 message: 'WebLLM engine initialized successfully.'
 79 |             });
 80 | 
 81 |         } catch (error) {
 82 |             this.isInitialized = false;
 83 |             this.currentModelId = null;
 84 |             this.progressCallback({
 85 |                 stage: 'webllm_init_error',
 86 |                 message: `WebLLM initialization failed: ${error.message}`,
 87 |                 error
 88 |             });
 89 |             throw new Error(`WebLLM initialization failed: ${error.message}`);
 90 |         }
 91 |     }
 92 | 
 93 |     /**
 94 |      * Generate text using the initialized model
 95 |      * @param {string} prompt - Input prompt
 96 |      * @param {Object} options - Generation options
 97 |      * @returns {Promise<string>} Generated text
 98 |      */
 99 |     async generate(prompt, options = {}) {
100 |         if (!this.isInitialized || !this.engine) {
101 |             throw new Error('WebLLM engine is not initialized. Call initialize() first.');
102 |         }
103 | 
104 |         try {
105 |             this.progressCallback({
106 |                 stage: 'webllm_generate_start',
107 |                 message: 'Generating response...'
108 |             });
109 | 
110 |             const messages = [{ role: "user", content: prompt }];
111 |             
112 |             const requestOptions = {
113 |                 messages,
114 |                 temperature: options.temperature || 0.7,
115 |                 max_tokens: options.maxTokens || 4096,
116 |                 ...options
117 |             };
118 | 
119 |             const chatCompletion = await this.engine.chat.completions.create(requestOptions);
120 |             
121 |             if (chatCompletion.choices && chatCompletion.choices.length > 0) {
122 |                 const content = chatCompletion.choices[0].message.content || '';
123 |                 
124 |                 this.progressCallback({
125 |                     stage: 'webllm_generate_complete',
126 |                     message: 'Text generation completed.'
127 |                 });
128 |                 
129 |                 return content;
130 |             } else {
131 |                 throw new Error('No response generated from the model.');
132 |             }
133 | 
134 |         } catch (error) {
135 |             this.progressCallback({
136 |                 stage: 'webllm_generate_error',
137 |                 message: `Text generation failed: ${error.message}`,
138 |                 error
139 |             });
140 |             throw new Error(`Text generation failed: ${error.message}`);
141 |         }
142 |     }
143 | 
144 |     /**
145 |      * Generate text with streaming support
146 |      * @param {string} prompt - Input prompt
147 |      * @param {Object} options - Generation options
148 |      * @param {Function} onChunk - Callback for each chunk
149 |      * @returns {Promise<string>} Complete generated text
150 |      */
151 |     async generateStream(prompt, options = {}, onChunk = null) {
152 |         if (!this.isInitialized || !this.engine) {
153 |             throw new Error('WebLLM engine is not initialized. Call initialize() first.');
154 |         }
155 | 
156 |         try {
157 |             this.progressCallback({
158 |                 stage: 'webllm_stream_start',
159 |                 message: 'Starting streaming generation...'
160 |             });
161 | 
162 |             const messages = [{ role: "user", content: prompt }];
163 |             
164 |             const requestOptions = {
165 |                 messages,
166 |                 temperature: options.temperature || 0.7,
167 |                 max_tokens: options.maxTokens || 4096,
168 |                 stream: true,
169 |                 stream_options: { include_usage: true },
170 |                 ...options
171 |             };
172 | 
173 |             const chunks = await this.engine.chat.completions.create(requestOptions);
174 |             let fullResponse = '';
175 | 
176 |             for await (const chunk of chunks) {
177 |                 const content = chunk.choices[0]?.delta?.content || '';
178 |                 fullResponse += content;
179 |                 
180 |                 if (onChunk && content) {
181 |                     onChunk(content, fullResponse);
182 |                 }
183 |                 
184 |                 if (chunk.usage) {
185 |                     this.progressCallback({
186 |                         stage: 'webllm_stream_usage',
187 |                         message: 'Stream completed',
188 |                         usage: chunk.usage
189 |                     });
190 |                 }
191 |             }
192 | 
193 |             this.progressCallback({
194 |                 stage: 'webllm_stream_complete',
195 |                 message: 'Streaming generation completed.'
196 |             });
197 | 
198 |             return fullResponse;
199 | 
200 |         } catch (error) {
201 |             this.progressCallback({
202 |                 stage: 'webllm_stream_error',
203 |                 message: `Streaming generation failed: ${error.message}`,
204 |                 error
205 |             });
206 |             throw new Error(`Streaming generation failed: ${error.message}`);
207 |         }
208 |     }
209 | 
210 |     /**
211 |      * Check if the engine is ready for use
212 |      * @returns {boolean} Engine readiness status
213 |      */
214 |     isReady() {
215 |         return this.isInitialized && this.engine !== null;
216 |     }
217 | 
218 |     /**
219 |      * Get current model information
220 |      * @returns {Object} Model information
221 |      */
222 |     getModelInfo() {
223 |         return {
224 |             isInitialized: this.isInitialized,
225 |             currentModelId: this.currentModelId,
226 |             isReady: this.isReady()
227 |         };
228 |     }
229 | 
230 |     /**
231 |      * Clean up the engine and free resources
232 |      */
233 |     async cleanup() {
234 |         if (this.engine) {
235 |             try {
236 |                 this.progressCallback({
237 |                     stage: 'webllm_cleanup',
238 |                     message: 'Cleaning up WebLLM engine...'
239 |                 });
240 | 
241 |                 // Note: WebLLM's MLCEngine might not have a direct unload method
242 |                 // But we should clean up references
243 |                 this.engine = null;
244 |                 this.isInitialized = false;
245 |                 this.currentModelId = null;
246 | 
247 |                 this.progressCallback({
248 |                     stage: 'webllm_cleanup_complete',
249 |                     message: 'WebLLM engine cleanup completed.'
250 |                 });
251 |             } catch (error) {
252 |                 this.progressCallback({
253 |                     stage: 'webllm_cleanup_error',
254 |                     message: `WebLLM cleanup failed: ${error.message}`,
255 |                     error
256 |                 });
257 |             }
258 |         }
259 |     }
260 | }
261 | 
262 | export default WebLLMEngine;
263 | 


--------------------------------------------------------------------------------
/examples/usage-examples.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Example usage of Extract2MD with all scenarios
  3 |  */
  4 | 
  5 | // Fix import statement to match new API structure
  6 | import { Extract2MDConverter, ConfigValidator } from '../src/index.js';
  7 | 
  8 | // Example configurations for different scenarios
  9 | const basicConfig = {
 10 |   progressCallback: (progress) => {
 11 |     console.log(`[${progress.stage}] ${progress.message}`);
 12 |     if (progress.currentPage && progress.totalPages) {
 13 |       console.log(`Progress: ${progress.currentPage}/${progress.totalPages}`);
 14 |     }
 15 |   }
 16 | };
 17 | 
 18 | const advancedConfig = {
 19 |   webllm: {
 20 |     model: "Qwen3-0.6B-q4f16_1-MLC",
 21 |     options: {
 22 |       temperature: 0.7,
 23 |       maxTokens: 4096
 24 |     }
 25 |   },
 26 |   systemPrompts: {
 27 |     singleExtraction: "Focus on preserving technical accuracy and code examples.",
 28 |     combinedExtraction: "Create comprehensive documentation by leveraging both extraction methods."
 29 |   },
 30 |   processing: {
 31 |     splitPascalCase: false,
 32 |     pdfRenderScale: 2.5,
 33 |     postProcessRules: [
 34 |       { find: /\bAPI\b/g, replace: "API" },
 35 |       { find: /\bJSON\b/g, replace: "JSON" },
 36 |       { find: /\bHTML\b/g, replace: "HTML" }
 37 |     ]
 38 |   },
 39 |   progressCallback: (progress) => {
 40 |     console.log(`[${progress.stage}] ${progress.message}`);
 41 |     if (progress.progress !== undefined) {
 42 |       console.log(`Loading: ${Math.round(progress.progress * 100)}%`);
 43 |     }
 44 |     if (progress.error) {
 45 |       console.error('Error:', progress.error);
 46 |     }
 47 |   }
 48 | };
 49 | 
 50 | // Example: Using different scenarios
 51 | async function demonstrateScenarios(pdfFile) {
 52 |   console.log('=== Extract2MD Scenario Demonstrations ===\n');
 53 | 
 54 |   try {
 55 |     // Scenario 1: Quick Convert Only
 56 |     console.log('1. Quick Convert Only (Fast, basic formatting)');
 57 |     console.log('Use case: PDFs with selectable text, need quick results');
 58 |     const result1 = await Extract2MDConverter.quickConvertOnly(pdfFile, basicConfig);
 59 |     console.log('✅ Quick conversion completed');
 60 |     console.log(`Output length: ${result1.length} characters\n`);
 61 | 
 62 |     // Scenario 2: High Accuracy Convert Only  
 63 |     console.log('2. High Accuracy Convert Only (OCR, slower but comprehensive)');
 64 |     console.log('Use case: Scanned PDFs, images, complex layouts');
 65 |     const result2 = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile, basicConfig);
 66 |     console.log('✅ High accuracy conversion completed');
 67 |     console.log(`Output length: ${result2.length} characters\n`);
 68 | 
 69 |     // Scenario 3: Quick Convert + LLM
 70 |     console.log('3. Quick Convert + LLM Enhancement');
 71 |     console.log('Use case: Fast extraction with AI enhancement');
 72 |     const result3 = await Extract2MDConverter.quickConvertWithLLM(pdfFile, advancedConfig);
 73 |     console.log('✅ Quick conversion with LLM completed');
 74 |     console.log(`Output length: ${result3.length} characters\n`);
 75 | 
 76 |     // Scenario 4: High Accuracy + LLM
 77 |     console.log('4. High Accuracy + LLM Enhancement');
 78 |     console.log('Use case: OCR extraction with AI enhancement');
 79 |     const result4 = await Extract2MDConverter.highAccuracyConvertWithLLM(pdfFile, advancedConfig);
 80 |     console.log('✅ High accuracy conversion with LLM completed');
 81 |     console.log(`Output length: ${result4.length} characters\n`);
 82 | 
 83 |     // Scenario 5: Combined + LLM (Recommended)
 84 |     console.log('5. Combined Convert + LLM Enhancement (RECOMMENDED)');
 85 |     console.log('Use case: Best possible results using both extraction methods');
 86 |     const result5 = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, advancedConfig);
 87 |     console.log('✅ Combined conversion with LLM completed');
 88 |     console.log(`Output length: ${result5.length} characters\n`);
 89 | 
 90 |     return {
 91 |       quickOnly: result1,
 92 |       ocrOnly: result2,
 93 |       quickWithLLM: result3,
 94 |       ocrWithLLM: result4,
 95 |       combined: result5
 96 |     };
 97 | 
 98 |   } catch (error) {
 99 |     console.error('❌ Error during conversion:', error.message);
100 |     throw error;
101 |   }
102 | }
103 | 
104 | // Example: Configuration validation
105 | function demonstrateConfigValidation() {
106 |   console.log('=== Configuration Validation Demo ===\n');
107 | 
108 |   // Valid configuration
109 |   try {
110 |     const validConfig = {
111 |       webllm: {
112 |         model: "Qwen3-0.6B-q4f16_1-MLC",
113 |         options: { temperature: 0.8 }
114 |       },
115 |       processing: {
116 |         splitPascalCase: true,
117 |         pdfRenderScale: 3.0
118 |       }
119 |     };
120 |     
121 |     const validated = ConfigValidator.validate(validConfig);
122 |     console.log('✅ Configuration validation passed');
123 |     console.log('Validated config keys:', Object.keys(validated));
124 |   } catch (error) {
125 |     console.error('❌ Configuration validation failed:', error.message);
126 |   }
127 | 
128 |   // Invalid configuration example
129 |   try {
130 |     const invalidConfig = {
131 |       webllm: {
132 |         options: { temperature: 5.0 } // Invalid: temperature > 2
133 |       }
134 |     };
135 |     
136 |     ConfigValidator.validate(invalidConfig);
137 |   } catch (error) {
138 |     console.log('✅ Invalid configuration correctly rejected:', error.message);
139 |   }
140 | 
141 |   console.log('');
142 | }
143 | 
144 | // Example: Loading configuration from JSON
145 | function demonstrateJSONConfig() {
146 |   console.log('=== JSON Configuration Demo ===\n');
147 | 
148 |   const configJson = `{
149 |     "webllm": {
150 |       "model": "Qwen3-0.6B-q4f16_1-MLC",
151 |       "options": {
152 |         "temperature": 0.7,
153 |         "maxTokens": 2048
154 |       }
155 |     },
156 |     "systemPrompts": {
157 |       "singleExtraction": "Focus on technical accuracy.",
158 |       "combinedExtraction": "Create comprehensive documentation."
159 |     },
160 |     "processing": {
161 |       "splitPascalCase": false,
162 |       "postProcessRules": [
163 |         {"find": "\\\\bAPI\\\\b", "replace": "API"}
164 |       ]
165 |     }
166 |   }`;
167 | 
168 |   try {
169 |     const config = ConfigValidator.fromJSON(configJson);
170 |     console.log('✅ JSON configuration loaded successfully');
171 |     console.log('LLM model:', config.webllm.model);
172 |     console.log('Temperature:', config.webllm.options.temperature);
173 |     console.log('Custom single extraction prompt:', config.systemPrompts.singleExtraction);
174 |     console.log('');
175 |   } catch (error) {
176 |     console.error('❌ JSON configuration failed:', error.message);
177 |   }
178 | }
179 | 
180 | // Example: Progress tracking
181 | function createProgressTracker() {
182 |   const startTime = Date.now();
183 |   let lastStage = '';
184 | 
185 |   return (progress) => {
186 |     const elapsed = ((Date.now() - startTime) / 1000).toFixed(1);
187 |     
188 |     if (progress.stage !== lastStage) {
189 |       console.log(`\n[${elapsed}s] === ${progress.stage.toUpperCase()} ===`);
190 |       lastStage = progress.stage;
191 |     }
192 |     
193 |     console.log(`[${elapsed}s] ${progress.message}`);
194 |     
195 |     if (progress.currentPage && progress.totalPages) {
196 |       const pageProgress = Math.round((progress.currentPage / progress.totalPages) * 100);
197 |       console.log(`[${elapsed}s] Page Progress: ${pageProgress}% (${progress.currentPage}/${progress.totalPages})`);
198 |     }
199 |     
200 |     if (progress.progress !== undefined) {
201 |       const loadProgress = Math.round(progress.progress * 100);
202 |       console.log(`[${elapsed}s] Loading Progress: ${loadProgress}%`);
203 |     }
204 |     
205 |     if (progress.usage) {
206 |       console.log(`[${elapsed}s] Token Usage:`, progress.usage);
207 |     }
208 |     
209 |     if (progress.error) {
210 |       console.error(`[${elapsed}s] ❌ Error:`, progress.error.message || progress.error);
211 |     }
212 |   };
213 | }
214 | 
215 | // Main demo function
216 | async function runDemo() {
217 |   console.log('🚀 Extract2MD Enhanced API Demo\n');
218 |   
219 |   // Configuration demos
220 |   demonstrateConfigValidation();
221 |   demonstrateJSONConfig();
222 |   
223 |   // File input simulation (in real usage, this would come from user input)
224 |   console.log('📄 To test with actual PDF files:');
225 |   console.log('1. Use an HTML file input: <input type="file" accept=".pdf" />');
226 |   console.log('2. Pass the File object to any scenario method');
227 |   console.log('3. Monitor progress through the callback\n');
228 |   
229 |   console.log('Example usage:');
230 |   console.log(`
231 | // HTML
232 | <input type="file" id="pdfInput" accept=".pdf" />
233 | <button onclick="convertPDF()">Convert</button>
234 | <div id="progress"></div>
235 | <div id="output"></div>
236 | 
237 | // JavaScript
238 | async function convertPDF() {
239 |   const fileInput = document.getElementById('pdfInput');
240 |   const pdfFile = fileInput.files[0];
241 |   
242 |   if (!pdfFile) {
243 |     alert('Please select a PDF file');
244 |     return;
245 |   }
246 |   
247 |   const config = {
248 |     progressCallback: (progress) => {
249 |       document.getElementById('progress').textContent = progress.message;
250 |     }
251 |   };
252 |   
253 |   try {
254 |     // Use the best scenario for comprehensive results
255 |     const markdown = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config);
256 |     document.getElementById('output').innerHTML = 
257 |       '<pre>' + markdown.replace(/</g, '&lt;') + '</pre>';
258 |   } catch (error) {
259 |     console.error('Conversion failed:', error);
260 |     alert('Conversion failed: ' + error.message);
261 |   }
262 | }
263 |   `);
264 | }
265 | 
266 | // Export for use in other examples
267 | export {
268 |   demonstrateScenarios,
269 |   demonstrateConfigValidation,
270 |   demonstrateJSONConfig,
271 |   createProgressTracker,
272 |   basicConfig,
273 |   advancedConfig
274 | };
275 | 
276 | // Run demo if this file is executed directly
277 | if (typeof window !== 'undefined') {
278 |   // Browser environment
279 |   window.Extract2MDDemo = {
280 |     runDemo,
281 |     demonstrateScenarios,
282 |     createProgressTracker,
283 |     basicConfig,
284 |     advancedConfig
285 |   };
286 | } else {
287 |   // Node.js environment
288 |   runDemo();
289 | }
290 | 


--------------------------------------------------------------------------------
/scripts/postinstall.js:
--------------------------------------------------------------------------------
  1 | import fs from 'fs';
  2 | import https from 'https';
  3 | import zlib from 'zlib';
  4 | import path from 'path';
  5 | import { fileURLToPath } from 'url';
  6 | 
  7 | // Polyfill for __dirname in ES modules
  8 | const __filename = fileURLToPath(import.meta.url);
  9 | const __dirname = path.dirname(__filename);
 10 | 
 11 | const langDataPath = path.resolve(__dirname, '..', 'dist', 'assets', 'lang-data');
 12 | 
 13 | const filesToDownload = [
 14 |   {
 15 |     url: 'https://cdn.jsdelivr.net/npm/@tesseract.js-data/eng@1.0.0/4.0.0_best_int/eng.traineddata.gz',
 16 |     fileName: 'eng.traineddata.gz',
 17 |     destFileName: 'eng.traineddata.gz', // Ensure final file is named .gz but contains uncompressed data
 18 |     gzipped: true,
 19 |   },
 20 |   {
 21 |     url: 'https://github.com/tesseract-ocr/tessdata/raw/4.00/sin.traineddata',
 22 |     fileName: 'sin.traineddata', // Original name from URL (or how we save it initially)
 23 |     destFileName: 'sin.traineddata.gz', // Final name Tesseract.js expects
 24 |     gzipped: false, // Source is not gzipped, so no decompression needed
 25 |   },
 26 | ];
 27 | 
 28 | async function ensureDirExists(dirPath) {
 29 |   try {
 30 |     await fs.promises.mkdir(dirPath, { recursive: true });
 31 |     console.log(`Directory ensured: ${dirPath}`);
 32 |   } catch (error) {
 33 |     if (error.code !== 'EEXIST') {
 34 |       console.error(`Error creating directory ${dirPath}:`, error);
 35 |       throw error;
 36 |     }
 37 |     console.log(`Directory already exists: ${dirPath}`);
 38 |   }
 39 | }
 40 | 
 41 | async function downloadFile(url, destPath, fileName, redirectCount = 0) {
 42 |   const MAX_REDIRECTS = 5;
 43 | 
 44 |   return new Promise((resolve, reject) => {
 45 |     if (redirectCount > MAX_REDIRECTS) {
 46 |       reject(new Error(`Exceeded maximum redirect limit (${MAX_REDIRECTS}) for ${fileName}`));
 47 |       return;
 48 |     }
 49 | 
 50 |     const tempFilePath = path.join(langDataPath, `_${fileName}`); // Download to a temp name
 51 |     // Ensure tempFilePath is not created if the actual destPath is the same (for non-gzipped direct save)
 52 |     // However, for consistency and cleanup, using a temp file is safer.
 53 |     // The final rename will handle placing it correctly.
 54 | 
 55 |     console.log(`Downloading ${fileName} from ${url} (Attempt: ${redirectCount + 1})...`);
 56 | 
 57 |     const request = https.get(url, (response) => {
 58 |       if (response.statusCode >= 300 && response.statusCode < 400 && response.headers.location) {
 59 |         console.log(`Redirected for ${fileName} to ${response.headers.location}`);
 60 |         // Consume response data to free up memory
 61 |         response.resume();
 62 |         downloadFile(response.headers.location, destPath, fileName, redirectCount + 1)
 63 |           .then(resolve)
 64 |           .catch(reject);
 65 |         return;
 66 |       }
 67 | 
 68 |       if (response.statusCode !== 200) {
 69 |         // fs.unlink(tempFilePath, () => {}); // Don't unlink if it wasn't opened yet or on redirect
 70 |         reject(new Error(`Failed to download ${fileName}. Status Code: ${response.statusCode} from ${url}`));
 71 |         return;
 72 |       }
 73 | 
 74 |       const fileStream = fs.createWriteStream(tempFilePath);
 75 |       response.pipe(fileStream);
 76 | 
 77 |       fileStream.on('finish', () => {
 78 |         fileStream.close(async (err) => {
 79 |           if (err) {
 80 |             fs.unlink(tempFilePath, () => {}).catch(() => {}); // Clean up temp file, ignore error if it doesn't exist
 81 |             reject(new Error(`Error closing file stream for ${fileName}: ${err.message}`));
 82 |             return;
 83 |           }
 84 |           try {
 85 |             // Ensure target directory exists before renaming
 86 |             await ensureDirExists(path.dirname(destPath));
 87 |             await fs.promises.rename(tempFilePath, destPath);
 88 |             console.log(`Successfully downloaded and saved ${fileName} to ${destPath}`);
 89 |             resolve();
 90 |           } catch (renameError) {
 91 |             fs.unlink(tempFilePath, () => {}).catch(() => {});
 92 |             reject(new Error(`Error renaming ${tempFilePath} to ${destPath}: ${renameError.message}`));
 93 |           }
 94 |         });
 95 |       });
 96 | 
 97 |       fileStream.on('error', (err) => {
 98 |         fs.unlink(tempFilePath, () => {}).catch(() => {});
 99 |         reject(new Error(`Error writing file ${fileName}: ${err.message}`));
100 |       });
101 |     });
102 | 
103 |     request.on('error', (err) => {
104 |       // fs.unlink(tempFilePath, () => {}).catch(() => {}); // Temp file might not exist if request itself failed early
105 |       reject(new Error(`Error downloading ${fileName} from ${url}: ${err.message}`));
106 |     });
107 |   });
108 | }
109 | 
110 | async function decompressGzip(sourcePath, destPath) {
111 |   return new Promise((resolve, reject) => {
112 |     const isSameFile = sourcePath === destPath;
113 |     // Use a temporary file for the decompressed output, especially if decompressing in-place.
114 |     const tempOutputPath = isSameFile ? `${destPath}.tmp_decompress_${Date.now()}` : destPath;
115 | 
116 |     console.log(`Decompressing ${sourcePath} to ${tempOutputPath}${isSameFile ? ' (will then replace original)' : ''}...`);
117 | 
118 |     const gzip = zlib.createGunzip();
119 |     const sourceStream = fs.createReadStream(sourcePath);
120 |     const destStream = fs.createWriteStream(tempOutputPath);
121 | 
122 |     sourceStream.pipe(gzip).pipe(destStream);
123 | 
124 |     destStream.on('finish', () => {
125 |       destStream.close(async (closeErr) => {
126 |         if (closeErr) {
127 |           // Attempt to clean up temporary file if it exists
128 |           if (fs.existsSync(tempOutputPath)) {
129 |             await fs.promises.unlink(tempOutputPath).catch(unlinkErr => console.error(`Error unlinking temp file ${tempOutputPath} on close error:`, unlinkErr));
130 |           }
131 |           reject(new Error(`Error closing destination stream for ${tempOutputPath}: ${closeErr.message}`));
132 |           return;
133 |         }
134 |         try {
135 |           if (isSameFile) {
136 |             // If source and dest are the same, rename temp file to replace original sourcePath with decompressed content
137 |             await fs.promises.rename(tempOutputPath, destPath);
138 |             console.log(`Successfully decompressed and replaced ${sourcePath} with uncompressed content.`);
139 |           } else {
140 |             // If source and dest are different, the decompressed file is at destPath (which was tempOutputPath).
141 |             // The original gzipped sourcePath should be removed.
142 |             await fs.promises.unlink(sourcePath);
143 |             console.log(`Successfully decompressed ${sourcePath} to ${destPath}. Original ${sourcePath} removed.`);
144 |           }
145 |           resolve();
146 |         } catch (moveOrUnlinkError) {
147 |           // Attempt to clean up temporary file if it exists and wasn't the final destPath
148 |           if (fs.existsSync(tempOutputPath) && tempOutputPath !== destPath) {
149 |              await fs.promises.unlink(tempOutputPath).catch(unlinkErr => console.error(`Error unlinking temp file ${tempOutputPath} on finalization error:`, unlinkErr));
150 |           }
151 |           reject(new Error(`Error finalizing decompression for ${sourcePath} (to ${destPath}): ${moveOrUnlinkError.message}`));
152 |         }
153 |       });
154 |     });
155 | 
156 |     destStream.on('error', async (streamErr) => {
157 |       if (fs.existsSync(tempOutputPath)) {
158 |         await fs.promises.unlink(tempOutputPath).catch(unlinkErr => console.error(`Error unlinking temp file ${tempOutputPath} on stream error:`, unlinkErr));
159 |       }
160 |       reject(new Error(`Error writing decompressed file ${tempOutputPath}: ${streamErr.message}`));
161 |     });
162 |     gzip.on('error', (gzipErr) => reject(new Error(`Error decompressing ${sourcePath}: ${gzipErr.message}`)));
163 |     sourceStream.on('error', (sourceErr) => reject(new Error(`Error reading ${sourcePath} for decompression: ${sourceErr.message}`)));
164 |   });
165 | }
166 | 
167 | async function main() {
168 |   try {
169 |     console.log('Starting postinstall script for extract2md...');
170 |     await ensureDirExists(langDataPath);
171 | 
172 |     for (const file of filesToDownload) {
173 |       const downloadedFilePath = path.join(langDataPath, file.fileName);
174 |       const finalDestPath = path.join(langDataPath, file.destFileName);
175 | 
176 |       // Check if final decompressed/copied file already exists
177 |       if (fs.existsSync(finalDestPath)) {
178 |         console.log(`${file.destFileName} already exists at ${finalDestPath}. Skipping download.`);
179 |         continue;
180 |       }
181 |       
182 |       // Check if intermediate .gz file exists (for gzipped files)
183 |       if (file.gzipped && fs.existsSync(downloadedFilePath)) {
184 |          console.log(`Intermediate file ${file.fileName} already exists. Attempting decompression.`);
185 |       } else {
186 |         await downloadFile(file.url, downloadedFilePath, file.fileName);
187 |       }
188 | 
189 |       if (file.gzipped) {
190 |         // Ensure downloaded file exists before trying to decompress
191 |         if (!fs.existsSync(downloadedFilePath)) {
192 |             console.error(`Error: Gzipped file ${downloadedFilePath} not found after download attempt. Skipping decompression.`);
193 |             continue;
194 |         }
195 |         await decompressGzip(downloadedFilePath, finalDestPath);
196 |       } else {
197 |         // Handle non-gzipped files: if downloadedFilePath is different from finalDestPath, rename.
198 |         // This applies if we downloaded 'lang.traineddata' but want 'lang.traineddata.gz' (containing uncompressed data).
199 |         if (downloadedFilePath !== finalDestPath) {
200 |           if (fs.existsSync(downloadedFilePath)) {
201 |             console.log(`Renaming non-gzipped file ${downloadedFilePath} to ${finalDestPath}...`);
202 |             await fs.promises.rename(downloadedFilePath, finalDestPath);
203 |             console.log(`Successfully renamed ${downloadedFilePath} to ${finalDestPath}.`);
204 |           } else {
205 |             console.warn(`File ${downloadedFilePath} not found for renaming to ${finalDestPath}. It might have been saved directly as ${finalDestPath} if download logic handles it, or download failed.`);
206 |           }
207 |         } else {
208 |           // If downloadedFilePath is the same as finalDestPath, it means the file was already saved with the correct name.
209 |           console.log(`Non-gzipped file ${finalDestPath} is already correctly named. No rename needed.`);
210 |         }
211 |       }
212 |     }
213 | 
214 |     console.log('Postinstall script completed successfully.');
215 |   } catch (error) {
216 |     console.error('Error during postinstall script:', error.message);
217 |     // process.exit(1); // Optionally exit with error, though npm might handle this.
218 |   }
219 | }
220 | 
221 | main();


--------------------------------------------------------------------------------
/DEPLOYMENT.md:
--------------------------------------------------------------------------------
  1 | # Deployment Documentation for Extract2MD v2.0.0
  2 | 
  3 | This document outlines the deployment process, distribution methods, and integration guidelines for the Extract2MD package.
  4 | 
  5 | ## Package Structure
  6 | 
  7 | The Extract2MD package is distributed with the following structure:
  8 | 
  9 | ```
 10 | extract2md/
 11 | ├── dist/                           # Built files for distribution
 12 | │   ├── assets/
 13 | │   │   ├── extract2md.umd.js      # Main UMD bundle
 14 | │   │   ├── extract2md.umd.js.map  # Source map
 15 | │   │   ├── tesseract-worker.min.js # Tesseract.js worker
 16 | │   │   └── tesseract-core.wasm.js  # Tesseract WASM core
 17 | │   └── pdf.worker.min.mjs          # PDF.js worker
 18 | ├── src/                            # Source code
 19 | │   ├── types/index.d.ts           # TypeScript definitions
 20 | │   ├── index.js                   # Main entry point
 21 | │   ├── converters/                # Converter modules
 22 | │   ├── engines/                   # Processing engines
 23 | │   └── utils/                     # Utility modules
 24 | ├── examples/                      # Usage examples and demo
 25 | ├── test/                         # Test files
 26 | ├── package.json                  # Package configuration
 27 | ├── config.example.json          # Example configuration
 28 | ├── README.md                 # Main documentation
 29 | └── MIGRATION.md                 # Migration guide
 30 | ```
 31 | 
 32 | ## Distribution Methods
 33 | 
 34 | ### 1. NPM Package Distribution
 35 | 
 36 | The package is designed for npm distribution with full TypeScript support.
 37 | 
 38 | #### Installation
 39 | ```bash
 40 | npm install extract2md
 41 | ```
 42 | 
 43 | #### Package Entry Points
 44 | - **Main (UMD)**: `dist/assets/extract2md.umd.js` - For browser use
 45 | - **Module (ES6)**: `src/index.js` - For modern bundlers
 46 | - **Types**: `src/types/index.d.ts` - TypeScript definitions
 47 | 
 48 | ### 2. CDN Distribution
 49 | 
 50 | The UMD bundle can be served via CDN for direct browser use:
 51 | 
 52 | ```html
 53 | <script src="https://unpkg.com/extract2md@1.0.6/dist/assets/extract2md.umd.js"></script>
 54 | <script>
 55 |     // Extract2MD is available as a global variable
 56 |     const result = await Extract2MD.Extract2MDConverter.quickConvertOnly(pdfFile, config);
 57 | </script>
 58 | ```
 59 | 
 60 | ### 3. Direct Bundle Integration
 61 | 
 62 | For projects that need to bundle the library:
 63 | 
 64 | ```javascript
 65 | import { Extract2MDConverter } from 'extract2md';
 66 | // Use ES6 modules with tree shaking support
 67 | ```
 68 | 
 69 | ## Build Process
 70 | 
 71 | ### Prerequisites
 72 | - Node.js 14+ 
 73 | - npm 7+
 74 | 
 75 | ### Building the Package
 76 | 
 77 | ```bash
 78 | # Install dependencies
 79 | npm install
 80 | 
 81 | # Build the UMD bundle
 82 | npm run build
 83 | 
 84 | # Run tests
 85 | npm test
 86 | 
 87 | # Prepare for publishing
 88 | npm run prepublishOnly
 89 | ```
 90 | 
 91 | ### Build Outputs
 92 | 
 93 | The build process creates:
 94 | 1. **UMD Bundle**: `dist/assets/extract2md.umd.js` (5.69 MiB)
 95 | 2. **Worker Files**: Required for PDF.js and Tesseract.js
 96 | 3. **Source Maps**: For debugging
 97 | 
 98 | ## Deployment Configurations
 99 | 
100 | ### 1. Web Application Deployment
101 | 
102 | For web applications using the library:
103 | 
104 | ```javascript
105 | // Webpack configuration example
106 | module.exports = {
107 |     resolve: {
108 |         fallback: {
109 |             "fs": false,
110 |             "path": false
111 |         }
112 |     },
113 |     // Copy worker files to your public directory
114 |     plugins: [
115 |         new CopyWebpackPlugin({
116 |             patterns: [
117 |                 { from: 'node_modules/extract2md/dist/pdf.worker.min.mjs', to: 'public/' },
118 |                 { from: 'node_modules/extract2md/dist/assets/tesseract-worker.min.js', to: 'public/' },
119 |                 { from: 'node_modules/extract2md/dist/assets/tesseract-core.wasm.js', to: 'public/' }
120 |             ]
121 |         })
122 |     ]
123 | };
124 | ```
125 | 
126 | ### 2. Node.js Server Deployment
127 | 
128 | For server-side use (limited functionality due to browser dependencies):
129 | 
130 | ```javascript
131 | // Server-side usage (configuration validation only)
132 | import { ConfigValidator } from 'extract2md/src/utils/ConfigValidator.js';
133 | 
134 | const validator = new ConfigValidator();
135 | const isValid = validator.validate(config);
136 | ```
137 | 
138 | ### 3. Static Site Deployment
139 | 
140 | For static sites or demos:
141 | 
142 | ```html
143 | <!DOCTYPE html>
144 | <html>
145 | <head>
146 |     <script src="https://unpkg.com/extract2md@1.0.6/dist/assets/extract2md.umd.js"></script>
147 | </head>
148 | <body>
149 |     <input type="file" id="pdfInput" accept=".pdf">
150 |     <button onclick="convertPDF()">Convert</button>
151 |     <div id="output"></div>
152 | 
153 |     <script>
154 |         async function convertPDF() {
155 |             const file = document.getElementById('pdfInput').files[0];
156 |             if (!file) return;
157 | 
158 |             try {
159 |                 const result = await Extract2MD.Extract2MDConverter.quickConvertOnly(file, {
160 |                     tesseract: { language: 'eng', oem: 1, psm: 6 }
161 |                 });
162 |                 document.getElementById('output').innerHTML = `<pre>${result}</pre>`;
163 |             } catch (error) {
164 |                 console.error('Conversion failed:', error);
165 |             }
166 |         }
167 |     </script>
168 | </body>
169 | </html>
170 | ```
171 | 
172 | ## Performance Considerations
173 | 
174 | ### Bundle Size Optimization
175 | 
176 | The package includes large dependencies:
177 | - **PDF.js**: ~951 KB (PDF processing)
178 | - **Tesseract.js**: ~4.5 MB (OCR functionality)
179 | - **WebLLM**: ~Variable (model-dependent)
180 | 
181 | #### Optimization Strategies:
182 | 
183 | 1. **Lazy Loading**: Load only required modules
184 | ```javascript
185 | // Load only when needed
186 | const { Extract2MDConverter } = await import('extract2md');
187 | ```
188 | 
189 | 2. **Code Splitting**: Separate scenarios into different chunks
190 | ```javascript
191 | // Webpack code splitting
192 | const quickConvert = () => import('extract2md').then(m => m.Extract2MDConverter.quickConvertOnly);
193 | ```
194 | 
195 | 3. **CDN Caching**: Use CDN for static assets
196 | ```javascript
197 | // Configure worker paths to use CDN
198 | window.EXTRACT2MD_CONFIG = {
199 |     workerPaths: {
200 |         pdf: 'https://cdn.example.com/pdf.worker.min.mjs',
201 |         tesseract: 'https://cdn.example.com/tesseract-worker.min.js'
202 |     }
203 | };
204 | ```
205 | 
206 | ## Security Considerations
207 | 
208 | ### Content Security Policy (CSP)
209 | 
210 | When deploying, configure CSP headers:
211 | 
212 | ```
213 | Content-Security-Policy: 
214 |     script-src 'self' 'wasm-unsafe-eval';
215 |     worker-src 'self' blob:;
216 |     connect-src 'self' https://huggingface.co;
217 | ```
218 | 
219 | ### File Processing Security
220 | 
221 | - Files are processed client-side only
222 | - No data is sent to external servers (except WebLLM model downloads)
223 | - Implement file size limits for production use
224 | 
225 | ## Monitoring and Debugging
226 | 
227 | ### Error Tracking
228 | 
229 | ```javascript
230 | try {
231 |     const result = await Extract2MDConverter.quickConvertOnly(file, config);
232 | } catch (error) {
233 |     // Log error details for monitoring
234 |     console.error('Extract2MD Error:', {
235 |         type: error.name,
236 |         message: error.message,
237 |         scenario: 'quickConvertOnly',
238 |         timestamp: new Date().toISOString()
239 |     });
240 | }
241 | ```
242 | 
243 | ### Performance Monitoring
244 | 
245 | ```javascript
246 | const startTime = performance.now();
247 | const result = await Extract2MDConverter.quickConvertOnly(file, config);
248 | const duration = performance.now() - startTime;
249 | 
250 | console.log(`Conversion took ${duration}ms`);
251 | ```
252 | 
253 | ## Version Management
254 | 
255 | ### Semantic Versioning
256 | 
257 | The package follows semantic versioning:
258 | - **Major**: Breaking API changes
259 | - **Minor**: New features, backward compatible
260 | - **Patch**: Bug fixes, backward compatible
261 | 
262 | ### Upgrade Path
263 | 
264 | 1. **v1.0.x**: Current stable release
265 | 2. **v1.1.x**: Planned features (streaming, progress callbacks)
266 | 3. **v2.0.x**: Planned breaking changes (remove legacy API)
267 | 
268 | ## Integration Examples
269 | 
270 | ### React Integration
271 | 
272 | ```jsx
273 | import React, { useState } from 'react';
274 | import { Extract2MDConverter } from 'extract2md';
275 | 
276 | function PDFConverter() {
277 |     const [result, setResult] = useState('');
278 |     const [loading, setLoading] = useState(false);
279 | 
280 |     const handleConvert = async (file) => {
281 |         setLoading(true);
282 |         try {
283 |             const markdown = await Extract2MDConverter.quickConvertOnly(file, {
284 |                 tesseract: { language: 'eng', oem: 1, psm: 6 }
285 |             });
286 |             setResult(markdown);
287 |         } catch (error) {
288 |             console.error('Conversion failed:', error);
289 |         } finally {
290 |             setLoading(false);
291 |         }
292 |     };
293 | 
294 |     return (
295 |         <div>
296 |             <input 
297 |                 type="file" 
298 |                 accept=".pdf" 
299 |                 onChange={(e) => handleConvert(e.target.files[0])} 
300 |             />
301 |             {loading && <p>Converting...</p>}
302 |             {result && <pre>{result}</pre>}
303 |         </div>
304 |     );
305 | }
306 | ```
307 | 
308 | ### Vue.js Integration
309 | 
310 | ```vue
311 | <template>
312 |     <div>
313 |         <input type="file" @change="convertPDF" accept=".pdf" />
314 |         <div v-if="loading">Converting...</div>
315 |         <pre v-if="result">{{ result }}</pre>
316 |     </div>
317 | </template>
318 | 
319 | <script>
320 | import { Extract2MDConverter } from 'extract2md';
321 | 
322 | export default {
323 |     data() {
324 |         return {
325 |             result: '',
326 |             loading: false
327 |         };
328 |     },
329 |     methods: {
330 |         async convertPDF(event) {
331 |             const file = event.target.files[0];
332 |             if (!file) return;
333 | 
334 |             this.loading = true;
335 |             try {
336 |                 this.result = await Extract2MDConverter.quickConvertOnly(file, {
337 |                     tesseract: { language: 'eng', oem: 1, psm: 6 }
338 |                 });
339 |             } catch (error) {
340 |                 console.error('Conversion failed:', error);
341 |             } finally {
342 |                 this.loading = false;
343 |             }
344 |         }
345 |     }
346 | };
347 | </script>
348 | ```
349 | 
350 | ## Support and Maintenance
351 | 
352 | ### Documentation
353 | - **API Documentation**: See `README.md`
354 | - **Migration Guide**: See `MIGRATION.md`
355 | - **Examples**: See `examples/` directory
356 | 
357 | ### Community Support
358 | - GitHub Issues for bug reports
359 | - GitHub Discussions for questions
360 | - Stack Overflow for implementation help
361 | 
362 | ### Commercial Support
363 | Contact the maintainer for commercial support, custom integrations, or enterprise licensing.
364 | 
365 | ## Licensing
366 | 
367 | The package is distributed under the MIT License, allowing for both commercial and non-commercial use. See `LICENSE` file for full details.
368 | 
369 | ---
370 | 
371 | **Note**: This deployment guide is for Extract2MD v2.0.0. Check the latest documentation for updates and changes in newer versions.
372 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Extract2MD - Enhanced PDF to Markdown Converter
  2 | 
  3 | <!-- Badges (Placeholder - Replace with actual badges) -->
  4 | [![NPM Version](https://img.shields.io/npm/v/extract2md.svg)](https://www.npmjs.com/package/extract2md)
  5 | [![License](https://img.shields.io/npm/l/extract2md.svg)](https://github.com/hashangit/Extract2MD/blob/main/LICENSE)
  6 | [![Downloads](https://img.shields.io/npm/dt/extract2md.svg)](https://www.npmjs.com/package/extract2md)
  7 | 
  8 | [![Sponsor on Patreon](https://img.shields.io/badge/Sponsor%20on-Patreon-F96854?logo=patreon&style=flat)](https://www.patreon.com/HashanWickramasinghe)
  9 | 
 10 | A powerful client-side JavaScript library for converting PDFs to Markdown with multiple extraction methods and optional LLM enhancement. Now with scenario-specific methods for different use cases.
 11 | 
 12 | ![Extract2MD](https://github.com/user-attachments/assets/0704e80a-54bc-4449-a495-eb944a318400)
 13 | 
 14 | ## 🚀 Quick Start
 15 | 
 16 | Extract2MD now offers 5 distinct scenarios for different conversion needs:
 17 | 
 18 | ```javascript
 19 | import Extract2MDConverter from 'extract2md';
 20 | 
 21 | // Scenario 1: Quick conversion only
 22 | const markdown1 = await Extract2MDConverter.quickConvertOnly(pdfFile);
 23 | 
 24 | // Scenario 2: High accuracy OCR conversion only  
 25 | const markdown2 = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile);
 26 | 
 27 | // Scenario 3: Quick conversion + LLM enhancement
 28 | const markdown3 = await Extract2MDConverter.quickConvertWithLLM(pdfFile);
 29 | 
 30 | // Scenario 4: High accuracy conversion + LLM enhancement
 31 | const markdown4 = await Extract2MDConverter.highAccuracyConvertWithLLM(pdfFile);
 32 | 
 33 | // Scenario 5: Combined extraction + LLM enhancement (most comprehensive)
 34 | const markdown5 = await Extract2MDConverter.combinedConvertWithLLM(pdfFile);
 35 | ```
 36 | 
 37 | ## 📋 Scenarios Explained
 38 | 
 39 | ### Scenario 1: Quick Convert Only
 40 | - **Use case**: Fast conversion when PDF has selectable text
 41 | - **Method**: `quickConvertOnly(pdfFile, config?)`
 42 | - **Tech**: PDF.js text extraction only
 43 | - **Output**: Basic markdown formatting
 44 | 
 45 | ### Scenario 2: High Accuracy Convert Only
 46 | - **Use case**: PDFs with images, scanned documents, complex layouts
 47 | - **Method**: `highAccuracyConvertOnly(pdfFile, config?)`
 48 | - **Tech**: Tesseract.js OCR
 49 | - **Output**: Markdown from OCR extraction
 50 | 
 51 | ### Scenario 3: Quick Convert + LLM
 52 | - **Use case**: Fast extraction with AI enhancement for better formatting
 53 | - **Method**: `quickConvertWithLLM(pdfFile, config?)`
 54 | - **Tech**: PDF.js + WebLLM
 55 | - **Output**: AI-enhanced markdown with improved structure and clarity
 56 | 
 57 | ### Scenario 4: High Accuracy + LLM
 58 | - **Use case**: OCR extraction with AI enhancement
 59 | - **Method**: `highAccuracyConvertWithLLM(pdfFile, config?)`
 60 | - **Tech**: Tesseract.js OCR + WebLLM
 61 | - **Output**: AI-enhanced markdown from OCR
 62 | 
 63 | ### Scenario 5: Combined + LLM (Recommended)
 64 | - **Use case**: Most comprehensive conversion using both extraction methods
 65 | - **Method**: `combinedConvertWithLLM(pdfFile, config?)`
 66 | - **Tech**: PDF.js + Tesseract.js + WebLLM with specialized prompts
 67 | - **Output**: Best possible markdown leveraging strengths of both extraction methods
 68 | 
 69 | ## ⚙️ Configuration
 70 | 
 71 | Create a configuration object or JSON file to customize behavior:
 72 | 
 73 | ```javascript
 74 | const config = {
 75 |   // PDF.js Worker
 76 |   pdfJsWorkerSrc: "../pdf.worker.min.mjs",
 77 |   
 78 |   // Tesseract OCR Settings
 79 |   tesseract: {
 80 |     workerPath: "./tesseract-worker.min.js",
 81 |     corePath: "./tesseract-core.wasm.js", 
 82 |     langPath: "./lang-data/",
 83 |     language: "eng",
 84 |     options: {}
 85 |   },
 86 |   
 87 |   // LLM Configuration
 88 |   webllm: {
 89 |     model: "Qwen3-0.6B-q4f16_1-MLC",
 90 |     // Optional: Custom model
 91 |     customModel: {
 92 |       model: "https://huggingface.co/mlc-ai/your-model/resolve/main/",
 93 |       model_id: "YourModel-ID",
 94 |       model_lib: "https://example.com/your-model.wasm",
 95 |       required_features: ["shader-f16"],
 96 |       overrides: { conv_template: "qwen" }
 97 |     },
 98 |     options: {
 99 |       temperature: 0.7,
100 |       maxTokens: 4096
101 |     }
102 |   },
103 |   
104 |   // System Prompt Customizations
105 |   systemPrompts: {
106 |     singleExtraction: "Focus on preserving code examples exactly.",
107 |     combinedExtraction: "Pay attention to tables and diagrams from OCR."
108 |   },
109 |   
110 |   // Processing Options
111 |   processing: {
112 |     splitPascalCase: false,
113 |     pdfRenderScale: 2.5,
114 |     postProcessRules: [
115 |       { find: /\bAPI\b/g, replace: "API" }
116 |     ]
117 |   },
118 |   
119 |   // Progress Tracking
120 |   progressCallback: (progress) => {
121 |     console.log(`${progress.stage}: ${progress.message}`);
122 |     if (progress.currentPage) {
123 |       console.log(`Page ${progress.currentPage}/${progress.totalPages}`);
124 |     }
125 |   }
126 | };
127 | 
128 | // Use configuration
129 | const markdown = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config);
130 | ```
131 | 
132 | ## 🔧 Advanced Usage
133 | 
134 | ### Using Individual Components
135 | 
136 | ```javascript
137 | import { 
138 |   WebLLMEngine, 
139 |   OutputParser, 
140 |   SystemPrompts,
141 |   ConfigValidator 
142 | } from 'extract2md';
143 | 
144 | // Validate configuration
145 | const validatedConfig = ConfigValidator.validate(userConfig);
146 | 
147 | // Initialize WebLLM engine
148 | const engine = new WebLLMEngine(validatedConfig);
149 | await engine.initialize();
150 | 
151 | // Generate text
152 | const result = await engine.generate("Your prompt here");
153 | 
154 | // Parse output
155 | const parser = new OutputParser();
156 | const cleanMarkdown = parser.parse(result);
157 | ```
158 | 
159 | ### Custom System Prompts
160 | 
161 | The library uses different system prompts for different scenarios:
162 | 
163 | ```javascript
164 | // For scenarios 3 & 4 (single extraction)
165 | const singlePrompt = SystemPrompts.getSingleExtractionPrompt(
166 |   "Additional instruction: Preserve all technical terms."
167 | );
168 | 
169 | // For scenario 5 (combined extraction) 
170 | const combinedPrompt = SystemPrompts.getCombinedExtractionPrompt(
171 |   "Focus on creating comprehensive documentation."
172 | );
173 | ```
174 | 
175 | ### Configuration from JSON
176 | 
177 | ```javascript
178 | import { ConfigValidator } from 'extract2md';
179 | 
180 | // Load from JSON string
181 | const config = ConfigValidator.fromJSON(configJsonString);
182 | 
183 | // Use with any scenario
184 | const result = await Extract2MDConverter.quickConvertWithLLM(pdfFile, config);
185 | ```
186 | 
187 | ## 🎯 Error Handling & Progress Tracking
188 | 
189 | ```javascript
190 | const config = {
191 |   progressCallback: (progress) => {
192 |     switch (progress.stage) {
193 |       case 'scenario_5_start':
194 |         console.log('Starting combined conversion...');
195 |         break;
196 |       case 'webllm_load_progress':
197 |         console.log(`Loading model: ${progress.progress}%`);
198 |         break;
199 |       case 'ocr_page_process':
200 |         console.log(`OCR: ${progress.currentPage}/${progress.totalPages}`);
201 |         break;
202 |       case 'webllm_generate_start':
203 |         console.log('AI enhancement in progress...');
204 |         break;
205 |       case 'scenario_5_complete':
206 |         console.log('Conversion completed!');
207 |         break;
208 |       default:
209 |         console.log(`${progress.stage}: ${progress.message}`);
210 |     }
211 |     
212 |     if (progress.error) {
213 |       console.error('Error:', progress.error);
214 |     }
215 |   }
216 | };
217 | 
218 | try {
219 |   const result = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config);
220 |   console.log('Success:', result);
221 | } catch (error) {
222 |   console.error('Conversion failed:', error.message);
223 | }
224 | ```
225 | 
226 | ## 🔄 Migration from Legacy API
227 | 
228 | If you're using the old API, you can still access it:
229 | 
230 | ```javascript
231 | import { LegacyExtract2MDConverter } from 'extract2md';
232 | 
233 | // Old way
234 | const converter = new LegacyExtract2MDConverter(options);
235 | const quick = await converter.quickConvert(pdfFile);
236 | const ocr = await converter.highAccuracyConvert(pdfFile);
237 | const enhanced = await converter.llmRewrite(text);
238 | 
239 | // New way (recommended)
240 | const quick = await Extract2MDConverter.quickConvertOnly(pdfFile, config);
241 | const ocr = await Extract2MDConverter.highAccuracyConvertOnly(pdfFile, config);
242 | const enhanced = await Extract2MDConverter.quickConvertWithLLM(pdfFile, config);
243 | ```
244 | 
245 | ## 🌟 Features
246 | 
247 | - **5 Scenario-Specific Methods**: Choose the right approach for your use case
248 | - **WebLLM Integration**: Client-side AI enhancement with Qwen models
249 | - **Custom Model Support**: Use your own trained models
250 | - **Advanced Output Parsing**: Automatic removal of thinking tags and formatting
251 | - **Comprehensive Configuration**: Fine-tune every aspect of the conversion
252 | - **Progress Tracking**: Real-time updates for UI integration
253 | - **TypeScript Support**: Full type definitions included
254 | - **Backwards Compatible**: Legacy API still available
255 | 
256 | ## 📚 TypeScript Support
257 | 
258 | Full TypeScript definitions are included:
259 | 
260 | ```typescript
261 | import Extract2MDConverter, { 
262 |   Extract2MDConfig, 
263 |   ProgressReport,
264 |   CustomModelConfig 
265 | } from 'extract2md';
266 | 
267 | const config: Extract2MDConfig = {
268 |   webllm: {
269 |     model: "Qwen3-0.6B-q4f16_1-MLC",
270 |     options: {
271 |       temperature: 0.7,
272 |       maxTokens: 4096
273 |     }
274 |   },
275 |   progressCallback: (progress: ProgressReport) => {
276 |     console.log(progress.stage, progress.message);
277 |   }
278 | };
279 | 
280 | const result: string = await Extract2MDConverter.combinedConvertWithLLM(pdfFile, config);
281 | ```
282 | 
283 | ## 🏗️ Installation & Deployment
284 | 
285 | ### NPM Installation
286 | ```bash
287 | npm install extract2md
288 | ```
289 | 
290 | ### CDN Usage
291 | ```html
292 | <script src="https://unpkg.com/extract2md@2.0.0/dist/assets/extract2md.umd.js"></script>
293 | <script>
294 |     // Available as global Extract2MD
295 |     const result = await Extract2MD.Extract2MDConverter.quickConvertOnly(pdfFile);
296 | </script>
297 | ```
298 | 
299 | ### Worker Files Configuration
300 | The package requires worker files for PDF.js and Tesseract.js. These are automatically copied during build:
301 | 
302 | ```javascript
303 | // Default worker paths (adjust for your deployment)
304 | const config = {
305 |     pdfJsWorkerSrc: "/pdf.worker.min.mjs",
306 |     tesseract: {
307 |         workerPath: "/tesseract-worker.min.js",
308 |         corePath: "/tesseract-core.wasm.js"
309 |     }
310 | };
311 | ```
312 | 
313 | ### Bundle Size Considerations
314 | - **Total Size**: ~11 MB (includes OCR and PDF processing)
315 | - **PDF.js**: ~950 KB
316 | - **Tesseract.js**: ~4.5 MB 
317 | - **WebLLM**: Variable (model-dependent)
318 | 
319 | Use lazy loading and code splitting for production deployments.
320 | 
321 | ## 📚 Documentation
322 | 
323 | - **[Migration Guide](./MIGRATION.md)** - Upgrade from legacy API
324 | - **[Deployment Guide](./DEPLOYMENT.md)** - Production deployment instructions
325 | - **[Examples](./examples/)** - Complete usage examples
326 | - **[How To Run the Demo](./examples/README.md)** - Instructions on how to run the demo
327 | - **[TypeScript Definitions](./src/types/index.d.ts)** - Full type definitions
328 | 
329 | ## 📄 License
330 | 
331 | MIT License - see LICENSE file for details.
332 | 
333 | ## 🤝 Contributing
334 | 
335 | Contributions welcome! Please read the contributing guidelines before submitting PRs.
336 | 
337 | ## 🐛 Issues
338 | 
339 | Report issues on the [GitHub Issues page](https://github.com/hashangit/Extract2MD/issues).
340 | 


--------------------------------------------------------------------------------
/src/utils/ConfigValidator.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * ConfigValidator.js
  3 |  * Validates and normalizes configuration objects
  4 |  */
  5 | 
  6 | export class ConfigValidator {
  7 |     /**
  8 |      * Default configuration values
  9 |      */
 10 |     static getDefaultConfig() {
 11 |         return {
 12 |             // PDF.js configuration
 13 |             pdfJsWorkerSrc: '../pdf.worker.min.mjs',
 14 |             
 15 |             // Tesseract configuration
 16 |             tesseract: {
 17 |                 workerPath: './tesseract-worker.min.js',
 18 |                 corePath: './tesseract-core.wasm.js',
 19 |                 langPath: './lang-data/',
 20 |                 language: 'eng',
 21 |                 options: {}
 22 |             },
 23 |             
 24 |             // LLM configuration
 25 |             webllm: {
 26 |                 model: 'Qwen3-0.6B-q4f16_1-MLC',
 27 |                 customModel: null,
 28 |                 options: {
 29 |                     temperature: 0.7,
 30 |                     maxTokens: 4096
 31 |                 }
 32 |             },
 33 |             
 34 |             // System prompt customizations
 35 |             systemPrompts: {
 36 |                 singleExtraction: '',
 37 |                 combinedExtraction: ''
 38 |             },
 39 |             
 40 |             // Processing options
 41 |             processing: {
 42 |                 splitPascalCase: false,
 43 |                 pdfRenderScale: 2.5,
 44 |                 postProcessRules: []
 45 |             },
 46 |             
 47 |             // Progress tracking
 48 |             progressCallback: null
 49 |         };
 50 |     }
 51 | 
 52 |     /**
 53 |      * Validate and normalize a configuration object
 54 |      * @param {Object} config - Configuration object to validate
 55 |      * @returns {Object} Validated and normalized configuration
 56 |      */
 57 |     static validate(config = {}) {
 58 |         const defaultConfig = this.getDefaultConfig();
 59 |         const normalizedConfig = this.deepMerge(defaultConfig, config);
 60 |         
 61 |         // Validate required types and values
 62 |         this.validateTesseractConfig(normalizedConfig.tesseract);
 63 |         this.validateLLMConfig(normalizedConfig.webllm);
 64 |         this.validateProcessingConfig(normalizedConfig.processing);
 65 |         this.validateSystemPrompts(normalizedConfig.systemPrompts);
 66 |         
 67 |         return normalizedConfig;
 68 |     }
 69 | 
 70 |     /**
 71 |      * Validate Tesseract configuration
 72 |      * @param {Object} tesseractConfig - Tesseract configuration
 73 |      */
 74 |     static validateTesseractConfig(tesseractConfig) {
 75 |         if (!tesseractConfig) {
 76 |             throw new Error('Tesseract configuration is required');
 77 |         }
 78 | 
 79 |         // Validate language
 80 |         if (tesseractConfig.language && typeof tesseractConfig.language !== 'string') {
 81 |             throw new Error('Tesseract language must be a string');
 82 |         }
 83 | 
 84 |         // Validate paths
 85 |         const pathFields = ['workerPath', 'corePath', 'langPath'];
 86 |         for (const field of pathFields) {
 87 |             if (tesseractConfig[field] && typeof tesseractConfig[field] !== 'string') {
 88 |                 throw new Error(`Tesseract ${field} must be a string`);
 89 |             }
 90 |         }
 91 | 
 92 |         // Validate options
 93 |         if (tesseractConfig.options && typeof tesseractConfig.options !== 'object') {
 94 |             throw new Error('Tesseract options must be an object');
 95 |         }
 96 |     }
 97 | 
 98 |     /**
 99 |      * Validate LLM configuration
100 |      * @param {Object} llmConfig - LLM configuration
101 |      */
102 |     static validateLLMConfig(llmConfig) {
103 |         if (!llmConfig) {
104 |             throw new Error('LLM configuration is required');
105 |         }
106 | 
107 |         // Validate model
108 |         if (llmConfig.model && typeof llmConfig.model !== 'string') {
109 |             throw new Error('LLM model must be a string');
110 |         }
111 | 
112 |         // Validate custom model structure
113 |         if (llmConfig.customModel) {
114 |             this.validateCustomModel(llmConfig.customModel);
115 |         }
116 | 
117 |         // Validate options
118 |         if (llmConfig.options) {
119 |             this.validateLLMOptions(llmConfig.options);
120 |         }
121 |     }
122 | 
123 |     /**
124 |      * Validate custom model configuration
125 |      * @param {Object} customModel - Custom model configuration
126 |      */
127 |     static validateCustomModel(customModel) {
128 |         const requiredFields = ['model', 'model_id', 'model_lib'];
129 |         
130 |         for (const field of requiredFields) {
131 |             if (!customModel[field] || typeof customModel[field] !== 'string') {
132 |                 throw new Error(`Custom model ${field} is required and must be a string`);
133 |             }
134 |         }
135 | 
136 |         // Validate optional fields
137 |         if (customModel.required_features && !Array.isArray(customModel.required_features)) {
138 |             throw new Error('Custom model required_features must be an array');
139 |         }
140 | 
141 |         if (customModel.overrides && typeof customModel.overrides !== 'object') {
142 |             throw new Error('Custom model overrides must be an object');
143 |         }
144 |     }
145 | 
146 |     /**
147 |      * Validate LLM options
148 |      * @param {Object} options - LLM options
149 |      */
150 |     static validateLLMOptions(options) {
151 |         if (typeof options !== 'object') {
152 |             throw new Error('LLM options must be an object');
153 |         }
154 | 
155 |         // Validate temperature
156 |         if (options.temperature !== undefined) {
157 |             if (typeof options.temperature !== 'number' || options.temperature < 0 || options.temperature > 2) {
158 |                 throw new Error('LLM temperature must be a number between 0 and 2');
159 |             }
160 |         }
161 | 
162 |         // Validate maxTokens
163 |         if (options.maxTokens !== undefined) {
164 |             if (!Number.isInteger(options.maxTokens) || options.maxTokens < 1) {
165 |                 throw new Error('LLM maxTokens must be a positive integer');
166 |             }
167 |         }
168 |     }
169 | 
170 |     /**
171 |      * Validate processing configuration
172 |      * @param {Object} processingConfig - Processing configuration
173 |      */
174 |     static validateProcessingConfig(processingConfig) {
175 |         if (!processingConfig) {
176 |             throw new Error('Processing configuration is required');
177 |         }
178 | 
179 |         // Validate splitPascalCase
180 |         if (processingConfig.splitPascalCase !== undefined && typeof processingConfig.splitPascalCase !== 'boolean') {
181 |             throw new Error('splitPascalCase must be a boolean');
182 |         }
183 | 
184 |         // Validate pdfRenderScale
185 |         if (processingConfig.pdfRenderScale !== undefined) {
186 |             if (typeof processingConfig.pdfRenderScale !== 'number' || processingConfig.pdfRenderScale <= 0) {
187 |                 throw new Error('pdfRenderScale must be a positive number');
188 |             }
189 |         }
190 | 
191 |         // Validate postProcessRules
192 |         if (processingConfig.postProcessRules && !Array.isArray(processingConfig.postProcessRules)) {
193 |             throw new Error('postProcessRules must be an array');
194 |         }
195 | 
196 |         if (processingConfig.postProcessRules) {
197 |             for (const rule of processingConfig.postProcessRules) {
198 |                 if (!rule || typeof rule !== 'object') {
199 |                     throw new Error('Each postProcessRule must be an object');
200 |                 }
201 |                 if (!rule.find) {
202 |                     throw new Error('Each postProcessRule must have a "find" property');
203 |                 }
204 |                 if (typeof rule.find !== 'string' && !(rule.find instanceof RegExp)) {
205 |                     throw new Error('Each postProcessRule "find" property must be a string or RegExp');
206 |                 }
207 |                 if (typeof rule.replace !== 'string') {
208 |                     throw new Error('Each postProcessRule must have a "replace" string property');
209 |                 }
210 |             }
211 |         }
212 |     }
213 | 
214 |     /**
215 |      * Validate system prompts configuration
216 |      * @param {Object} systemPrompts - System prompts configuration
217 |      */
218 |     static validateSystemPrompts(systemPrompts) {
219 |         if (!systemPrompts) {
220 |             throw new Error('System prompts configuration is required');
221 |         }
222 | 
223 |         const promptTypes = ['singleExtraction', 'combinedExtraction'];
224 |         for (const promptType of promptTypes) {
225 |             if (systemPrompts[promptType] !== undefined && typeof systemPrompts[promptType] !== 'string') {
226 |                 throw new Error(`System prompt ${promptType} must be a string`);
227 |             }
228 |         }
229 |     }
230 | 
231 |     /**
232 |      * Deep merge two objects
233 |      * @param {Object} target - Target object
234 |      * @param {Object} source - Source object
235 |      * @returns {Object} Merged object
236 |      */
237 |     static deepMerge(target, source) {
238 |         const result = { ...target };
239 |         
240 |         for (const key in source) {
241 |             if (Object.prototype.hasOwnProperty.call(source, key)) {
242 |                 if (this.isObject(source[key]) && this.isObject(target[key])) {
243 |                     result[key] = this.deepMerge(target[key], source[key]);
244 |                 } else {
245 |                     result[key] = source[key];
246 |                 }
247 |             }
248 |         }
249 |         
250 |         return result;
251 |     }
252 | 
253 |     /**
254 |      * Check if value is a plain object
255 |      * @param {*} value - Value to check
256 |      * @returns {boolean} Whether value is a plain object
257 |      */
258 |     static isObject(value) {
259 |         return value !== null && typeof value === 'object' && !Array.isArray(value);
260 |     }
261 | 
262 |     /**
263 |      * Create a configuration object from a JSON string or file content
264 |      * @param {string} jsonString - JSON configuration string
265 |      * @returns {Object} Parsed and validated configuration
266 |      */
267 |     static fromJSON(jsonString) {
268 |         try {
269 |             const config = JSON.parse(jsonString);
270 |             return this.validate(config);
271 |         } catch (error) {
272 |             if (error instanceof SyntaxError) {
273 |                 throw new Error(`Invalid JSON configuration: ${error.message}`);
274 |             }
275 |             throw error;
276 |         }
277 |     }
278 | 
279 |     /**
280 |      * Get configuration schema for documentation
281 |      * @returns {Object} Configuration schema
282 |      */
283 |     static getSchema() {
284 |         return {
285 |             type: 'object',
286 |             properties: {
287 |                 pdfJsWorkerSrc: {
288 |                     type: 'string',
289 |                     description: 'Path to PDF.js worker file'
290 |                 },
291 |                 tesseract: {
292 |                     type: 'object',
293 |                     properties: {
294 |                         workerPath: { type: 'string', description: 'Path to Tesseract worker' },
295 |                         corePath: { type: 'string', description: 'Path to Tesseract core WASM' },
296 |                         langPath: { type: 'string', description: 'Path to language data directory' },
297 |                         language: { type: 'string', description: 'OCR language code' },
298 |                         options: { type: 'object', description: 'Additional Tesseract options' }
299 |                     }
300 |                 },
301 |                 webllm: {
302 |                     type: 'object',
303 |                     properties: {
304 |                         model: { type: 'string', description: 'Model identifier' },
305 |                         customModel: {
306 |                             type: 'object',
307 |                             description: 'Custom model configuration',
308 |                             properties: {
309 |                                 model: { type: 'string', description: 'Model URL' },
310 |                                 model_id: { type: 'string', description: 'Model identifier' },
311 |                                 model_lib: { type: 'string', description: 'Model library URL' },
312 |                                 required_features: { type: 'array', description: 'Required GPU features' },
313 |                                 overrides: { type: 'object', description: 'Model override settings' }
314 |                             }
315 |                         },
316 |                         options: {
317 |                             type: 'object',
318 |                             properties: {
319 |                                 temperature: { type: 'number', minimum: 0, maximum: 2 },
320 |                                 maxTokens: { type: 'integer', minimum: 1 }
321 |                             }
322 |                         }
323 |                     }
324 |                 },
325 |                 systemPrompts: {
326 |                     type: 'object',
327 |                     properties: {
328 |                         singleExtraction: { type: 'string', description: 'Custom prompt for single extraction scenarios' },
329 |                         combinedExtraction: { type: 'string', description: 'Custom prompt for combined extraction scenario' }
330 |                     }
331 |                 },
332 |                 processing: {
333 |                     type: 'object',
334 |                     properties: {
335 |                         splitPascalCase: { type: 'boolean', description: 'Split PascalCase words' },
336 |                         pdfRenderScale: { type: 'number', minimum: 0, description: 'PDF rendering scale for OCR' },
337 |                         postProcessRules: {
338 |                             type: 'array',
339 |                             items: {
340 |                                 type: 'object',
341 |                                 properties: {
342 |                                     find: { description: 'RegExp or string to find' },
343 |                                     replace: { type: 'string', description: 'Replacement string' }
344 |                                 },
345 |                                 required: ['find', 'replace']
346 |                             }
347 |                         }
348 |                     }
349 |                 },
350 |                 progressCallback: { description: 'Function to handle progress updates' }
351 |             }
352 |         };
353 |     }
354 | }
355 | 
356 | export default ConfigValidator;
357 | 


--------------------------------------------------------------------------------
/examples/demo.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  6 |     <title>Extract2MD Demo</title>
  7 |     <style>
  8 |         body {
  9 |             font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
 10 |             max-width: 1200px;
 11 |             margin: 0 auto;
 12 |             padding: 20px;
 13 |             line-height: 1.6;
 14 |         }
 15 |         .header {
 16 |             text-align: center;
 17 |             margin-bottom: 30px;
 18 |             padding: 20px;
 19 |             background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
 20 |             color: white;
 21 |             border-radius: 10px;
 22 |         }
 23 |         .scenario-grid {
 24 |             display: grid;
 25 |             grid-template-columns: repeat(auto-fit, minmax(300px, 1fr));
 26 |             gap: 20px;
 27 |             margin-bottom: 30px;
 28 |         }
 29 |         .scenario-card {
 30 |             border: 2px solid #e1e5e9;
 31 |             border-radius: 10px;
 32 |             padding: 20px;
 33 |             transition: all 0.3s ease;
 34 |             cursor: pointer;
 35 |         }
 36 |         .scenario-card:hover {
 37 |             border-color: #667eea;
 38 |             box-shadow: 0 4px 12px rgba(102, 126, 234, 0.1);
 39 |         }
 40 |         .scenario-card.active {
 41 |             border-color: #667eea;
 42 |             background-color: #f8f9ff;
 43 |         }
 44 |         .scenario-title {
 45 |             font-size: 1.2em;
 46 |             font-weight: bold;
 47 |             color: #333;
 48 |             margin-bottom: 10px;
 49 |         }
 50 |         .scenario-tech {
 51 |             font-size: 0.9em;
 52 |             color: #666;
 53 |             margin-bottom: 10px;
 54 |         }
 55 |         .scenario-description {
 56 |             font-size: 0.9em;
 57 |             color: #555;
 58 |         }
 59 |         .controls {
 60 |             background: #f8f9fa;
 61 |             padding: 20px;
 62 |             border-radius: 10px;
 63 |             margin-bottom: 20px;
 64 |         }
 65 |         .file-input-wrapper {
 66 |             display: flex;
 67 |             align-items: center;
 68 |             gap: 15px;
 69 |             margin-bottom: 15px;
 70 |         }
 71 |         .file-input {
 72 |             flex: 1;
 73 |         }
 74 |         .btn {
 75 |             background: #667eea;
 76 |             color: white;
 77 |             border: none;
 78 |             padding: 10px 20px;
 79 |             border-radius: 5px;
 80 |             cursor: pointer;
 81 |             font-size: 16px;
 82 |             transition: background 0.3s ease;
 83 |         }
 84 |         .btn:hover {
 85 |             background: #5a6fd8;
 86 |         }
 87 |         .btn:disabled {
 88 |             background: #ccc;
 89 |             cursor: not-allowed;
 90 |         }
 91 |         .progress {
 92 |             background: #e9ecef;
 93 |             border-radius: 10px;
 94 |             padding: 15px;
 95 |             margin: 20px 0;
 96 |             border-left: 4px solid #667eea;
 97 |             display: none;
 98 |         }
 99 |         .progress.active {
100 |             display: block;
101 |         }
102 |         .progress-message {
103 |             font-weight: bold;
104 |             margin-bottom: 5px;
105 |         }
106 |         .progress-details {
107 |             font-size: 0.9em;
108 |             color: #666;
109 |         }
110 |         .output {
111 |             background: #f8f9fa;
112 |             border: 1px solid #e9ecef;
113 |             border-radius: 10px;
114 |             padding: 20px;
115 |             margin-top: 20px;
116 |             display: none;
117 |         }
118 |         .output.active {
119 |             display: block;
120 |         }
121 |         .output-header {
122 |             font-weight: bold;
123 |             margin-bottom: 15px;
124 |             color: #333;
125 |             display: flex;
126 |             justify-content: space-between;
127 |             align-items: center;
128 |         }
129 |         .output-content {
130 |             background: white;
131 |             border: 1px solid #ddd;
132 |             border-radius: 5px;
133 |             padding: 15px;
134 |             max-height: 400px;
135 |             overflow-y: auto;
136 |             white-space: pre-wrap;
137 |             font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
138 |             font-size: 0.9em;
139 |             line-height: 1.4;
140 |         }
141 |         .copy-btn {
142 |             background: #28a745;
143 |             color: white;
144 |             border: none;
145 |             padding: 5px 10px;
146 |             border-radius: 3px;
147 |             cursor: pointer;
148 |             font-size: 12px;
149 |         }
150 |         .error {
151 |             background: #f8d7da;
152 |             color: #721c24;
153 |             border: 1px solid #f5c6cb;
154 |             border-radius: 5px;
155 |             padding: 15px;
156 |             margin: 10px 0;
157 |         }
158 |         .info {
159 |             background: #d1ecf1;
160 |             color: #0c5460;
161 |             border: 1px solid #bee5eb;
162 |             border-radius: 5px;
163 |             padding: 15px;
164 |             margin: 10px 0;
165 |         }
166 |     </style>
167 | </head>
168 | <body>
169 |     <div class="header">
170 |         <h1>🚀 Extract2MD Enhanced Demo</h1>
171 |         <p>Choose your conversion scenario and upload a PDF to see the magic happen!</p>
172 |     </div>
173 | 
174 |     <div class="scenario-grid">
175 |         <div class="scenario-card" data-scenario="1">
176 |             <div class="scenario-title">1. Quick Convert Only</div>
177 |             <div class="scenario-tech">Tech: PDF.js text extraction</div>
178 |             <div class="scenario-description">Fast conversion for PDFs with selectable text. Basic markdown formatting.</div>
179 |         </div>
180 | 
181 |         <div class="scenario-card" data-scenario="2">
182 |             <div class="scenario-title">2. High Accuracy OCR Only</div>
183 |             <div class="scenario-tech">Tech: Tesseract.js OCR</div>
184 |             <div class="scenario-description">OCR extraction for scanned documents, images, and complex layouts.</div>
185 |         </div>
186 | 
187 |         <div class="scenario-card" data-scenario="3">
188 |             <div class="scenario-title">3. Quick + LLM Enhancement</div>
189 |             <div class="scenario-tech">Tech: PDF.js + WebLLM</div>
190 |             <div class="scenario-description">Fast extraction with AI enhancement for better structure and clarity.</div>
191 |         </div>
192 | 
193 |         <div class="scenario-card" data-scenario="4">
194 |             <div class="scenario-title">4. OCR + LLM Enhancement</div>
195 |             <div class="scenario-tech">Tech: Tesseract.js + WebLLM</div>
196 |             <div class="scenario-description">OCR extraction with AI enhancement for comprehensive results.</div>
197 |         </div>
198 | 
199 |         <div class="scenario-card active" data-scenario="5">
200 |             <div class="scenario-title">5. Combined + LLM (Recommended)</div>
201 |             <div class="scenario-tech">Tech: PDF.js + Tesseract.js + WebLLM</div>
202 |             <div class="scenario-description">Best results using both extraction methods with specialized AI prompts.</div>
203 |         </div>
204 |     </div>
205 | 
206 |     <div class="controls">
207 |         <div class="file-input-wrapper">
208 |             <input type="file" class="file-input" id="pdfInput" accept=".pdf">
209 |             <button class="btn" id="convertBtn" onclick="convertPDF()">Convert PDF</button>
210 |         </div>
211 |         
212 |         <div class="info">
213 |             <strong>Selected Scenario:</strong> <span id="selectedScenario">5. Combined + LLM (Recommended)</span><br>
214 |             <strong>Note:</strong> LLM scenarios require WebGPU support and will download models on first use.
215 |         </div>
216 |     </div>
217 | 
218 |     <div class="progress" id="progressDiv">
219 |         <div class="progress-message" id="progressMessage">Initializing...</div>
220 |         <div class="progress-details" id="progressDetails"></div>
221 |     </div>
222 | 
223 |     <div class="output" id="outputDiv">
224 |         <div class="output-header">
225 |             <span>Conversion Result</span>
226 |             <button class="copy-btn" onclick="copyToClipboard()">Copy</button>
227 |         </div>
228 |         <div class="output-content" id="outputContent"></div>
229 |     </div>
230 | 
231 |     <!-- Include the Extract2MD library -->
232 |     <script type="module">
233 |         // Import the bundled Extract2MD library
234 |         // Ensure you have run 'npm run build' first
235 |         // The UMD bundle exposes itself as global 'Extract2MD'
236 |         // No explicit import needed here if script tag is not type="module"
237 |         // For simplicity, we'll assume Extract2MD is globally available after including the UMD script.
238 | 
239 |         // Configuration for the demo
240 |         // Note: Worker paths might need adjustment if defaults in the library are not relative to the bundle.
241 |         // However, the library's defaults seem to align with the webpack output structure.
242 |         window.demoConfig = {
243 |             // Set absolute paths for worker files from the server root
244 |             // as recommended in README.md for deployment.
245 |             pdfJsWorkerSrc: "/dist/pdf.worker.min.mjs",
246 |             tesseract: {
247 |                 oem: 1,
248 |                 psm: 8,
249 |                 workerPath: "/dist/assets/tesseract-worker.min.js",
250 |                 corePath: "/dist/assets/tesseract-core.wasm.js",
251 |                 langPath: "/dist/assets/lang-data/"
252 |             },
253 |             llm: {
254 |                 model: "Qwen3-0.6B-q4f16_1-MLC",
255 |                 options: {
256 |                     temperature: 0.7,
257 |                     maxTokens: 4096
258 |                 }
259 |             },
260 |             systemPrompts: {
261 |                 singleExtraction: "Focus on preserving technical accuracy and code examples exactly as they appear in the original document.",
262 |                 combinedExtraction: "Create comprehensive documentation by intelligently combining the best elements from both quick extraction and OCR methods. Focus on accuracy and completeness."
263 |             },
264 |             processing: {
265 |                 splitPascalCase: false,
266 |                 pdfRenderScale: 2.5,
267 |                 postProcessRules: [
268 |                     { find: /\bAPI\b/g, replace: "API" },
269 |                     { find: /\bJSON\b/g, replace: "JSON" },
270 |                     { find: /\bHTML\b/g, replace: "HTML" },
271 |                     { find: /\bCSS\b/g, replace: "CSS" },
272 |                     { find: /\bJS\b/g, replace: "JavaScript" }
273 |                 ]
274 |             },
275 |             progressCallback: (progress) => {
276 |                 updateProgress(progress);
277 |             }
278 |         };
279 |         
280 |         console.log('Extract2MD Demo loaded successfully!');
281 |     </script>
282 | 
283 |     <script>
284 |         let selectedScenario = 5;
285 |         let isConverting = false;
286 |         let lastResult = '';
287 | 
288 |         // Scenario selection
289 |         document.querySelectorAll('.scenario-card').forEach(card => {
290 |             card.addEventListener('click', () => {
291 |                 // Remove active class from all cards
292 |                 document.querySelectorAll('.scenario-card').forEach(c => c.classList.remove('active'));
293 |                 // Add active class to clicked card
294 |                 card.classList.add('active');
295 |                 // Update selected scenario
296 |                 selectedScenario = parseInt(card.dataset.scenario);
297 |                 document.getElementById('selectedScenario').textContent = card.querySelector('.scenario-title').textContent;
298 |             });
299 |         });
300 | 
301 |         // Progress updates
302 |         function updateProgress(progress) {
303 |             const progressDiv = document.getElementById('progressDiv');
304 |             const messageEl = document.getElementById('progressMessage');
305 |             const detailsEl = document.getElementById('progressDetails');
306 | 
307 |             progressDiv.classList.add('active');
308 |             messageEl.textContent = progress.message;
309 | 
310 |             let details = [];
311 |             if (progress.currentPage && progress.totalPages) {
312 |                 const pageProgress = Math.round((progress.currentPage / progress.totalPages) * 100);
313 |                 details.push(`Page Progress: ${pageProgress}% (${progress.currentPage}/${progress.totalPages})`);
314 |             }
315 |             if (progress.progress !== undefined) {
316 |                 const loadProgress = Math.round(progress.progress * 100);
317 |                 details.push(`Loading: ${loadProgress}%`);
318 |             }
319 |             if (progress.usage) {
320 |                 details.push(`Tokens: ${progress.usage.total_tokens || 'N/A'}`);
321 |             }
322 | 
323 |             detailsEl.textContent = details.join(' | ');
324 | 
325 |             if (progress.error) {
326 |                 showError('Conversion Error: ' + (progress.error.message || progress.error));
327 |             }
328 |         }
329 | 
330 |         // Main conversion function
331 |         async function convertPDF() {
332 |             if (isConverting) return;
333 | 
334 |             const fileInput = document.getElementById('pdfInput');
335 |             const pdfFile = fileInput.files[0];
336 | 
337 |             if (!pdfFile) {
338 |                 showError('Please select a PDF file first.');
339 |                 return;
340 |             }
341 | 
342 |             isConverting = true;
343 |             document.getElementById('convertBtn').disabled = true;
344 |             document.getElementById('convertBtn').textContent = 'Converting...';
345 |             document.getElementById('outputDiv').classList.remove('active');
346 | 
347 |             try {
348 |                 let result;
349 |                 
350 |                 switch (selectedScenario) {
351 |                     case 1:
352 |                         result = await Extract2MD.Extract2MDConverter.quickConvertOnly(pdfFile, window.demoConfig);
353 |                         break;
354 |                     case 2:
355 |                         result = await Extract2MD.Extract2MDConverter.highAccuracyConvertOnly(pdfFile, window.demoConfig);
356 |                         break;
357 |                     case 3:
358 |                         result = await Extract2MD.Extract2MDConverter.quickConvertWithLLM(pdfFile, window.demoConfig);
359 |                         break;
360 |                     case 4:
361 |                         result = await Extract2MD.Extract2MDConverter.highAccuracyConvertWithLLM(pdfFile, window.demoConfig);
362 |                         break;
363 |                     case 5:
364 |                         result = await Extract2MD.Extract2MDConverter.combinedConvertWithLLM(pdfFile, window.demoConfig);
365 |                         break;
366 |                     default:
367 |                         throw new Error('Invalid scenario selected');
368 |                 }
369 | 
370 |                 showResult(result);
371 | 
372 |             } catch (error) {
373 |                 console.error('Conversion failed:', error);
374 |                 showError('Conversion failed: ' + error.message);
375 |             } finally {
376 |                 isConverting = false;
377 |                 document.getElementById('convertBtn').disabled = false;
378 |                 document.getElementById('convertBtn').textContent = 'Convert PDF';
379 |                 document.getElementById('progressDiv').classList.remove('active');
380 |             }
381 |         }
382 | 
383 |         function showResult(markdown) {
384 |             lastResult = markdown;
385 |             document.getElementById('outputContent').textContent = markdown;
386 |             document.getElementById('outputDiv').classList.add('active');
387 |         }
388 | 
389 |         function showError(message) {
390 |             const errorDiv = document.createElement('div');
391 |             errorDiv.className = 'error';
392 |             errorDiv.textContent = message;
393 |             document.querySelector('.controls').appendChild(errorDiv);
394 |             setTimeout(() => errorDiv.remove(), 5000);
395 |         }
396 | 
397 |         function copyToClipboard() {
398 |             if (lastResult) {
399 |                 navigator.clipboard.writeText(lastResult).then(() => {
400 |                     const btn = document.querySelector('.copy-btn');
401 |                     const originalText = btn.textContent;
402 |                     btn.textContent = 'Copied!';
403 |                     setTimeout(() => btn.textContent = originalText, 2000);
404 |                 });
405 |             }
406 |         }
407 | 
408 |         // Make functions globally available
409 |         window.convertPDF = convertPDF;
410 |         window.copyToClipboard = copyToClipboard;
411 |     </script>
412 |     <!-- Include the bundled Extract2MD library -->
413 |     <script src="../dist/assets/extract2md.umd.js"></script>
414 | </body>
415 | </html>
416 | 


--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Extract2MD - Enhanced PDF to Markdown conversion library
  3 |  * New API with scenario-specific methods for different use cases
  4 |  */
  5 | 
  6 | // Import new modular components
  7 | import Extract2MDConverter from './converters/Extract2MDConverter.js';
  8 | import WebLLMEngine from './engines/WebLLMEngine.js';
  9 | import OutputParser from './utils/OutputParser.js';
 10 | import SystemPrompts from './utils/SystemPrompts.js';
 11 | import ConfigValidator from './utils/ConfigValidator.js';
 12 | 
 13 | // Legacy imports for backwards compatibility
 14 | import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs';
 15 | import Tesseract from 'tesseract.js';
 16 | import { Chat as ImportedChat, CreateMLCEngine as ImportedCreateMLCEngine } from '@mlc-ai/web-llm';
 17 | import * as webllm from '@mlc-ai/web-llm';
 18 | 
 19 | const DEFAULT_PDFJS_WORKER_SRC = '../pdf.worker.min.mjs'; // Relative to dist/assets/
 20 | const DEFAULT_TESSERACT_WORKER_PATH = './tesseract-worker.min.js'; // Relative to dist/assets/
 21 | const DEFAULT_TESSERACT_CORE_PATH = './tesseract-core.wasm.js';   // Relative to dist/assets/
 22 | const DEFAULT_TESSERACT_LANG_PATH = './lang-data/';             // Relative to dist/assets/
 23 | const DEFAULT_LLM_MODEL = 'Qwen3-0.6B-q4f16_1-MLC'; // Updated to match available WASM
 24 | const DEFAULT_LLM_MODEL_LIB_URL = 'https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/web-llm-models/v0_2_48/Qwen3-0.6B-q4f16_1-ctx4k_cs1k-webgpu.wasm';
 25 | 
 26 | // Legacy converter class for backwards compatibility
 27 | class LegacyExtract2MDConverter {
 28 |     constructor(options = {}) {
 29 |         this.pdfJsWorkerSrc = options.pdfJsWorkerSrc || DEFAULT_PDFJS_WORKER_SRC;
 30 |         const pdfjsSetupLib = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : (typeof window !== 'undefined' ? window.pdfjsLib : null));
 31 |         if (pdfjsSetupLib && pdfjsSetupLib.GlobalWorkerOptions) {
 32 |             pdfjsSetupLib.GlobalWorkerOptions.workerSrc = this.pdfJsWorkerSrc;
 33 |         } else {
 34 |             console.warn('pdfjsLib or pdfjsLib.GlobalWorkerOptions is not defined. PDF.js worker may not load correctly if not already configured globally.');
 35 |         }
 36 | 
 37 |         this.tesseractOptions = { 
 38 |             workerPath: options.tesseractWorkerPath || DEFAULT_TESSERACT_WORKER_PATH,
 39 |             corePath: options.tesseractCorePath || DEFAULT_TESSERACT_CORE_PATH,
 40 |             langPath: options.tesseractLangPath || DEFAULT_TESSERACT_LANG_PATH,
 41 |             ...(options.tesseractOptions || {})
 42 |         };
 43 |         this.tesseractLanguage = options.tesseractLanguage || 'eng'; // Default to English
 44 |         this.splitPascalCase = options.splitPascalCase || false; 
 45 | 
 46 |         this.defaultPostProcessRules = [
 47 |             { find: /\uFB00/g, replace: 'ff' }, 
 48 |             { find: /\uFB01/g, replace: 'fi' }, 
 49 |             { find: /\uFB02/g, replace: 'fl' }, 
 50 |             { find: /\uFB03/g, replace: 'ffi' }, 
 51 |             { find: /\uFB04/g, replace: 'ffl' }, 
 52 |             { find: /[\u2018\u2019]/g, replace: "'" }, 
 53 |             { find: /[\u201C\u201D]/g, replace: '"' }, 
 54 |             { find: /[\u2022\u2023\u25E6\u2043\u2219\u25CF\u25CB\u2981\u2619\u2765]/g, replace: '-' }, 
 55 |             { find: /[\u2013\u2014]/g, replace: '-' }, 
 56 |             { find: /\u00AD/g, replace: '' }, 
 57 |             { find: /[\s\u00A0\u2000-\u200A\u202F\u205F\u3000]+/g, replace: ' ' }, 
 58 |         ];
 59 | 
 60 |         if (this.splitPascalCase) {
 61 |             this.defaultPostProcessRules.push(
 62 |                 { find: /([A-Z][a-z]+)([A-Z][a-z]+)/g, replace: '$1 $2' },
 63 |                 { find: /([a-z])([A-Z][a-z]+)/g, replace: '$1 $2' },
 64 |                 { find: /([A-Z][a-z]+)([A-Z][a-z]+)/g, replace: '$1 $2' } 
 65 |             );
 66 |         }
 67 |         this.customPostProcessRules = options.postProcessRules || [];
 68 | 
 69 |         this.llmModel = options.llmModel || DEFAULT_LLM_MODEL;
 70 |         this.llmModelLibUrl = options.llmModelLibUrl || null; // New option for user-specified model_lib
 71 |         this.chatModule = null;
 72 |         this.llmInitialized = false;
 73 |         
 74 |         this.progressCallback = options.progressCallback || function(progress) { /* console.log(progress) */ };
 75 | 
 76 |         this.WebLLMChatConstructor = null; // For fallback
 77 |         this.WebLLMCreateEngine = null;
 78 |         this.webllmModule = null;
 79 | 
 80 |         // Try to get the full webllm module for modelLibURLPrefix and modelVersion
 81 |         if (typeof webllm !== 'undefined' && webllm.CreateMLCEngine) {
 82 |             this.webllmModule = webllm;
 83 |             this.WebLLMCreateEngine = webllm.CreateMLCEngine;
 84 |             this.WebLLMChatConstructor = webllm.Chat; // Also get Chat from the main module
 85 |         } else if (typeof window !== 'undefined' && window.webLLM && typeof window.webLLM.CreateMLCEngine === 'function') {
 86 |             this.webllmModule = window.webLLM;
 87 |             this.WebLLMCreateEngine = window.webLLM.CreateMLCEngine;
 88 |             this.WebLLMChatConstructor = window.webLLM.Chat;
 89 |         } else {
 90 |              // Fallback if full module import didn't work as expected, try individual imports
 91 |             console.warn('Extract2MD_Debug: Full webllm module not found, relying on individual imports/globals for CreateMLCEngine/Chat.');
 92 |             if (typeof ImportedCreateMLCEngine !== 'undefined') {
 93 |                 this.WebLLMCreateEngine = ImportedCreateMLCEngine;
 94 |             } else if (typeof window !== 'undefined' && window.webLLM && typeof window.webLLM.CreateMLCEngine === 'function') { // Redundant but safe
 95 |                 this.WebLLMCreateEngine = window.webLLM.CreateMLCEngine;
 96 |             }
 97 |             // Fallback for Chat constructor
 98 |             if (typeof ImportedChat !== 'undefined') {
 99 |                 this.WebLLMChatConstructor = ImportedChat;
100 |             } else if (typeof window !== 'undefined' && window.webLLM && typeof window.webLLM.Chat === 'function') { // Redundant
101 |                 this.WebLLMChatConstructor = window.webLLM.Chat;
102 |             }
103 |         }
104 |     }
105 | 
106 |     _postProcessText(text, additionalRules = []) {
107 |         if (!text) return '';
108 |         let cleanedText = text;
109 |         const allRules = [...this.defaultPostProcessRules, ...this.customPostProcessRules, ...additionalRules];
110 | 
111 |         // Optimized rule application - batch similar operations
112 |         const unicodeReplacements = [];
113 |         const regexReplacements = [];
114 |         
115 |         for (const rule of allRules) {
116 |             if (rule.find && typeof rule.replace === 'string') {
117 |                 if (rule.find instanceof RegExp) {
118 |                     regexReplacements.push(rule);
119 |                 } else {
120 |                     unicodeReplacements.push(rule);
121 |                 }
122 |             }
123 |         }
124 |         
125 |         // Apply unicode replacements first (typically simpler)
126 |         for (const rule of unicodeReplacements) {
127 |             cleanedText = cleanedText.replace(rule.find, rule.replace);
128 |         }
129 |         
130 |         // Apply regex replacements
131 |         for (const rule of regexReplacements) {
132 |             cleanedText = cleanedText.replace(rule.find, rule.replace);
133 |         }
134 | 
135 |         return cleanedText.trim();
136 |     }
137 | 
138 |     _convertToMarkdownLogic(rawText) {
139 |         let markdownOutputLines = [];
140 |         const inputLines = rawText.split(/\n/);
141 | 
142 |         let currentParagraphCollector = [];
143 |         let inPotentialTableBlock = false;
144 |         let potentialTableBlockLines = [];
145 | 
146 |         const flushCurrentParagraph = () => {
147 |             if (currentParagraphCollector.length > 0) {
148 |                 markdownOutputLines.push(currentParagraphCollector.join(' ').trim());
149 |                 currentParagraphCollector = [];
150 |                 // Only add empty line if the next content isn't a heading or table block
151 |                 this._addSeparatorLine(markdownOutputLines);
152 |             }
153 |         };
154 | 
155 |         const flushPotentialTableBlock = () => {
156 |             if (potentialTableBlockLines.length > 0) {
157 |                 if (potentialTableBlockLines.length >= 2) { // Heuristic: at least 2 lines for a table/code block
158 |                     markdownOutputLines.push('```');
159 |                     markdownOutputLines.push(...potentialTableBlockLines.map(l => l.trimEnd()));
160 |                     markdownOutputLines.push('```');
161 |                 } else {
162 |                     markdownOutputLines.push(potentialTableBlockLines.join(' ').trim());
163 |                 }
164 |                 potentialTableBlockLines = [];
165 |                 this._addSeparatorLine(markdownOutputLines);
166 |             }
167 |             inPotentialTableBlock = false;
168 |         };
169 | 
170 |         for (let i = 0; i < inputLines.length; i++) {
171 |             const originalLine = inputLines[i];
172 |             const trimmedLine = originalLine.trim();
173 | 
174 |             if (trimmedLine === '') {
175 |                 if (inPotentialTableBlock) flushPotentialTableBlock();
176 |                 flushCurrentParagraph();
177 |                 continue;
178 |             }
179 |             
180 |             const isShortLine = trimmedLine.length > 0 && trimmedLine.length < 80;
181 |             const noPunctuationEnd = isShortLine && !/[.,;:!?]$/.test(trimmedLine);
182 |             const isAllCapsLine = trimmedLine.length > 2 && trimmedLine.length < 80 && /^[A-Z\s\d\W]*[A-Z][A-Z\s\d\W]*$/.test(trimmedLine) && /[A-Z]/.test(trimmedLine) && !/^\d+$/.test(trimmedLine);
183 |             const nextLineIsBlankOrEndOfFile = (i + 1 === inputLines.length || inputLines[i + 1].trim() === '');
184 | 
185 |             if (isAllCapsLine || (isShortLine && noPunctuationEnd && nextLineIsBlankOrEndOfFile && trimmedLine.length > 1)) {
186 |                 if (inPotentialTableBlock) flushPotentialTableBlock();
187 |                 flushCurrentParagraph();
188 |                 markdownOutputLines.push(`# ${trimmedLine}`);
189 |                 this._addSeparatorLine(markdownOutputLines);
190 |                 if (nextLineIsBlankOrEndOfFile && inputLines[i+1] && inputLines[i + 1].trim() === '') {
191 |                     i++; 
192 |                 }
193 |                 continue;
194 |             }
195 | 
196 |             const hasMultipleSpacesBetweenWords = /\S\s{2,}\S/.test(originalLine);
197 |             const hasMultipleColumnsBySpaces = originalLine.split(/\s{2,}/).length > 2 && originalLine.length > 10;
198 | 
199 |             if (hasMultipleSpacesBetweenWords || hasMultipleColumnsBySpaces) {
200 |                 flushCurrentParagraph();
201 |                 if (!inPotentialTableBlock) inPotentialTableBlock = true;
202 |                 potentialTableBlockLines.push(originalLine);
203 |             } else {
204 |                 if (inPotentialTableBlock) flushPotentialTableBlock();
205 |                 if (trimmedLine) currentParagraphCollector.push(trimmedLine);
206 |             }
207 |         }
208 | 
209 |         if (inPotentialTableBlock) flushPotentialTableBlock();
210 |         flushCurrentParagraph();
211 | 
212 |         // Optimized final cleanup - single pass to normalize excessive newlines
213 |         return this._normalizeMarkdownNewlines(markdownOutputLines);
214 |     }
215 | 
216 |     /**
217 |      * Helper method to add separator lines only when needed
218 |      */
219 |     _addSeparatorLine(outputLines) {
220 |         // Only add empty line if the last line isn't already empty
221 |         if (outputLines.length > 0 && outputLines[outputLines.length - 1] !== '') {
222 |             outputLines.push('');
223 |         }
224 |     }
225 | 
226 |     /**
227 |      * Normalize newlines in the final markdown output
228 |      */
229 |     _normalizeMarkdownNewlines(lines) {
230 |         // Filter out excessive empty lines while preserving structure
231 |         const normalizedLines = [];
232 |         let consecutiveEmptyLines = 0;
233 |         
234 |         for (const line of lines) {
235 |             if (line.trim() === '') {
236 |                 consecutiveEmptyLines++;
237 |                 // Allow maximum of 1 consecutive empty line
238 |                 if (consecutiveEmptyLines <= 1) {
239 |                     normalizedLines.push('');
240 |                 }
241 |             } else {
242 |                 consecutiveEmptyLines = 0;
243 |                 normalizedLines.push(line.trimEnd());
244 |             }
245 |         }
246 |         
247 |         // Join and do final cleanup
248 |         let finalMarkdown = normalizedLines.join('\n');
249 |         // Remove any remaining triple+ newlines and trim
250 |         finalMarkdown = finalMarkdown.replace(/\n{3,}/g, '\n\n').trim();
251 |         return finalMarkdown;
252 |     }
253 | 
254 |     async _extractTextWithPdfJs(fileArrayBuffer) {
255 |         const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : (typeof window !== 'undefined' ? window.pdfjsLib : null));
256 |         if (!pdfjs || !pdfjs.getDocument) {
257 |             throw new Error('pdf.js library (pdfjsLib) is not loaded or not fully initialized.');
258 |         }
259 | 
260 |         this.progressCallback({ stage: 'pdfjs_load', message: 'Loading PDF with pdf.js...' });
261 |         const pdfDoc = await pdfjs.getDocument({ data: fileArrayBuffer }).promise;
262 |         let fullText = '';
263 |         const numPages = pdfDoc.numPages;
264 | 
265 |         for (let pageNum = 1; pageNum <= numPages; pageNum++) {
266 |             this.progressCallback({ stage: 'pdfjs_page', message: `Extracting text from page ${pageNum}/${numPages}...`, currentPage: pageNum, totalPages: numPages });
267 |             const page = await pdfDoc.getPage(pageNum);
268 |             const textContent = await page.getTextContent({
269 |                 normalizeWhitespace: false, 
270 |                 disableCombineTextItems: true 
271 |             });
272 |             let pageTextBuffer = '';
273 |             if (textContent.items && textContent.items.length > 0) {
274 |                 for (let i = 0; i < textContent.items.length; i++) {
275 |                     const item = textContent.items[i];
276 |                     pageTextBuffer += item.str;
277 |                     if (item.hasEOL) {
278 |                         if (!pageTextBuffer.endsWith('\n')) pageTextBuffer += '\n';
279 |                     } else if (i < textContent.items.length - 1) {
280 |                         const nextItem = textContent.items[i+1];
281 |                         if (item.str && !item.str.endsWith(' ') && nextItem.str && !nextItem.str.startsWith(' ') && Math.abs(item.transform[5] - nextItem.transform[5]) < (item.height * 0.5)) {
282 |                             const currentItemEndX = item.transform[4] + item.width;
283 |                             const nextItemStartX = nextItem.transform[4];
284 |                             if (nextItemStartX - currentItemEndX > -0.5) { 
285 |                                 pageTextBuffer += ' ';
286 |                             }
287 |                         }
288 |                     }
289 |                 }
290 |             }
291 |             fullText += pageTextBuffer;
292 |             if (pageTextBuffer.trim() !== '' && !pageTextBuffer.endsWith('\n')) fullText += '\n';
293 |         }
294 |         this.progressCallback({ stage: 'pdfjs_extract_complete', message: 'pdf.js text extraction complete.' });
295 |         return fullText;
296 |     }
297 | 
298 |     async quickConvert(pdfFile, options = {}) {
299 |         if (!(pdfFile instanceof File)) throw new Error('Invalid input: pdfFile must be a File object.');
300 |         this.progressCallback({ stage: 'start_quick', message: 'Starting quick conversion...' });
301 |         const arrayBuffer = await pdfFile.arrayBuffer();
302 |         let rawText = await this._extractTextWithPdfJs(arrayBuffer);
303 |         
304 |         this.progressCallback({ stage: 'postprocess_quick', message: 'Post-processing extracted text...' });
305 |         let cleanedText = this._postProcessText(rawText, options.postProcessRules);
306 |         cleanedText = cleanedText.replace(/\r\n/g, '\n').replace(/\n{2,}/g, '\n\n').trim();
307 |         
308 |         this.progressCallback({ stage: 'markdown_quick', message: 'Converting to Markdown...' });
309 |         const markdown = this._convertToMarkdownLogic(cleanedText);
310 |         this.progressCallback({ stage: 'complete_quick', message: 'Quick conversion complete.' });
311 |         return markdown;
312 |     }
313 | 
314 |     async highAccuracyConvert(pdfFile, options = {}) {
315 |         if (!(pdfFile instanceof File)) throw new Error('Invalid input: pdfFile must be a File object.');
316 |         const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : (typeof window !== 'undefined' ? window.pdfjsLib : null));
317 |         if (!pdfjs || !pdfjs.getDocument) throw new Error('pdf.js library (pdfjsLib) is not loaded or not fully initialized.');
318 |         const Tess = (typeof Tesseract !== 'undefined' ? Tesseract : (typeof window !== 'undefined' ? window.Tesseract : null));
319 |         if (!Tess) throw new Error('Tesseract.js library is not loaded.');
320 | 
321 |         this.progressCallback({ stage: 'start_ocr', message: 'Starting high-accuracy OCR conversion...' });
322 | 
323 |         const tesseractLang = options.tesseractLanguage || this.tesseractLanguage;
324 |         const tesseractOpts = { ...this.tesseractOptions, ...(options.tesseractOptions || {}) }; // Merge instance and call options
325 |         const pdfRenderScale = options.pdfRenderScale || 2.5;
326 | 
327 |         let worker;
328 |         let workerInitialized = false;
329 |         
330 |         try {
331 |             this.progressCallback({ stage: 'ocr_worker_init', message: 'Initializing Tesseract OCR worker...' });
332 |             
333 |             try {
334 |                 // Set timeout for worker initialization
335 |                 const workerPromise = Tess.createWorker(tesseractLang, 1, tesseractOpts);
336 | 
337 |                 // Add timeout to prevent hanging
338 |                 const timeoutPromise = new Promise((_, reject) => {
339 |                     setTimeout(() => reject(new Error('Worker initialization timed out after 30 seconds')), 30000);
340 |                 });
341 | 
342 |                 worker = await Promise.race([workerPromise, timeoutPromise]);
343 |                 workerInitialized = true;
344 | 
345 |                 this.progressCallback({ stage: 'ocr_worker_ready', message: 'OCR worker initialized successfully.' });
346 |             } catch (err) {
347 |                 this.progressCallback({ stage: 'ocr_worker_error', message: `Failed to initialize Tesseract worker: ${err.message}`, error: err });
348 |                 throw new Error(`Failed to initialize Tesseract worker: ${err.message}. Check if Tesseract.js files are accessible and language data is available.`);
349 |             }
350 |             
351 |             const arrayBuffer = await pdfFile.arrayBuffer();
352 |             const pdfDoc = await pdfjs.getDocument({ data: arrayBuffer }).promise;
353 |             let fullTextAccumulator = '';
354 |             const numPages = pdfDoc.numPages;
355 | 
356 |             for (let pageNum = 1; pageNum <= numPages; pageNum++) {
357 |                 this.progressCallback({ stage: 'ocr_render_page', message: `Rendering page ${pageNum}/${numPages} for OCR...`, currentPage: pageNum, totalPages: numPages });
358 |                 
359 |                 const page = await pdfDoc.getPage(pageNum);
360 |                 const viewport = page.getViewport({ scale: pdfRenderScale });
361 |                 
362 |                 const canvas = document.createElement('canvas');
363 |                 const context = canvas.getContext('2d');
364 |                 canvas.height = viewport.height;
365 |                 canvas.width = viewport.width;
366 | 
367 |                 try {
368 |                     await page.render({ canvasContext: context, viewport: viewport }).promise;
369 |                     
370 |                     this.progressCallback({ stage: 'ocr_recognize_page', message: `OCR processing page ${pageNum}/${numPages}...`, currentPage: pageNum, totalPages: numPages });
371 |                     const recognition = await worker.recognize(canvas);
372 |                     const ocrPageText = recognition.data?.text || '';
373 |                     fullTextAccumulator += ocrPageText + '\n';
374 |                     
375 |                 } catch (pageError) {
376 |                     this.progressCallback({ stage: 'ocr_page_warning', message: `Warning: Failed to process page ${pageNum}: ${pageError.message}` });
377 |                     console.warn(`OCR processing failed for page ${pageNum}:`, pageError);
378 |                     // Continue with other pages instead of failing completely
379 |                 } finally {
380 |                     // Clean up canvas resources
381 |                     canvas.width = 0;
382 |                     canvas.height = 0;
383 |                 }
384 |             }
385 |             
386 |             // Safely terminate worker
387 |             if (workerInitialized && worker) {
388 |                 try {
389 |                     this.progressCallback({ stage: 'ocr_terminate_worker', message: 'Terminating Tesseract worker...' });
390 |                     
391 |                     await Promise.race([
392 |                         worker.terminate(),
393 |                         new Promise((_, reject) => {
394 |                             setTimeout(() => reject(new Error('Worker termination timed out')), 10000);
395 |                         })
396 |                     ]);
397 |                 } catch (terminateError) {
398 |                     console.warn('Warning: Failed to properly terminate Tesseract worker:', terminateError);
399 |                     // Don't throw error for termination issues
400 |                 }
401 |             }
402 |         } catch (error) {
403 |             // Enhanced cleanup on error
404 |             if (workerInitialized && worker) {
405 |                 try {
406 |                     await Promise.race([
407 |                         worker.terminate(),
408 |                         new Promise((resolve) => setTimeout(resolve, 5000)) // Give up after 5 seconds
409 |                     ]);
410 |                 } catch (cleanupError) {
411 |                     console.warn('Failed to cleanup worker after error:', cleanupError);
412 |                 }
413 |             }
414 |             throw error;
415 |         }
416 | 
417 |         this.progressCallback({ stage: 'postprocess_ocr', message: 'Post-processing OCR text...' });
418 |         let cleanedText = this._postProcessText(fullTextAccumulator, options.postProcessRules);
419 |         cleanedText = cleanedText.replace(/\r\n/g, '\n').replace(/\n{2,}/g, '\n\n').trim();
420 | 
421 |         this.progressCallback({ stage: 'markdown_ocr', message: 'Converting to Markdown...' });
422 |         const markdown = this._convertToMarkdownLogic(cleanedText);
423 |         this.progressCallback({ stage: 'complete_ocr', message: 'High-accuracy conversion complete.' });
424 |         return markdown;
425 |     }
426 | 
427 |     async _initializeLLM(modelId, chatOpts = {}) {
428 |         if (!this.WebLLMCreateEngine && !this.WebLLMChatConstructor) {
429 |             throw new Error('WebLLM (CreateMLCEngine or Chat) module is not loaded. Ensure @mlc-ai/web-llm is correctly imported/bundled, or webLLM is globally available.');
430 |         }
431 | 
432 |         // Check if LLM is already initialized with the same model.
433 |         // For CreateMLCEngine, modelId is part of the engine. For Chat, we stored it.
434 |         const currentModelId = this.chatModule ? (this.chatModule.modelId || (this.chatModule.config && this.chatModule.config.model_id)) : null;
435 |         if (this.llmInitialized && this.chatModule && currentModelId === modelId) {
436 |             this.progressCallback({ stage: 'llm_ready', message: 'LLM already initialized with the correct model.' });
437 |             return;
438 |         }
439 | 
440 |         this.progressCallback({ stage: 'llm_init', message: `Initializing LLM with model: ${modelId}... This may take time.` });
441 |         
442 |         if (this.chatModule && typeof this.chatModule.unload === 'function') {
443 |             await this.chatModule.unload();
444 |             this.chatModule = null; // Ensure it's cleared
445 |         }
446 |         this.llmInitialized = false;
447 | 
448 | 
449 |         const llmInitProgressCallback = report => {
450 |             this.progressCallback({
451 |                 stage: 'llm_load_progress',
452 |                 message: `LLM Loading: ${report.text}`,
453 |                 progress: report.progress
454 |             });
455 |         };
456 | 
457 |         try {
458 |             if (this.WebLLMCreateEngine) {
459 |                 let modelLibToUse;
460 | 
461 |                 if (this.llmModelLibUrl) {
462 |                     // User provided a specific model_lib URL
463 |                     modelLibToUse = this.llmModelLibUrl;
464 |                 } else if (modelId === DEFAULT_LLM_MODEL) {
465 |                     // Use the hardcoded default model_lib URL for the default model
466 |                     modelLibToUse = DEFAULT_LLM_MODEL_LIB_URL;
467 |                 } else {
468 |                     // No specific URL provided by user, and it's not the default model with a known URL
469 |                     throw new Error(
470 |                         `Extract2MD Error: 'model_lib' URL not specified for model '${modelId}'. ` +
471 |                         `Please provide it via the 'llmModelLibUrl' constructor option, ` +
472 |                         `or use the default model ('${DEFAULT_LLM_MODEL}').`
473 |                     );
474 |                 }
475 |                 
476 |                 const appConfig = {
477 |                     model_list: [
478 |                         {
479 |                             "model": `https://huggingface.co/mlc-ai/${modelId}/resolve/main/`,
480 |                             "model_id": modelId,
481 |                             "model_lib": modelLibToUse,
482 |                             "required_features": modelId.includes("f16") ? ["shader-f16"] : [],
483 |                             "overrides": {
484 |                                 "conv_template": "qwen"
485 |                             }
486 |                         }
487 |                     ]
488 |                 };
489 | 
490 |                 const engineConfig = {
491 |                     ...chatOpts,
492 |                     initProgressCallback: llmInitProgressCallback,
493 |                     appConfig: appConfig // Pass the constructed appConfig
494 |                 };
495 |                 this.chatModule = await this.WebLLMCreateEngine(modelId, engineConfig);
496 |                 // CreateMLCEngine loads the model, so no separate reload needed immediately.
497 |                 // We can store modelId if needed for future checks, though engine usually has it.
498 |                 if(this.chatModule) this.chatModule.modelId = modelId; // For consistency if checked later
499 |             } else if (this.WebLLMChatConstructor) {
500 |                 // Fallback to Chat constructor - this is the path that had issues
501 |                 this.chatModule = new this.WebLLMChatConstructor();
502 |                 if(this.chatModule) this.chatModule.modelId = modelId; // Store modelId for Chat instances
503 |                 
504 |                 const finalChatOpts = {
505 |                     ...chatOpts,
506 |                     initProgressCallback: llmInitProgressCallback
507 |                 };
508 |                 if (typeof this.chatModule.reload !== 'function') {
509 |                     throw new Error('this.chatModule.reload is not a function (Chat fallback path).');
510 |                 }
511 |                 await this.chatModule.reload(modelId, finalChatOpts);
512 |             } else {
513 |                  throw new Error('No valid WebLLM constructor found.');
514 |             }
515 |             
516 |             this.llmInitialized = true;
517 |             this.progressCallback({ stage: 'llm_init_complete', message: 'LLM initialized successfully.' });
518 |         } catch (err) {
519 |             this.llmInitialized = false;
520 |             this.progressCallback({ stage: 'llm_init_error', message: `LLM initialization failed: ${err.message}`, error: err });
521 |             throw new Error(`LLM initialization failed: ${err.message}`);
522 |         }
523 |     }
524 | 
525 |     async llmRewrite(textToRewrite, options = {}) {
526 |         const model = options.llmModel || this.llmModel;
527 |         const promptTemplate = options.llmPromptTemplate || 
528 |             ((text) => `Please rewrite the following text, which was extracted from a PDF. Aim to improve its clarity, correct grammatical errors, and enhance its flow and professional tone, while preserving the original meaning, information, details, context and structure. Correct spelling errors in common words (do not change spelling in uncommon words like names, places, brands, etc.). Output only the rewritten text.\n\nOriginal Text:\n${text}\n\nRewritten Text:`);
529 |         
530 |         const chatOpts = options.chatOpts || {};
531 | 
532 |         await this._initializeLLM(model, chatOpts);
533 |         if (!this.llmInitialized || !this.chatModule) {
534 |             throw new Error('LLM could not be initialized or is not ready.');
535 |         }
536 | 
537 |         const prompt = promptTemplate(textToRewrite);
538 |         this.progressCallback({ stage: 'llm_generate_start', message: 'LLM generating rewritten text...' });
539 |         
540 |         try {
541 |             // The generate method in newer web-llm might return a ChatCompletion object.
542 |             // We need to access the message content.
543 |             // For simplicity, assuming it's similar to the previous structure or a direct string.
544 |             // If it returns a more complex object, this part might need adjustment based on the exact API of webLLM.Chat.
545 |             let replyContent = '';
546 |             if (this.WebLLMCreateEngine && this.chatModule && this.chatModule.chat && typeof this.chatModule.chat.completions.create === 'function') {
547 |                 // Using MLCEngine's OpenAI-compatible API
548 |                 const chatCompletion = await this.chatModule.chat.completions.create({
549 |                     messages: [{ role: "user", content: prompt }],
550 |                     model: model // Ensure 'model' here is the modelId used for the engine
551 |                 });
552 |                 if (chatCompletion.choices && chatCompletion.choices.length > 0 && chatCompletion.choices[0].message) {
553 |                     replyContent = chatCompletion.choices[0].message.content || '';
554 |                 }
555 |             } else if (this.chatModule && typeof this.chatModule.generate === 'function') {
556 |                 // Fallback or direct Chat.generate usage
557 |                 replyContent = await this.chatModule.generate(prompt, undefined, 0); // progressCb and streamInterval to undefined/0
558 |             } else {
559 |                 throw new Error('LLM module does not support generate or chat.completions.create');
560 |             }
561 |             
562 |             this.progressCallback({ stage: 'llm_generate_complete', message: 'LLM rewrite complete.' });
563 |             return replyContent;
564 |         } catch (err) {
565 |             this.progressCallback({ stage: 'llm_generate_error', message: `LLM generation failed: ${err.message}`, error: err });
566 |             throw new Error(`LLM generation failed: ${err.message}`);
567 |         }
568 |     }
569 | 
570 |     async unloadLLM() {
571 |         if (this.chatModule) {
572 |             this.progressCallback({ stage: 'llm_unload', message: 'Unloading LLM model...' });
573 |             await this.chatModule.unload();
574 |             this.chatModule = null;
575 |             this.llmInitialized = false;
576 |             this.progressCallback({ stage: 'llm_unload_complete', message: 'LLM unloaded.' });
577 |         }
578 |     }
579 | }
580 | 
581 | // Export new API
582 | export default Extract2MDConverter;
583 | 
584 | // Export individual components for advanced usage
585 | export {
586 |     Extract2MDConverter,
587 |     WebLLMEngine,
588 |     OutputParser,
589 |     SystemPrompts,
590 |     ConfigValidator,
591 |     LegacyExtract2MDConverter
592 | };
593 | 
594 | // Export legacy class as default for backwards compatibility
595 | export { LegacyExtract2MDConverter as Extract2MDConverter_Legacy };


--------------------------------------------------------------------------------
/src/converters/Extract2MDConverter.js:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Extract2MDConverter.js
  3 |  * Main converter class with scenario-specific methods
  4 |  */
  5 | 
  6 | import * as pdfjsLib from 'pdfjs-dist/build/pdf.mjs';
  7 | import Tesseract from 'tesseract.js';
  8 | import WebLLMEngine from '../engines/WebLLMEngine.js';
  9 | import OutputParser from '../utils/OutputParser.js';
 10 | import SystemPrompts from '../utils/SystemPrompts.js';
 11 | import ConfigValidator from '../utils/ConfigValidator.js';
 12 | 
 13 | export class Extract2MDConverter {
 14 |     constructor(config = {}) {
 15 |         // Validate and normalize configuration
 16 |         this.config = ConfigValidator.validate(config);
 17 |         
 18 |         // Initialize components
 19 |         this.webllmEngine = null;
 20 |         this.outputParser = new OutputParser();
 21 |         
 22 |         // Setup PDF.js worker
 23 |         this.setupPdfJsWorker();
 24 |         
 25 |         // Progress callback
 26 |         this.progressCallback = this.config.progressCallback || ((progress) => {});
 27 |     }
 28 | 
 29 |     /**
 30 |      * Setup PDF.js worker
 31 |      */
 32 |     setupPdfJsWorker() {
 33 |         const pdfjsSetupLib = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : 
 34 |                               (typeof window !== 'undefined' ? window.pdfjsLib : null));
 35 |         
 36 |         if (pdfjsSetupLib && pdfjsSetupLib.GlobalWorkerOptions) {
 37 |             pdfjsSetupLib.GlobalWorkerOptions.workerSrc = this.config.pdfJsWorkerSrc;
 38 |         } else {
 39 |             console.warn('pdfjsLib or pdfjsLib.GlobalWorkerOptions is not defined. PDF.js worker may not load correctly.');
 40 |         }
 41 |     }
 42 | 
 43 |     /**
 44 |      * Scenario 1: Quick convert only - returns MD output
 45 |      * @param {File} pdfFile - PDF file to convert
 46 |      * @param {Object} options - Optional configuration overrides
 47 |      * @returns {Promise<string>} Markdown output
 48 |      */
 49 |     static async quickConvertOnly(pdfFile, options = {}) {
 50 |         const converter = new Extract2MDConverter(options);
 51 |         return await converter._performQuickConvert(pdfFile);
 52 |     }
 53 | 
 54 |     /**
 55 |      * Scenario 2: High accuracy convert only - returns MD output
 56 |      * @param {File} pdfFile - PDF file to convert
 57 |      * @param {Object} options - Optional configuration overrides
 58 |      * @returns {Promise<string>} Markdown output
 59 |      */
 60 |     static async highAccuracyConvertOnly(pdfFile, options = {}) {
 61 |         const converter = new Extract2MDConverter(options);
 62 |         return await converter._performHighAccuracyConvert(pdfFile);
 63 |     }
 64 | 
 65 |     /**
 66 |      * Scenario 3: Quick convert + LLM rewrite - returns MD output
 67 |      * @param {File} pdfFile - PDF file to convert
 68 |      * @param {Object} options - Optional configuration overrides
 69 |      * @returns {Promise<string>} LLM-rewritten markdown output
 70 |      */
 71 |     static async quickConvertWithLLM(pdfFile, options = {}) {
 72 |         const converter = new Extract2MDConverter(options);
 73 |         
 74 |         try {
 75 |             // Step 1: Quick extraction
 76 |             converter.progressCallback({
 77 |                 stage: 'scenario_3_start',
 78 |                 message: 'Starting quick conversion with LLM rewrite...'
 79 |             });
 80 |             
 81 |             const extractedText = await converter._performQuickExtraction(pdfFile);
 82 |             
 83 |             // Step 2: LLM rewrite
 84 |             await converter._initializeWebLLM();
 85 |             const rewrittenMarkdown = await converter._performLLMRewrite(
 86 |                 extractedText,
 87 |                 'single',
 88 |                 converter.config.systemPrompts.singleExtraction
 89 |             );
 90 |             
 91 |             converter.progressCallback({
 92 |                 stage: 'scenario_3_complete',
 93 |                 message: 'Quick conversion with LLM rewrite completed.'
 94 |             });
 95 |             
 96 |             return rewrittenMarkdown;
 97 |             
 98 |         } finally {
 99 |             await converter._cleanup();
100 |         }
101 |     }
102 | 
103 |     /**
104 |      * Scenario 4: High accuracy convert + LLM rewrite - returns MD output
105 |      * @param {File} pdfFile - PDF file to convert
106 |      * @param {Object} options - Optional configuration overrides
107 |      * @returns {Promise<string>} LLM-rewritten markdown output
108 |      */
109 |     static async highAccuracyConvertWithLLM(pdfFile, options = {}) {
110 |         const converter = new Extract2MDConverter(options);
111 |         
112 |         try {
113 |             // Step 1: High accuracy extraction
114 |             converter.progressCallback({
115 |                 stage: 'scenario_4_start',
116 |                 message: 'Starting high accuracy conversion with LLM rewrite...'
117 |             });
118 |             
119 |             const extractedText = await converter._performHighAccuracyExtraction(pdfFile);
120 |             
121 |             // Step 2: LLM rewrite
122 |             await converter._initializeWebLLM();
123 |             const rewrittenMarkdown = await converter._performLLMRewrite(
124 |                 extractedText,
125 |                 'single',
126 |                 converter.config.systemPrompts.singleExtraction
127 |             );
128 |             
129 |             converter.progressCallback({
130 |                 stage: 'scenario_4_complete',
131 |                 message: 'High accuracy conversion with LLM rewrite completed.'
132 |             });
133 |             
134 |             return rewrittenMarkdown;
135 |             
136 |         } finally {
137 |             await converter._cleanup();
138 |         }
139 |     }
140 | 
141 |     /**
142 |      * Scenario 5: Combined convert + LLM rewrite - returns comprehensive MD output
143 |      * @param {File} pdfFile - PDF file to convert
144 |      * @param {Object} options - Optional configuration overrides
145 |      * @returns {Promise<string>} Comprehensive LLM-rewritten markdown output
146 |      */
147 |     static async combinedConvertWithLLM(pdfFile, options = {}) {
148 |         const converter = new Extract2MDConverter(options);
149 |         
150 |         try {
151 |             converter.progressCallback({
152 |                 stage: 'scenario_5_start',
153 |                 message: 'Starting combined conversion with LLM rewrite...'
154 |             });
155 |             
156 |             // Step 1: Parallel extraction using both methods
157 |             const [quickText, ocrText] = await Promise.all([
158 |                 converter._performQuickExtraction(pdfFile),
159 |                 converter._performHighAccuracyExtraction(pdfFile)
160 |             ]);
161 |             
162 |             // Step 2: LLM rewrite with combined context
163 |             await converter._initializeWebLLM();
164 |             const rewrittenMarkdown = await converter._performCombinedLLMRewrite(
165 |                 quickText,
166 |                 ocrText,
167 |                 converter.config.systemPrompts.combinedExtraction
168 |             );
169 |             
170 |             converter.progressCallback({
171 |                 stage: 'scenario_5_complete',
172 |                 message: 'Combined conversion with LLM rewrite completed.'
173 |             });
174 |             
175 |             return rewrittenMarkdown;
176 |             
177 |         } finally {
178 |             await converter._cleanup();
179 |         }
180 |     }
181 | 
182 |     // Internal methods for extraction and processing
183 | 
184 |     /**
185 |      * Perform quick text extraction using PDF.js
186 |      */
187 |     async _performQuickExtraction(pdfFile) {
188 |         // Enhanced input validation
189 |         if (!(pdfFile instanceof File)) {
190 |             throw new Error('Invalid input: pdfFile must be a File object.');
191 |         }
192 |         if (pdfFile.size === 0) {
193 |             throw new Error('Invalid input: PDF file is empty.');
194 |         }
195 |         if (pdfFile.size > 100 * 1024 * 1024) { // 100MB limit
196 |             throw new Error('Invalid input: PDF file is too large (max 100MB).');
197 |         }
198 |         if (!pdfFile.type || (!pdfFile.type.includes('pdf') && !pdfFile.name.toLowerCase().endsWith('.pdf'))) {
199 |             throw new Error('Invalid input: File must be a PDF document.');
200 |         }
201 | 
202 |         this.progressCallback({
203 |             stage: 'quick_extraction_start',
204 |             message: 'Starting quick PDF text extraction...'
205 |         });
206 | 
207 |         const arrayBuffer = await pdfFile.arrayBuffer();
208 |         const rawText = await this._extractTextWithPdfJs(arrayBuffer);
209 |         const cleanedText = this._postProcessText(rawText);
210 |         
211 |         this.progressCallback({
212 |             stage: 'quick_extraction_complete',
213 |             message: 'Quick extraction completed.'
214 |         });
215 | 
216 |         return cleanedText;
217 |     }
218 | 
219 |     /**
220 |      * Perform quick conversion (extraction + markdown formatting)
221 |      */
222 |     async _performQuickConvert(pdfFile) {
223 |         const extractedText = await this._performQuickExtraction(pdfFile);
224 |         
225 |         this.progressCallback({
226 |             stage: 'quick_markdown_start',
227 |             message: 'Converting to Markdown...'
228 |         });
229 |         
230 |         const markdown = this._convertToMarkdown(extractedText);
231 |         
232 |         this.progressCallback({
233 |             stage: 'quick_markdown_complete',
234 |             message: 'Quick conversion completed.'
235 |         });
236 |         
237 |         return markdown;
238 |     }
239 | 
240 |     /**
241 |      * Perform high accuracy text extraction using OCR
242 |      */
243 |     async _performHighAccuracyExtraction(pdfFile) {
244 |         // Enhanced input validation
245 |         if (!(pdfFile instanceof File)) {
246 |             throw new Error('Invalid input: pdfFile must be a File object.');
247 |         }
248 |         if (pdfFile.size === 0) {
249 |             throw new Error('Invalid input: PDF file is empty.');
250 |         }
251 |         if (pdfFile.size > 100 * 1024 * 1024) { // 100MB limit
252 |             throw new Error('Invalid input: PDF file is too large (max 100MB).');
253 |         }
254 |         if (!pdfFile.type || (!pdfFile.type.includes('pdf') && !pdfFile.name.toLowerCase().endsWith('.pdf'))) {
255 |             throw new Error('Invalid input: File must be a PDF document.');
256 |         }
257 | 
258 |         this.progressCallback({
259 |             stage: 'ocr_extraction_start',
260 |             message: 'Starting OCR text extraction...'
261 |         });
262 | 
263 |         const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : 
264 |                       (typeof window !== 'undefined' ? window.pdfjsLib : null));
265 |         if (!pdfjs || !pdfjs.getDocument) {
266 |             throw new Error('pdf.js library is not loaded or not fully initialized.');
267 |         }
268 | 
269 |         const Tess = (typeof Tesseract !== 'undefined' ? Tesseract : 
270 |                      (typeof window !== 'undefined' ? window.Tesseract : null));
271 |         if (!Tess) {
272 |             throw new Error('Tesseract.js library is not loaded.');
273 |         }
274 | 
275 |         let worker;
276 |         let workerInitialized = false;
277 |         
278 |         try {
279 |             // Initialize Tesseract worker with enhanced error handling
280 |             this.progressCallback({
281 |                 stage: 'ocr_worker_init',
282 |                 message: 'Initializing OCR worker...'
283 |             });
284 |             
285 |             try {
286 |                 // Set timeout for worker initialization
287 |                 const workerPromise = Tess.createWorker(
288 |                     this.config.tesseract.language,
289 |                     1,
290 |                     {
291 |                         workerPath: this.config.tesseract.workerPath,
292 |                         corePath: this.config.tesseract.corePath,
293 |                         langPath: this.config.tesseract.langPath,
294 |                         ...this.config.tesseract.options
295 |                     }
296 |                 );
297 | 
298 |                 // Add timeout to prevent hanging
299 |                 const timeoutPromise = new Promise((_, reject) => {
300 |                     setTimeout(() => reject(new Error('Worker initialization timed out after 30 seconds')), 30000);
301 |                 });
302 | 
303 |                 worker = await Promise.race([workerPromise, timeoutPromise]);
304 |                 workerInitialized = true;
305 | 
306 |                 this.progressCallback({
307 |                     stage: 'ocr_worker_ready',
308 |                     message: 'OCR worker initialized successfully.'
309 |                 });
310 | 
311 |             } catch (workerError) {
312 |                 throw new Error(`Failed to initialize Tesseract worker: ${workerError.message}. Check if Tesseract.js files are accessible and language data is available.`);
313 |             }
314 | 
315 |             // Process PDF
316 |             const arrayBuffer = await pdfFile.arrayBuffer();
317 |             const pdfDoc = await pdfjs.getDocument({ data: arrayBuffer }).promise;
318 |             let fullText = '';
319 |             const numPages = pdfDoc.numPages;
320 | 
321 |             for (let pageNum = 1; pageNum <= numPages; pageNum++) {
322 |                 this.progressCallback({
323 |                     stage: 'ocr_page_process',
324 |                     message: `Processing page ${pageNum}/${numPages}...`,
325 |                     currentPage: pageNum,
326 |                     totalPages: numPages
327 |                 });
328 | 
329 |                 const page = await pdfDoc.getPage(pageNum);
330 |                 const viewport = page.getViewport({ scale: this.config.processing.pdfRenderScale });
331 |                 
332 |                 const canvas = document.createElement('canvas');
333 |                 const context = canvas.getContext('2d');
334 |                 canvas.height = viewport.height;
335 |                 canvas.width = viewport.width;
336 | 
337 |                 try {
338 |                     await page.render({ canvasContext: context, viewport: viewport }).promise;
339 |                     
340 |                     // OCR recognition with error handling
341 |                     const recognition = await worker.recognize(canvas);
342 |                     const ocrPageText = recognition.data?.text || '';
343 |                     fullText += ocrPageText + '\n';
344 |                     
345 |                 } catch (pageError) {
346 |                     this.progressCallback({
347 |                         stage: 'ocr_page_warning',
348 |                         message: `Warning: Failed to process page ${pageNum}: ${pageError.message}`
349 |                     });
350 |                     console.warn(`OCR processing failed for page ${pageNum}:`, pageError);
351 |                     // Continue with other pages instead of failing completely
352 |                 } finally {
353 |                     // Clean up canvas resources
354 |                     canvas.width = 0;
355 |                     canvas.height = 0;
356 |                 }
357 |             }
358 | 
359 |             // Safely terminate worker
360 |             if (workerInitialized && worker) {
361 |                 try {
362 |                     this.progressCallback({
363 |                         stage: 'ocr_worker_terminate',
364 |                         message: 'Terminating OCR worker...'
365 |                     });
366 |                     
367 |                     await Promise.race([
368 |                         worker.terminate(),
369 |                         new Promise((_, reject) => {
370 |                             setTimeout(() => reject(new Error('Worker termination timed out')), 10000);
371 |                         })
372 |                     ]);
373 |                 } catch (terminateError) {
374 |                     console.warn('Warning: Failed to properly terminate Tesseract worker:', terminateError);
375 |                     // Don't throw error for termination issues
376 |                 }
377 |             }
378 |             
379 |             const cleanedText = this._postProcessText(fullText);
380 |             
381 |             this.progressCallback({
382 |                 stage: 'ocr_extraction_complete',
383 |                 message: 'OCR extraction completed.'
384 |             });
385 | 
386 |             return cleanedText;
387 | 
388 |         } catch (error) {
389 |             // Enhanced cleanup on error
390 |             if (workerInitialized && worker) {
391 |                 try {
392 |                     await Promise.race([
393 |                         worker.terminate(),
394 |                         new Promise((resolve) => setTimeout(resolve, 5000)) // Give up after 5 seconds
395 |                     ]);
396 |                 } catch (cleanupError) {
397 |                     console.warn('Failed to cleanup worker after error:', cleanupError);
398 |                 }
399 |             }
400 |             throw error;
401 |         }
402 |     }
403 | 
404 |     /**
405 |      * Perform high accuracy conversion (OCR + markdown formatting)
406 |      */
407 |     async _performHighAccuracyConvert(pdfFile) {
408 |         const extractedText = await this._performHighAccuracyExtraction(pdfFile);
409 |         
410 |         this.progressCallback({
411 |             stage: 'ocr_markdown_start',
412 |             message: 'Converting OCR results to Markdown...'
413 |         });
414 |         
415 |         const markdown = this._convertToMarkdown(extractedText);
416 |         
417 |         this.progressCallback({
418 |             stage: 'ocr_markdown_complete',
419 |             message: 'High accuracy conversion completed.'
420 |         });
421 |         
422 |         return markdown;
423 |     }
424 | 
425 |     /**
426 |      * Check WebGPU capability and browser support
427 |      * @returns {Promise<Object>} WebGPU capability information
428 |      */
429 |     static async checkWebGPUCapability() {
430 |         const result = {
431 |             isSupported: false,
432 |             hasShaderF16: false,
433 |             error: null,
434 |             details: {}
435 |         };
436 | 
437 |         try {
438 |             // Check if WebGPU is available
439 |             if (!navigator.gpu) {
440 |                 result.error = 'WebGPU is not supported in this browser. Please use Chrome 113+ or Edge 113+.';
441 |                 return result;
442 |             }
443 | 
444 |             // Request WebGPU adapter
445 |             const adapter = await navigator.gpu.requestAdapter();
446 |             if (!adapter) {
447 |                 result.error = 'No WebGPU adapter found. WebGPU may not be supported on this device.';
448 |                 return result;
449 |             }
450 | 
451 |             // Get adapter features
452 |             const features = Array.from(adapter.features);
453 |             result.hasShaderF16 = features.includes('shader-f16');
454 |             result.isSupported = true;
455 |             result.details = {
456 |                 features,
457 |                 limits: adapter.limits,
458 |                 info: adapter.info
459 |             };
460 | 
461 |             return result;
462 | 
463 |         } catch (error) {
464 |             result.error = `WebGPU capability check failed: ${error.message}`;
465 |             return result;
466 |         }
467 |     }
468 | 
469 |     /**
470 |      * Initialize WebLLM engine with WebGPU capability checks
471 |      */
472 |     async _initializeWebLLM() {
473 |         // Check WebGPU capability before initializing LLM
474 |         this.progressCallback({
475 |             stage: 'webgpu_check_start',
476 |             message: 'Checking WebGPU capability for LLM processing...'
477 |         });
478 | 
479 |         const webgpuCapability = await Extract2MDConverter.checkWebGPUCapability();
480 |         
481 |         if (!webgpuCapability.isSupported) {
482 |             const errorMessage = `WebGPU capability check failed: ${webgpuCapability.error}`;
483 |             this.progressCallback({
484 |                 stage: 'webgpu_check_failed',
485 |                 message: errorMessage,
486 |                 error: webgpuCapability.error
487 |             });
488 |             throw new Error(errorMessage);
489 |         }
490 | 
491 |         this.progressCallback({
492 |             stage: 'webgpu_check_success',
493 |             message: `WebGPU is supported. Shader F16: ${webgpuCapability.hasShaderF16 ? 'Yes' : 'No'}`
494 |         });
495 | 
496 |         // Validate model requirements against WebGPU capabilities
497 |         const modelRequiresF16 = this.config.webllm.model && this.config.webllm.model.includes('f16');
498 |         if (modelRequiresF16 && !webgpuCapability.hasShaderF16) {
499 |             const warningMessage = `Warning: Model "${this.config.webllm.model}" requires shader-f16 support, but your device doesn't support it. Performance may be reduced.`;
500 |             this.progressCallback({
501 |                 stage: 'webgpu_compatibility_warning',
502 |                 message: warningMessage
503 |             });
504 |         }
505 | 
506 |         if (!this.webllmEngine) {
507 |             this.webllmEngine = new WebLLMEngine({
508 |                 progressCallback: this.progressCallback,
509 |                 defaultModel: this.config.webllm.model,
510 |                 customModelConfig: this.config.webllm.customModel
511 |             });
512 |         }
513 | 
514 |         const modelToUse = this.config.webllm.customModel ? 
515 |                           this.config.webllm.customModel.model_id : 
516 |                           this.config.webllm.model;
517 | 
518 |         await this.webllmEngine.initialize(modelToUse, this.config.webllm.options);
519 |     }
520 | 
521 |     /**
522 |      * Perform LLM rewrite for single extraction
523 |      */
524 |     async _performLLMRewrite(extractedText, scenarioType, customPrompt) {
525 |         const systemPrompt = SystemPrompts.buildSystemPrompt(scenarioType, customPrompt);
526 |         const userPrompt = SystemPrompts.buildUserPrompt(scenarioType, extractedText);
527 |         
528 |         // For models that support thinking, we could enable it
529 |         const fullPrompt = `${systemPrompt}\n\n${userPrompt}`;
530 |         
531 |         const rawOutput = await this.webllmEngine.generate(fullPrompt, this.config.webllm.options);
532 |         const cleanedOutput = this.outputParser.parse(rawOutput);
533 |         
534 |         return cleanedOutput;
535 |     }
536 | 
537 |     /**
538 |      * Perform combined LLM rewrite
539 |      */
540 |     async _performCombinedLLMRewrite(quickText, ocrText, customPrompt) {
541 |         const systemPrompt = SystemPrompts.buildSystemPrompt('combined', customPrompt);
542 |         const userPrompt = SystemPrompts.buildUserPrompt('combined', quickText, ocrText);
543 |         
544 |         const fullPrompt = `${systemPrompt}\n\n${userPrompt}`;
545 |         
546 |         const rawOutput = await this.webllmEngine.generate(fullPrompt, this.config.webllm.options);
547 |         const cleanedOutput = this.outputParser.parse(rawOutput);
548 |         
549 |         return cleanedOutput;
550 |     }
551 | 
552 |     /**
553 |      * Extract text using PDF.js
554 |      */
555 |     async _extractTextWithPdfJs(fileArrayBuffer) {
556 |         const pdfjs = (typeof pdfjsLib !== 'undefined' ? pdfjsLib : 
557 |                       (typeof window !== 'undefined' ? window.pdfjsLib : null));
558 |         
559 |         if (!pdfjs || !pdfjs.getDocument) {
560 |             throw new Error('pdf.js library is not loaded or not fully initialized.');
561 |         }
562 | 
563 |         this.progressCallback({
564 |             stage: 'pdfjs_load',
565 |             message: 'Loading PDF with pdf.js...'
566 |         });
567 | 
568 |         const pdfDoc = await pdfjs.getDocument({ data: fileArrayBuffer }).promise;
569 |         let fullText = '';
570 |         const numPages = pdfDoc.numPages;
571 | 
572 |         for (let pageNum = 1; pageNum <= numPages; pageNum++) {
573 |             this.progressCallback({
574 |                 stage: 'pdfjs_page',
575 |                 message: `Extracting text from page ${pageNum}/${numPages}...`,
576 |                 currentPage: pageNum,
577 |                 totalPages: numPages
578 |             });
579 | 
580 |             const page = await pdfDoc.getPage(pageNum);
581 |             const textContent = await page.getTextContent({
582 |                 normalizeWhitespace: false,
583 |                 disableCombineTextItems: true
584 |             });
585 | 
586 |             let pageTextBuffer = '';
587 |             if (textContent.items && textContent.items.length > 0) {
588 |                 for (let i = 0; i < textContent.items.length; i++) {
589 |                     const item = textContent.items[i];
590 |                     pageTextBuffer += item.str;
591 |                     
592 |                     if (item.hasEOL) {
593 |                         if (!pageTextBuffer.endsWith('\n')) pageTextBuffer += '\n';
594 |                     } else if (i < textContent.items.length - 1) {
595 |                         const nextItem = textContent.items[i + 1];
596 |                         if (item.str && !item.str.endsWith(' ') && 
597 |                             nextItem.str && !nextItem.str.startsWith(' ') && 
598 |                             Math.abs(item.transform[5] - nextItem.transform[5]) < (item.height * 0.5)) {
599 |                             
600 |                             const currentItemEndX = item.transform[4] + item.width;
601 |                             const nextItemStartX = nextItem.transform[4];
602 |                             if (nextItemStartX - currentItemEndX > -0.5) {
603 |                                 pageTextBuffer += ' ';
604 |                             }
605 |                         }
606 |                     }
607 |                 }
608 |             }
609 |             
610 |             fullText += pageTextBuffer;
611 |             if (pageTextBuffer.trim() !== '' && !pageTextBuffer.endsWith('\n')) {
612 |                 fullText += '\n';
613 |             }
614 |         }
615 | 
616 |         this.progressCallback({
617 |             stage: 'pdfjs_extract_complete',
618 |             message: 'PDF.js text extraction complete.'
619 |         });
620 | 
621 |         return fullText;
622 |     }
623 | 
624 |     /**
625 |      * Post-process extracted text with optimized rule application
626 |      */
627 |     _postProcessText(text) {
628 |         if (!text) return '';
629 |         
630 |         let cleanedText = text;
631 |         
632 |         // Apply default rules
633 |         const defaultRules = [
634 |             { find: /\uFB00/g, replace: 'ff' },
635 |             { find: /\uFB01/g, replace: 'fi' },
636 |             { find: /\uFB02/g, replace: 'fl' },
637 |             { find: /\uFB03/g, replace: 'ffi' },
638 |             { find: /\uFB04/g, replace: 'ffl' },
639 |             { find: /[\u2018\u2019]/g, replace: "'" },
640 |             { find: /[\u201C\u201D]/g, replace: '"' },
641 |             { find: /[\u2022\u2023\u25E6\u2043\u2219\u25CF\u25CB\u2981\u2619\u2765]/g, replace: '-' },
642 |             { find: /[\u2013\u2014]/g, replace: '-' },
643 |             { find: /\u00AD/g, replace: '' },
644 |             { find: /[\s\u00A0\u2000-\u200A\u202F\u205F\u3000]+/g, replace: ' ' }
645 |         ];
646 | 
647 |         // Add PascalCase rules if enabled
648 |         if (this.config.processing.splitPascalCase) {
649 |             defaultRules.push(
650 |                 { find: /([A-Z][a-z]+)([A-Z][a-z]+)/g, replace: '$1 $2' },
651 |                 { find: /([a-z])([A-Z][a-z]+)/g, replace: '$1 $2' }
652 |             );
653 |         }
654 | 
655 |         // Combine all rules for efficient processing
656 |         const allRules = [...defaultRules, ...this.config.processing.postProcessRules];
657 |         
658 |         // Optimized rule application - batch similar operations
659 |         const unicodeReplacements = [];
660 |         const regexReplacements = [];
661 |         
662 |         for (const rule of allRules) {
663 |             if (rule.find && typeof rule.replace === 'string') {
664 |                 if (rule.find instanceof RegExp) {
665 |                     regexReplacements.push(rule);
666 |                 } else {
667 |                     unicodeReplacements.push(rule);
668 |                 }
669 |             }
670 |         }
671 |         
672 |         // Apply unicode replacements first (typically simpler)
673 |         for (const rule of unicodeReplacements) {
674 |             cleanedText = cleanedText.replace(rule.find, rule.replace);
675 |         }
676 |         
677 |         // Apply regex replacements
678 |         for (const rule of regexReplacements) {
679 |             cleanedText = cleanedText.replace(rule.find, rule.replace);
680 |         }
681 | 
682 |         // Final normalization - combine line break handling with newline normalization
683 |         return cleanedText.replace(/\r\n/g, '\n').replace(/\n{3,}/g, '\n\n').trim();
684 |     }
685 | 
686 |     /**
687 |      * Convert text to markdown with optimized newline handling
688 |      */
689 |     _convertToMarkdown(rawText) {
690 |         // Implementation of markdown conversion logic with optimized newline handling
691 |         let markdownOutputLines = [];
692 |         const inputLines = rawText.split(/\n/);
693 | 
694 |         let currentParagraphCollector = [];
695 |         let inPotentialTableBlock = false;
696 |         let potentialTableBlockLines = [];
697 | 
698 |         const flushCurrentParagraph = () => {
699 |             if (currentParagraphCollector.length > 0) {
700 |                 markdownOutputLines.push(currentParagraphCollector.join(' ').trim());
701 |                 currentParagraphCollector = [];
702 |                 // Only add empty line if the next content isn't a heading or table block
703 |                 this._addSeparatorLine(markdownOutputLines);
704 |             }
705 |         };
706 | 
707 |         const flushPotentialTableBlock = () => {
708 |             if (potentialTableBlockLines.length > 0) {
709 |                 if (potentialTableBlockLines.length >= 2) {
710 |                     markdownOutputLines.push('```');
711 |                     markdownOutputLines.push(...potentialTableBlockLines.map(l => l.trimEnd()));
712 |                     markdownOutputLines.push('```');
713 |                 } else {
714 |                     markdownOutputLines.push(potentialTableBlockLines.join(' ').trim());
715 |                 }
716 |                 potentialTableBlockLines = [];
717 |                 this._addSeparatorLine(markdownOutputLines);
718 |             }
719 |             inPotentialTableBlock = false;
720 |         };
721 | 
722 |         for (let i = 0; i < inputLines.length; i++) {
723 |             const originalLine = inputLines[i];
724 |             const trimmedLine = originalLine.trim();
725 | 
726 |             if (trimmedLine === '') {
727 |                 if (inPotentialTableBlock) flushPotentialTableBlock();
728 |                 flushCurrentParagraph();
729 |                 continue;
730 |             }
731 |             
732 |             const isShortLine = trimmedLine.length > 0 && trimmedLine.length < 80;
733 |             const noPunctuationEnd = isShortLine && !/[.,;:!?]$/.test(trimmedLine);
734 |             const isAllCapsLine = trimmedLine.length > 2 && trimmedLine.length < 80 && 
735 |                                   /^[A-Z\s\d\W]*[A-Z][A-Z\s\d\W]*$/.test(trimmedLine) && 
736 |                                   /[A-Z]/.test(trimmedLine) && !/^\d+$/.test(trimmedLine);
737 |             const nextLineIsBlankOrEndOfFile = (i + 1 === inputLines.length || 
738 |                                                inputLines[i + 1].trim() === '');
739 | 
740 |             if (isAllCapsLine || (isShortLine && noPunctuationEnd && nextLineIsBlankOrEndOfFile && trimmedLine.length > 1)) {
741 |                 if (inPotentialTableBlock) flushPotentialTableBlock();
742 |                 flushCurrentParagraph();
743 |                 markdownOutputLines.push(`# ${trimmedLine}`);
744 |                 this._addSeparatorLine(markdownOutputLines);
745 |                 if (nextLineIsBlankOrEndOfFile && inputLines[i + 1] && inputLines[i + 1].trim() === '') {
746 |                     i++;
747 |                 }
748 |                 continue;
749 |             }
750 | 
751 |             const hasMultipleSpacesBetweenWords = /\S\s{2,}\S/.test(originalLine);
752 |             const hasMultipleColumnsBySpaces = originalLine.split(/\s{2,}/).length > 2 && originalLine.length > 10;
753 | 
754 |             if (hasMultipleSpacesBetweenWords || hasMultipleColumnsBySpaces) {
755 |                 flushCurrentParagraph();
756 |                 if (!inPotentialTableBlock) inPotentialTableBlock = true;
757 |                 potentialTableBlockLines.push(originalLine);
758 |             } else {
759 |                 if (inPotentialTableBlock) flushPotentialTableBlock();
760 |                 if (trimmedLine) currentParagraphCollector.push(trimmedLine);
761 |             }
762 |         }
763 | 
764 |         if (inPotentialTableBlock) flushPotentialTableBlock();
765 |         flushCurrentParagraph();
766 | 
767 |         // Optimized final cleanup - single pass to normalize excessive newlines
768 |         return this._normalizeMarkdownNewlines(markdownOutputLines);
769 |     }
770 | 
771 |     /**
772 |      * Helper method to add separator lines only when needed
773 |      */
774 |     _addSeparatorLine(outputLines) {
775 |         // Only add empty line if the last line isn't already empty
776 |         if (outputLines.length > 0 && outputLines[outputLines.length - 1] !== '') {
777 |             outputLines.push('');
778 |         }
779 |     }
780 | 
781 |     /**
782 |      * Normalize newlines in the final markdown output
783 |      */
784 |     _normalizeMarkdownNewlines(lines) {
785 |         // Filter out excessive empty lines while preserving structure
786 |         const normalizedLines = [];
787 |         let consecutiveEmptyLines = 0;
788 |         
789 |         for (const line of lines) {
790 |             if (line.trim() === '') {
791 |                 consecutiveEmptyLines++;
792 |                 // Allow maximum of 1 consecutive empty line
793 |                 if (consecutiveEmptyLines <= 1) {
794 |                     normalizedLines.push('');
795 |                 }
796 |             } else {
797 |                 consecutiveEmptyLines = 0;
798 |                 normalizedLines.push(line.trimEnd());
799 |             }
800 |         }
801 |         
802 |         // Join and do final cleanup
803 |         let finalMarkdown = normalizedLines.join('\n');
804 |         // Remove any remaining triple+ newlines and trim
805 |         finalMarkdown = finalMarkdown.replace(/\n{3,}/g, '\n\n').trim();
806 |         return finalMarkdown;
807 |     }
808 | 
809 |     /**
810 |      * Cleanup resources with proper error handling
811 |      */
812 |     async _cleanup() {
813 |         try {
814 |             if (this.webllmEngine) {
815 |                 this.progressCallback({
816 |                     stage: 'cleanup_webllm',
817 |                     message: 'Cleaning up WebLLM engine...'
818 |                 });
819 |                 await this.webllmEngine.cleanup();
820 |                 this.webllmEngine = null;
821 |             }
822 |             
823 |             this.progressCallback({
824 |                 stage: 'cleanup_complete',
825 |                 message: 'Resource cleanup completed successfully.'
826 |             });
827 |         } catch (error) {
828 |             console.warn('Warning: Error during resource cleanup:', error.message);
829 |             this.progressCallback({
830 |                 stage: 'cleanup_error',
831 |                 message: `Resource cleanup warning: ${error.message}`,
832 |                 error: error
833 |             });
834 |             // Don't throw - cleanup errors shouldn't break the application
835 |         }
836 |     }
837 | }
838 | 
839 | export default Extract2MDConverter;
840 | 


--------------------------------------------------------------------------------