├── .gitignore ├── .coverage ├── settings.json ├── Screenshot.png ├── requirements.txt ├── LICENSE ├── cleanup.sh ├── tests ├── test_epub_processor.py └── test_document_processor.py ├── README.md ├── epub_processor.py ├── templates ├── job_status.html └── index.html ├── document_processor.py └── document_gui.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | venv -------------------------------------------------------------------------------- /.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpeterpan/gemini-document-processor/HEAD/.coverage -------------------------------------------------------------------------------- /settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "obsidian_vault_path": "/Users/kidpeterpan/Documents/Pan's Vault" 3 | } -------------------------------------------------------------------------------- /Screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpeterpan/gemini-document-processor/HEAD/Screenshot.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | Flask==2.3.3 3 | PyMuPDF # For PDF processing (fitz) 4 | ebooklib==0.18.0 # For EPUB processing 5 | beautifulsoup4==4.12.2 # For parsing HTML content in EPUBs 6 | html2text==2020.1.16 # For converting HTML to markdown text 7 | requests==2.31.0 # For API calls to Gemini 8 | pathlib==1.0.1 # For path manipulations 9 | pypdf==5.4.0 10 | google-generativeai 11 | 12 | # Optional dependencies (comment out if not needed) 13 | # PIL or Pillow might be needed for more advanced image processing 14 | Pillow 15 | 16 | # Testing dependencies 17 | pytest 18 | pytest-cov -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 kidpeterpan@github.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to clean up results, uploads, and log files 4 | # Usage: ./cleanup.sh 5 | 6 | # Set the base directory - default to the current directory 7 | BASE_DIR="$(pwd)" 8 | 9 | # File paths 10 | RESULTS_DIR="$BASE_DIR/results" 11 | UPLOADS_DIR="$BASE_DIR/uploads" 12 | 13 | echo "========================================" 14 | echo "Starting cleanup process..." 15 | echo "========================================" 16 | 17 | # Clean results directory 18 | if [ -d "$RESULTS_DIR" ]; then 19 | echo "Cleaning results directory..." 20 | rm -rf "$RESULTS_DIR"/* 21 | echo "✓ Results directory emptied" 22 | else 23 | echo "! Results directory not found at $RESULTS_DIR" 24 | fi 25 | 26 | # Clean uploads directory 27 | if [ -d "$UPLOADS_DIR" ]; then 28 | echo "Cleaning uploads directory..." 29 | rm -rf "$UPLOADS_DIR"/* 30 | echo "✓ Uploads directory emptied" 31 | else 32 | echo "! Uploads directory not found at $UPLOADS_DIR" 33 | fi 34 | 35 | # Remove log files 36 | echo "Removing log files..." 37 | find "$BASE_DIR" -name "*.log" -type f -delete 38 | echo "✓ Log files removed" 39 | 40 | # Done 41 | echo "========================================" 42 | echo "Cleanup completed!" 43 | echo "========================================" -------------------------------------------------------------------------------- /tests/test_epub_processor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 4 | import pytest 5 | from epub_processor import SimpleEpubProcessor 6 | 7 | class DummyBook: 8 | def get_metadata(self, *args, **kwargs): 9 | return {'title': [('Test Book',)]} 10 | def get_items(self): 11 | class DummyItem: 12 | def get_type(self): return 9 # ebooklib.ITEM_DOCUMENT 13 | def get_content(self): return b'

Chapter

Content

' 14 | return [DummyItem()] 15 | 16 | def test_process_epub(monkeypatch, tmp_path): 17 | epub_path = tmp_path / "test.epub" 18 | with open(epub_path, "wb") as f: 19 | f.write(b"dummy epub content") 20 | monkeypatch.setattr('ebooklib.epub.read_epub', lambda path: DummyBook()) 21 | processor = SimpleEpubProcessor(extract_images=True) 22 | chapters, images_by_chapter, name, meta = processor.process_epub(str(epub_path), image_output_dir=str(tmp_path)) 23 | assert isinstance(chapters, dict) 24 | assert isinstance(images_by_chapter, dict) 25 | assert name == "test" 26 | assert meta['title'] == 'Test Book' 27 | 28 | def test_process_epub_missing_file(tmp_path): 29 | processor = SimpleEpubProcessor() 30 | chapters, images_by_chapter, name, meta = processor.process_epub(str(tmp_path / "no_file.epub")) 31 | assert "Error" in chapters or "Failed" in list(chapters.values())[0] 32 | 33 | def test_process_epub_no_chapters(monkeypatch, tmp_path): 34 | class DummyBook: 35 | def get_metadata(self, *args, **kwargs): 36 | return {'title': [('Test Book',)]} 37 | def get_items(self): 38 | return [] 39 | epub_path = tmp_path / "test.epub" 40 | with open(epub_path, "wb") as f: 41 | f.write(b"dummy epub content") 42 | monkeypatch.setattr('ebooklib.epub.read_epub', lambda path: DummyBook()) 43 | processor = SimpleEpubProcessor() 44 | chapters, images_by_chapter, name, meta = processor.process_epub(str(epub_path), image_output_dir=str(tmp_path)) 45 | assert "No Content" in chapters 46 | 47 | def test_has_content(): 48 | processor = SimpleEpubProcessor() 49 | assert not processor._has_content("") 50 | assert not processor._has_content("\n\t ") 51 | assert processor._has_content("a" * 101) 52 | 53 | def test_extract_images_basic(tmp_path): 54 | processor = SimpleEpubProcessor() 55 | class DummyBook: pass 56 | class DummyItem: pass 57 | from bs4 import BeautifulSoup 58 | soup = BeautifulSoup('A', 'html.parser') 59 | images = processor._extract_images_basic(DummyBook(), DummyItem(), soup, str(tmp_path), "book", 1) 60 | assert len(images) == 2 61 | assert images[0]['filename'].endswith('.png') 62 | assert images[0]['alt'] == 'A' 63 | 64 | def test_extract_images_basic_malformed_html(tmp_path): 65 | processor = SimpleEpubProcessor() 66 | class DummyBook: pass 67 | class DummyItem: pass 68 | from bs4 import BeautifulSoup 69 | soup = BeautifulSoup('
', 'html.parser') 70 | images = processor._extract_images_basic(DummyBook(), DummyItem(), soup, str(tmp_path), "book", 1) 71 | assert isinstance(images, list) 72 | 73 | def test_has_content_special_chars(): 74 | processor = SimpleEpubProcessor() 75 | assert not processor._has_content("!@#$%^&*()_+-=\n\t ") 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gemini Document Processor 2 | 3 | A powerful document processing tool that uses Google's Gemini AI to generate high-quality Thai language summaries from PDF and EPUB files, with image extraction and Obsidian integration. 4 | 5 | ![Screenshot.png](Screenshot.png) 6 | 7 | ## Features 8 | 9 | ### Core Functionality 10 | - **AI-Powered Summarization**: Uses Google's latest Gemini models (gemini-2.0-flash, gemini-2.5-flash-preview, gemini-1.5-pro) 11 | - **Multiple Document Formats**: Processes both PDF and EPUB files 12 | - **Thai-Focused Summaries**: Optimized for creating comprehensive Thai language summaries 13 | 14 | ### Advanced Processing 15 | - **Smart Chunking**: Processes documents in manageable chunks for better AI performance 16 | - **Image Extraction**: Extracts and filters images from documents with size thresholds 17 | - **Robust Error Handling**: Includes intelligent retry mechanisms with model fallbacks 18 | - **Timeout Management**: Configurable timeouts for both API calls and chunk processing 19 | 20 | ### User Experience 21 | - **Web Interface**: Clean, tabbed web application for document processing 22 | - **Real-time Progress Tracking**: Live updates during processing 23 | - **Job Status Monitoring**: Track failed chunks and retry problematic sections 24 | - **Parallel Processing**: Multi-threaded image extraction for improved performance 25 | 26 | ### Obsidian Integration 27 | - **Direct Export**: Create markdown files directly in your Obsidian vault 28 | - **Metadata Support**: Includes YAML frontmatter with tags and other metadata 29 | - **Customizable Tags**: Define your own Obsidian tags for processed documents 30 | 31 | ## Installation 32 | 33 | 1. Clone this repository: 34 | ```bash 35 | git clone https://github.com/kidpeterpan/gemini-document-processor.git 36 | cd gemini-document-processor 37 | ``` 38 | 39 | 2. Install the required dependencies: 40 | ```bash 41 | pip install -r requirements.txt 42 | ``` 43 | 44 | 3. Get a Google Gemini API key from [Google AI Studio](https://aistudio.google.com/) 45 | 46 | ## Usage 47 | 48 | ### Starting the Web Interface 49 | 50 | Run the web server: 51 | 52 | ```bash 53 | python document_gui.py 54 | ``` 55 | 56 | Then open your web browser and navigate to: http://127.0.0.1:8081/ 57 | 58 | ### Web Interface Features 59 | 60 | The interface is organized into three tabs: 61 | 62 | 1. **Basic Settings**: 63 | - Upload PDF or EPUB files 64 | - Select Gemini model: 65 | - gemini-2.0-flash (Faster) 66 | - gemini-2.5-flash-preview (More accurate) 67 | - gemini-1.5-pro (Backup option) 68 | - Adjust chunk size (pages per processing unit) 69 | - Enter your Gemini API key 70 | - Toggle image extraction 71 | 72 | 2. **Obsidian Integration**: 73 | - Enable automatic export to Obsidian 74 | - Verify and set Obsidian vault path 75 | - Configure tags, author, cover URL, and review ratings 76 | - Automatic path validation 77 | 78 | 3. **Advanced Settings**: 79 | - Configure timeout settings: 80 | - Chunk processing timeout (60-1800 seconds) 81 | - API request timeout (30-300 seconds) 82 | - Set retry attempts for API calls 83 | - Configure image size thresholds 84 | - Select image format (PNG/JPG) 85 | - Adjust worker thread count (1-16) 86 | 87 | ### Job Status and Monitoring 88 | 89 | - **Real-time Progress**: View detailed progress during processing 90 | - **Log Viewer**: See all processing events as they happen 91 | - **Failed Chunks**: Identify and retry problematic sections 92 | - **Result Management**: Download or view generated summaries 93 | - **Obsidian Export**: Track files exported to your Obsidian vault 94 | 95 | ## How It Works 96 | 97 | 1. **Document Loading**: The application loads PDF or EPUB files and extracts text content 98 | 2. **Chunking**: Content is divided into manageable chunks (by page for PDFs, by chapter for EPUBs) 99 | 3. **Image Extraction**: Images are extracted with size filtering and saved separately 100 | 4. **AI Processing**: Each chunk is sent to Gemini API with timeout handling and retries 101 | 5. **Error Recovery**: Failed chunks are tracked and can be retried with more robust settings 102 | 6. **Summary Creation**: Results are compiled into a well-formatted Markdown document 103 | 7. **Integration**: Summary and images are saved locally and (optionally) to Obsidian 104 | 105 | ## Troubleshooting 106 | 107 | ### Common Issues 108 | 109 | - **API Errors**: Check your API key and internet connection 110 | - **Processing Timeouts**: Increase the chunk and API timeout values in Advanced Settings 111 | - **Failed Chunks**: Use the "Retry Failed Chunks" button on the job status page 112 | - **Obsidian Integration**: Ensure your Obsidian vault path is correct and contains a .obsidian folder 113 | 114 | ### Error Logs 115 | 116 | For detailed error information, check the application logs in your terminal or command prompt. 117 | 118 | ## Project Structure 119 | 120 | - `document_gui.py` - Web interface and job management 121 | - `document_processor.py` - Core processing logic for documents 122 | - `epub_processor.py` - EPUB-specific processing functionality 123 | - `templates/` - HTML templates for web interface 124 | - `uploads/` - Temporary storage for uploaded files and processing results 125 | 126 | ## License 127 | 128 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 129 | 130 | ## Credits 131 | 132 | This project uses the following technologies: 133 | - [Google Generative AI API](https://ai.google.dev/) 134 | - [Flask](https://flask.palletsprojects.com/) 135 | - [PyPDF](https://pypdf.readthedocs.io/en/latest/) 136 | - [ebooklib](https://github.com/aerkalov/ebooklib) 137 | - [Bootstrap](https://getbootstrap.com/) for the web interface 138 | -------------------------------------------------------------------------------- /tests/test_document_processor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 4 | 5 | import pytest 6 | from document_processor import GeminiDocumentProcessor, ChunkTimeoutError 7 | 8 | class DummyModel: 9 | def generate_content(self, prompt, generation_config=None): 10 | class Response: 11 | text = "Summary" 12 | return Response() 13 | 14 | def test_initialize_api(monkeypatch): 15 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 16 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 17 | processor = GeminiDocumentProcessor(api_key='dummy', model_name='gemini-2.0-flash') 18 | assert processor.model_name == 'gemini-2.0-flash' 19 | 20 | def test_get_total_pages(tmp_path, monkeypatch): 21 | # Create a dummy PDF file 22 | pdf_path = tmp_path / "test.pdf" 23 | with open(pdf_path, "wb") as f: 24 | f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF") 25 | monkeypatch.setattr('pypdf.PdfReader', lambda f: type('R', (), {'pages': [1, 2, 3]})()) 26 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 27 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 28 | processor = GeminiDocumentProcessor(api_key='dummy') 29 | assert processor.get_total_pages(str(pdf_path)) == 3 30 | 31 | def test_get_total_pages_nonexistent(): 32 | processor = GeminiDocumentProcessor(api_key='dummy') 33 | assert processor.get_total_pages('nonexistent.pdf') == 0 34 | 35 | def test_extract_metadata_pdf(monkeypatch, tmp_path): 36 | pdf_path = tmp_path / "test.pdf" 37 | with open(pdf_path, "wb") as f: 38 | f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF") 39 | monkeypatch.setattr('pypdf.PdfReader', lambda f: type('R', (), {'metadata': {'/Title': 'Test'}})()) 40 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 41 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 42 | processor = GeminiDocumentProcessor(api_key='dummy') 43 | meta = processor._extract_pdf_metadata(str(pdf_path)) 44 | assert meta['title'] == 'Test' 45 | 46 | def test_extract_text_from_pdf_pages(monkeypatch, tmp_path): 47 | pdf_path = tmp_path / "test.pdf" 48 | with open(pdf_path, "wb") as f: 49 | f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF") 50 | class DummyPage: 51 | def extract_text(self): 52 | return "Hello" 53 | class DummyReader: 54 | pages = [DummyPage(), DummyPage()] 55 | monkeypatch.setattr('pypdf.PdfReader', lambda f: DummyReader()) 56 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 57 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 58 | processor = GeminiDocumentProcessor(api_key='dummy') 59 | text = processor._extract_text_from_pdf_pages(str(pdf_path), 1, 2) 60 | assert "Hello" in text 61 | 62 | def test_extract_text_from_pdf_pages_corrupted(monkeypatch, tmp_path): 63 | pdf_path = tmp_path / "corrupt.pdf" 64 | with open(pdf_path, "wb") as f: 65 | f.write(b"not a real pdf") 66 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 67 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 68 | processor = GeminiDocumentProcessor(api_key='dummy') 69 | with pytest.raises(Exception): 70 | processor._extract_text_from_pdf_pages(str(pdf_path), 1, 1) 71 | 72 | def test_summarize_text_with_timeout(monkeypatch): 73 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 74 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 75 | processor = GeminiDocumentProcessor(api_key='dummy') 76 | summary = processor._summarize_text_with_timeout("test", 1, 1, 1) 77 | assert summary == "Summary" 78 | 79 | def test_save_summaries(tmp_path, monkeypatch): 80 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 81 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 82 | processor = GeminiDocumentProcessor(api_key='dummy') 83 | output = tmp_path / "out.md" 84 | summaries = {"Chunk 1": "Summary text"} 85 | images = {"Chunk 1": [{"path": "img.png", "alt": "img"}]} 86 | meta = {"title": "Test"} 87 | path = processor.save_summaries(summaries, images, str(output), "Test", "pdf", meta) 88 | assert os.path.exists(path) 89 | 90 | def test_process_chunk(monkeypatch, tmp_path): 91 | pdf_path = tmp_path / "test.pdf" 92 | with open(pdf_path, "wb") as f: 93 | f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF") 94 | class DummyPage: 95 | def extract_text(self): 96 | return "Hello" 97 | class DummyReader: 98 | pages = [DummyPage(), DummyPage()] 99 | monkeypatch.setattr('pypdf.PdfReader', lambda f: DummyReader()) 100 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 101 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 102 | processor = GeminiDocumentProcessor(api_key='dummy') 103 | summary, images = processor.process_chunk(str(pdf_path), 1, 2) 104 | assert isinstance(summary, str) 105 | assert isinstance(images, list) 106 | 107 | def test_process_chunk_no_images(monkeypatch, tmp_path): 108 | pdf_path = tmp_path / "test.pdf" 109 | with open(pdf_path, "wb") as f: 110 | f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF") 111 | class DummyPage: 112 | def extract_text(self): 113 | return "Hello" 114 | class DummyReader: 115 | pages = [DummyPage(), DummyPage()] 116 | monkeypatch.setattr('pypdf.PdfReader', lambda f: DummyReader()) 117 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 118 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 119 | processor = GeminiDocumentProcessor(api_key='dummy', extract_images=False) 120 | summary, images = processor.process_chunk(str(pdf_path), 1, 2) 121 | assert isinstance(summary, str) 122 | assert images == [] 123 | 124 | def test_retry_failed_chunks_no_failed(monkeypatch): 125 | monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel()) 126 | monkeypatch.setattr('google.generativeai.configure', lambda api_key: None) 127 | processor = GeminiDocumentProcessor(api_key='dummy') 128 | summaries = {"Chunk 1": "Summary"} 129 | updated = processor.retry_failed_chunks('dummy.pdf', summaries) 130 | assert updated == summaries 131 | -------------------------------------------------------------------------------- /epub_processor.py: -------------------------------------------------------------------------------- 1 | import ebooklib 2 | from ebooklib import epub 3 | from bs4 import BeautifulSoup 4 | import os 5 | import logging 6 | import re 7 | import html2text 8 | 9 | # Set up logging 10 | logger = logging.getLogger("GeminiEbookProcessor") 11 | 12 | 13 | class SimpleEpubProcessor: 14 | """ 15 | A simplified EPUB processor that doesn't require PIL for image processing. 16 | """ 17 | 18 | def __init__(self, extract_images=True, img_format="png"): 19 | self.extract_images = extract_images 20 | self.img_format = img_format.lower() 21 | self.html_converter = html2text.HTML2Text() 22 | self.html_converter.ignore_links = False 23 | self.html_converter.ignore_images = False 24 | self.html_converter.ignore_tables = False 25 | self.html_converter.body_width = 0 # No wrapping 26 | 27 | def process_epub(self, epub_path, image_output_dir=None): 28 | """ 29 | Process an EPUB file and extract text and images. 30 | 31 | Args: 32 | epub_path (str): Path to the EPUB file. 33 | image_output_dir (str, optional): Directory to save extracted images. 34 | If None, a temp directory will be created. 35 | 36 | Returns: 37 | tuple: (chapters, images_by_chapter, epub_name_without_ext, metadata) 38 | chapters: Dictionary of chapter number to text content 39 | images_by_chapter: Dictionary of chapter number to list of image information 40 | epub_name_without_ext: Name of the EPUB file without extension 41 | metadata: Dictionary of metadata extracted from the EPUB 42 | """ 43 | # Always set epub_name_without_ext early for error handling 44 | epub_filename = os.path.basename(epub_path) 45 | epub_name_without_ext = os.path.splitext(epub_filename)[0] 46 | try: 47 | if not os.path.exists(epub_path): 48 | raise FileNotFoundError(f"EPUB file not found: {epub_path}") 49 | 50 | # Create image output directory if extracting images 51 | if self.extract_images and image_output_dir: 52 | os.makedirs(image_output_dir, exist_ok=True) 53 | logger.info(f"Images will be saved to: {image_output_dir}") 54 | 55 | # Read the EPUB file 56 | book = epub.read_epub(epub_path) 57 | 58 | # Extract metadata 59 | metadata = {} 60 | try: 61 | for key, value in book.get_metadata('DC', 'http://purl.org/dc/elements/1.1/').items(): 62 | if value: 63 | # Strip namespace from key 64 | key = key.split("}")[-1] 65 | metadata[key] = value[0][0] 66 | except Exception as e: 67 | logger.error(f"Error extracting metadata: {str(e)}") 68 | metadata = {"title": epub_name_without_ext} 69 | 70 | # Get chapters and images 71 | chapters = {} # Always initialize as a dictionary 72 | images_by_chapter = {} 73 | chapter_index = 1 74 | 75 | try: 76 | for item in book.get_items(): 77 | if item.get_type() == ebooklib.ITEM_DOCUMENT: 78 | try: 79 | # Extract text from HTML content 80 | content = item.get_content().decode('utf-8') 81 | soup = BeautifulSoup(content, 'html.parser') 82 | 83 | # Get text content 84 | text_content = self.html_converter.handle(content) 85 | 86 | # Skip if there's no meaningful content 87 | if not self._has_content(text_content): 88 | continue 89 | 90 | # Add chapter 91 | chapter_key = f"Chapter {chapter_index}" 92 | chapters[chapter_key] = text_content 93 | 94 | # Basic image extraction that doesn't depend on PIL 95 | if self.extract_images and image_output_dir: 96 | chapter_images = self._extract_images_basic( 97 | book, item, soup, image_output_dir, 98 | epub_name_without_ext, chapter_index 99 | ) 100 | 101 | if chapter_images: 102 | images_by_chapter[chapter_key] = chapter_images 103 | logger.info(f"Found {len(chapter_images)} image references in {chapter_key}") 104 | 105 | chapter_index += 1 106 | except Exception as e: 107 | logger.error(f"Error processing chapter {chapter_index}: {str(e)}") 108 | # Continue with next chapter 109 | chapter_index += 1 110 | except Exception as e: 111 | logger.error(f"Error processing chapters: {str(e)}") 112 | # If we hit an error in the chapter processing loop, add an error chapter 113 | chapters = {"Error": f"Failed to process chapters: {str(e)}"} 114 | 115 | # If no chapters were found, add a placeholder 116 | if not chapters: 117 | chapters = {"No Content": "No readable content found in the EPUB file."} 118 | 119 | return chapters, images_by_chapter, epub_name_without_ext, metadata 120 | 121 | except Exception as e: 122 | logger.error(f"Error processing EPUB file: {str(e)}") 123 | # Return minimal data to not break the pipeline - ensure everything is a dictionary 124 | return {"Error": f"Failed to process EPUB: {str(e)}"}, {}, epub_name_without_ext, { 125 | "title": epub_name_without_ext} 126 | 127 | def _has_content(self, text): 128 | """Check if there's substantial content in the text.""" 129 | # Remove whitespace, special characters, and common EPUB navigation elements 130 | if not text: 131 | return False 132 | clean_text = re.sub(r'\s+|[^\w]', '', text) 133 | return len(clean_text) > 100 # Arbitrary threshold 134 | 135 | def _extract_images_basic(self, book, item, soup, output_dir, epub_name, chapter_index): 136 | """Extract basic image references without using PIL.""" 137 | extracted_images = [] 138 | img_elements = soup.find_all('img') 139 | 140 | for img_index, img in enumerate(img_elements): 141 | try: 142 | # Get image source 143 | src = img.get('src') 144 | if not src: 145 | continue 146 | 147 | # Just record the image reference without saving it 148 | # This way we don't depend on PIL but still have image info 149 | img_filename = f"{epub_name}_chapter{chapter_index:03d}_img{img_index + 1:03d}.{self.img_format}" 150 | img_path = os.path.join(output_dir, img_filename) 151 | 152 | # Store image information 153 | extracted_images.append({ 154 | 'filename': img_filename, 155 | 'path': img_path, 156 | 'chapter': chapter_index, 157 | 'alt': img.get('alt', ''), 158 | 'src': src 159 | }) 160 | 161 | except Exception as e: 162 | logger.error(f"Error extracting image {img_index} reference from chapter {chapter_index}: {str(e)}") 163 | continue 164 | 165 | return extracted_images -------------------------------------------------------------------------------- /templates/job_status.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Job Status - Gemini Document Processor 8 | 9 | 34 | 35 | 36 |
37 |

Document Processing Status

38 | 39 |
40 |
41 |
42 |
Job Status
43 | Back to Home 44 |
45 |
46 |
47 |
48 |
49 | Status: 50 | Loading... 51 |
52 |
53 | 54 |
55 | Progress: 56 |
57 |
0%
58 |
59 |
60 | Processing chunk 0 of 0 61 |
62 |
63 | 64 |
65 | Current Task: 66 |
Initializing...
67 |
68 | 69 |
70 | Log: 71 |
72 |
73 | 74 |
75 | 79 |
80 | 81 |
82 | Results: 83 |
84 |
85 | 86 | 90 |
91 |
92 |
93 | 94 | 95 | 337 | 338 | 339 | -------------------------------------------------------------------------------- /templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Gemini Document Processor 8 | 9 | 20 | 21 | 22 |
23 |

Gemini Document Processor

24 | 25 |
26 |
27 | 38 |
39 |
40 |
41 |
42 |
43 |
44 | 45 | 46 | You can upload either a PDF or EPUB file for processing 47 |
48 | 49 |
50 | 51 | 56 |
57 | 58 |
59 | 60 | 61 | For PDFs: Number of pages to process in each API call. For EPUBs: Controls how much content is processed at once. 62 |
63 | 64 |
65 | 66 | 67 |
68 | 69 | 70 |
71 |
72 | 73 |
74 | 75 | 76 |
77 |
78 | 79 |
80 |
Obsidian Integration
81 | 82 |
83 | 84 | 85 |
86 | 87 |
88 |
89 | 90 |
91 | 92 | 93 |
94 |
95 | Path to your Obsidian vault. The summarized document will be saved to a 'books' folder in this location. 96 |
97 | 98 |
99 | 100 | 101 |
102 | 103 |
104 | 105 | 106 | If not specified, will use author from document metadata if available 107 |
108 | 109 |
110 | 111 | 112 |
113 | 114 |
115 | 116 | 117 | Example: "20/33" or "5/5" 118 |
119 |
120 |
121 | 122 |
123 |
Advanced Options
124 | 125 |
126 | 127 | 128 | Maximum number of retry attempts for API calls 129 |
130 | 131 |
132 | 133 | 134 | Maximum time allowed for processing a single chunk before it's considered failed (5-30 minutes) 135 |
136 | 137 |
138 | 139 | 140 | Maximum time allowed for a single API call (30-300 seconds) 141 |
142 | 143 |
144 | 145 | 146 | Images smaller than this width will be ignored 147 |
148 | 149 |
150 | 151 | 152 | Images smaller than this height will be ignored 153 |
154 | 155 |
156 | 157 | 161 |
162 | 163 |
164 | 165 | 166 | Number of parallel threads for image extraction 167 |
168 |
169 |
170 | 171 |
172 | 173 |
174 | Loading... 175 |
176 |
177 |
178 |
179 |
180 |
181 | 182 | 183 | 264 | 265 | 266 | -------------------------------------------------------------------------------- /document_processor.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import time 5 | import logging 6 | import tempfile 7 | import threading 8 | from concurrent.futures import ThreadPoolExecutor 9 | from typing import Dict, List, Tuple, Any, Optional 10 | 11 | import pypdf 12 | import google.generativeai as genai 13 | from PIL import Image 14 | 15 | # Import the epub processor 16 | from epub_processor import SimpleEpubProcessor 17 | 18 | # Set up logging 19 | logging.basicConfig( 20 | level=logging.INFO, 21 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 22 | ) 23 | logger = logging.getLogger("GeminiDocumentProcessor") 24 | 25 | 26 | class ChunkTimeoutError(Exception): 27 | """Exception raised when a chunk processing times out.""" 28 | pass 29 | 30 | 31 | class GeminiDocumentProcessor: 32 | """ 33 | Enhanced processor for PDF and EPUB documents using Gemini API with improved 34 | timeout handling and chunk-level processing. 35 | """ 36 | 37 | def __init__( 38 | self, 39 | api_key: str = None, 40 | model_name: str = "gemini-2.0-flash", 41 | language: str = "thai", 42 | chunk_size: int = 7, 43 | max_retries: int = 3, 44 | retry_delay: int = 5, 45 | extract_images: bool = True, 46 | min_img_width: int = 100, 47 | min_img_height: int = 100, 48 | img_format: str = "png", 49 | max_workers: int = 4, 50 | request_timeout: int = 60 51 | ): 52 | self.api_key = api_key 53 | self.model_name = model_name 54 | self.language = language 55 | self.chunk_size = chunk_size 56 | self.max_retries = max_retries 57 | self.retry_delay = retry_delay 58 | self.extract_images = extract_images 59 | self.min_img_width = min_img_width 60 | self.min_img_height = min_img_height 61 | self.img_format = img_format.lower() 62 | self.max_workers = max_workers 63 | self.request_timeout = request_timeout 64 | self.failed_chunks = [] 65 | self.progress_callback = None # Added progress callback 66 | # Do NOT initialize Gemini API here 67 | self.model = None 68 | 69 | def _initialize_api(self): 70 | """Initialize the Gemini API client.""" 71 | if self.model is not None: 72 | return # Already initialized 73 | try: 74 | if self.api_key: 75 | logger.info("Using provided API key") 76 | genai.configure(api_key=self.api_key) 77 | self.model = genai.GenerativeModel(self.model_name) 78 | # Test the API connection 79 | response = self.model.generate_content("Hello, this is a test.") 80 | if response: 81 | logger.info(f"Successfully connected to Gemini API using model {self.model_name}") 82 | except Exception as e: 83 | logger.error(f"Error initializing Gemini API: {str(e)}") 84 | raise RuntimeError(f"Failed to initialize Gemini API: {str(e)}") 85 | 86 | def get_total_pages(self, pdf_path: str) -> int: 87 | """Get the total number of pages in a PDF document.""" 88 | try: 89 | with open(pdf_path, 'rb') as file: 90 | pdf_reader = pypdf.PdfReader(file) 91 | return len(pdf_reader.pages) 92 | except Exception as e: 93 | logger.error(f"Error counting PDF pages: {e}") 94 | return 0 95 | 96 | def process_document(self, file_path: str) -> Tuple[Dict[str, str], Dict[str, List[Dict]], str, str, Dict]: 97 | """ 98 | Process a document (PDF or EPUB) and return summaries and images. 99 | 100 | This is a wrapper around the chunk-level processing functions that ensures 101 | all chunks are processed with proper error handling. 102 | 103 | Args: 104 | file_path: Path to the document 105 | 106 | Returns: 107 | Tuple of: 108 | - Dictionary of chunk/chapter summaries 109 | - Dictionary of images by chunk/chapter 110 | - Document name 111 | - Document type ("pdf" or "epub") 112 | - Document metadata 113 | """ 114 | # Determine document type 115 | file_extension = os.path.splitext(file_path)[1].lower() 116 | doc_type = "epub" if file_extension == ".epub" else "pdf" 117 | doc_name = os.path.splitext(os.path.basename(file_path))[0] 118 | 119 | # Reset failed chunks list 120 | self.failed_chunks = [] 121 | 122 | # Create images directory if extracting images 123 | if self.extract_images: 124 | images_dir = os.path.join(os.path.dirname(file_path), f"{doc_name}_images") 125 | os.makedirs(images_dir, exist_ok=True) 126 | logger.info(f"Images will be saved to: {images_dir}") 127 | 128 | # Process based on document type 129 | if doc_type == "pdf": 130 | return self._process_pdf(file_path) 131 | else: 132 | return self._process_epub(file_path) 133 | 134 | def _process_pdf(self, pdf_path: str) -> Tuple[Dict[str, str], Dict[str, List[Dict]], str, str, Dict]: 135 | """ 136 | Process a PDF document chunk by chunk with immediate retry on failure. 137 | 138 | Args: 139 | pdf_path: Path to the PDF file 140 | 141 | Returns: 142 | Tuple containing: 143 | - Dictionary of summaries by chunk 144 | - Dictionary of images by chunk 145 | - Document name without extension 146 | - Document type ("pdf") 147 | - Document metadata 148 | """ 149 | # Get document name 150 | pdf_name = os.path.splitext(os.path.basename(pdf_path))[0] 151 | 152 | # Count total pages 153 | total_pages = self.get_total_pages(pdf_path) 154 | logger.info(f"Total pages in PDF: {total_pages}") 155 | 156 | # Calculate total chunks 157 | total_chunks = (total_pages + self.chunk_size - 1) // self.chunk_size 158 | 159 | # Extract metadata 160 | metadata = self.extract_metadata(pdf_path) 161 | 162 | # Process each chunk 163 | summaries = {} 164 | images_by_chunk = {} 165 | 166 | # Process chunks sequentially to avoid overwhelming the API 167 | for chunk_num in range(1, total_chunks + 1): 168 | page_start = (chunk_num - 1) * self.chunk_size + 1 169 | page_end = min(chunk_num * self.chunk_size, total_pages) 170 | 171 | # Update progress via callback if it exists 172 | if self.progress_callback: 173 | self.progress_callback(chunk_num, total_chunks) 174 | 175 | # Immediate retry loop for each chunk 176 | max_immediate_retries = 2 # Try up to 2 additional times (total 3 attempts) 177 | immediate_retry_count = 0 178 | chunk_success = False 179 | last_error = None 180 | 181 | while immediate_retry_count <= max_immediate_retries and not chunk_success: 182 | try: 183 | if immediate_retry_count > 0: 184 | # Log that we're retrying this chunk immediately 185 | logger.info( 186 | f"Immediately retrying chunk {chunk_num} (attempt {immediate_retry_count + 1}/{max_immediate_retries + 1})...") 187 | 188 | # Adjust model for retry - use a more capable model on retry 189 | if immediate_retry_count == 1 and self.model_name == "gemini-2.0-flash": 190 | original_model = self.model_name 191 | self.model_name = "gemini-2.5-flash-preview-04-17" 192 | logger.info(f" Switching from {original_model} to {self.model_name} for retry") 193 | elif immediate_retry_count == 2: 194 | # On second retry, increase timeout 195 | original_timeout = self.request_timeout 196 | self.request_timeout = original_timeout * 1.5 197 | logger.info(f" Increasing timeout to {self.request_timeout} seconds for final retry") 198 | else: 199 | # First attempt 200 | logger.info(f"Summarizing chunk {chunk_num} (pages {page_start}-{page_end})...") 201 | 202 | # Extract text and images from this chunk 203 | chunk_text = self._extract_text_from_pdf_pages(pdf_path, page_start, page_end) 204 | chunk_images = [] 205 | 206 | if self.extract_images: 207 | chunk_images = self._extract_images_from_pdf_pages( 208 | pdf_path, page_start, page_end, 209 | os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_images"), 210 | chunk_num 211 | ) 212 | 213 | # Summarize the chunk text 214 | chunk_summary = self._summarize_text_with_timeout( 215 | chunk_text, chunk_num, page_start, page_end, 216 | doc_type="pdf", doc_filename=pdf_name 217 | ) 218 | 219 | # Save temporary summary 220 | temp_file = f"temp_{pdf_name}_chunk_{chunk_num}.md" 221 | with open(temp_file, "w", encoding="utf-8") as f: 222 | f.write(chunk_summary) 223 | logger.info(f" Saved temporary summary to {temp_file}") 224 | 225 | # Store results 226 | summaries[f"Chunk {chunk_num}"] = chunk_summary 227 | 228 | if chunk_images: 229 | images_by_chunk[f"Chunk {chunk_num}"] = chunk_images 230 | logger.info(f" Associated {len(chunk_images)} images with chunk {chunk_num}") 231 | 232 | # If we get here, the chunk was processed successfully 233 | chunk_success = True 234 | 235 | # Reset any modified parameters 236 | if immediate_retry_count > 0: 237 | if immediate_retry_count == 1 and 'original_model' in locals(): 238 | logger.info(f" Resetting model back to {original_model}") 239 | self.model_name = original_model 240 | if immediate_retry_count == 2 and 'original_timeout' in locals(): 241 | logger.info(f" Resetting timeout back to {original_timeout}") 242 | self.request_timeout = original_timeout 243 | 244 | except Exception as e: 245 | last_error = e 246 | logger.error(f"Error processing chunk {chunk_num} (pages {page_start}-{page_end}): {str(e)}") 247 | immediate_retry_count += 1 248 | 249 | # Reset any modified parameters before next retry or moving on 250 | if 'original_model' in locals() and self.model_name != original_model: 251 | logger.info(f" Resetting model back to {original_model}") 252 | self.model_name = original_model 253 | if 'original_timeout' in locals() and self.request_timeout != original_timeout: 254 | logger.info(f" Resetting timeout back to {original_timeout}") 255 | 256 | # After all retry attempts, if still not successful, add to failed chunks 257 | if not chunk_success: 258 | self.failed_chunks.append({ 259 | "chunk_number": chunk_num, 260 | "page_start": page_start, 261 | "page_end": page_end, 262 | "error": str(last_error) 263 | }) 264 | 265 | # Create a placeholder summary for the failed chunk 266 | summaries[ 267 | f"Chunk {chunk_num}"] = f"**Error processing pages {page_start}-{page_end} after {max_immediate_retries + 1} attempts:** {str(last_error)}" 268 | 269 | # Final progress update 270 | if self.progress_callback: 271 | self.progress_callback(total_chunks, total_chunks) 272 | 273 | return summaries, images_by_chunk, pdf_name, "pdf", metadata 274 | 275 | def _extract_text_from_pdf_pages(self, pdf_path: str, start_page: int, end_page: int) -> str: 276 | """ 277 | Extract text from a range of PDF pages. 278 | 279 | Args: 280 | pdf_path: Path to the PDF file 281 | start_page: Start page number (1-based index) 282 | end_page: End page number (inclusive) 283 | 284 | Returns: 285 | Extracted text from the specified pages 286 | """ 287 | try: 288 | with open(pdf_path, 'rb') as file: 289 | pdf_reader = pypdf.PdfReader(file) 290 | 291 | # Adjust for 0-based indexing 292 | start_idx = start_page - 1 293 | end_idx = min(end_page, len(pdf_reader.pages)) - 1 294 | 295 | text_parts = [] 296 | for i in range(start_idx, end_idx + 1): 297 | try: 298 | page = pdf_reader.pages[i] 299 | page_text = page.extract_text() or "" 300 | text_parts.append(f"--- Page {i + 1} ---\n{page_text}") 301 | except Exception as e: 302 | logger.error(f"Error extracting text from page {i + 1}: {str(e)}") 303 | text_parts.append(f"--- Page {i + 1} ---\n[Error extracting text: {str(e)}]") 304 | 305 | return "\n\n".join(text_parts) 306 | 307 | except Exception as e: 308 | logger.error(f"Error opening PDF file: {str(e)}") 309 | raise 310 | 311 | def _extract_images_from_pdf_pages( 312 | self, pdf_path: str, start_page: int, end_page: int, 313 | output_dir: str, chunk_num: int = None) -> List[Dict]: 314 | """ 315 | Extract images from a range of PDF pages using PyPDF. 316 | 317 | Args: 318 | pdf_path: Path to the PDF file 319 | start_page: Start page number (1-based index) 320 | end_page: End page number (inclusive) 321 | output_dir: Directory to save extracted images 322 | chunk_num: Current chunk number (for logging) 323 | 324 | Returns: 325 | List of dictionaries with image information 326 | """ 327 | # Store information about extracted images 328 | extracted_images = [] 329 | 330 | try: 331 | # Make sure output directory exists 332 | os.makedirs(output_dir, exist_ok=True) 333 | 334 | # Let's try with PyPDF first - it's more widely available 335 | # For each page in range, extract images if any 336 | with open(pdf_path, "rb") as pdf_file: 337 | pdf = pypdf.PdfReader(pdf_file) 338 | 339 | # Adjust for 0-based indexing 340 | start_idx = start_page - 1 341 | end_idx = min(end_page, len(pdf.pages)) - 1 342 | 343 | # Process each page in range 344 | for page_idx in range(start_idx, end_idx + 1): 345 | page_num = page_idx + 1 # 1-based page number for display 346 | page = pdf.pages[page_idx] 347 | 348 | # Try to get images from the page (if any) 349 | try: 350 | # Extract image data through page.images property 351 | if hasattr(page, "images") and page.images: 352 | images_extracted = 0 353 | 354 | for i, image in enumerate(page.images): 355 | try: 356 | # Skip small images 357 | if hasattr(image, "width") and hasattr(image, "height"): 358 | if image.width < self.min_img_width or image.height < self.min_img_height: 359 | continue 360 | 361 | # Create a unique filename for this image 362 | img_filename = f"page{page_num:03d}_img{i + 1:03d}.{self.img_format}" 363 | img_path = os.path.join(output_dir, img_filename) 364 | 365 | # Try to save the image 366 | with open(img_path, "wb") as img_file: 367 | img_file.write(image.data) 368 | 369 | # Check if image was saved successfully 370 | if os.path.exists(img_path) and os.path.getsize(img_path) > 0: 371 | # Store image information 372 | extracted_images.append({ 373 | "filename": img_filename, 374 | "path": img_path, 375 | "page": page_num, 376 | "width": getattr(image, "width", 0), 377 | "height": getattr(image, "height", 0), 378 | "alt": f"Image {i + 1} from page {page_num}" 379 | }) 380 | images_extracted += 1 381 | 382 | except Exception as e: 383 | logger.error(f"Error extracting image {i + 1} from page {page_num}: {str(e)}") 384 | 385 | if images_extracted > 0: 386 | logger.info(f"Extracted {images_extracted} images from page {page_num}") 387 | 388 | except Exception as e: 389 | logger.error(f"Error accessing images on page {page_num}: {str(e)}") 390 | continue 391 | 392 | except Exception as e: 393 | logger.error(f"Error in image extraction process: {str(e)}") 394 | 395 | # Return information about extracted images 396 | return extracted_images 397 | 398 | def _summarize_text_with_timeout( 399 | self, text: str, chunk_num: int, page_start: int, page_end: int, 400 | doc_type: str = "pdf", doc_filename: str = "") -> str: 401 | """ 402 | Summarize text using Gemini API with timeout handling. 403 | 404 | Args: 405 | text: Text to summarize 406 | chunk_num: Chunk number for error reporting 407 | page_start: Start page of this chunk 408 | page_end: End page of this chunk 409 | doc_type: Document type ('pdf' or 'epub') 410 | doc_filename: Name of the document file 411 | 412 | Returns: 413 | Summarized text 414 | 415 | Raises: 416 | ChunkTimeoutError: If the API call times out 417 | """ 418 | start_time = time.time() 419 | max_time = self.request_timeout # seconds 420 | 421 | # Ensure Gemini API is initialized before summarization 422 | self._initialize_api() 423 | 424 | # Prepare the original Thai prompt 425 | page_or_chapter = "หน้า" if doc_type == 'pdf' else "บท" 426 | prompt = ( 427 | "คุณคือ expert ด้าน summarizer analyzing\n\n" 428 | f"ช่วยสรุปเนื้อหา**จาก{doc_type}** ({doc_filename}) **เป็น ภาษาไทย** โดย:\n" 429 | "1. ใช้หัวข้อเดิมตามไฟล์ได้เลย (ไม่ต้องแปลหัวข้อ)\n" 430 | "2. ไม่อยากให้ตกหล่นแม้แต่เรื่องเดียว (ขอแบบละเอียดจนไม่ต้องกลับไปอ่านต้นฉบับเลย)\n" 431 | f"3. บอกด้วยว่ากำลังสรุป{page_or_chapter}ไหนของไฟล์ เช่น , , , (use markdown comment)\n" 432 | "4. ไม่จำเป็นต้องกระชับ และรักษาความถูกต้องของข้อมูลสำคัญ เนื้อหาสำคัญไม่ตกหล่น\n" 433 | "5. ไม่ต้องแปล technical terminology จากภาษาอังกฤษให้เป็นภาษาไทย\n" 434 | "6. ถ้าใน file มีตัวอย่าง code ก็ใส่มาให้ด้วย\n" 435 | "7. output ใน format ที่ดีที่สุด\n\n" 436 | f"เนื้อหาต่อไปนี้มาจาก{page_or_chapter} {page_start} ถึง{page_or_chapter} {page_end}:\n\n{text}" 437 | ) 438 | 439 | # Define a timeout function using a separate thread 440 | def call_with_timeout(func, *args, **kwargs): 441 | result = [None] 442 | error = [None] 443 | 444 | def target(): 445 | try: 446 | result[0] = func(*args, **kwargs) 447 | except Exception as e: 448 | error[0] = e 449 | 450 | thread = threading.Thread(target=target) 451 | thread.daemon = True 452 | thread.start() 453 | thread.join(max_time) 454 | 455 | if thread.is_alive(): 456 | # Thread is still running after timeout 457 | raise ChunkTimeoutError(f"API call timed out after {max_time} seconds") 458 | 459 | if error[0]: 460 | raise error[0] 461 | 462 | return result[0] 463 | 464 | # Try multiple times with backoff 465 | for attempt in range(1, self.max_retries + 1): 466 | try: 467 | # Check if we've already exceeded our timeout 468 | elapsed = time.time() - start_time 469 | if elapsed > max_time: 470 | raise ChunkTimeoutError(f"Processing timeout after {elapsed:.1f} seconds") 471 | 472 | # Set up the generation config (without timeout parameter) 473 | generation_config = { 474 | "temperature": 0.1, 475 | "top_p": 0.95, 476 | "top_k": 50, 477 | "max_output_tokens": 65000, 478 | "response_mime_type": "text/plain" 479 | } 480 | 481 | # Make the API call with our own timeout handling 482 | # Using only valid roles (user) 483 | response = call_with_timeout( 484 | self.model.generate_content, 485 | prompt, # Send the full prompt as a user message 486 | generation_config=generation_config 487 | ) 488 | 489 | # Successfully got a response 490 | return response.text 491 | 492 | except ChunkTimeoutError: 493 | # Re-raise timeout errors directly 494 | raise 495 | 496 | except Exception as e: 497 | error_msg = str(e) 498 | logger.warning(f"Attempt {attempt} failed for chunk {chunk_num}: {error_msg}") 499 | 500 | if attempt < self.max_retries: 501 | # Wait before retrying with exponential backoff 502 | wait_time = self.retry_delay * (2 ** (attempt - 1)) 503 | logger.info(f"Waiting {wait_time} seconds before retry...") 504 | time.sleep(wait_time) 505 | else: 506 | # Failed all attempts 507 | logger.error(f"All {self.max_retries} attempts failed for chunk {chunk_num}") 508 | raise RuntimeError(f"Failed to summarize after {self.max_retries} attempts: {error_msg}") 509 | 510 | def _process_epub(self, epub_path: str) -> Tuple[Dict[str, str], Dict[str, List[Dict]], str, str, Dict]: 511 | """ 512 | Process an EPUB document chapter by chapter. 513 | 514 | Args: 515 | epub_path: Path to the EPUB file 516 | 517 | Returns: 518 | Tuple containing: 519 | - Dictionary of summaries by chapter 520 | - Dictionary of images by chapter 521 | - Document name without extension 522 | - Document type ("epub") 523 | - Document metadata 524 | """ 525 | # Create EPUB processor 526 | epub_processor = SimpleEpubProcessor(extract_images=self.extract_images, img_format=self.img_format) 527 | 528 | # Extract chapters and metadata 529 | chapters, images_by_chapter, epub_name, metadata = epub_processor.process_epub( 530 | epub_path, 531 | image_output_dir=os.path.join(os.path.dirname(epub_path), 532 | f"{os.path.splitext(os.path.basename(epub_path))[0]}_images") 533 | ) 534 | 535 | # Process each chapter with timeout handling 536 | summaries = {} 537 | processed_images = {} 538 | 539 | # Calculate total chapters 540 | total_chapters = len(chapters) 541 | 542 | # Initialize progress 543 | chapter_count = 0 544 | 545 | for chapter_key, chapter_text in chapters.items(): 546 | chapter_count += 1 547 | 548 | # Update progress via callback if it exists 549 | if self.progress_callback: 550 | self.progress_callback(chapter_count, total_chapters) 551 | 552 | try: 553 | logger.info(f"Summarizing {chapter_key}...") 554 | 555 | # Extract chapter number if possible 556 | chapter_num = chapter_key.replace("Chapter ", "") 557 | 558 | # Summarize this chapter with timeout handling 559 | chapter_summary = self._summarize_text_with_timeout( 560 | chapter_text, 561 | chapter_num, 562 | chapter_num, chapter_num, # Using chapter number for both start and end 563 | doc_type="epub", 564 | doc_filename=epub_name 565 | ) 566 | 567 | # Save temporary summary 568 | temp_file = f"temp_{epub_name}_{chapter_key.lower().replace(' ', '_')}.md" 569 | with open(temp_file, "w", encoding="utf-8") as f: 570 | f.write(chapter_summary) 571 | logger.info(f" Saved temporary summary to {temp_file}") 572 | 573 | # Store results 574 | summaries[chapter_key] = chapter_summary 575 | 576 | # Store images for this chapter if any 577 | if chapter_key in images_by_chapter: 578 | processed_images[chapter_key] = images_by_chapter[chapter_key] 579 | logger.info(f" Associated {len(images_by_chapter[chapter_key])} images with {chapter_key}") 580 | 581 | except Exception as e: 582 | logger.error(f"Error processing {chapter_key}: {str(e)}") 583 | 584 | # Record the failure 585 | self.failed_chunks.append({ 586 | "chapter": chapter_key, 587 | "error": str(e) 588 | }) 589 | 590 | # Create a placeholder summary for the failed chapter 591 | summaries[chapter_key] = f"**Error processing {chapter_key}:** {str(e)}" 592 | 593 | # Continue with the next chapter 594 | 595 | # Final progress update 596 | if self.progress_callback: 597 | self.progress_callback(total_chapters, total_chapters) 598 | 599 | return summaries, processed_images, epub_name, "epub", metadata 600 | def extract_metadata(self, file_path: str) -> Dict[str, str]: 601 | """ 602 | Extract metadata from a document. 603 | 604 | Args: 605 | file_path: Path to the document 606 | 607 | Returns: 608 | Dictionary of metadata 609 | """ 610 | file_extension = os.path.splitext(file_path)[1].lower() 611 | 612 | if file_extension == ".pdf": 613 | return self._extract_pdf_metadata(file_path) 614 | elif file_extension == ".epub": 615 | return self._extract_epub_metadata(file_path) 616 | else: 617 | return {"title": os.path.splitext(os.path.basename(file_path))[0]} 618 | 619 | def _extract_pdf_metadata(self, pdf_path: str) -> Dict[str, str]: 620 | """Extract metadata from a PDF file.""" 621 | try: 622 | with open(pdf_path, 'rb') as file: 623 | pdf_reader = pypdf.PdfReader(file) 624 | metadata = {} 625 | 626 | if pdf_reader.metadata: 627 | for key in pdf_reader.metadata: 628 | # Clean up the key name 629 | clean_key = key.lower().replace('/', '').strip() 630 | if pdf_reader.metadata[key]: 631 | metadata[clean_key] = str(pdf_reader.metadata[key]) 632 | 633 | # If no title is found, use the filename 634 | if 'title' not in metadata or not metadata['title']: 635 | metadata['title'] = os.path.splitext(os.path.basename(pdf_path))[0] 636 | 637 | return metadata 638 | 639 | except Exception as e: 640 | logger.error(f"Error extracting PDF metadata: {str(e)}") 641 | return {"title": os.path.splitext(os.path.basename(pdf_path))[0]} 642 | 643 | def _extract_epub_metadata(self, epub_path: str) -> Dict[str, str]: 644 | """Extract metadata from an EPUB file.""" 645 | # This would ideally call a method from your epub_processor 646 | # For now, returning basic info 647 | return {"title": os.path.splitext(os.path.basename(epub_path))[0]} 648 | 649 | def save_summaries( 650 | self, 651 | summaries: Dict[str, str], 652 | images_by_chunk: Dict[str, List[Dict]], 653 | output_path: str, 654 | doc_name: str, 655 | doc_type: str, 656 | metadata: Dict, 657 | obsidian_metadata: Optional[Dict] = None 658 | ) -> str: 659 | """ 660 | Save summaries to a Markdown file. 661 | 662 | Args: 663 | summaries: Dictionary of summaries by chunk/chapter 664 | images_by_chunk: Dictionary of images by chunk/chapter 665 | output_path: Path to save the output Markdown file 666 | doc_name: Document name 667 | doc_type: Document type ("pdf" or "epub") 668 | metadata: Document metadata 669 | obsidian_metadata: Optional metadata for Obsidian 670 | 671 | Returns: 672 | Path to the saved file 673 | """ 674 | try: 675 | with open(output_path, "w", encoding="utf-8") as f: 676 | # Add YAML frontmatter for Obsidian if provided 677 | if obsidian_metadata: 678 | f.write("---\n") 679 | 680 | # Add tags 681 | if 'tags' in obsidian_metadata and obsidian_metadata['tags']: 682 | tags = obsidian_metadata['tags'].split(',') 683 | f.write("tags:\n") 684 | for tag in tags: 685 | f.write(f" - {tag.strip()}\n") 686 | 687 | # Add other metadata fields 688 | for key, value in obsidian_metadata.items(): 689 | if key != 'tags' and value: # Skip tags as we handled them above 690 | f.write(f"{key}: {value}\n") 691 | 692 | f.write("---\n\n") 693 | 694 | # Add title and metadata 695 | title = metadata.get('title', doc_name) 696 | f.write(f"# {title}\n\n") 697 | 698 | if 'author' in metadata and metadata['author']: 699 | f.write(f"**Author:** {metadata['author']}\n\n") 700 | 701 | # Add other metadata 702 | f.write("## Document Information\n\n") 703 | f.write(f"- **Type:** {doc_type.upper()}\n") 704 | 705 | for key, value in metadata.items(): 706 | if key not in ['title', 'author'] and value: 707 | key_display = key.replace('_', ' ').title() 708 | f.write(f"- **{key_display}:** {value}\n") 709 | 710 | f.write("\n") 711 | 712 | # Add table of contents 713 | f.write("## Table of Contents\n\n") 714 | 715 | for idx, chunk_key in enumerate(summaries.keys()): 716 | f.write(f"{idx + 1}. [{chunk_key}](#{chunk_key.lower().replace(' ', '-')})\n") 717 | 718 | f.write("\n") 719 | 720 | # Add summaries 721 | f.write("## Summary\n\n") 722 | 723 | for chunk_key, summary in summaries.items(): 724 | # Add chunk header 725 | f.write(f"### {chunk_key}\n\n") 726 | 727 | # Add summary content 728 | f.write(f"{summary}\n\n") 729 | 730 | # Add images for this chunk if any 731 | if chunk_key in images_by_chunk and images_by_chunk[chunk_key]: 732 | f.write("#### Images\n\n") 733 | 734 | for img_info in images_by_chunk[chunk_key]: 735 | img_path = img_info.get('path', '') 736 | alt_text = img_info.get('alt', 'Image') or f"Image from {chunk_key}" 737 | 738 | # Create relative path for embedding 739 | rel_path = os.path.relpath( 740 | img_path, 741 | os.path.dirname(output_path) 742 | ) 743 | 744 | # Add image with markdown 745 | f.write(f"![{alt_text}]({rel_path})\n\n") 746 | 747 | # Add footer with generation info 748 | f.write("---\n") 749 | f.write(f"*Summary generated using Gemini {self.model_name}*\n") 750 | 751 | return output_path 752 | 753 | except Exception as e: 754 | logger.error(f"Error saving summaries: {str(e)}") 755 | raise 756 | 757 | def process_chunk(self, file_path: str, page_start: int, page_end: int, timeout: int = 60) -> Tuple[ 758 | str, List[Dict]]: 759 | """ 760 | Process a single chunk of a document with timeout. 761 | 762 | Args: 763 | file_path: Path to the document 764 | page_start: Start page number (1-based index) 765 | page_end: End page number (inclusive) 766 | timeout: Timeout in seconds 767 | 768 | Returns: 769 | Tuple containing: 770 | - Summarized text 771 | - List of image information dictionaries 772 | """ 773 | # Save the current timeout setting 774 | original_timeout = self.request_timeout 775 | 776 | try: 777 | # Update timeout for this call 778 | self.request_timeout = timeout 779 | 780 | # Get document info 781 | doc_type = "pdf" if file_path.lower().endswith('.pdf') else "epub" 782 | doc_filename = os.path.basename(file_path) 783 | 784 | # Extract text from pages 785 | text = self._extract_text_from_pdf_pages(file_path, page_start, page_end) 786 | 787 | # Summarize text with timeout 788 | summary = self._summarize_text_with_timeout( 789 | text, 790 | 0, # Not using chunk number here 791 | page_start, 792 | page_end, 793 | doc_type=doc_type, 794 | doc_filename=doc_filename 795 | ) 796 | 797 | # Extract images if enabled 798 | images = [] 799 | if self.extract_images: 800 | doc_name = os.path.splitext(os.path.basename(file_path))[0] 801 | images_dir = os.path.join(os.path.dirname(file_path), f"{doc_name}_images") 802 | 803 | images = self._extract_images_from_pdf_pages( 804 | file_path, page_start, page_end, images_dir 805 | ) 806 | 807 | return summary, images 808 | 809 | finally: 810 | # Restore original timeout 811 | self.request_timeout = original_timeout 812 | 813 | def retry_failed_chunks(self, file_path: str, existing_summaries: Dict[str, str]) -> Dict[str, str]: 814 | """ 815 | Retry processing previously failed chunks. 816 | 817 | Args: 818 | file_path: Path to the document 819 | existing_summaries: Dictionary of existing summaries 820 | 821 | Returns: 822 | Updated dictionary of summaries 823 | """ 824 | # Copy existing summaries 825 | updated_summaries = existing_summaries.copy() 826 | 827 | # Nothing to do if no failed chunks 828 | if not self.failed_chunks: 829 | return updated_summaries 830 | 831 | logger.info(f"Retrying {len(self.failed_chunks)} failed chunks with increased timeout") 832 | 833 | # Save original settings 834 | original_timeout = self.request_timeout 835 | original_retries = self.max_retries 836 | 837 | try: 838 | # Use more aggressive settings for retries 839 | self.request_timeout = original_timeout * 2 840 | self.max_retries = original_retries + 2 841 | 842 | # Process each failed chunk 843 | for failed_chunk in self.failed_chunks: 844 | chunk_num = failed_chunk.get("chunk_number") 845 | if not chunk_num: 846 | continue 847 | 848 | page_start = failed_chunk.get("page_start", 0) 849 | page_end = failed_chunk.get("page_end", 0) 850 | 851 | if page_start <= 0 or page_end <= 0: 852 | continue 853 | 854 | try: 855 | logger.info(f"Retrying chunk {chunk_num} (pages {page_start}-{page_end})...") 856 | 857 | # Process this chunk 858 | summary, _ = self.process_chunk(file_path, page_start, page_end) 859 | 860 | # Update the summaries 861 | chunk_key = f"Chunk {chunk_num}" 862 | updated_summaries[chunk_key] = summary 863 | 864 | # Remove from failed chunks 865 | logger.info(f"Successfully reprocessed chunk {chunk_num}") 866 | 867 | except Exception as e: 868 | logger.error(f"Failed to reprocess chunk {chunk_num}: {str(e)}") 869 | # Keep the existing summary or error message 870 | 871 | return updated_summaries 872 | 873 | finally: 874 | # Restore original settings 875 | self.request_timeout = original_timeout 876 | self.max_retries = original_retries -------------------------------------------------------------------------------- /document_gui.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import json 4 | import threading 5 | import logging 6 | import shutil 7 | import time 8 | import uuid 9 | import signal 10 | from pathlib import Path 11 | from threading import Timer 12 | from flask import Flask, render_template, request, redirect, url_for, jsonify, send_from_directory 13 | 14 | # Import the enhanced document processor 15 | from document_processor import GeminiDocumentProcessor, ChunkTimeoutError 16 | 17 | # Set up logging 18 | logging.basicConfig( 19 | level=logging.INFO, 20 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 21 | handlers=[ 22 | logging.FileHandler("document_processor_web.log"), 23 | logging.StreamHandler() 24 | ] 25 | ) 26 | logger = logging.getLogger("GeminiDocumentProcessorWeb") 27 | 28 | app = Flask(__name__) 29 | 30 | # Store processing jobs 31 | jobs = {} 32 | uploads_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads') 33 | results_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'results') 34 | obsidian_dir = None # Will be set from settings or form 35 | 36 | # Create directories if they don't exist 37 | os.makedirs(uploads_dir, exist_ok=True) 38 | os.makedirs(results_dir, exist_ok=True) 39 | 40 | # Try to load obsidian_dir from settings 41 | try: 42 | settings_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.json') 43 | if os.path.exists(settings_file): 44 | with open(settings_file, 'r') as f: 45 | settings = json.load(f) 46 | obsidian_dir = settings.get('obsidian_vault_path', None) 47 | if obsidian_dir and os.path.exists(obsidian_dir): 48 | logger.info(f"Loaded Obsidian vault path from settings: {obsidian_dir}") 49 | else: 50 | obsidian_dir = None 51 | logger.warning("Obsidian vault path not found or invalid in settings") 52 | except Exception as e: 53 | logger.error(f"Error loading settings: {e}") 54 | 55 | 56 | class ProcessingJob: 57 | def __init__(self, job_id, file_path, output_path, settings): 58 | self.job_id = job_id 59 | self.file_path = file_path 60 | self.output_path = output_path 61 | self.settings = settings 62 | self.status = "pending" 63 | self.progress = 0 64 | self.message = "Waiting to start..." 65 | self.log = [] 66 | self.result_files = [] 67 | self.error = None 68 | self.thread = None 69 | self.document_type = None 70 | self.document_metadata = None 71 | self.current_chunk = None 72 | self.total_chunks = None 73 | self.chunk_start_time = None 74 | self.chunk_timer = None 75 | self.processing_paused = False 76 | self.failed_chunks = [] 77 | self.retry_for_job_id = None # For retry jobs, reference to original job 78 | 79 | def add_log(self, message): 80 | timestamp = time.strftime('%H:%M:%S') 81 | log_entry = f"[{timestamp}] {message}" 82 | self.log.append(log_entry) 83 | self.message = message 84 | logger.info(f"Job {self.job_id}: {message}") 85 | 86 | def start_chunk_timer(self, chunk_number, timeout_seconds=300): # 5-minute default timeout 87 | """Start a timer for the current chunk being processed.""" 88 | self.current_chunk = chunk_number 89 | self.chunk_start_time = time.time() 90 | 91 | # Cancel any existing timer 92 | if self.chunk_timer: 93 | self.chunk_timer.cancel() 94 | 95 | # Set a new timer 96 | self.chunk_timer = Timer(timeout_seconds, self.handle_chunk_timeout) 97 | self.chunk_timer.daemon = True 98 | self.chunk_timer.start() 99 | 100 | def handle_chunk_timeout(self): 101 | """Handle a chunk processing timeout by logging and adding to failed chunks.""" 102 | elapsed = time.time() - self.chunk_start_time 103 | self.add_log(f"⚠️ WARNING: Chunk {self.current_chunk} processing timed out after {elapsed:.1f} seconds") 104 | 105 | # Add to failed chunks list 106 | if self.current_chunk is not None: 107 | self.failed_chunks.append({ 108 | "chunk_number": self.current_chunk, 109 | "reason": "timeout" 110 | }) 111 | 112 | # Set the job as paused so we can try to recover 113 | self.processing_paused = True 114 | 115 | def clear_chunk_timer(self): 116 | """Clear the chunk timer after successful completion.""" 117 | if self.chunk_timer: 118 | self.chunk_timer.cancel() 119 | self.chunk_timer = None 120 | self.chunk_start_time = None 121 | 122 | def update_progress(self): 123 | """Update progress percentage based on chunks processed.""" 124 | if self.total_chunks and self.current_chunk is not None: 125 | # Ensure current_chunk is not greater than total_chunks 126 | current = min(self.current_chunk, self.total_chunks) 127 | # Calculate progress as percentage of chunks completed 128 | self.progress = min(round((current / self.total_chunks) * 100), 99) 129 | # Only set to 100% when fully complete 130 | if self.status == "completed": 131 | self.progress = 100 132 | 133 | # Log progress updates for debugging 134 | logger.info( 135 | f"Job {self.job_id}: Progress updated to {self.progress}% (chunk {current}/{self.total_chunks})") 136 | 137 | def to_dict(self): 138 | return { 139 | "job_id": self.job_id, 140 | "status": self.status, 141 | "progress": self.progress, 142 | "message": self.message, 143 | "log": self.log[-15:] if len(self.log) > 15 else self.log, # Return last 15 log entries 144 | "result_files": self.result_files, 145 | "error": self.error, 146 | "document_type": self.document_type, 147 | "document_metadata": self.document_metadata, 148 | "current_chunk": self.current_chunk, 149 | "total_chunks": self.total_chunks, 150 | "failed_chunks": len(self.failed_chunks) if self.failed_chunks else 0 151 | } 152 | 153 | 154 | def capture_output(job): 155 | """Redirect stdout to capture logs for a job""" 156 | 157 | class OutputCapture: 158 | def __init__(self, job): 159 | self.job = job 160 | self.original_stdout = sys.stdout 161 | self.buffer = "" 162 | 163 | def write(self, message): 164 | self.original_stdout.write(message) 165 | 166 | # Add to buffer 167 | self.buffer += message 168 | 169 | # Check if we have a complete line 170 | if "\n" in self.buffer: 171 | lines = self.buffer.split("\n") 172 | # Keep the last incomplete line in the buffer 173 | self.buffer = lines[-1] 174 | 175 | # Log complete lines 176 | for line in lines[:-1]: 177 | if line.strip(): # Skip empty lines 178 | self.job.add_log(line.strip()) 179 | 180 | def flush(self): 181 | self.original_stdout.flush() 182 | 183 | # Flush any remaining content in the buffer 184 | if self.buffer.strip(): 185 | self.job.add_log(self.buffer.strip()) 186 | self.buffer = "" 187 | 188 | return OutputCapture(job) 189 | 190 | 191 | def save_to_obsidian(file_path, obsidian_dir, job, settings=None): 192 | """ 193 | Save a file to the Obsidian vault. 194 | 195 | Args: 196 | file_path (str): Path to the source file 197 | obsidian_dir (str): Path to the Obsidian vault 198 | job (ProcessingJob): The job object for logging 199 | settings (dict, optional): Additional settings 200 | 201 | Returns: 202 | str: Path to the file in the Obsidian vault 203 | """ 204 | if not obsidian_dir or not os.path.exists(obsidian_dir): 205 | job.add_log("❌ Invalid Obsidian vault path") 206 | return None 207 | 208 | try: 209 | # Create target directory (books folder in Obsidian vault) 210 | file_name = os.path.basename(file_path) 211 | books_dir = os.path.join(obsidian_dir, "books") 212 | os.makedirs(books_dir, exist_ok=True) 213 | 214 | # Target path in Obsidian vault 215 | target_path = os.path.join(books_dir, file_name) 216 | 217 | # Copy file to Obsidian vault 218 | shutil.copy2(file_path, target_path) 219 | 220 | # Copy associated images directory if it exists 221 | if job.document_type: 222 | base_name = os.path.splitext(os.path.basename(job.file_path))[0] 223 | images_dir = os.path.join(os.path.dirname(job.file_path), f"{base_name}_images") 224 | if os.path.exists(images_dir): 225 | target_images_dir = os.path.join(books_dir, f"{base_name}_images") 226 | 227 | # Remove existing images directory if it exists 228 | if os.path.exists(target_images_dir): 229 | shutil.rmtree(target_images_dir) 230 | 231 | # Copy images directory 232 | shutil.copytree(images_dir, target_images_dir) 233 | job.add_log(f"✅ Copied images to Obsidian: {target_images_dir}") 234 | 235 | job.add_log(f"✅ File saved to Obsidian: {target_path}") 236 | return target_path 237 | 238 | except Exception as e: 239 | job.add_log(f"❌ Error saving to Obsidian: {str(e)}") 240 | return None 241 | 242 | 243 | def process_document_job(job_id): 244 | """Process document in a separate thread with improved error handling and recovery""" 245 | job = jobs[job_id] 246 | job.status = "processing" 247 | file_extension = os.path.splitext(job.file_path)[1].lower() 248 | file_type = "EPUB" if file_extension == ".epub" else "PDF" 249 | job.add_log(f"Starting {file_type} processing for {os.path.basename(job.file_path)}") 250 | 251 | # Redirect stdout to capture logs 252 | original_stdout = sys.stdout 253 | sys.stdout = capture_output(job) 254 | 255 | try: 256 | # Apply settings 257 | settings = job.settings 258 | chunk_size = int(settings.get('chunk_size', 7)) 259 | chunk_timeout = int(settings.get('chunk_timeout', 300)) # Default 5 minutes 260 | api_timeout = int(settings.get('api_timeout', 60)) # Default 1 minute 261 | 262 | # Initialize processor 263 | processor = GeminiDocumentProcessor( 264 | chunk_size=chunk_size, 265 | api_key=settings.get('api_key', ""), 266 | model_name=settings.get('model_name', "gemini-2.0-flash"), 267 | language="thai", # Always Thai for this tool 268 | max_retries=int(settings.get('max_retries', 3)), 269 | extract_images=settings.get('extract_images', True), 270 | min_img_width=int(settings.get('min_img_width', 100)), 271 | min_img_height=int(settings.get('min_img_height', 100)), 272 | img_format=settings.get('img_format', "png"), 273 | max_workers=int(settings.get('max_workers', 4)), 274 | request_timeout=api_timeout 275 | ) 276 | 277 | # Define a progress callback function 278 | def progress_callback(chunk_num, total_chunks): 279 | job.current_chunk = chunk_num 280 | job.total_chunks = total_chunks 281 | job.update_progress() 282 | 283 | # Add the progress callback to the processor 284 | processor.progress_callback = progress_callback 285 | 286 | # Process the document with improved chunk handling 287 | job.add_log(f"Processing {file_type} with {settings.get('model_name')} model") 288 | 289 | # First, analyze the document to get total pages/chunks 290 | if file_type == "PDF": 291 | try: 292 | # For PDF, we can estimate total chunks before processing 293 | total_pages = processor.get_total_pages(job.file_path) 294 | job.total_chunks = (total_pages + chunk_size - 1) // chunk_size # Ceiling division 295 | job.add_log( 296 | f"Document has {total_pages} pages, will process in approximately {job.total_chunks} chunks") 297 | # Initialize progress 298 | job.current_chunk = 0 299 | job.update_progress() 300 | except Exception as e: 301 | job.add_log(f"Error estimating document size: {str(e)}") 302 | # Continue anyway, we'll handle progress differently 303 | 304 | # Now process the document 305 | try: 306 | # Process document with standard approach and let the processor handle chunk-by-chunk processing 307 | summaries, images_by_chunk, doc_name, doc_type, doc_metadata = processor.process_document(job.file_path) 308 | 309 | # Store document type and metadata 310 | job.document_type = doc_type 311 | job.document_metadata = doc_metadata 312 | 313 | # Save any info about failed chunks 314 | job.failed_chunks = processor.failed_chunks.copy() if hasattr(processor, 'failed_chunks') else [] 315 | except Exception as e: 316 | job.add_log(f"❌ Error during document processing: {str(e)}") 317 | raise 318 | 319 | # Prepare Obsidian metadata if enabled 320 | obsidian_metadata = None 321 | if settings.get('use_obsidian', False): 322 | obsidian_metadata = { 323 | 'tags': settings.get('obsidian_tags', 'book,main'), 324 | 'author': settings.get('obsidian_author', doc_metadata.get('author', '')), 325 | 'coverUrl': settings.get('obsidian_cover_url', ''), 326 | 'review': settings.get('obsidian_review', '') 327 | } 328 | 329 | # Save the results 330 | job.add_log(f"Saving summary to {job.output_path}") 331 | output_file = processor.save_summaries( 332 | summaries, 333 | images_by_chunk, 334 | job.output_path, 335 | doc_name, 336 | doc_type, 337 | doc_metadata, 338 | obsidian_metadata 339 | ) 340 | 341 | # Handle image directory 342 | if settings.get('extract_images', True): 343 | images_dir = os.path.join(os.path.dirname(job.file_path), f"{doc_name}_images") 344 | if os.path.exists(images_dir): 345 | result_images_dir = os.path.join(results_dir, f"{job.job_id}_images") 346 | shutil.copytree(images_dir, result_images_dir) 347 | job.add_log(f"Copied images to {result_images_dir}") 348 | 349 | # Add images directory to result files 350 | job.result_files.append({ 351 | "type": "directory", 352 | "name": f"{doc_name}_images", 353 | "path": f"{job.job_id}_images" 354 | }) 355 | 356 | # Add summary file to result files 357 | job.result_files.append({ 358 | "type": "file", 359 | "name": os.path.basename(job.output_path), 360 | "path": os.path.basename(job.output_path) 361 | }) 362 | 363 | # Copy summary file to results directory 364 | shutil.copy2(job.output_path, os.path.join(results_dir, os.path.basename(job.output_path))) 365 | 366 | # Save to Obsidian if requested 367 | if settings.get('use_obsidian', False) and settings.get('obsidian_vault_path'): 368 | obsidian_path = settings.get('obsidian_vault_path') 369 | if os.path.exists(obsidian_path): 370 | job.add_log(f"Saving to Obsidian vault: {obsidian_path}") 371 | obsidian_file = save_to_obsidian(job.output_path, obsidian_path, job, settings) 372 | if obsidian_file: 373 | job.result_files.append({ 374 | "type": "obsidian", 375 | "name": f"Obsidian: {os.path.basename(obsidian_file)}", 376 | "path": obsidian_file 377 | }) 378 | 379 | # Clean up temporary files 380 | temp_files = [ 381 | f for f in os.listdir('.') 382 | if (f.startswith(f"temp_{doc_name}_chunk_") or f.startswith(f"temp_{doc_name}_chapter_")) 383 | and f.endswith(".md") 384 | ] 385 | for temp_file in temp_files: 386 | try: 387 | os.remove(temp_file) 388 | job.add_log(f"Removed temporary file: {temp_file}") 389 | except Exception as e: 390 | job.add_log(f"Could not remove temporary file {temp_file}: {e}") 391 | 392 | # Report any failed chunks 393 | if job.failed_chunks: 394 | job.add_log(f"⚠️ Note: {len(job.failed_chunks)} chunks failed processing.") 395 | 396 | # Save failed chunks for future retry 397 | failed_chunks_file = os.path.join(results_dir, f"{job.job_id}_failed_chunks.json") 398 | with open(failed_chunks_file, "w", encoding="utf-8") as f: 399 | json.dump(job.failed_chunks, f, ensure_ascii=False, indent=2) 400 | 401 | job.add_log(f"Failed chunks saved to {failed_chunks_file}") 402 | 403 | # Add failed chunks file to result files 404 | job.result_files.append({ 405 | "type": "file", 406 | "name": f"{doc_name}_failed_chunks.json", 407 | "path": f"{job.job_id}_failed_chunks.json" 408 | }) 409 | 410 | # Add option to retry failed chunks 411 | job.result_files.append({ 412 | "type": "action", 413 | "name": f"Retry {len(job.failed_chunks)} Failed Chunks", 414 | "action": "retry_chunks", 415 | "job_id": job.job_id 416 | }) 417 | 418 | job.status = "completed" 419 | job.progress = 100 420 | job.add_log("✅ Processing completed successfully!") 421 | 422 | except Exception as e: 423 | error_message = f"Error processing document: {str(e)}" 424 | job.status = "failed" 425 | job.error = error_message 426 | job.add_log(f"❌ {error_message}") 427 | logger.error(error_message, exc_info=True) 428 | 429 | # Restore stdout 430 | sys.stdout = original_stdout 431 | 432 | 433 | def process_retry_chunks(job_id): 434 | """Process only the failed chunks from a previous job.""" 435 | job = jobs[job_id] 436 | original_job_id = job.retry_for_job_id 437 | 438 | if not original_job_id or original_job_id not in jobs: 439 | job.add_log("❌ Original job not found") 440 | job.status = "failed" 441 | job.error = "Original job not found" 442 | return 443 | 444 | original_job = jobs[original_job_id] 445 | 446 | if not original_job.failed_chunks: 447 | job.add_log("No failed chunks to retry") 448 | job.status = "completed" 449 | return 450 | 451 | # Start processing 452 | job.status = "processing" 453 | job.add_log(f"Retrying {len(original_job.failed_chunks)} failed chunks from job {original_job_id}") 454 | 455 | # Initialize progress tracking 456 | job.total_chunks = len(original_job.failed_chunks) 457 | job.current_chunk = 0 458 | job.update_progress() 459 | 460 | # Redirect stdout to capture logs 461 | original_stdout = sys.stdout 462 | sys.stdout = capture_output(job) 463 | 464 | try: 465 | # Load original settings with adjustments for retry 466 | settings = job.settings.copy() 467 | 468 | # Use more aggressive settings for retries 469 | settings['chunk_timeout'] = int(settings.get('chunk_timeout', 300)) * 2 470 | settings['api_timeout'] = int(settings.get('api_timeout', 60)) * 2 471 | settings['max_retries'] = int(settings.get('max_retries', 3)) + 2 472 | 473 | # Initialize processor 474 | processor = GeminiDocumentProcessor( 475 | chunk_size=int(settings.get('chunk_size', 7)), 476 | api_key=settings.get('api_key', ""), 477 | model_name=settings.get('model_name', "gemini-2.0-flash"), 478 | language="thai", # Always Thai for this tool 479 | max_retries=int(settings.get('max_retries', 5)), # More retries 480 | extract_images=settings.get('extract_images', True), 481 | min_img_width=int(settings.get('min_img_width', 100)), 482 | min_img_height=int(settings.get('min_img_height', 100)), 483 | img_format=settings.get('img_format', "png"), 484 | max_workers=int(settings.get('max_workers', 4)), 485 | request_timeout=int(settings.get('api_timeout', 120)) # Longer timeout 486 | ) 487 | 488 | # Load the original summaries file 489 | original_output_path = original_job.output_path 490 | 491 | # Read original file to get existing summaries 492 | with open(original_output_path, 'r', encoding='utf-8') as f: 493 | original_content = f.read() 494 | 495 | # Process failed chunks 496 | successful_retries = 0 497 | for idx, failed_chunk in enumerate(original_job.failed_chunks): 498 | chunk_num = failed_chunk.get('chunk_number') 499 | if not chunk_num: 500 | continue 501 | 502 | # Update progress 503 | job.current_chunk = idx + 1 504 | job.update_progress() 505 | 506 | # Try to extract page range from failed chunk info 507 | page_start = failed_chunk.get('page_start', 0) 508 | page_end = failed_chunk.get('page_end', 0) 509 | 510 | # If we don't have page info, try to calculate it 511 | if page_start <= 0 or page_end <= 0: 512 | chunk_size = int(settings.get('chunk_size', 7)) 513 | page_start = (chunk_num - 1) * chunk_size + 1 514 | page_end = chunk_num * chunk_size 515 | 516 | job.add_log(f"Retrying chunk {chunk_num} (pages {page_start}-{page_end})...") 517 | 518 | try: 519 | # Process this chunk with extended timeout 520 | summary, images = processor.process_chunk( 521 | job.file_path, 522 | page_start, 523 | page_end, 524 | timeout=int(settings.get('api_timeout', 120)) 525 | ) 526 | 527 | # Save temporary result 528 | temp_file = f"temp_retry_{job.job_id}_chunk_{chunk_num}.md" 529 | with open(temp_file, "w", encoding="utf-8") as f: 530 | f.write(summary) 531 | 532 | job.add_log(f" Successfully reprocessed chunk {chunk_num}") 533 | successful_retries += 1 534 | 535 | # We would need to update the original content here 536 | # This is a simplistic approach - in practice, you'd need more sophisticated parsing 537 | chunk_marker = f"### Chunk {chunk_num}" 538 | next_chunk_marker = f"### Chunk {chunk_num + 1}" 539 | 540 | if chunk_marker in original_content: 541 | start_idx = original_content.find(chunk_marker) 542 | end_idx = original_content.find(next_chunk_marker, start_idx) 543 | 544 | if end_idx == -1: # Last chunk 545 | end_idx = original_content.find("---\n*Summary generated", start_idx) 546 | 547 | if start_idx != -1 and end_idx != -1: 548 | # Replace the chunk content 549 | new_content = original_content[:start_idx] + chunk_marker + "\n\n" + summary + "\n\n" 550 | if end_idx != -1: 551 | new_content += original_content[end_idx:] 552 | 553 | original_content = new_content 554 | 555 | except Exception as e: 556 | job.add_log(f"❌ Error retrying chunk {chunk_num}: {str(e)}") 557 | job.failed_chunks.append({ 558 | "chunk_number": chunk_num, 559 | "page_start": page_start, 560 | "page_end": page_end, 561 | "error": str(e) 562 | }) 563 | 564 | # Save the updated content to a new file 565 | new_output_path = job.output_path 566 | with open(new_output_path, 'w', encoding='utf-8') as f: 567 | f.write(original_content) 568 | 569 | # Copy to results directory 570 | shutil.copy2(new_output_path, os.path.join(results_dir, os.path.basename(new_output_path))) 571 | 572 | # Add to result files 573 | job.result_files.append({ 574 | "type": "file", 575 | "name": os.path.basename(new_output_path), 576 | "path": os.path.basename(new_output_path) 577 | }) 578 | 579 | # Summary of retry results 580 | if successful_retries > 0: 581 | job.add_log(f"✅ Successfully reprocessed {successful_retries} of {len(original_job.failed_chunks)} chunks") 582 | else: 583 | job.add_log("❌ Failed to reprocess any chunks") 584 | 585 | # Set final progress and status 586 | job.status = "completed" 587 | job.progress = 100 588 | 589 | except Exception as e: 590 | error_message = f"Error during retry processing: {str(e)}" 591 | job.status = "failed" 592 | job.error = error_message 593 | job.add_log(f"❌ {error_message}") 594 | logger.error(error_message, exc_info=True) 595 | 596 | # Restore stdout 597 | sys.stdout = original_stdout 598 | 599 | @app.route('/') 600 | def index(): 601 | """Main page""" 602 | return render_template('index.html', obsidian_dir=obsidian_dir) 603 | 604 | 605 | @app.route('/upload', methods=['POST']) 606 | def upload_file(): 607 | """Handle file upload and job creation""" 608 | if 'file' not in request.files: 609 | return jsonify({'error': 'No file part'}), 400 610 | 611 | file = request.files['file'] 612 | if file.filename == '': 613 | return jsonify({'error': 'No selected file'}), 400 614 | 615 | # Check valid file types 616 | if not (file.filename.lower().endswith('.pdf') or file.filename.lower().endswith('.epub')): 617 | return jsonify({'error': 'File must be a PDF or EPUB'}), 400 618 | 619 | # Generate a unique job ID 620 | job_id = str(uuid.uuid4()) 621 | 622 | # Save the uploaded file 623 | file_filename = os.path.basename(file.filename) 624 | file_name_without_ext = os.path.splitext(file_filename)[0] 625 | file_path = os.path.join(uploads_dir, f"{job_id}_{file_filename}") 626 | file.save(file_path) 627 | 628 | # Get settings from form 629 | settings = { 630 | 'model_name': request.form.get('model_name', 'gemini-2.0-flash'), 631 | 'chunk_size': request.form.get('chunk_size', '7'), 632 | 'api_key': request.form.get('api_key', ''), 633 | 'extract_images': request.form.get('extract_images') == 'on', 634 | 'max_retries': request.form.get('max_retries', '3'), 635 | 'min_img_width': request.form.get('min_img_width', '100'), 636 | 'min_img_height': request.form.get('min_img_height', '100'), 637 | 'img_format': request.form.get('img_format', 'png'), 638 | 'max_workers': request.form.get('max_workers', '4'), 639 | 'chunk_timeout': request.form.get('chunk_timeout', '300'), # 5 minutes default 640 | 'api_timeout': request.form.get('api_timeout', '60'), # 1 minute default 641 | 642 | # Obsidian settings 643 | 'use_obsidian': request.form.get('use_obsidian') == 'on', 644 | 'obsidian_vault_path': request.form.get('obsidian_vault_path', ''), 645 | 'obsidian_tags': request.form.get('obsidian_tags', 'book,main,verified'), 646 | 'obsidian_author': request.form.get('obsidian_author', ''), 647 | 'obsidian_cover_url': request.form.get('obsidian_cover_url', ''), 648 | 'obsidian_review': request.form.get('obsidian_review', '') 649 | } 650 | 651 | # Save Obsidian path to settings if provided 652 | if settings['obsidian_vault_path'] and os.path.exists(settings['obsidian_vault_path']): 653 | try: 654 | settings_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.json') 655 | settings_data = {} 656 | if os.path.exists(settings_file): 657 | with open(settings_file, 'r') as f: 658 | settings_data = json.load(f) 659 | 660 | settings_data['obsidian_vault_path'] = settings['obsidian_vault_path'] 661 | 662 | with open(settings_file, 'w') as f: 663 | json.dump(settings_data, f, indent=2) 664 | 665 | global obsidian_dir 666 | obsidian_dir = settings['obsidian_vault_path'] 667 | logger.info(f"Saved Obsidian vault path to settings: {obsidian_dir}") 668 | except Exception as e: 669 | logger.error(f"Error saving settings: {e}") 670 | 671 | # Create output path 672 | output_path = os.path.join(uploads_dir, f"{job_id}_{file_name_without_ext}_summary.md") 673 | 674 | # Create and store the job 675 | job = ProcessingJob(job_id, file_path, output_path, settings) 676 | jobs[job_id] = job 677 | 678 | # Start processing in a background thread 679 | job.thread = threading.Thread(target=process_document_job, args=(job_id,)) 680 | job.thread.daemon = True 681 | job.thread.start() 682 | 683 | return jsonify({'job_id': job_id, 'redirect': url_for('job_status', job_id=job_id)}) 684 | 685 | 686 | @app.route('/job/') 687 | def job_status(job_id): 688 | """Show job status page""" 689 | if job_id not in jobs: 690 | return "Job not found", 404 691 | 692 | return render_template('job_status.html', job_id=job_id) 693 | 694 | 695 | @app.route('/api/job/') 696 | def api_job_status(job_id): 697 | """API to get job status""" 698 | if job_id not in jobs: 699 | return jsonify({'error': 'Job not found'}), 404 700 | 701 | return jsonify(jobs[job_id].to_dict()) 702 | 703 | 704 | @app.route('/download/') 705 | def download_file(filename): 706 | """Serve result files for download""" 707 | return send_from_directory(results_dir, filename, as_attachment=True) 708 | 709 | 710 | @app.route('/view/') 711 | def view_file(filename): 712 | """View result files in browser""" 713 | return send_from_directory(results_dir, filename) 714 | 715 | 716 | @app.route('/retry_chunks/', methods=['POST']) 717 | def retry_failed_chunks(job_id): 718 | """Retry processing failed chunks for a job.""" 719 | if job_id not in jobs: 720 | return jsonify({'error': 'Job not found'}), 404 721 | 722 | job = jobs[job_id] 723 | 724 | if not hasattr(job, 'failed_chunks') or not job.failed_chunks: 725 | return jsonify({'message': 'No failed chunks to retry'}), 200 726 | 727 | # Create a new job for retry 728 | retry_job_id = str(uuid.uuid4()) 729 | file_name_without_ext = os.path.splitext(os.path.basename(job.file_path))[0] 730 | retry_output_path = os.path.join(uploads_dir, f"{retry_job_id}_{file_name_without_ext}_summary_retry.md") 731 | 732 | retry_job = ProcessingJob( 733 | retry_job_id, 734 | job.file_path, 735 | retry_output_path, 736 | job.settings 737 | ) 738 | 739 | # Set special flag to indicate this is a retry job 740 | retry_job.retry_for_job_id = job_id 741 | retry_job.failed_chunks = job.failed_chunks.copy() 742 | 743 | # Store the job 744 | jobs[retry_job_id] = retry_job 745 | 746 | # Start processing in a background thread 747 | retry_job.thread = threading.Thread(target=process_retry_chunks, args=(retry_job_id,)) 748 | retry_job.thread.daemon = True 749 | retry_job.thread.start() 750 | 751 | return jsonify({'job_id': retry_job_id, 'redirect': url_for('job_status', job_id=retry_job_id)}) 752 | 753 | 754 | @app.route('/obsidian_check', methods=['POST']) 755 | def check_obsidian_path(): 756 | """Check if the Obsidian vault path is valid""" 757 | path = request.json.get('path', '') 758 | if not path: 759 | return jsonify({'valid': False, 'message': 'Path is empty'}) 760 | 761 | if not os.path.exists(path): 762 | return jsonify({'valid': False, 'message': 'Path does not exist'}) 763 | 764 | if not os.path.isdir(path): 765 | return jsonify({'valid': False, 'message': 'Path is not a directory'}) 766 | 767 | # Check for .obsidian folder to confirm it's an Obsidian vault 768 | obsidian_folder = os.path.join(path, '.obsidian') 769 | if not os.path.exists(obsidian_folder) or not os.path.isdir(obsidian_folder): 770 | return jsonify({'valid': False, 'message': 'Not an Obsidian vault (no .obsidian folder)'}) 771 | 772 | return jsonify({'valid': True, 'message': 'Valid Obsidian vault'}) 773 | 774 | 775 | if __name__ == '__main__': 776 | # Create HTML templates folder and files if they don't exist 777 | templates_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates') 778 | os.makedirs(templates_dir, exist_ok=True) 779 | 780 | # Create index.html template with additional timeout settings 781 | index_html = """ 782 | 783 | 784 | 785 | 786 | 787 | Gemini Document Processor 788 | 789 | 800 | 801 | 802 |
803 |

Gemini Document Processor

804 | 805 |
806 |
807 | 818 |
819 |
820 |
821 |
822 |
823 |
824 | 825 | 826 | You can upload either a PDF or EPUB file for processing 827 |
828 | 829 |
830 | 831 | 836 |
837 | 838 |
839 | 840 | 841 | For PDFs: Number of pages to process in each API call. For EPUBs: Controls how much content is processed at once. 842 |
843 | 844 |
845 | 846 | 847 |
848 | 849 | 850 |
851 |
852 | 853 |
854 | 855 | 856 |
857 |
858 | 859 |
860 |
Obsidian Integration
861 | 862 |
863 | 864 | 865 |
866 | 867 |
868 |
869 | 870 |
871 | 872 | 873 |
874 |
875 | Path to your Obsidian vault. The summarized document will be saved to a 'books' folder in this location. 876 |
877 | 878 |
879 | 880 | 881 |
882 | 883 |
884 | 885 | 886 | If not specified, will use author from document metadata if available 887 |
888 | 889 |
890 | 891 | 892 |
893 | 894 |
895 | 896 | 897 | Example: "20/33" or "5/5" 898 |
899 |
900 |
901 | 902 |
903 |
Advanced Options
904 | 905 |
906 | 907 | 908 | Maximum number of retry attempts for API calls 909 |
910 | 911 |
912 | 913 | 914 | Maximum time allowed for processing a single chunk before it's considered failed (5-30 minutes) 915 |
916 | 917 |
918 | 919 | 920 | Maximum time allowed for a single API call (30-300 seconds) 921 |
922 | 923 |
924 | 925 | 926 | Images smaller than this width will be ignored 927 |
928 | 929 |
930 | 931 | 932 | Images smaller than this height will be ignored 933 |
934 | 935 |
936 | 937 | 941 |
942 | 943 |
944 | 945 | 946 | Number of parallel threads for image extraction 947 |
948 |
949 |
950 | 951 |
952 | 953 |
954 | Loading... 955 |
956 |
957 |
958 |
959 |
960 |
961 | 962 | 963 | 1044 | 1045 | 1046 | """ 1047 | 1048 | # Create job_status.html template with retry buttons 1049 | job_status_html = """ 1050 | 1051 | 1052 | 1053 | 1054 | 1055 | Job Status - Gemini Document Processor 1056 | 1057 | 1082 | 1083 | 1084 |
1085 |

Document Processing Status

1086 | 1087 |
1088 |
1089 |
1090 |
Job Status
1091 | Back to Home 1092 |
1093 |
1094 |
1095 |
1096 |
1097 | Status: 1098 | Loading... 1099 |
1100 |
1101 | 1102 |
1103 | Progress: 1104 |
1105 |
0%
1106 |
1107 |
1108 | Processing chunk 0 of 0 1109 |
1110 |
1111 | 1112 |
1113 | Current Task: 1114 |
Initializing...
1115 |
1116 | 1117 |
1118 | Log: 1119 |
1120 |
1121 | 1122 |
1123 | 1127 |
1128 | 1129 |
1130 | Results: 1131 |
1132 |
1133 | 1134 | 1138 |
1139 |
1140 |
1141 | 1142 | 1143 | 1385 | 1386 | 1387 | """ 1388 | 1389 | with open(os.path.join(templates_dir, 'index.html'), 'w') as f: 1390 | f.write(index_html) 1391 | 1392 | with open(os.path.join(templates_dir, 'job_status.html'), 'w') as f: 1393 | f.write(job_status_html) 1394 | 1395 | print("Starting web server at http://127.0.0.1:8081/") 1396 | print("You can access the document processor by opening this URL in your browser") 1397 | app.run(debug=True, port=8081) --------------------------------------------------------------------------------