├── .gitignore
├── .coverage
├── settings.json
├── Screenshot.png
├── requirements.txt
├── LICENSE
├── cleanup.sh
├── tests
    ├── test_epub_processor.py
    └── test_document_processor.py
├── README.md
├── epub_processor.py
├── templates
    ├── job_status.html
    └── index.html
├── document_processor.py
└── document_gui.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | venv


--------------------------------------------------------------------------------
/.coverage:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kidpeterpan/gemini-document-processor/HEAD/.coverage


--------------------------------------------------------------------------------
/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "obsidian_vault_path": "/Users/kidpeterpan/Documents/Pan's Vault"
3 | }


--------------------------------------------------------------------------------
/Screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kidpeterpan/gemini-document-processor/HEAD/Screenshot.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | Flask==2.3.3
 3 | PyMuPDF  # For PDF processing (fitz)
 4 | ebooklib==0.18.0  # For EPUB processing
 5 | beautifulsoup4==4.12.2  # For parsing HTML content in EPUBs
 6 | html2text==2020.1.16  # For converting HTML to markdown text
 7 | requests==2.31.0  # For API calls to Gemini
 8 | pathlib==1.0.1  # For path manipulations
 9 | pypdf==5.4.0
10 | google-generativeai
11 | 
12 | # Optional dependencies (comment out if not needed)
13 | # PIL or Pillow might be needed for more advanced image processing
14 | Pillow
15 | 
16 | # Testing dependencies
17 | pytest
18 | pytest-cov


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 kidpeterpan@github.com
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cleanup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to clean up results, uploads, and log files
 4 | # Usage: ./cleanup.sh
 5 | 
 6 | # Set the base directory - default to the current directory
 7 | BASE_DIR="$(pwd)"
 8 | 
 9 | # File paths
10 | RESULTS_DIR="$BASE_DIR/results"
11 | UPLOADS_DIR="$BASE_DIR/uploads"
12 | 
13 | echo "========================================"
14 | echo "Starting cleanup process..."
15 | echo "========================================"
16 | 
17 | # Clean results directory
18 | if [ -d "$RESULTS_DIR" ]; then
19 |     echo "Cleaning results directory..."
20 |     rm -rf "$RESULTS_DIR"/*
21 |     echo "✓ Results directory emptied"
22 | else
23 |     echo "! Results directory not found at $RESULTS_DIR"
24 | fi
25 | 
26 | # Clean uploads directory
27 | if [ -d "$UPLOADS_DIR" ]; then
28 |     echo "Cleaning uploads directory..."
29 |     rm -rf "$UPLOADS_DIR"/*
30 |     echo "✓ Uploads directory emptied"
31 | else
32 |     echo "! Uploads directory not found at $UPLOADS_DIR"
33 | fi
34 | 
35 | # Remove log files
36 | echo "Removing log files..."
37 | find "$BASE_DIR" -name "*.log" -type f -delete
38 | echo "✓ Log files removed"
39 | 
40 | # Done
41 | echo "========================================"
42 | echo "Cleanup completed!"
43 | echo "========================================"


--------------------------------------------------------------------------------
/tests/test_epub_processor.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
 4 | import pytest
 5 | from epub_processor import SimpleEpubProcessor
 6 | 
 7 | class DummyBook:
 8 |     def get_metadata(self, *args, **kwargs):
 9 |         return {'title': [('Test Book',)]}
10 |     def get_items(self):
11 |         class DummyItem:
12 |             def get_type(self): return 9  # ebooklib.ITEM_DOCUMENT
13 |             def get_content(self): return b'<html><body><h1>Chapter</h1><p>Content</p><img src="img.png"/></body></html>'
14 |         return [DummyItem()]
15 | 
16 | def test_process_epub(monkeypatch, tmp_path):
17 |     epub_path = tmp_path / "test.epub"
18 |     with open(epub_path, "wb") as f:
19 |         f.write(b"dummy epub content")
20 |     monkeypatch.setattr('ebooklib.epub.read_epub', lambda path: DummyBook())
21 |     processor = SimpleEpubProcessor(extract_images=True)
22 |     chapters, images_by_chapter, name, meta = processor.process_epub(str(epub_path), image_output_dir=str(tmp_path))
23 |     assert isinstance(chapters, dict)
24 |     assert isinstance(images_by_chapter, dict)
25 |     assert name == "test"
26 |     assert meta['title'] == 'Test Book'
27 | 
28 | def test_process_epub_missing_file(tmp_path):
29 |     processor = SimpleEpubProcessor()
30 |     chapters, images_by_chapter, name, meta = processor.process_epub(str(tmp_path / "no_file.epub"))
31 |     assert "Error" in chapters or "Failed" in list(chapters.values())[0]
32 | 
33 | def test_process_epub_no_chapters(monkeypatch, tmp_path):
34 |     class DummyBook:
35 |         def get_metadata(self, *args, **kwargs):
36 |             return {'title': [('Test Book',)]}
37 |         def get_items(self):
38 |             return []
39 |     epub_path = tmp_path / "test.epub"
40 |     with open(epub_path, "wb") as f:
41 |         f.write(b"dummy epub content")
42 |     monkeypatch.setattr('ebooklib.epub.read_epub', lambda path: DummyBook())
43 |     processor = SimpleEpubProcessor()
44 |     chapters, images_by_chapter, name, meta = processor.process_epub(str(epub_path), image_output_dir=str(tmp_path))
45 |     assert "No Content" in chapters
46 | 
47 | def test_has_content():
48 |     processor = SimpleEpubProcessor()
49 |     assert not processor._has_content("")
50 |     assert not processor._has_content("\n\t ")
51 |     assert processor._has_content("a" * 101)
52 | 
53 | def test_extract_images_basic(tmp_path):
54 |     processor = SimpleEpubProcessor()
55 |     class DummyBook: pass
56 |     class DummyItem: pass
57 |     from bs4 import BeautifulSoup
58 |     soup = BeautifulSoup('<img src="img1.png" alt="A"/><img src="img2.png"/>', 'html.parser')
59 |     images = processor._extract_images_basic(DummyBook(), DummyItem(), soup, str(tmp_path), "book", 1)
60 |     assert len(images) == 2
61 |     assert images[0]['filename'].endswith('.png')
62 |     assert images[0]['alt'] == 'A'
63 | 
64 | def test_extract_images_basic_malformed_html(tmp_path):
65 |     processor = SimpleEpubProcessor()
66 |     class DummyBook: pass
67 |     class DummyItem: pass
68 |     from bs4 import BeautifulSoup
69 |     soup = BeautifulSoup('<img><div><img src=""></div>', 'html.parser')
70 |     images = processor._extract_images_basic(DummyBook(), DummyItem(), soup, str(tmp_path), "book", 1)
71 |     assert isinstance(images, list)
72 | 
73 | def test_has_content_special_chars():
74 |     processor = SimpleEpubProcessor()
75 |     assert not processor._has_content("!@#$%^&*()_+-=\n\t ")
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Gemini Document Processor
  2 | 
  3 | A powerful document processing tool that uses Google's Gemini AI to generate high-quality Thai language summaries from PDF and EPUB files, with image extraction and Obsidian integration.
  4 | 
  5 | ![Screenshot.png](Screenshot.png)
  6 | 
  7 | ## Features
  8 | 
  9 | ### Core Functionality
 10 | - **AI-Powered Summarization**: Uses Google's latest Gemini models (gemini-2.0-flash, gemini-2.5-flash-preview, gemini-1.5-pro)
 11 | - **Multiple Document Formats**: Processes both PDF and EPUB files
 12 | - **Thai-Focused Summaries**: Optimized for creating comprehensive Thai language summaries
 13 | 
 14 | ### Advanced Processing
 15 | - **Smart Chunking**: Processes documents in manageable chunks for better AI performance
 16 | - **Image Extraction**: Extracts and filters images from documents with size thresholds
 17 | - **Robust Error Handling**: Includes intelligent retry mechanisms with model fallbacks
 18 | - **Timeout Management**: Configurable timeouts for both API calls and chunk processing
 19 | 
 20 | ### User Experience
 21 | - **Web Interface**: Clean, tabbed web application for document processing
 22 | - **Real-time Progress Tracking**: Live updates during processing
 23 | - **Job Status Monitoring**: Track failed chunks and retry problematic sections
 24 | - **Parallel Processing**: Multi-threaded image extraction for improved performance
 25 | 
 26 | ### Obsidian Integration
 27 | - **Direct Export**: Create markdown files directly in your Obsidian vault
 28 | - **Metadata Support**: Includes YAML frontmatter with tags and other metadata
 29 | - **Customizable Tags**: Define your own Obsidian tags for processed documents
 30 | 
 31 | ## Installation
 32 | 
 33 | 1. Clone this repository:
 34 |    ```bash
 35 |    git clone https://github.com/kidpeterpan/gemini-document-processor.git
 36 |    cd gemini-document-processor
 37 |    ```
 38 | 
 39 | 2. Install the required dependencies:
 40 |    ```bash
 41 |    pip install -r requirements.txt
 42 |    ```
 43 | 
 44 | 3. Get a Google Gemini API key from [Google AI Studio](https://aistudio.google.com/)
 45 | 
 46 | ## Usage
 47 | 
 48 | ### Starting the Web Interface
 49 | 
 50 | Run the web server:
 51 | 
 52 | ```bash
 53 | python document_gui.py
 54 | ```
 55 | 
 56 | Then open your web browser and navigate to: http://127.0.0.1:8081/
 57 | 
 58 | ### Web Interface Features
 59 | 
 60 | The interface is organized into three tabs:
 61 | 
 62 | 1. **Basic Settings**:
 63 |    - Upload PDF or EPUB files
 64 |    - Select Gemini model:
 65 |      - gemini-2.0-flash (Faster)
 66 |      - gemini-2.5-flash-preview (More accurate)
 67 |      - gemini-1.5-pro (Backup option)
 68 |    - Adjust chunk size (pages per processing unit)
 69 |    - Enter your Gemini API key
 70 |    - Toggle image extraction
 71 | 
 72 | 2. **Obsidian Integration**:
 73 |    - Enable automatic export to Obsidian
 74 |    - Verify and set Obsidian vault path
 75 |    - Configure tags, author, cover URL, and review ratings
 76 |    - Automatic path validation
 77 | 
 78 | 3. **Advanced Settings**:
 79 |    - Configure timeout settings:
 80 |      - Chunk processing timeout (60-1800 seconds)
 81 |      - API request timeout (30-300 seconds)
 82 |    - Set retry attempts for API calls
 83 |    - Configure image size thresholds
 84 |    - Select image format (PNG/JPG)
 85 |    - Adjust worker thread count (1-16)
 86 | 
 87 | ### Job Status and Monitoring
 88 | 
 89 | - **Real-time Progress**: View detailed progress during processing
 90 | - **Log Viewer**: See all processing events as they happen
 91 | - **Failed Chunks**: Identify and retry problematic sections
 92 | - **Result Management**: Download or view generated summaries
 93 | - **Obsidian Export**: Track files exported to your Obsidian vault
 94 | 
 95 | ## How It Works
 96 | 
 97 | 1. **Document Loading**: The application loads PDF or EPUB files and extracts text content
 98 | 2. **Chunking**: Content is divided into manageable chunks (by page for PDFs, by chapter for EPUBs)
 99 | 3. **Image Extraction**: Images are extracted with size filtering and saved separately
100 | 4. **AI Processing**: Each chunk is sent to Gemini API with timeout handling and retries
101 | 5. **Error Recovery**: Failed chunks are tracked and can be retried with more robust settings
102 | 6. **Summary Creation**: Results are compiled into a well-formatted Markdown document
103 | 7. **Integration**: Summary and images are saved locally and (optionally) to Obsidian
104 | 
105 | ## Troubleshooting
106 | 
107 | ### Common Issues
108 | 
109 | - **API Errors**: Check your API key and internet connection
110 | - **Processing Timeouts**: Increase the chunk and API timeout values in Advanced Settings
111 | - **Failed Chunks**: Use the "Retry Failed Chunks" button on the job status page
112 | - **Obsidian Integration**: Ensure your Obsidian vault path is correct and contains a .obsidian folder
113 | 
114 | ### Error Logs
115 | 
116 | For detailed error information, check the application logs in your terminal or command prompt.
117 | 
118 | ## Project Structure
119 | 
120 | - `document_gui.py` - Web interface and job management
121 | - `document_processor.py` - Core processing logic for documents
122 | - `epub_processor.py` - EPUB-specific processing functionality
123 | - `templates/` - HTML templates for web interface
124 | - `uploads/` - Temporary storage for uploaded files and processing results
125 | 
126 | ## License
127 | 
128 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
129 | 
130 | ## Credits
131 | 
132 | This project uses the following technologies:
133 | - [Google Generative AI API](https://ai.google.dev/)
134 | - [Flask](https://flask.palletsprojects.com/)
135 | - [PyPDF](https://pypdf.readthedocs.io/en/latest/)
136 | - [ebooklib](https://github.com/aerkalov/ebooklib)
137 | - [Bootstrap](https://getbootstrap.com/) for the web interface
138 | 


--------------------------------------------------------------------------------
/tests/test_document_processor.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
  4 | 
  5 | import pytest
  6 | from document_processor import GeminiDocumentProcessor, ChunkTimeoutError
  7 | 
  8 | class DummyModel:
  9 |     def generate_content(self, prompt, generation_config=None):
 10 |         class Response:
 11 |             text = "Summary"
 12 |         return Response()
 13 | 
 14 | def test_initialize_api(monkeypatch):
 15 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
 16 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
 17 |     processor = GeminiDocumentProcessor(api_key='dummy', model_name='gemini-2.0-flash')
 18 |     assert processor.model_name == 'gemini-2.0-flash'
 19 | 
 20 | def test_get_total_pages(tmp_path, monkeypatch):
 21 |     # Create a dummy PDF file
 22 |     pdf_path = tmp_path / "test.pdf"
 23 |     with open(pdf_path, "wb") as f:
 24 |         f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF")
 25 |     monkeypatch.setattr('pypdf.PdfReader', lambda f: type('R', (), {'pages': [1, 2, 3]})())
 26 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
 27 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
 28 |     processor = GeminiDocumentProcessor(api_key='dummy')
 29 |     assert processor.get_total_pages(str(pdf_path)) == 3
 30 | 
 31 | def test_get_total_pages_nonexistent():
 32 |     processor = GeminiDocumentProcessor(api_key='dummy')
 33 |     assert processor.get_total_pages('nonexistent.pdf') == 0
 34 | 
 35 | def test_extract_metadata_pdf(monkeypatch, tmp_path):
 36 |     pdf_path = tmp_path / "test.pdf"
 37 |     with open(pdf_path, "wb") as f:
 38 |         f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF")
 39 |     monkeypatch.setattr('pypdf.PdfReader', lambda f: type('R', (), {'metadata': {'/Title': 'Test'}})())
 40 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
 41 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
 42 |     processor = GeminiDocumentProcessor(api_key='dummy')
 43 |     meta = processor._extract_pdf_metadata(str(pdf_path))
 44 |     assert meta['title'] == 'Test'
 45 | 
 46 | def test_extract_text_from_pdf_pages(monkeypatch, tmp_path):
 47 |     pdf_path = tmp_path / "test.pdf"
 48 |     with open(pdf_path, "wb") as f:
 49 |         f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF")
 50 |     class DummyPage:
 51 |         def extract_text(self):
 52 |             return "Hello"
 53 |     class DummyReader:
 54 |         pages = [DummyPage(), DummyPage()]
 55 |     monkeypatch.setattr('pypdf.PdfReader', lambda f: DummyReader())
 56 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
 57 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
 58 |     processor = GeminiDocumentProcessor(api_key='dummy')
 59 |     text = processor._extract_text_from_pdf_pages(str(pdf_path), 1, 2)
 60 |     assert "Hello" in text
 61 | 
 62 | def test_extract_text_from_pdf_pages_corrupted(monkeypatch, tmp_path):
 63 |     pdf_path = tmp_path / "corrupt.pdf"
 64 |     with open(pdf_path, "wb") as f:
 65 |         f.write(b"not a real pdf")
 66 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
 67 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
 68 |     processor = GeminiDocumentProcessor(api_key='dummy')
 69 |     with pytest.raises(Exception):
 70 |         processor._extract_text_from_pdf_pages(str(pdf_path), 1, 1)
 71 | 
 72 | def test_summarize_text_with_timeout(monkeypatch):
 73 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
 74 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
 75 |     processor = GeminiDocumentProcessor(api_key='dummy')
 76 |     summary = processor._summarize_text_with_timeout("test", 1, 1, 1)
 77 |     assert summary == "Summary"
 78 | 
 79 | def test_save_summaries(tmp_path, monkeypatch):
 80 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
 81 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
 82 |     processor = GeminiDocumentProcessor(api_key='dummy')
 83 |     output = tmp_path / "out.md"
 84 |     summaries = {"Chunk 1": "Summary text"}
 85 |     images = {"Chunk 1": [{"path": "img.png", "alt": "img"}]}
 86 |     meta = {"title": "Test"}
 87 |     path = processor.save_summaries(summaries, images, str(output), "Test", "pdf", meta)
 88 |     assert os.path.exists(path)
 89 | 
 90 | def test_process_chunk(monkeypatch, tmp_path):
 91 |     pdf_path = tmp_path / "test.pdf"
 92 |     with open(pdf_path, "wb") as f:
 93 |         f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF")
 94 |     class DummyPage:
 95 |         def extract_text(self):
 96 |             return "Hello"
 97 |     class DummyReader:
 98 |         pages = [DummyPage(), DummyPage()]
 99 |     monkeypatch.setattr('pypdf.PdfReader', lambda f: DummyReader())
100 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
101 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
102 |     processor = GeminiDocumentProcessor(api_key='dummy')
103 |     summary, images = processor.process_chunk(str(pdf_path), 1, 2)
104 |     assert isinstance(summary, str)
105 |     assert isinstance(images, list)
106 | 
107 | def test_process_chunk_no_images(monkeypatch, tmp_path):
108 |     pdf_path = tmp_path / "test.pdf"
109 |     with open(pdf_path, "wb") as f:
110 |         f.write(b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog >>\nendobj\ntrailer\n<< /Root 1 0 R >>\n%%EOF")
111 |     class DummyPage:
112 |         def extract_text(self):
113 |             return "Hello"
114 |     class DummyReader:
115 |         pages = [DummyPage(), DummyPage()]
116 |     monkeypatch.setattr('pypdf.PdfReader', lambda f: DummyReader())
117 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
118 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
119 |     processor = GeminiDocumentProcessor(api_key='dummy', extract_images=False)
120 |     summary, images = processor.process_chunk(str(pdf_path), 1, 2)
121 |     assert isinstance(summary, str)
122 |     assert images == []
123 | 
124 | def test_retry_failed_chunks_no_failed(monkeypatch):
125 |     monkeypatch.setattr('google.generativeai.GenerativeModel', lambda name: DummyModel())
126 |     monkeypatch.setattr('google.generativeai.configure', lambda api_key: None)
127 |     processor = GeminiDocumentProcessor(api_key='dummy')
128 |     summaries = {"Chunk 1": "Summary"}
129 |     updated = processor.retry_failed_chunks('dummy.pdf', summaries)
130 |     assert updated == summaries
131 | 


--------------------------------------------------------------------------------
/epub_processor.py:
--------------------------------------------------------------------------------
  1 | import ebooklib
  2 | from ebooklib import epub
  3 | from bs4 import BeautifulSoup
  4 | import os
  5 | import logging
  6 | import re
  7 | import html2text
  8 | 
  9 | # Set up logging
 10 | logger = logging.getLogger("GeminiEbookProcessor")
 11 | 
 12 | 
 13 | class SimpleEpubProcessor:
 14 |     """
 15 |     A simplified EPUB processor that doesn't require PIL for image processing.
 16 |     """
 17 | 
 18 |     def __init__(self, extract_images=True, img_format="png"):
 19 |         self.extract_images = extract_images
 20 |         self.img_format = img_format.lower()
 21 |         self.html_converter = html2text.HTML2Text()
 22 |         self.html_converter.ignore_links = False
 23 |         self.html_converter.ignore_images = False
 24 |         self.html_converter.ignore_tables = False
 25 |         self.html_converter.body_width = 0  # No wrapping
 26 | 
 27 |     def process_epub(self, epub_path, image_output_dir=None):
 28 |         """
 29 |         Process an EPUB file and extract text and images.
 30 | 
 31 |         Args:
 32 |             epub_path (str): Path to the EPUB file.
 33 |             image_output_dir (str, optional): Directory to save extracted images.
 34 |                                             If None, a temp directory will be created.
 35 | 
 36 |         Returns:
 37 |             tuple: (chapters, images_by_chapter, epub_name_without_ext, metadata)
 38 |                 chapters: Dictionary of chapter number to text content
 39 |                 images_by_chapter: Dictionary of chapter number to list of image information
 40 |                 epub_name_without_ext: Name of the EPUB file without extension
 41 |                 metadata: Dictionary of metadata extracted from the EPUB
 42 |         """
 43 |         # Always set epub_name_without_ext early for error handling
 44 |         epub_filename = os.path.basename(epub_path)
 45 |         epub_name_without_ext = os.path.splitext(epub_filename)[0]
 46 |         try:
 47 |             if not os.path.exists(epub_path):
 48 |                 raise FileNotFoundError(f"EPUB file not found: {epub_path}")
 49 | 
 50 |             # Create image output directory if extracting images
 51 |             if self.extract_images and image_output_dir:
 52 |                 os.makedirs(image_output_dir, exist_ok=True)
 53 |                 logger.info(f"Images will be saved to: {image_output_dir}")
 54 | 
 55 |             # Read the EPUB file
 56 |             book = epub.read_epub(epub_path)
 57 | 
 58 |             # Extract metadata
 59 |             metadata = {}
 60 |             try:
 61 |                 for key, value in book.get_metadata('DC', 'http://purl.org/dc/elements/1.1/').items():
 62 |                     if value:
 63 |                         # Strip namespace from key
 64 |                         key = key.split("}")[-1]
 65 |                         metadata[key] = value[0][0]
 66 |             except Exception as e:
 67 |                 logger.error(f"Error extracting metadata: {str(e)}")
 68 |                 metadata = {"title": epub_name_without_ext}
 69 | 
 70 |             # Get chapters and images
 71 |             chapters = {}  # Always initialize as a dictionary
 72 |             images_by_chapter = {}
 73 |             chapter_index = 1
 74 | 
 75 |             try:
 76 |                 for item in book.get_items():
 77 |                     if item.get_type() == ebooklib.ITEM_DOCUMENT:
 78 |                         try:
 79 |                             # Extract text from HTML content
 80 |                             content = item.get_content().decode('utf-8')
 81 |                             soup = BeautifulSoup(content, 'html.parser')
 82 | 
 83 |                             # Get text content
 84 |                             text_content = self.html_converter.handle(content)
 85 | 
 86 |                             # Skip if there's no meaningful content
 87 |                             if not self._has_content(text_content):
 88 |                                 continue
 89 | 
 90 |                             # Add chapter
 91 |                             chapter_key = f"Chapter {chapter_index}"
 92 |                             chapters[chapter_key] = text_content
 93 | 
 94 |                             # Basic image extraction that doesn't depend on PIL
 95 |                             if self.extract_images and image_output_dir:
 96 |                                 chapter_images = self._extract_images_basic(
 97 |                                     book, item, soup, image_output_dir,
 98 |                                     epub_name_without_ext, chapter_index
 99 |                                 )
100 | 
101 |                                 if chapter_images:
102 |                                     images_by_chapter[chapter_key] = chapter_images
103 |                                     logger.info(f"Found {len(chapter_images)} image references in {chapter_key}")
104 | 
105 |                             chapter_index += 1
106 |                         except Exception as e:
107 |                             logger.error(f"Error processing chapter {chapter_index}: {str(e)}")
108 |                             # Continue with next chapter
109 |                             chapter_index += 1
110 |             except Exception as e:
111 |                 logger.error(f"Error processing chapters: {str(e)}")
112 |                 # If we hit an error in the chapter processing loop, add an error chapter
113 |                 chapters = {"Error": f"Failed to process chapters: {str(e)}"}
114 | 
115 |             # If no chapters were found, add a placeholder
116 |             if not chapters:
117 |                 chapters = {"No Content": "No readable content found in the EPUB file."}
118 | 
119 |             return chapters, images_by_chapter, epub_name_without_ext, metadata
120 | 
121 |         except Exception as e:
122 |             logger.error(f"Error processing EPUB file: {str(e)}")
123 |             # Return minimal data to not break the pipeline - ensure everything is a dictionary
124 |             return {"Error": f"Failed to process EPUB: {str(e)}"}, {}, epub_name_without_ext, {
125 |                 "title": epub_name_without_ext}
126 | 
127 |     def _has_content(self, text):
128 |         """Check if there's substantial content in the text."""
129 |         # Remove whitespace, special characters, and common EPUB navigation elements
130 |         if not text:
131 |             return False
132 |         clean_text = re.sub(r'\s+|[^\w]', '', text)
133 |         return len(clean_text) > 100  # Arbitrary threshold
134 | 
135 |     def _extract_images_basic(self, book, item, soup, output_dir, epub_name, chapter_index):
136 |         """Extract basic image references without using PIL."""
137 |         extracted_images = []
138 |         img_elements = soup.find_all('img')
139 | 
140 |         for img_index, img in enumerate(img_elements):
141 |             try:
142 |                 # Get image source
143 |                 src = img.get('src')
144 |                 if not src:
145 |                     continue
146 | 
147 |                 # Just record the image reference without saving it
148 |                 # This way we don't depend on PIL but still have image info
149 |                 img_filename = f"{epub_name}_chapter{chapter_index:03d}_img{img_index + 1:03d}.{self.img_format}"
150 |                 img_path = os.path.join(output_dir, img_filename)
151 | 
152 |                 # Store image information
153 |                 extracted_images.append({
154 |                     'filename': img_filename,
155 |                     'path': img_path,
156 |                     'chapter': chapter_index,
157 |                     'alt': img.get('alt', ''),
158 |                     'src': src
159 |                 })
160 | 
161 |             except Exception as e:
162 |                 logger.error(f"Error extracting image {img_index} reference from chapter {chapter_index}: {str(e)}")
163 |                 continue
164 | 
165 |         return extracted_images


--------------------------------------------------------------------------------
/templates/job_status.html:
--------------------------------------------------------------------------------
  1 | 
  2 |     <!DOCTYPE html>
  3 |     <html lang="en">
  4 |     <head>
  5 |         <meta charset="UTF-8">
  6 |         <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |         <title>Job Status - Gemini Document Processor</title>
  8 |         <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
  9 |         <style>
 10 |             body { padding-top: 20px; }
 11 |             .log-container {
 12 |                 background-color: #f8f9fa;
 13 |                 border: 1px solid #dee2e6;
 14 |                 border-radius: 0.25rem;
 15 |                 padding: 15px;
 16 |                 height: 300px;
 17 |                 overflow-y: auto;
 18 |                 font-family: monospace;
 19 |                 white-space: pre-wrap;
 20 |             }
 21 |             .log-line {
 22 |                 margin-bottom: 3px;
 23 |             }
 24 |             .badge-obsidian {
 25 |                 background-color: #8d6fdb;
 26 |                 color: white;
 27 |             }
 28 |             .current-chunk-info {
 29 |                 font-size: 0.9rem;
 30 |                 color: #6c757d;
 31 |                 margin-top: 5px;
 32 |             }
 33 |         </style>
 34 |     </head>
 35 |     <body>
 36 |         <div class="container">
 37 |             <h1 class="mb-4">Document Processing Status</h1>
 38 | 
 39 |             <div class="card">
 40 |                 <div class="card-header">
 41 |                     <div class="d-flex justify-content-between align-items-center">
 42 |                         <h5 class="mb-0">Job Status</h5>
 43 |                         <a href="/" class="btn btn-sm btn-outline-secondary">Back to Home</a>
 44 |                     </div>
 45 |                 </div>
 46 |                 <div class="card-body">
 47 |                     <div class="mb-3">
 48 |                         <div class="d-flex justify-content-between">
 49 |                             <strong>Status:</strong>
 50 |                             <span id="status" class="badge bg-secondary">Loading...</span>
 51 |                         </div>
 52 |                     </div>
 53 | 
 54 |                     <div class="mb-3">
 55 |                         <strong>Progress:</strong>
 56 |                         <div class="progress">
 57 |                             <div id="progress" class="progress-bar" role="progressbar" style="width: 0%;" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100">0%</div>
 58 |                         </div>
 59 |                         <div id="chunkInfo" class="current-chunk-info d-none">
 60 |                             Processing chunk <span id="currentChunk">0</span> of <span id="totalChunks">0</span>
 61 |                         </div>
 62 |                     </div>
 63 | 
 64 |                     <div class="mb-3">
 65 |                         <strong>Current Task:</strong>
 66 |                         <div id="message">Initializing...</div>
 67 |                     </div>
 68 | 
 69 |                     <div class="mb-3">
 70 |                         <strong>Log:</strong>
 71 |                         <div id="log" class="log-container"></div>
 72 |                     </div>
 73 | 
 74 |                     <div id="failedChunksContainer" class="mb-3 d-none">
 75 |                         <div class="alert alert-warning" role="alert">
 76 |                             <strong id="failedChunksCount">0</strong> chunks failed processing. 
 77 |                             <button id="retryChunksBtn" class="btn btn-sm btn-warning ms-2">Retry Failed Chunks</button>
 78 |                         </div>
 79 |                     </div>
 80 | 
 81 |                     <div id="resultsContainer" class="mb-3 d-none">
 82 |                         <strong>Results:</strong>
 83 |                         <div id="results" class="list-group mt-2"></div>
 84 |                     </div>
 85 | 
 86 |                     <div id="errorContainer" class="alert alert-danger d-none" role="alert">
 87 |                         <strong>Error:</strong>
 88 |                         <div id="errorMessage"></div>
 89 |                     </div>
 90 |                 </div>
 91 |             </div>
 92 |         </div>
 93 | 
 94 |         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
 95 |         <script>
 96 |             // Get job ID from URL
 97 |             const jobId = '{{ job_id }}';
 98 |             let statusCheckInterval;
 99 | 
100 |             // Update status badge based on job status
101 |             function updateStatusBadge(status) {
102 |                 const statusElement = document.getElementById('status');
103 |                 statusElement.textContent = status.charAt(0).toUpperCase() + status.slice(1);
104 | 
105 |                 // Update badge color
106 |                 statusElement.className = 'badge';
107 |                 switch(status) {
108 |                     case 'pending':
109 |                         statusElement.classList.add('bg-secondary');
110 |                         break;
111 |                     case 'processing':
112 |                         statusElement.classList.add('bg-primary');
113 |                         break;
114 |                     case 'completed':
115 |                         statusElement.classList.add('bg-success');
116 |                         break;
117 |                     case 'failed':
118 |                         statusElement.classList.add('bg-danger');
119 |                         break;
120 |                     default:
121 |                         statusElement.classList.add('bg-secondary');
122 |                 }
123 |             }
124 | 
125 |             // Update progress bar
126 |             function updateProgress(progress, currentChunk, totalChunks) {
127 |                 const progressBar = document.getElementById('progress');
128 |                 progressBar.style.width = `${progress}%`;
129 |                 progressBar.textContent = `${progress}%`;
130 |                 progressBar.setAttribute('aria-valuenow', progress);
131 | 
132 |                 // Update chunk info if available
133 |                 const chunkInfo = document.getElementById('chunkInfo');
134 |                 if (currentChunk && totalChunks) {
135 |                     document.getElementById('currentChunk').textContent = currentChunk;
136 |                     document.getElementById('totalChunks').textContent = totalChunks;
137 |                     chunkInfo.classList.remove('d-none');
138 |                 } else {
139 |                     chunkInfo.classList.add('d-none');
140 |                 }
141 |             }
142 | 
143 |             // Update log
144 |             function updateLog(logEntries) {
145 |                 const logContainer = document.getElementById('log');
146 | 
147 |                 // Clear existing log
148 |                 logContainer.innerHTML = '';
149 | 
150 |                 // Add new log entries
151 |                 logEntries.forEach(entry => {
152 |                     const logLine = document.createElement('div');
153 |                     logLine.className = 'log-line';
154 | 
155 |                     // Add error styling if needed
156 |                     if (entry.includes('❌') || entry.includes('Error')) {
157 |                         logLine.className += ' text-danger';
158 |                     } else if (entry.includes('⚠️') || entry.includes('WARNING')) {
159 |                         logLine.className += ' text-warning';
160 |                     } else if (entry.includes('✅')) {
161 |                         logLine.className += ' text-success';
162 |                     }
163 | 
164 |                     logLine.textContent = entry;
165 |                     logContainer.appendChild(logLine);
166 |                 });
167 | 
168 |                 // Auto-scroll to bottom
169 |                 logContainer.scrollTop = logContainer.scrollHeight;
170 |             }
171 | 
172 |             // Update failed chunks info
173 |             function updateFailedChunks(failedChunksCount) {
174 |                 const container = document.getElementById('failedChunksContainer');
175 |                 const countElement = document.getElementById('failedChunksCount');
176 | 
177 |                 if (failedChunksCount > 0) {
178 |                     countElement.textContent = failedChunksCount;
179 |                     container.classList.remove('d-none');
180 |                 } else {
181 |                     container.classList.add('d-none');
182 |                 }
183 |             }
184 | 
185 |             // Update results
186 |             function updateResults(resultFiles) {
187 |                 const resultsContainer = document.getElementById('resultsContainer');
188 |                 const resultsList = document.getElementById('results');
189 | 
190 |                 if (resultFiles && resultFiles.length > 0) {
191 |                     // Show results container
192 |                     resultsContainer.classList.remove('d-none');
193 | 
194 |                     // Clear existing results
195 |                     resultsList.innerHTML = '';
196 | 
197 |                     // Add new result files
198 |                     resultFiles.forEach(file => {
199 |                         const resultItem = document.createElement('a');
200 |                         resultItem.className = 'list-group-item list-group-item-action d-flex justify-content-between align-items-center';
201 | 
202 |                         if (file.type === 'file') {
203 |                             resultItem.href = `/download/${file.path}`;
204 |                             resultItem.textContent = file.name;
205 | 
206 |                             // Add view button for markdown files
207 |                             if (file.name.endsWith('.md')) {
208 |                                 const viewBtn = document.createElement('a');
209 |                                 viewBtn.className = 'btn btn-sm btn-outline-primary ms-2';
210 |                                 viewBtn.href = `/view/${file.path}`;
211 |                                 viewBtn.textContent = 'View';
212 |                                 viewBtn.target = '_blank';
213 |                                 resultItem.appendChild(viewBtn);
214 |                             }
215 |                         } else if (file.type === 'directory') {
216 |                             resultItem.textContent = `${file.name} (directory)`;
217 |                             resultItem.href = '#';
218 |                             resultItem.style.pointerEvents = 'none';
219 |                         } else if (file.type === 'obsidian') {
220 |                             resultItem.className += ' list-group-item-light';
221 |                             resultItem.innerHTML = `<span>${file.name}</span>`;
222 |                             resultItem.href = '#';
223 | 
224 |                             // Add obsidian badge
225 |                             const badge = document.createElement('span');
226 |                             badge.className = 'badge badge-obsidian';
227 |                             badge.textContent = 'Obsidian';
228 |                             resultItem.appendChild(badge);
229 |                         } else if (file.type === 'action' && file.action === 'retry_chunks') {
230 |                             resultItem.href = '#';
231 |                             resultItem.textContent = file.name;
232 |                             resultItem.className = 'list-group-item list-group-item-warning d-flex justify-content-between align-items-center';
233 | 
234 |                             // Add retry button
235 |                             const retryBtn = document.createElement('button');
236 |                             retryBtn.className = 'btn btn-sm btn-warning';
237 |                             retryBtn.textContent = 'Retry';
238 |                             retryBtn.onclick = function(e) {
239 |                                 e.preventDefault();
240 |                                 retryFailedChunks(file.job_id);
241 |                             };
242 |                             resultItem.appendChild(retryBtn);
243 |                         }
244 | 
245 |                         resultsList.appendChild(resultItem);
246 |                     });
247 |                 }
248 |             }
249 | 
250 |             // Show error message
251 |             function showError(errorMessage) {
252 |                 const errorContainer = document.getElementById('errorContainer');
253 |                 const errorMessageElement = document.getElementById('errorMessage');
254 | 
255 |                 errorContainer.classList.remove('d-none');
256 |                 errorMessageElement.textContent = errorMessage;
257 |             }
258 | 
259 |             // Function to retry failed chunks
260 |             function retryFailedChunks(originalJobId) {
261 |                 if (!confirm('Are you sure you want to retry failed chunks?')) {
262 |                     return;
263 |                 }
264 | 
265 |                 fetch(`/retry_chunks/${originalJobId}`, {
266 |                     method: 'POST'
267 |                 })
268 |                 .then(response => {
269 |                     if (!response.ok) {
270 |                         throw new Error('Failed to start retry job');
271 |                     }
272 |                     return response.json();
273 |                 })
274 |                 .then(data => {
275 |                     if (data.redirect) {
276 |                         window.location.href = data.redirect;
277 |                     }
278 |                 })
279 |                 .catch(error => {
280 |                     alert('Error starting retry job: ' + error.message);
281 |                 });
282 |             }
283 | 
284 |             // Check job status
285 |             function checkJobStatus() {
286 |                 fetch(`/api/job/${jobId}`)
287 |                     .then(response => {
288 |                         if (!response.ok) {
289 |                             throw new Error('Failed to get job status');
290 |                         }
291 |                         return response.json();
292 |                     })
293 |                     .then(data => {
294 |                         // Update UI with job status
295 |                         updateStatusBadge(data.status);
296 |                         updateProgress(data.progress, data.current_chunk, data.total_chunks);
297 |                         document.getElementById('message').textContent = data.message;
298 |                         updateLog(data.log);
299 |                         updateFailedChunks(data.failed_chunks);
300 | 
301 |                         // If job is completed, show results
302 |                         if (data.result_files && data.result_files.length > 0) {
303 |                             updateResults(data.result_files);
304 |                         }
305 | 
306 |                         // If job has an error, show it
307 |                         if (data.error) {
308 |                             showError(data.error);
309 |                         }
310 | 
311 |                         // If job is completed or failed, stop checking
312 |                         if (data.status === 'completed' || data.status === 'failed') {
313 |                             clearInterval(statusCheckInterval);
314 |                             updateStatusBadge(data.status);
315 |                         }
316 |                     })
317 |                     .catch(error => {
318 |                         console.error('Error checking job status:', error);
319 |                         document.getElementById('message').textContent = 'Error checking job status';
320 |                     });
321 |             }
322 | 
323 |             // Start checking job status
324 |             document.addEventListener('DOMContentLoaded', function() {
325 |                 // Initial check
326 |                 checkJobStatus();
327 | 
328 |                 // Check status every 5 seconds
329 |                 statusCheckInterval = setInterval(checkJobStatus, 5000);
330 | 
331 |                 // Set up retry button
332 |                 document.getElementById('retryChunksBtn').addEventListener('click', function() {
333 |                     retryFailedChunks(jobId);
334 |                 });
335 |             });
336 |         </script>
337 |     </body>
338 |     </html>
339 |     


--------------------------------------------------------------------------------
/templates/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 |     <!DOCTYPE html>
  3 |     <html lang="en">
  4 |     <head>
  5 |         <meta charset="UTF-8">
  6 |         <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |         <title>Gemini Document Processor</title>
  8 |         <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
  9 |         <style>
 10 |             body { padding-top: 20px; }
 11 |             .card { margin-bottom: 20px; }
 12 |             .form-group { margin-bottom: 15px; }
 13 |             .obsidian-settings { 
 14 |                 background-color: #f5f5f5;
 15 |                 border-radius: 8px;
 16 |                 padding: 15px;
 17 |                 margin-top: 15px;
 18 |             }
 19 |         </style>
 20 |     </head>
 21 |     <body>
 22 |         <div class="container">
 23 |             <h1 class="mb-4">Gemini Document Processor</h1>
 24 | 
 25 |             <div class="card">
 26 |                 <div class="card-header">
 27 |                     <ul class="nav nav-tabs card-header-tabs" id="myTab" role="tablist">
 28 |                         <li class="nav-item" role="presentation">
 29 |                             <button class="nav-link active" id="basic-tab" data-bs-toggle="tab" data-bs-target="#basic" type="button" role="tab" aria-controls="basic" aria-selected="true">Basic Settings</button>
 30 |                         </li>
 31 |                         <li class="nav-item" role="presentation">
 32 |                             <button class="nav-link" id="obsidian-tab" data-bs-toggle="tab" data-bs-target="#obsidian" type="button" role="tab" aria-controls="obsidian" aria-selected="false">Obsidian Integration</button>
 33 |                         </li>
 34 |                         <li class="nav-item" role="presentation">
 35 |                             <button class="nav-link" id="advanced-tab" data-bs-toggle="tab" data-bs-target="#advanced" type="button" role="tab" aria-controls="advanced" aria-selected="false">Advanced Settings</button>
 36 |                         </li>
 37 |                     </ul>
 38 |                 </div>
 39 |                 <div class="card-body">
 40 |                     <form id="uploadForm" enctype="multipart/form-data">
 41 |                         <div class="tab-content" id="myTabContent">
 42 |                             <div class="tab-pane fade show active" id="basic" role="tabpanel" aria-labelledby="basic-tab">
 43 |                                 <div class="form-group">
 44 |                                     <label for="file">Select PDF or EPUB File:</label>
 45 |                                     <input type="file" class="form-control" id="file" name="file" accept=".pdf,.epub" required>
 46 |                                     <small class="form-text text-muted">You can upload either a PDF or EPUB file for processing</small>
 47 |                                 </div>
 48 | 
 49 |                                 <div class="form-group">
 50 |                                     <label for="model_name">Gemini Model:</label>
 51 |                                     <select class="form-select" id="model_name" name="model_name">
 52 |                                         <option value="gemini-2.0-flash">gemini-2.0-flash (Faster)</option>
 53 |                                         <option value="gemini-2.5-flash-preview-04-17">gemini-2.5-flash-preview-04-17 (More accurate)</option>
 54 |                                         <option value="gemini-1.5-pro">gemini-1.5-pro (Backup option)</option>
 55 |                                     </select>
 56 |                                 </div>
 57 | 
 58 |                                 <div class="form-group">
 59 |                                     <label for="chunk_size">Pages per Chunk (PDF) / Content per Chapter (EPUB):</label>
 60 |                                     <input type="number" class="form-control" id="chunk_size" name="chunk_size" value="7" min="1" max="20">
 61 |                                     <small class="form-text text-muted">For PDFs: Number of pages to process in each API call. For EPUBs: Controls how much content is processed at once.</small>
 62 |                                 </div>
 63 | 
 64 |                                 <div class="form-group">
 65 |                                     <label for="api_key">API Key:</label>
 66 |                                     <input type="password" class="form-control" id="api_key" name="api_key" value="">
 67 |                                     <div class="form-check mt-2">
 68 |                                         <input class="form-check-input" type="checkbox" id="showApiKey">
 69 |                                         <label class="form-check-label" for="showApiKey">Show API Key</label>
 70 |                                     </div>
 71 |                                 </div>
 72 | 
 73 |                                 <div class="form-check mb-3">
 74 |                                     <input class="form-check-input" type="checkbox" id="extract_images" name="extract_images" checked>
 75 |                                     <label class="form-check-label" for="extract_images">Extract Images</label>
 76 |                                 </div>
 77 |                             </div>
 78 | 
 79 |                             <div class="tab-pane fade" id="obsidian" role="tabpanel" aria-labelledby="obsidian-tab">
 80 |                                 <h5 class="mb-3">Obsidian Integration</h5>
 81 | 
 82 |                                 <div class="form-check mb-3">
 83 |                                     <input class="form-check-input" type="checkbox" id="use_obsidian" name="use_obsidian">
 84 |                                     <label class="form-check-label" for="use_obsidian">Enable Obsidian Integration</label>
 85 |                                 </div>
 86 | 
 87 |                                 <div id="obsidianSettings" class="obsidian-settings d-none">
 88 |                                     <div class="form-group">
 89 |                                         <label for="obsidian_vault_path">Obsidian Vault Path:</label>
 90 |                                         <div class="input-group">
 91 |                                             <input type="text" class="form-control" id="obsidian_vault_path" name="obsidian_vault_path" value="{{ obsidian_dir or '' }}">
 92 |                                             <button class="btn btn-outline-secondary" type="button" id="checkObsidianPath">Check</button>
 93 |                                         </div>
 94 |                                         <div id="obsidianPathFeedback" class="form-text"></div>
 95 |                                         <small class="form-text text-muted">Path to your Obsidian vault. The summarized document will be saved to a 'books' folder in this location.</small>
 96 |                                     </div>
 97 | 
 98 |                                     <div class="form-group mt-3">
 99 |                                         <label for="obsidian_tags">Tags (comma separated):</label>
100 |                                         <input type="text" class="form-control" id="obsidian_tags" name="obsidian_tags" value="book,main">
101 |                                     </div>
102 | 
103 |                                     <div class="form-group mt-3">
104 |                                         <label for="obsidian_author">Author (optional):</label>
105 |                                         <input type="text" class="form-control" id="obsidian_author" name="obsidian_author" value="TBD">
106 |                                         <small class="form-text text-muted">If not specified, will use author from document metadata if available</small>
107 |                                     </div>
108 | 
109 |                                     <div class="form-group mt-3">
110 |                                         <label for="obsidian_cover_url">Cover URL (optional):</label>
111 |                                         <input type="text" class="form-control" id="obsidian_cover_url" name="obsidian_cover_url" value="TBD">
112 |                                     </div>
113 | 
114 |                                     <div class="form-group mt-3">
115 |                                         <label for="obsidian_review">Review (optional):</label>
116 |                                         <input type="text" class="form-control" id="obsidian_review" name="obsidian_review" value="TBD">
117 |                                         <small class="form-text text-muted">Example: "20/33" or "5/5"</small>
118 |                                     </div>
119 |                                 </div>
120 |                             </div>
121 | 
122 |                             <div class="tab-pane fade" id="advanced" role="tabpanel" aria-labelledby="advanced-tab">
123 |                                 <h5 class="mb-3">Advanced Options</h5>
124 | 
125 |                                 <div class="form-group">
126 |                                     <label for="max_retries">Max Retries:</label>
127 |                                     <input type="number" class="form-control" id="max_retries" name="max_retries" value="3" min="1" max="10">
128 |                                     <small class="form-text text-muted">Maximum number of retry attempts for API calls</small>
129 |                                 </div>
130 | 
131 |                                 <div class="form-group">
132 |                                     <label for="chunk_timeout">Chunk Processing Timeout (seconds):</label>
133 |                                     <input type="number" class="form-control" id="chunk_timeout" name="chunk_timeout" value="300" min="60" max="1800">
134 |                                     <small class="form-text text-muted">Maximum time allowed for processing a single chunk before it's considered failed (5-30 minutes)</small>
135 |                                 </div>
136 | 
137 |                                 <div class="form-group">
138 |                                     <label for="api_timeout">API Request Timeout (seconds):</label>
139 |                                     <input type="number" class="form-control" id="api_timeout" name="api_timeout" value="180" min="30" max="300">
140 |                                     <small class="form-text text-muted">Maximum time allowed for a single API call (30-300 seconds)</small>
141 |                                 </div>
142 | 
143 |                                 <div class="form-group">
144 |                                     <label for="min_img_width">Minimum Image Width:</label>
145 |                                     <input type="number" class="form-control" id="min_img_width" name="min_img_width" value="100" min="10" max="1000">
146 |                                     <small class="form-text text-muted">Images smaller than this width will be ignored</small>
147 |                                 </div>
148 | 
149 |                                 <div class="form-group">
150 |                                     <label for="min_img_height">Minimum Image Height:</label>
151 |                                     <input type="number" class="form-control" id="min_img_height" name="min_img_height" value="100" min="10" max="1000">
152 |                                     <small class="form-text text-muted">Images smaller than this height will be ignored</small>
153 |                                 </div>
154 | 
155 |                                 <div class="form-group">
156 |                                     <label for="img_format">Image Format:</label>
157 |                                     <select class="form-select" id="img_format" name="img_format">
158 |                                         <option value="jpg">JPG (Smaller files, some quality loss)</option>
159 |                                         <option value="png">PNG (Lossless, larger files)</option>
160 |                                     </select>
161 |                                 </div>
162 | 
163 |                                 <div class="form-group">
164 |                                     <label for="max_workers">Worker Threads:</label>
165 |                                     <input type="number" class="form-control" id="max_workers" name="max_workers" value="4" min="1" max="16">
166 |                                     <small class="form-text text-muted">Number of parallel threads for image extraction</small>
167 |                                 </div>
168 |                             </div>
169 |                         </div>
170 | 
171 |                         <div class="mt-4">
172 |                             <button type="submit" class="btn btn-primary" id="submitBtn">Process Document</button>
173 |                             <div class="spinner-border text-primary d-none" id="spinner" role="status">
174 |                                 <span class="visually-hidden">Loading...</span>
175 |                             </div>
176 |                         </div>
177 |                     </form>
178 |                 </div>
179 |             </div>
180 |         </div>
181 | 
182 |         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
183 |         <script>
184 |             // Toggle API key visibility
185 |             document.getElementById('showApiKey').addEventListener('change', function() {
186 |                 const apiKeyInput = document.getElementById('api_key');
187 |                 apiKeyInput.type = this.checked ? 'text' : 'password';
188 |             });
189 | 
190 |             // Toggle Obsidian settings visibility
191 |             document.getElementById('use_obsidian').addEventListener('change', function() {
192 |                 const obsidianSettings = document.getElementById('obsidianSettings');
193 |                 obsidianSettings.classList.toggle('d-none', !this.checked);
194 |             });
195 | 
196 |             // Check Obsidian path
197 |             document.getElementById('checkObsidianPath').addEventListener('click', function() {
198 |                 const path = document.getElementById('obsidian_vault_path').value;
199 |                 const feedbackElement = document.getElementById('obsidianPathFeedback');
200 | 
201 |                 feedbackElement.textContent = 'Checking...';
202 |                 feedbackElement.className = 'form-text text-muted';
203 | 
204 |                 fetch('/obsidian_check', {
205 |                     method: 'POST',
206 |                     headers: {
207 |                         'Content-Type': 'application/json'
208 |                     },
209 |                     body: JSON.stringify({ path: path })
210 |                 })
211 |                 .then(response => response.json())
212 |                 .then(data => {
213 |                     if (data.valid) {
214 |                         feedbackElement.textContent = '✅ ' + data.message;
215 |                         feedbackElement.className = 'form-text text-success';
216 |                     } else {
217 |                         feedbackElement.textContent = '❌ ' + data.message;
218 |                         feedbackElement.className = 'form-text text-danger';
219 |                     }
220 |                 })
221 |                 .catch(error => {
222 |                     feedbackElement.textContent = '❌ Error checking path';
223 |                     feedbackElement.className = 'form-text text-danger';
224 |                 });
225 |             });
226 | 
227 |             // Handle form submission
228 |             document.getElementById('uploadForm').addEventListener('submit', function(e) {
229 |                 e.preventDefault();
230 | 
231 |                 const submitBtn = document.getElementById('submitBtn');
232 |                 const spinner = document.getElementById('spinner');
233 | 
234 |                 // Disable button and show spinner
235 |                 submitBtn.disabled = true;
236 |                 spinner.classList.remove('d-none');
237 | 
238 |                 // Create FormData object
239 |                 const formData = new FormData(this);
240 | 
241 |                 // Submit the form via AJAX
242 |                 fetch('/upload', {
243 |                     method: 'POST',
244 |                     body: formData
245 |                 })
246 |                 .then(response => {
247 |                     if (!response.ok) {
248 |                         throw new Error('Network response was not ok');
249 |                     }
250 |                     return response.json();
251 |                 })
252 |                 .then(data => {
253 |                     if (data.redirect) {
254 |                         window.location.href = data.redirect;
255 |                     }
256 |                 })
257 |                 .catch(error => {
258 |                     alert('Error: ' + error.message);
259 |                     submitBtn.disabled = false;
260 |                     spinner.classList.add('d-none');
261 |                 });
262 |             });
263 |         </script>
264 |     </body>
265 |     </html>
266 |     


--------------------------------------------------------------------------------
/document_processor.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import time
  5 | import logging
  6 | import tempfile
  7 | import threading
  8 | from concurrent.futures import ThreadPoolExecutor
  9 | from typing import Dict, List, Tuple, Any, Optional
 10 | 
 11 | import pypdf
 12 | import google.generativeai as genai
 13 | from PIL import Image
 14 | 
 15 | # Import the epub processor
 16 | from epub_processor import SimpleEpubProcessor
 17 | 
 18 | # Set up logging
 19 | logging.basicConfig(
 20 |     level=logging.INFO,
 21 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 22 | )
 23 | logger = logging.getLogger("GeminiDocumentProcessor")
 24 | 
 25 | 
 26 | class ChunkTimeoutError(Exception):
 27 |     """Exception raised when a chunk processing times out."""
 28 |     pass
 29 | 
 30 | 
 31 | class GeminiDocumentProcessor:
 32 |     """
 33 |     Enhanced processor for PDF and EPUB documents using Gemini API with improved
 34 |     timeout handling and chunk-level processing.
 35 |     """
 36 | 
 37 |     def __init__(
 38 |             self,
 39 |             api_key: str = None,
 40 |             model_name: str = "gemini-2.0-flash",
 41 |             language: str = "thai",
 42 |             chunk_size: int = 7,
 43 |             max_retries: int = 3,
 44 |             retry_delay: int = 5,
 45 |             extract_images: bool = True,
 46 |             min_img_width: int = 100,
 47 |             min_img_height: int = 100,
 48 |             img_format: str = "png",
 49 |             max_workers: int = 4,
 50 |             request_timeout: int = 60
 51 |     ):
 52 |         self.api_key = api_key
 53 |         self.model_name = model_name
 54 |         self.language = language
 55 |         self.chunk_size = chunk_size
 56 |         self.max_retries = max_retries
 57 |         self.retry_delay = retry_delay
 58 |         self.extract_images = extract_images
 59 |         self.min_img_width = min_img_width
 60 |         self.min_img_height = min_img_height
 61 |         self.img_format = img_format.lower()
 62 |         self.max_workers = max_workers
 63 |         self.request_timeout = request_timeout
 64 |         self.failed_chunks = []
 65 |         self.progress_callback = None  # Added progress callback
 66 |         # Do NOT initialize Gemini API here
 67 |         self.model = None
 68 | 
 69 |     def _initialize_api(self):
 70 |         """Initialize the Gemini API client."""
 71 |         if self.model is not None:
 72 |             return  # Already initialized
 73 |         try:
 74 |             if self.api_key:
 75 |                 logger.info("Using provided API key")
 76 |                 genai.configure(api_key=self.api_key)
 77 |             self.model = genai.GenerativeModel(self.model_name)
 78 |             # Test the API connection
 79 |             response = self.model.generate_content("Hello, this is a test.")
 80 |             if response:
 81 |                 logger.info(f"Successfully connected to Gemini API using model {self.model_name}")
 82 |         except Exception as e:
 83 |             logger.error(f"Error initializing Gemini API: {str(e)}")
 84 |             raise RuntimeError(f"Failed to initialize Gemini API: {str(e)}")
 85 | 
 86 |     def get_total_pages(self, pdf_path: str) -> int:
 87 |         """Get the total number of pages in a PDF document."""
 88 |         try:
 89 |             with open(pdf_path, 'rb') as file:
 90 |                 pdf_reader = pypdf.PdfReader(file)
 91 |                 return len(pdf_reader.pages)
 92 |         except Exception as e:
 93 |             logger.error(f"Error counting PDF pages: {e}")
 94 |             return 0
 95 | 
 96 |     def process_document(self, file_path: str) -> Tuple[Dict[str, str], Dict[str, List[Dict]], str, str, Dict]:
 97 |         """
 98 |         Process a document (PDF or EPUB) and return summaries and images.
 99 | 
100 |         This is a wrapper around the chunk-level processing functions that ensures
101 |         all chunks are processed with proper error handling.
102 | 
103 |         Args:
104 |             file_path: Path to the document
105 | 
106 |         Returns:
107 |             Tuple of:
108 |             - Dictionary of chunk/chapter summaries
109 |             - Dictionary of images by chunk/chapter
110 |             - Document name
111 |             - Document type ("pdf" or "epub")
112 |             - Document metadata
113 |         """
114 |         # Determine document type
115 |         file_extension = os.path.splitext(file_path)[1].lower()
116 |         doc_type = "epub" if file_extension == ".epub" else "pdf"
117 |         doc_name = os.path.splitext(os.path.basename(file_path))[0]
118 | 
119 |         # Reset failed chunks list
120 |         self.failed_chunks = []
121 | 
122 |         # Create images directory if extracting images
123 |         if self.extract_images:
124 |             images_dir = os.path.join(os.path.dirname(file_path), f"{doc_name}_images")
125 |             os.makedirs(images_dir, exist_ok=True)
126 |             logger.info(f"Images will be saved to: {images_dir}")
127 | 
128 |         # Process based on document type
129 |         if doc_type == "pdf":
130 |             return self._process_pdf(file_path)
131 |         else:
132 |             return self._process_epub(file_path)
133 | 
134 |     def _process_pdf(self, pdf_path: str) -> Tuple[Dict[str, str], Dict[str, List[Dict]], str, str, Dict]:
135 |         """
136 |         Process a PDF document chunk by chunk with immediate retry on failure.
137 | 
138 |         Args:
139 |             pdf_path: Path to the PDF file
140 | 
141 |         Returns:
142 |             Tuple containing:
143 |             - Dictionary of summaries by chunk
144 |             - Dictionary of images by chunk
145 |             - Document name without extension
146 |             - Document type ("pdf")
147 |             - Document metadata
148 |         """
149 |         # Get document name
150 |         pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
151 | 
152 |         # Count total pages
153 |         total_pages = self.get_total_pages(pdf_path)
154 |         logger.info(f"Total pages in PDF: {total_pages}")
155 | 
156 |         # Calculate total chunks
157 |         total_chunks = (total_pages + self.chunk_size - 1) // self.chunk_size
158 | 
159 |         # Extract metadata
160 |         metadata = self.extract_metadata(pdf_path)
161 | 
162 |         # Process each chunk
163 |         summaries = {}
164 |         images_by_chunk = {}
165 | 
166 |         # Process chunks sequentially to avoid overwhelming the API
167 |         for chunk_num in range(1, total_chunks + 1):
168 |             page_start = (chunk_num - 1) * self.chunk_size + 1
169 |             page_end = min(chunk_num * self.chunk_size, total_pages)
170 | 
171 |             # Update progress via callback if it exists
172 |             if self.progress_callback:
173 |                 self.progress_callback(chunk_num, total_chunks)
174 | 
175 |             # Immediate retry loop for each chunk
176 |             max_immediate_retries = 2  # Try up to 2 additional times (total 3 attempts)
177 |             immediate_retry_count = 0
178 |             chunk_success = False
179 |             last_error = None
180 | 
181 |             while immediate_retry_count <= max_immediate_retries and not chunk_success:
182 |                 try:
183 |                     if immediate_retry_count > 0:
184 |                         # Log that we're retrying this chunk immediately
185 |                         logger.info(
186 |                             f"Immediately retrying chunk {chunk_num} (attempt {immediate_retry_count + 1}/{max_immediate_retries + 1})...")
187 | 
188 |                         # Adjust model for retry - use a more capable model on retry
189 |                         if immediate_retry_count == 1 and self.model_name == "gemini-2.0-flash":
190 |                             original_model = self.model_name
191 |                             self.model_name = "gemini-2.5-flash-preview-04-17"
192 |                             logger.info(f"  Switching from {original_model} to {self.model_name} for retry")
193 |                         elif immediate_retry_count == 2:
194 |                             # On second retry, increase timeout
195 |                             original_timeout = self.request_timeout
196 |                             self.request_timeout = original_timeout * 1.5
197 |                             logger.info(f"  Increasing timeout to {self.request_timeout} seconds for final retry")
198 |                     else:
199 |                         # First attempt
200 |                         logger.info(f"Summarizing chunk {chunk_num} (pages {page_start}-{page_end})...")
201 | 
202 |                     # Extract text and images from this chunk
203 |                     chunk_text = self._extract_text_from_pdf_pages(pdf_path, page_start, page_end)
204 |                     chunk_images = []
205 | 
206 |                     if self.extract_images:
207 |                         chunk_images = self._extract_images_from_pdf_pages(
208 |                             pdf_path, page_start, page_end,
209 |                             os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_images"),
210 |                             chunk_num
211 |                         )
212 | 
213 |                     # Summarize the chunk text
214 |                     chunk_summary = self._summarize_text_with_timeout(
215 |                         chunk_text, chunk_num, page_start, page_end,
216 |                         doc_type="pdf", doc_filename=pdf_name
217 |                     )
218 | 
219 |                     # Save temporary summary
220 |                     temp_file = f"temp_{pdf_name}_chunk_{chunk_num}.md"
221 |                     with open(temp_file, "w", encoding="utf-8") as f:
222 |                         f.write(chunk_summary)
223 |                     logger.info(f"  Saved temporary summary to {temp_file}")
224 | 
225 |                     # Store results
226 |                     summaries[f"Chunk {chunk_num}"] = chunk_summary
227 | 
228 |                     if chunk_images:
229 |                         images_by_chunk[f"Chunk {chunk_num}"] = chunk_images
230 |                         logger.info(f"  Associated {len(chunk_images)} images with chunk {chunk_num}")
231 | 
232 |                     # If we get here, the chunk was processed successfully
233 |                     chunk_success = True
234 | 
235 |                     # Reset any modified parameters
236 |                     if immediate_retry_count > 0:
237 |                         if immediate_retry_count == 1 and 'original_model' in locals():
238 |                             logger.info(f"  Resetting model back to {original_model}")
239 |                             self.model_name = original_model
240 |                         if immediate_retry_count == 2 and 'original_timeout' in locals():
241 |                             logger.info(f"  Resetting timeout back to {original_timeout}")
242 |                             self.request_timeout = original_timeout
243 | 
244 |                 except Exception as e:
245 |                     last_error = e
246 |                     logger.error(f"Error processing chunk {chunk_num} (pages {page_start}-{page_end}): {str(e)}")
247 |                     immediate_retry_count += 1
248 | 
249 |                     # Reset any modified parameters before next retry or moving on
250 |                     if 'original_model' in locals() and self.model_name != original_model:
251 |                         logger.info(f"  Resetting model back to {original_model}")
252 |                         self.model_name = original_model
253 |                     if 'original_timeout' in locals() and self.request_timeout != original_timeout:
254 |                         logger.info(f"  Resetting timeout back to {original_timeout}")
255 | 
256 |             # After all retry attempts, if still not successful, add to failed chunks
257 |             if not chunk_success:
258 |                 self.failed_chunks.append({
259 |                     "chunk_number": chunk_num,
260 |                     "page_start": page_start,
261 |                     "page_end": page_end,
262 |                     "error": str(last_error)
263 |                 })
264 | 
265 |                 # Create a placeholder summary for the failed chunk
266 |                 summaries[
267 |                     f"Chunk {chunk_num}"] = f"**Error processing pages {page_start}-{page_end} after {max_immediate_retries + 1} attempts:** {str(last_error)}"
268 | 
269 |         # Final progress update
270 |         if self.progress_callback:
271 |             self.progress_callback(total_chunks, total_chunks)
272 | 
273 |         return summaries, images_by_chunk, pdf_name, "pdf", metadata
274 | 
275 |     def _extract_text_from_pdf_pages(self, pdf_path: str, start_page: int, end_page: int) -> str:
276 |         """
277 |         Extract text from a range of PDF pages.
278 | 
279 |         Args:
280 |             pdf_path: Path to the PDF file
281 |             start_page: Start page number (1-based index)
282 |             end_page: End page number (inclusive)
283 | 
284 |         Returns:
285 |             Extracted text from the specified pages
286 |         """
287 |         try:
288 |             with open(pdf_path, 'rb') as file:
289 |                 pdf_reader = pypdf.PdfReader(file)
290 | 
291 |                 # Adjust for 0-based indexing
292 |                 start_idx = start_page - 1
293 |                 end_idx = min(end_page, len(pdf_reader.pages)) - 1
294 | 
295 |                 text_parts = []
296 |                 for i in range(start_idx, end_idx + 1):
297 |                     try:
298 |                         page = pdf_reader.pages[i]
299 |                         page_text = page.extract_text() or ""
300 |                         text_parts.append(f"--- Page {i + 1} ---\n{page_text}")
301 |                     except Exception as e:
302 |                         logger.error(f"Error extracting text from page {i + 1}: {str(e)}")
303 |                         text_parts.append(f"--- Page {i + 1} ---\n[Error extracting text: {str(e)}]")
304 | 
305 |                 return "\n\n".join(text_parts)
306 | 
307 |         except Exception as e:
308 |             logger.error(f"Error opening PDF file: {str(e)}")
309 |             raise
310 | 
311 |     def _extract_images_from_pdf_pages(
312 |             self, pdf_path: str, start_page: int, end_page: int,
313 |             output_dir: str, chunk_num: int = None) -> List[Dict]:
314 |         """
315 |         Extract images from a range of PDF pages using PyPDF.
316 | 
317 |         Args:
318 |             pdf_path: Path to the PDF file
319 |             start_page: Start page number (1-based index)
320 |             end_page: End page number (inclusive)
321 |             output_dir: Directory to save extracted images
322 |             chunk_num: Current chunk number (for logging)
323 | 
324 |         Returns:
325 |             List of dictionaries with image information
326 |         """
327 |         # Store information about extracted images
328 |         extracted_images = []
329 | 
330 |         try:
331 |             # Make sure output directory exists
332 |             os.makedirs(output_dir, exist_ok=True)
333 | 
334 |             # Let's try with PyPDF first - it's more widely available
335 |             # For each page in range, extract images if any
336 |             with open(pdf_path, "rb") as pdf_file:
337 |                 pdf = pypdf.PdfReader(pdf_file)
338 | 
339 |                 # Adjust for 0-based indexing
340 |                 start_idx = start_page - 1
341 |                 end_idx = min(end_page, len(pdf.pages)) - 1
342 | 
343 |                 # Process each page in range
344 |                 for page_idx in range(start_idx, end_idx + 1):
345 |                     page_num = page_idx + 1  # 1-based page number for display
346 |                     page = pdf.pages[page_idx]
347 | 
348 |                     # Try to get images from the page (if any)
349 |                     try:
350 |                         # Extract image data through page.images property
351 |                         if hasattr(page, "images") and page.images:
352 |                             images_extracted = 0
353 | 
354 |                             for i, image in enumerate(page.images):
355 |                                 try:
356 |                                     # Skip small images
357 |                                     if hasattr(image, "width") and hasattr(image, "height"):
358 |                                         if image.width < self.min_img_width or image.height < self.min_img_height:
359 |                                             continue
360 | 
361 |                                     # Create a unique filename for this image
362 |                                     img_filename = f"page{page_num:03d}_img{i + 1:03d}.{self.img_format}"
363 |                                     img_path = os.path.join(output_dir, img_filename)
364 | 
365 |                                     # Try to save the image
366 |                                     with open(img_path, "wb") as img_file:
367 |                                         img_file.write(image.data)
368 | 
369 |                                     # Check if image was saved successfully
370 |                                     if os.path.exists(img_path) and os.path.getsize(img_path) > 0:
371 |                                         # Store image information
372 |                                         extracted_images.append({
373 |                                             "filename": img_filename,
374 |                                             "path": img_path,
375 |                                             "page": page_num,
376 |                                             "width": getattr(image, "width", 0),
377 |                                             "height": getattr(image, "height", 0),
378 |                                             "alt": f"Image {i + 1} from page {page_num}"
379 |                                         })
380 |                                         images_extracted += 1
381 | 
382 |                                 except Exception as e:
383 |                                     logger.error(f"Error extracting image {i + 1} from page {page_num}: {str(e)}")
384 | 
385 |                             if images_extracted > 0:
386 |                                 logger.info(f"Extracted {images_extracted} images from page {page_num}")
387 | 
388 |                     except Exception as e:
389 |                         logger.error(f"Error accessing images on page {page_num}: {str(e)}")
390 |                         continue
391 | 
392 |         except Exception as e:
393 |             logger.error(f"Error in image extraction process: {str(e)}")
394 | 
395 |         # Return information about extracted images
396 |         return extracted_images
397 | 
398 |     def _summarize_text_with_timeout(
399 |             self, text: str, chunk_num: int, page_start: int, page_end: int,
400 |             doc_type: str = "pdf", doc_filename: str = "") -> str:
401 |         """
402 |         Summarize text using Gemini API with timeout handling.
403 | 
404 |         Args:
405 |             text: Text to summarize
406 |             chunk_num: Chunk number for error reporting
407 |             page_start: Start page of this chunk
408 |             page_end: End page of this chunk
409 |             doc_type: Document type ('pdf' or 'epub')
410 |             doc_filename: Name of the document file
411 | 
412 |         Returns:
413 |             Summarized text
414 | 
415 |         Raises:
416 |             ChunkTimeoutError: If the API call times out
417 |         """
418 |         start_time = time.time()
419 |         max_time = self.request_timeout  # seconds
420 | 
421 |         # Ensure Gemini API is initialized before summarization
422 |         self._initialize_api()
423 | 
424 |         # Prepare the original Thai prompt
425 |         page_or_chapter = "หน้า" if doc_type == 'pdf' else "บท"
426 |         prompt = (
427 |             "คุณคือ expert ด้าน summarizer analyzing\n\n"
428 |             f"ช่วยสรุปเนื้อหา**จาก{doc_type}** ({doc_filename}) **เป็น ภาษาไทย** โดย:\n"
429 |             "1. ใช้หัวข้อเดิมตามไฟล์ได้เลย (ไม่ต้องแปลหัวข้อ)\n"
430 |             "2. ไม่อยากให้ตกหล่นแม้แต่เรื่องเดียว (ขอแบบละเอียดจนไม่ต้องกลับไปอ่านต้นฉบับเลย)\n"
431 |             f"3. บอกด้วยว่ากำลังสรุป{page_or_chapter}ไหนของไฟล์ เช่น <!-- 1 -->, <!-- 2 -->, <!-- 3 -->, <!-- end --> (use markdown comment)\n"
432 |             "4. ไม่จำเป็นต้องกระชับ และรักษาความถูกต้องของข้อมูลสำคัญ เนื้อหาสำคัญไม่ตกหล่น\n"
433 |             "5. ไม่ต้องแปล technical terminology จากภาษาอังกฤษให้เป็นภาษาไทย\n"
434 |             "6. ถ้าใน file มีตัวอย่าง code ก็ใส่มาให้ด้วย\n"
435 |             "7. output ใน format ที่ดีที่สุด\n\n"
436 |             f"เนื้อหาต่อไปนี้มาจาก{page_or_chapter} {page_start} ถึง{page_or_chapter} {page_end}:\n\n{text}"
437 |         )
438 | 
439 |         # Define a timeout function using a separate thread
440 |         def call_with_timeout(func, *args, **kwargs):
441 |             result = [None]
442 |             error = [None]
443 | 
444 |             def target():
445 |                 try:
446 |                     result[0] = func(*args, **kwargs)
447 |                 except Exception as e:
448 |                     error[0] = e
449 | 
450 |             thread = threading.Thread(target=target)
451 |             thread.daemon = True
452 |             thread.start()
453 |             thread.join(max_time)
454 | 
455 |             if thread.is_alive():
456 |                 # Thread is still running after timeout
457 |                 raise ChunkTimeoutError(f"API call timed out after {max_time} seconds")
458 | 
459 |             if error[0]:
460 |                 raise error[0]
461 | 
462 |             return result[0]
463 | 
464 |         # Try multiple times with backoff
465 |         for attempt in range(1, self.max_retries + 1):
466 |             try:
467 |                 # Check if we've already exceeded our timeout
468 |                 elapsed = time.time() - start_time
469 |                 if elapsed > max_time:
470 |                     raise ChunkTimeoutError(f"Processing timeout after {elapsed:.1f} seconds")
471 | 
472 |                 # Set up the generation config (without timeout parameter)
473 |                 generation_config = {
474 |                     "temperature": 0.1,
475 |                     "top_p": 0.95,
476 |                     "top_k": 50,
477 |                     "max_output_tokens": 65000,
478 |                     "response_mime_type": "text/plain"
479 |                 }
480 | 
481 |                 # Make the API call with our own timeout handling
482 |                 # Using only valid roles (user)
483 |                 response = call_with_timeout(
484 |                     self.model.generate_content,
485 |                     prompt,  # Send the full prompt as a user message
486 |                     generation_config=generation_config
487 |                 )
488 | 
489 |                 # Successfully got a response
490 |                 return response.text
491 | 
492 |             except ChunkTimeoutError:
493 |                 # Re-raise timeout errors directly
494 |                 raise
495 | 
496 |             except Exception as e:
497 |                 error_msg = str(e)
498 |                 logger.warning(f"Attempt {attempt} failed for chunk {chunk_num}: {error_msg}")
499 | 
500 |                 if attempt < self.max_retries:
501 |                     # Wait before retrying with exponential backoff
502 |                     wait_time = self.retry_delay * (2 ** (attempt - 1))
503 |                     logger.info(f"Waiting {wait_time} seconds before retry...")
504 |                     time.sleep(wait_time)
505 |                 else:
506 |                     # Failed all attempts
507 |                     logger.error(f"All {self.max_retries} attempts failed for chunk {chunk_num}")
508 |                     raise RuntimeError(f"Failed to summarize after {self.max_retries} attempts: {error_msg}")
509 | 
510 |     def _process_epub(self, epub_path: str) -> Tuple[Dict[str, str], Dict[str, List[Dict]], str, str, Dict]:
511 |         """
512 |         Process an EPUB document chapter by chapter.
513 | 
514 |         Args:
515 |             epub_path: Path to the EPUB file
516 | 
517 |         Returns:
518 |             Tuple containing:
519 |             - Dictionary of summaries by chapter
520 |             - Dictionary of images by chapter
521 |             - Document name without extension
522 |             - Document type ("epub")
523 |             - Document metadata
524 |         """
525 |         # Create EPUB processor
526 |         epub_processor = SimpleEpubProcessor(extract_images=self.extract_images, img_format=self.img_format)
527 | 
528 |         # Extract chapters and metadata
529 |         chapters, images_by_chapter, epub_name, metadata = epub_processor.process_epub(
530 |             epub_path,
531 |             image_output_dir=os.path.join(os.path.dirname(epub_path),
532 |                                           f"{os.path.splitext(os.path.basename(epub_path))[0]}_images")
533 |         )
534 | 
535 |         # Process each chapter with timeout handling
536 |         summaries = {}
537 |         processed_images = {}
538 | 
539 |         # Calculate total chapters
540 |         total_chapters = len(chapters)
541 | 
542 |         # Initialize progress
543 |         chapter_count = 0
544 | 
545 |         for chapter_key, chapter_text in chapters.items():
546 |             chapter_count += 1
547 | 
548 |             # Update progress via callback if it exists
549 |             if self.progress_callback:
550 |                 self.progress_callback(chapter_count, total_chapters)
551 | 
552 |             try:
553 |                 logger.info(f"Summarizing {chapter_key}...")
554 | 
555 |                 # Extract chapter number if possible
556 |                 chapter_num = chapter_key.replace("Chapter ", "")
557 | 
558 |                 # Summarize this chapter with timeout handling
559 |                 chapter_summary = self._summarize_text_with_timeout(
560 |                     chapter_text,
561 |                     chapter_num,
562 |                     chapter_num, chapter_num,  # Using chapter number for both start and end
563 |                     doc_type="epub",
564 |                     doc_filename=epub_name
565 |                 )
566 | 
567 |                 # Save temporary summary
568 |                 temp_file = f"temp_{epub_name}_{chapter_key.lower().replace(' ', '_')}.md"
569 |                 with open(temp_file, "w", encoding="utf-8") as f:
570 |                     f.write(chapter_summary)
571 |                 logger.info(f"  Saved temporary summary to {temp_file}")
572 | 
573 |                 # Store results
574 |                 summaries[chapter_key] = chapter_summary
575 | 
576 |                 # Store images for this chapter if any
577 |                 if chapter_key in images_by_chapter:
578 |                     processed_images[chapter_key] = images_by_chapter[chapter_key]
579 |                     logger.info(f"  Associated {len(images_by_chapter[chapter_key])} images with {chapter_key}")
580 | 
581 |             except Exception as e:
582 |                 logger.error(f"Error processing {chapter_key}: {str(e)}")
583 | 
584 |                 # Record the failure
585 |                 self.failed_chunks.append({
586 |                     "chapter": chapter_key,
587 |                     "error": str(e)
588 |                 })
589 | 
590 |                 # Create a placeholder summary for the failed chapter
591 |                 summaries[chapter_key] = f"**Error processing {chapter_key}:** {str(e)}"
592 | 
593 |                 # Continue with the next chapter
594 | 
595 |         # Final progress update
596 |         if self.progress_callback:
597 |             self.progress_callback(total_chapters, total_chapters)
598 | 
599 |         return summaries, processed_images, epub_name, "epub", metadata
600 |     def extract_metadata(self, file_path: str) -> Dict[str, str]:
601 |         """
602 |         Extract metadata from a document.
603 | 
604 |         Args:
605 |             file_path: Path to the document
606 | 
607 |         Returns:
608 |             Dictionary of metadata
609 |         """
610 |         file_extension = os.path.splitext(file_path)[1].lower()
611 | 
612 |         if file_extension == ".pdf":
613 |             return self._extract_pdf_metadata(file_path)
614 |         elif file_extension == ".epub":
615 |             return self._extract_epub_metadata(file_path)
616 |         else:
617 |             return {"title": os.path.splitext(os.path.basename(file_path))[0]}
618 | 
619 |     def _extract_pdf_metadata(self, pdf_path: str) -> Dict[str, str]:
620 |         """Extract metadata from a PDF file."""
621 |         try:
622 |             with open(pdf_path, 'rb') as file:
623 |                 pdf_reader = pypdf.PdfReader(file)
624 |                 metadata = {}
625 | 
626 |                 if pdf_reader.metadata:
627 |                     for key in pdf_reader.metadata:
628 |                         # Clean up the key name
629 |                         clean_key = key.lower().replace('/', '').strip()
630 |                         if pdf_reader.metadata[key]:
631 |                             metadata[clean_key] = str(pdf_reader.metadata[key])
632 | 
633 |                 # If no title is found, use the filename
634 |                 if 'title' not in metadata or not metadata['title']:
635 |                     metadata['title'] = os.path.splitext(os.path.basename(pdf_path))[0]
636 | 
637 |                 return metadata
638 | 
639 |         except Exception as e:
640 |             logger.error(f"Error extracting PDF metadata: {str(e)}")
641 |             return {"title": os.path.splitext(os.path.basename(pdf_path))[0]}
642 | 
643 |     def _extract_epub_metadata(self, epub_path: str) -> Dict[str, str]:
644 |         """Extract metadata from an EPUB file."""
645 |         # This would ideally call a method from your epub_processor
646 |         # For now, returning basic info
647 |         return {"title": os.path.splitext(os.path.basename(epub_path))[0]}
648 | 
649 |     def save_summaries(
650 |             self,
651 |             summaries: Dict[str, str],
652 |             images_by_chunk: Dict[str, List[Dict]],
653 |             output_path: str,
654 |             doc_name: str,
655 |             doc_type: str,
656 |             metadata: Dict,
657 |             obsidian_metadata: Optional[Dict] = None
658 |     ) -> str:
659 |         """
660 |         Save summaries to a Markdown file.
661 | 
662 |         Args:
663 |             summaries: Dictionary of summaries by chunk/chapter
664 |             images_by_chunk: Dictionary of images by chunk/chapter
665 |             output_path: Path to save the output Markdown file
666 |             doc_name: Document name
667 |             doc_type: Document type ("pdf" or "epub")
668 |             metadata: Document metadata
669 |             obsidian_metadata: Optional metadata for Obsidian
670 | 
671 |         Returns:
672 |             Path to the saved file
673 |         """
674 |         try:
675 |             with open(output_path, "w", encoding="utf-8") as f:
676 |                 # Add YAML frontmatter for Obsidian if provided
677 |                 if obsidian_metadata:
678 |                     f.write("---\n")
679 | 
680 |                     # Add tags
681 |                     if 'tags' in obsidian_metadata and obsidian_metadata['tags']:
682 |                         tags = obsidian_metadata['tags'].split(',')
683 |                         f.write("tags:\n")
684 |                         for tag in tags:
685 |                             f.write(f"  - {tag.strip()}\n")
686 | 
687 |                     # Add other metadata fields
688 |                     for key, value in obsidian_metadata.items():
689 |                         if key != 'tags' and value:  # Skip tags as we handled them above
690 |                             f.write(f"{key}: {value}\n")
691 | 
692 |                     f.write("---\n\n")
693 | 
694 |                 # Add title and metadata
695 |                 title = metadata.get('title', doc_name)
696 |                 f.write(f"# {title}\n\n")
697 | 
698 |                 if 'author' in metadata and metadata['author']:
699 |                     f.write(f"**Author:** {metadata['author']}\n\n")
700 | 
701 |                 # Add other metadata
702 |                 f.write("## Document Information\n\n")
703 |                 f.write(f"- **Type:** {doc_type.upper()}\n")
704 | 
705 |                 for key, value in metadata.items():
706 |                     if key not in ['title', 'author'] and value:
707 |                         key_display = key.replace('_', ' ').title()
708 |                         f.write(f"- **{key_display}:** {value}\n")
709 | 
710 |                 f.write("\n")
711 | 
712 |                 # Add table of contents
713 |                 f.write("## Table of Contents\n\n")
714 | 
715 |                 for idx, chunk_key in enumerate(summaries.keys()):
716 |                     f.write(f"{idx + 1}. [{chunk_key}](#{chunk_key.lower().replace(' ', '-')})\n")
717 | 
718 |                 f.write("\n")
719 | 
720 |                 # Add summaries
721 |                 f.write("## Summary\n\n")
722 | 
723 |                 for chunk_key, summary in summaries.items():
724 |                     # Add chunk header
725 |                     f.write(f"### {chunk_key}\n\n")
726 | 
727 |                     # Add summary content
728 |                     f.write(f"{summary}\n\n")
729 | 
730 |                     # Add images for this chunk if any
731 |                     if chunk_key in images_by_chunk and images_by_chunk[chunk_key]:
732 |                         f.write("#### Images\n\n")
733 | 
734 |                         for img_info in images_by_chunk[chunk_key]:
735 |                             img_path = img_info.get('path', '')
736 |                             alt_text = img_info.get('alt', 'Image') or f"Image from {chunk_key}"
737 | 
738 |                             # Create relative path for embedding
739 |                             rel_path = os.path.relpath(
740 |                                 img_path,
741 |                                 os.path.dirname(output_path)
742 |                             )
743 | 
744 |                             # Add image with markdown
745 |                             f.write(f"![{alt_text}]({rel_path})\n\n")
746 | 
747 |                 # Add footer with generation info
748 |                 f.write("---\n")
749 |                 f.write(f"*Summary generated using Gemini {self.model_name}*\n")
750 | 
751 |                 return output_path
752 | 
753 |         except Exception as e:
754 |             logger.error(f"Error saving summaries: {str(e)}")
755 |             raise
756 | 
757 |     def process_chunk(self, file_path: str, page_start: int, page_end: int, timeout: int = 60) -> Tuple[
758 |         str, List[Dict]]:
759 |         """
760 |         Process a single chunk of a document with timeout.
761 | 
762 |         Args:
763 |             file_path: Path to the document
764 |             page_start: Start page number (1-based index)
765 |             page_end: End page number (inclusive)
766 |             timeout: Timeout in seconds
767 | 
768 |         Returns:
769 |             Tuple containing:
770 |             - Summarized text
771 |             - List of image information dictionaries
772 |         """
773 |         # Save the current timeout setting
774 |         original_timeout = self.request_timeout
775 | 
776 |         try:
777 |             # Update timeout for this call
778 |             self.request_timeout = timeout
779 | 
780 |             # Get document info
781 |             doc_type = "pdf" if file_path.lower().endswith('.pdf') else "epub"
782 |             doc_filename = os.path.basename(file_path)
783 | 
784 |             # Extract text from pages
785 |             text = self._extract_text_from_pdf_pages(file_path, page_start, page_end)
786 | 
787 |             # Summarize text with timeout
788 |             summary = self._summarize_text_with_timeout(
789 |                 text,
790 |                 0,  # Not using chunk number here
791 |                 page_start,
792 |                 page_end,
793 |                 doc_type=doc_type,
794 |                 doc_filename=doc_filename
795 |             )
796 | 
797 |             # Extract images if enabled
798 |             images = []
799 |             if self.extract_images:
800 |                 doc_name = os.path.splitext(os.path.basename(file_path))[0]
801 |                 images_dir = os.path.join(os.path.dirname(file_path), f"{doc_name}_images")
802 | 
803 |                 images = self._extract_images_from_pdf_pages(
804 |                     file_path, page_start, page_end, images_dir
805 |                 )
806 | 
807 |             return summary, images
808 | 
809 |         finally:
810 |             # Restore original timeout
811 |             self.request_timeout = original_timeout
812 | 
813 |     def retry_failed_chunks(self, file_path: str, existing_summaries: Dict[str, str]) -> Dict[str, str]:
814 |         """
815 |         Retry processing previously failed chunks.
816 | 
817 |         Args:
818 |             file_path: Path to the document
819 |             existing_summaries: Dictionary of existing summaries
820 | 
821 |         Returns:
822 |             Updated dictionary of summaries
823 |         """
824 |         # Copy existing summaries
825 |         updated_summaries = existing_summaries.copy()
826 | 
827 |         # Nothing to do if no failed chunks
828 |         if not self.failed_chunks:
829 |             return updated_summaries
830 | 
831 |         logger.info(f"Retrying {len(self.failed_chunks)} failed chunks with increased timeout")
832 | 
833 |         # Save original settings
834 |         original_timeout = self.request_timeout
835 |         original_retries = self.max_retries
836 | 
837 |         try:
838 |             # Use more aggressive settings for retries
839 |             self.request_timeout = original_timeout * 2
840 |             self.max_retries = original_retries + 2
841 | 
842 |             # Process each failed chunk
843 |             for failed_chunk in self.failed_chunks:
844 |                 chunk_num = failed_chunk.get("chunk_number")
845 |                 if not chunk_num:
846 |                     continue
847 | 
848 |                 page_start = failed_chunk.get("page_start", 0)
849 |                 page_end = failed_chunk.get("page_end", 0)
850 | 
851 |                 if page_start <= 0 or page_end <= 0:
852 |                     continue
853 | 
854 |                 try:
855 |                     logger.info(f"Retrying chunk {chunk_num} (pages {page_start}-{page_end})...")
856 | 
857 |                     # Process this chunk
858 |                     summary, _ = self.process_chunk(file_path, page_start, page_end)
859 | 
860 |                     # Update the summaries
861 |                     chunk_key = f"Chunk {chunk_num}"
862 |                     updated_summaries[chunk_key] = summary
863 | 
864 |                     # Remove from failed chunks
865 |                     logger.info(f"Successfully reprocessed chunk {chunk_num}")
866 | 
867 |                 except Exception as e:
868 |                     logger.error(f"Failed to reprocess chunk {chunk_num}: {str(e)}")
869 |                     # Keep the existing summary or error message
870 | 
871 |             return updated_summaries
872 | 
873 |         finally:
874 |             # Restore original settings
875 |             self.request_timeout = original_timeout
876 |             self.max_retries = original_retries


--------------------------------------------------------------------------------
/document_gui.py:
--------------------------------------------------------------------------------
   1 | import os
   2 | import sys
   3 | import json
   4 | import threading
   5 | import logging
   6 | import shutil
   7 | import time
   8 | import uuid
   9 | import signal
  10 | from pathlib import Path
  11 | from threading import Timer
  12 | from flask import Flask, render_template, request, redirect, url_for, jsonify, send_from_directory
  13 | 
  14 | # Import the enhanced document processor
  15 | from document_processor import GeminiDocumentProcessor, ChunkTimeoutError
  16 | 
  17 | # Set up logging
  18 | logging.basicConfig(
  19 |     level=logging.INFO,
  20 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
  21 |     handlers=[
  22 |         logging.FileHandler("document_processor_web.log"),
  23 |         logging.StreamHandler()
  24 |     ]
  25 | )
  26 | logger = logging.getLogger("GeminiDocumentProcessorWeb")
  27 | 
  28 | app = Flask(__name__)
  29 | 
  30 | # Store processing jobs
  31 | jobs = {}
  32 | uploads_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'uploads')
  33 | results_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'results')
  34 | obsidian_dir = None  # Will be set from settings or form
  35 | 
  36 | # Create directories if they don't exist
  37 | os.makedirs(uploads_dir, exist_ok=True)
  38 | os.makedirs(results_dir, exist_ok=True)
  39 | 
  40 | # Try to load obsidian_dir from settings
  41 | try:
  42 |     settings_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.json')
  43 |     if os.path.exists(settings_file):
  44 |         with open(settings_file, 'r') as f:
  45 |             settings = json.load(f)
  46 |             obsidian_dir = settings.get('obsidian_vault_path', None)
  47 |             if obsidian_dir and os.path.exists(obsidian_dir):
  48 |                 logger.info(f"Loaded Obsidian vault path from settings: {obsidian_dir}")
  49 |             else:
  50 |                 obsidian_dir = None
  51 |                 logger.warning("Obsidian vault path not found or invalid in settings")
  52 | except Exception as e:
  53 |     logger.error(f"Error loading settings: {e}")
  54 | 
  55 | 
  56 | class ProcessingJob:
  57 |     def __init__(self, job_id, file_path, output_path, settings):
  58 |         self.job_id = job_id
  59 |         self.file_path = file_path
  60 |         self.output_path = output_path
  61 |         self.settings = settings
  62 |         self.status = "pending"
  63 |         self.progress = 0
  64 |         self.message = "Waiting to start..."
  65 |         self.log = []
  66 |         self.result_files = []
  67 |         self.error = None
  68 |         self.thread = None
  69 |         self.document_type = None
  70 |         self.document_metadata = None
  71 |         self.current_chunk = None
  72 |         self.total_chunks = None
  73 |         self.chunk_start_time = None
  74 |         self.chunk_timer = None
  75 |         self.processing_paused = False
  76 |         self.failed_chunks = []
  77 |         self.retry_for_job_id = None  # For retry jobs, reference to original job
  78 | 
  79 |     def add_log(self, message):
  80 |         timestamp = time.strftime('%H:%M:%S')
  81 |         log_entry = f"[{timestamp}] {message}"
  82 |         self.log.append(log_entry)
  83 |         self.message = message
  84 |         logger.info(f"Job {self.job_id}: {message}")
  85 | 
  86 |     def start_chunk_timer(self, chunk_number, timeout_seconds=300):  # 5-minute default timeout
  87 |         """Start a timer for the current chunk being processed."""
  88 |         self.current_chunk = chunk_number
  89 |         self.chunk_start_time = time.time()
  90 | 
  91 |         # Cancel any existing timer
  92 |         if self.chunk_timer:
  93 |             self.chunk_timer.cancel()
  94 | 
  95 |         # Set a new timer
  96 |         self.chunk_timer = Timer(timeout_seconds, self.handle_chunk_timeout)
  97 |         self.chunk_timer.daemon = True
  98 |         self.chunk_timer.start()
  99 | 
 100 |     def handle_chunk_timeout(self):
 101 |         """Handle a chunk processing timeout by logging and adding to failed chunks."""
 102 |         elapsed = time.time() - self.chunk_start_time
 103 |         self.add_log(f"⚠️ WARNING: Chunk {self.current_chunk} processing timed out after {elapsed:.1f} seconds")
 104 | 
 105 |         # Add to failed chunks list
 106 |         if self.current_chunk is not None:
 107 |             self.failed_chunks.append({
 108 |                 "chunk_number": self.current_chunk,
 109 |                 "reason": "timeout"
 110 |             })
 111 | 
 112 |         # Set the job as paused so we can try to recover
 113 |         self.processing_paused = True
 114 | 
 115 |     def clear_chunk_timer(self):
 116 |         """Clear the chunk timer after successful completion."""
 117 |         if self.chunk_timer:
 118 |             self.chunk_timer.cancel()
 119 |             self.chunk_timer = None
 120 |         self.chunk_start_time = None
 121 | 
 122 |     def update_progress(self):
 123 |         """Update progress percentage based on chunks processed."""
 124 |         if self.total_chunks and self.current_chunk is not None:
 125 |             # Ensure current_chunk is not greater than total_chunks
 126 |             current = min(self.current_chunk, self.total_chunks)
 127 |             # Calculate progress as percentage of chunks completed
 128 |             self.progress = min(round((current / self.total_chunks) * 100), 99)
 129 |             # Only set to 100% when fully complete
 130 |             if self.status == "completed":
 131 |                 self.progress = 100
 132 | 
 133 |             # Log progress updates for debugging
 134 |             logger.info(
 135 |                 f"Job {self.job_id}: Progress updated to {self.progress}% (chunk {current}/{self.total_chunks})")
 136 | 
 137 |     def to_dict(self):
 138 |         return {
 139 |             "job_id": self.job_id,
 140 |             "status": self.status,
 141 |             "progress": self.progress,
 142 |             "message": self.message,
 143 |             "log": self.log[-15:] if len(self.log) > 15 else self.log,  # Return last 15 log entries
 144 |             "result_files": self.result_files,
 145 |             "error": self.error,
 146 |             "document_type": self.document_type,
 147 |             "document_metadata": self.document_metadata,
 148 |             "current_chunk": self.current_chunk,
 149 |             "total_chunks": self.total_chunks,
 150 |             "failed_chunks": len(self.failed_chunks) if self.failed_chunks else 0
 151 |         }
 152 | 
 153 | 
 154 | def capture_output(job):
 155 |     """Redirect stdout to capture logs for a job"""
 156 | 
 157 |     class OutputCapture:
 158 |         def __init__(self, job):
 159 |             self.job = job
 160 |             self.original_stdout = sys.stdout
 161 |             self.buffer = ""
 162 | 
 163 |         def write(self, message):
 164 |             self.original_stdout.write(message)
 165 | 
 166 |             # Add to buffer
 167 |             self.buffer += message
 168 | 
 169 |             # Check if we have a complete line
 170 |             if "\n" in self.buffer:
 171 |                 lines = self.buffer.split("\n")
 172 |                 # Keep the last incomplete line in the buffer
 173 |                 self.buffer = lines[-1]
 174 | 
 175 |                 # Log complete lines
 176 |                 for line in lines[:-1]:
 177 |                     if line.strip():  # Skip empty lines
 178 |                         self.job.add_log(line.strip())
 179 | 
 180 |         def flush(self):
 181 |             self.original_stdout.flush()
 182 | 
 183 |             # Flush any remaining content in the buffer
 184 |             if self.buffer.strip():
 185 |                 self.job.add_log(self.buffer.strip())
 186 |                 self.buffer = ""
 187 | 
 188 |     return OutputCapture(job)
 189 | 
 190 | 
 191 | def save_to_obsidian(file_path, obsidian_dir, job, settings=None):
 192 |     """
 193 |     Save a file to the Obsidian vault.
 194 | 
 195 |     Args:
 196 |         file_path (str): Path to the source file
 197 |         obsidian_dir (str): Path to the Obsidian vault
 198 |         job (ProcessingJob): The job object for logging
 199 |         settings (dict, optional): Additional settings
 200 | 
 201 |     Returns:
 202 |         str: Path to the file in the Obsidian vault
 203 |     """
 204 |     if not obsidian_dir or not os.path.exists(obsidian_dir):
 205 |         job.add_log("❌ Invalid Obsidian vault path")
 206 |         return None
 207 | 
 208 |     try:
 209 |         # Create target directory (books folder in Obsidian vault)
 210 |         file_name = os.path.basename(file_path)
 211 |         books_dir = os.path.join(obsidian_dir, "books")
 212 |         os.makedirs(books_dir, exist_ok=True)
 213 | 
 214 |         # Target path in Obsidian vault
 215 |         target_path = os.path.join(books_dir, file_name)
 216 | 
 217 |         # Copy file to Obsidian vault
 218 |         shutil.copy2(file_path, target_path)
 219 | 
 220 |         # Copy associated images directory if it exists
 221 |         if job.document_type:
 222 |             base_name = os.path.splitext(os.path.basename(job.file_path))[0]
 223 |             images_dir = os.path.join(os.path.dirname(job.file_path), f"{base_name}_images")
 224 |             if os.path.exists(images_dir):
 225 |                 target_images_dir = os.path.join(books_dir, f"{base_name}_images")
 226 | 
 227 |                 # Remove existing images directory if it exists
 228 |                 if os.path.exists(target_images_dir):
 229 |                     shutil.rmtree(target_images_dir)
 230 | 
 231 |                 # Copy images directory
 232 |                 shutil.copytree(images_dir, target_images_dir)
 233 |                 job.add_log(f"✅ Copied images to Obsidian: {target_images_dir}")
 234 | 
 235 |         job.add_log(f"✅ File saved to Obsidian: {target_path}")
 236 |         return target_path
 237 | 
 238 |     except Exception as e:
 239 |         job.add_log(f"❌ Error saving to Obsidian: {str(e)}")
 240 |         return None
 241 | 
 242 | 
 243 | def process_document_job(job_id):
 244 |     """Process document in a separate thread with improved error handling and recovery"""
 245 |     job = jobs[job_id]
 246 |     job.status = "processing"
 247 |     file_extension = os.path.splitext(job.file_path)[1].lower()
 248 |     file_type = "EPUB" if file_extension == ".epub" else "PDF"
 249 |     job.add_log(f"Starting {file_type} processing for {os.path.basename(job.file_path)}")
 250 | 
 251 |     # Redirect stdout to capture logs
 252 |     original_stdout = sys.stdout
 253 |     sys.stdout = capture_output(job)
 254 | 
 255 |     try:
 256 |         # Apply settings
 257 |         settings = job.settings
 258 |         chunk_size = int(settings.get('chunk_size', 7))
 259 |         chunk_timeout = int(settings.get('chunk_timeout', 300))  # Default 5 minutes
 260 |         api_timeout = int(settings.get('api_timeout', 60))  # Default 1 minute
 261 | 
 262 |         # Initialize processor
 263 |         processor = GeminiDocumentProcessor(
 264 |             chunk_size=chunk_size,
 265 |             api_key=settings.get('api_key', ""),
 266 |             model_name=settings.get('model_name', "gemini-2.0-flash"),
 267 |             language="thai",  # Always Thai for this tool
 268 |             max_retries=int(settings.get('max_retries', 3)),
 269 |             extract_images=settings.get('extract_images', True),
 270 |             min_img_width=int(settings.get('min_img_width', 100)),
 271 |             min_img_height=int(settings.get('min_img_height', 100)),
 272 |             img_format=settings.get('img_format', "png"),
 273 |             max_workers=int(settings.get('max_workers', 4)),
 274 |             request_timeout=api_timeout
 275 |         )
 276 | 
 277 |         # Define a progress callback function
 278 |         def progress_callback(chunk_num, total_chunks):
 279 |             job.current_chunk = chunk_num
 280 |             job.total_chunks = total_chunks
 281 |             job.update_progress()
 282 | 
 283 |         # Add the progress callback to the processor
 284 |         processor.progress_callback = progress_callback
 285 | 
 286 |         # Process the document with improved chunk handling
 287 |         job.add_log(f"Processing {file_type} with {settings.get('model_name')} model")
 288 | 
 289 |         # First, analyze the document to get total pages/chunks
 290 |         if file_type == "PDF":
 291 |             try:
 292 |                 # For PDF, we can estimate total chunks before processing
 293 |                 total_pages = processor.get_total_pages(job.file_path)
 294 |                 job.total_chunks = (total_pages + chunk_size - 1) // chunk_size  # Ceiling division
 295 |                 job.add_log(
 296 |                     f"Document has {total_pages} pages, will process in approximately {job.total_chunks} chunks")
 297 |                 # Initialize progress
 298 |                 job.current_chunk = 0
 299 |                 job.update_progress()
 300 |             except Exception as e:
 301 |                 job.add_log(f"Error estimating document size: {str(e)}")
 302 |                 # Continue anyway, we'll handle progress differently
 303 | 
 304 |         # Now process the document
 305 |         try:
 306 |             # Process document with standard approach and let the processor handle chunk-by-chunk processing
 307 |             summaries, images_by_chunk, doc_name, doc_type, doc_metadata = processor.process_document(job.file_path)
 308 | 
 309 |             # Store document type and metadata
 310 |             job.document_type = doc_type
 311 |             job.document_metadata = doc_metadata
 312 | 
 313 |             # Save any info about failed chunks
 314 |             job.failed_chunks = processor.failed_chunks.copy() if hasattr(processor, 'failed_chunks') else []
 315 |         except Exception as e:
 316 |             job.add_log(f"❌ Error during document processing: {str(e)}")
 317 |             raise
 318 | 
 319 |         # Prepare Obsidian metadata if enabled
 320 |         obsidian_metadata = None
 321 |         if settings.get('use_obsidian', False):
 322 |             obsidian_metadata = {
 323 |                 'tags': settings.get('obsidian_tags', 'book,main'),
 324 |                 'author': settings.get('obsidian_author', doc_metadata.get('author', '')),
 325 |                 'coverUrl': settings.get('obsidian_cover_url', ''),
 326 |                 'review': settings.get('obsidian_review', '')
 327 |             }
 328 | 
 329 |         # Save the results
 330 |         job.add_log(f"Saving summary to {job.output_path}")
 331 |         output_file = processor.save_summaries(
 332 |             summaries,
 333 |             images_by_chunk,
 334 |             job.output_path,
 335 |             doc_name,
 336 |             doc_type,
 337 |             doc_metadata,
 338 |             obsidian_metadata
 339 |         )
 340 | 
 341 |         # Handle image directory
 342 |         if settings.get('extract_images', True):
 343 |             images_dir = os.path.join(os.path.dirname(job.file_path), f"{doc_name}_images")
 344 |             if os.path.exists(images_dir):
 345 |                 result_images_dir = os.path.join(results_dir, f"{job.job_id}_images")
 346 |                 shutil.copytree(images_dir, result_images_dir)
 347 |                 job.add_log(f"Copied images to {result_images_dir}")
 348 | 
 349 |                 # Add images directory to result files
 350 |                 job.result_files.append({
 351 |                     "type": "directory",
 352 |                     "name": f"{doc_name}_images",
 353 |                     "path": f"{job.job_id}_images"
 354 |                 })
 355 | 
 356 |         # Add summary file to result files
 357 |         job.result_files.append({
 358 |             "type": "file",
 359 |             "name": os.path.basename(job.output_path),
 360 |             "path": os.path.basename(job.output_path)
 361 |         })
 362 | 
 363 |         # Copy summary file to results directory
 364 |         shutil.copy2(job.output_path, os.path.join(results_dir, os.path.basename(job.output_path)))
 365 | 
 366 |         # Save to Obsidian if requested
 367 |         if settings.get('use_obsidian', False) and settings.get('obsidian_vault_path'):
 368 |             obsidian_path = settings.get('obsidian_vault_path')
 369 |             if os.path.exists(obsidian_path):
 370 |                 job.add_log(f"Saving to Obsidian vault: {obsidian_path}")
 371 |                 obsidian_file = save_to_obsidian(job.output_path, obsidian_path, job, settings)
 372 |                 if obsidian_file:
 373 |                     job.result_files.append({
 374 |                         "type": "obsidian",
 375 |                         "name": f"Obsidian: {os.path.basename(obsidian_file)}",
 376 |                         "path": obsidian_file
 377 |                     })
 378 | 
 379 |         # Clean up temporary files
 380 |         temp_files = [
 381 |             f for f in os.listdir('.')
 382 |             if (f.startswith(f"temp_{doc_name}_chunk_") or f.startswith(f"temp_{doc_name}_chapter_"))
 383 |                and f.endswith(".md")
 384 |         ]
 385 |         for temp_file in temp_files:
 386 |             try:
 387 |                 os.remove(temp_file)
 388 |                 job.add_log(f"Removed temporary file: {temp_file}")
 389 |             except Exception as e:
 390 |                 job.add_log(f"Could not remove temporary file {temp_file}: {e}")
 391 | 
 392 |         # Report any failed chunks
 393 |         if job.failed_chunks:
 394 |             job.add_log(f"⚠️ Note: {len(job.failed_chunks)} chunks failed processing.")
 395 | 
 396 |             # Save failed chunks for future retry
 397 |             failed_chunks_file = os.path.join(results_dir, f"{job.job_id}_failed_chunks.json")
 398 |             with open(failed_chunks_file, "w", encoding="utf-8") as f:
 399 |                 json.dump(job.failed_chunks, f, ensure_ascii=False, indent=2)
 400 | 
 401 |             job.add_log(f"Failed chunks saved to {failed_chunks_file}")
 402 | 
 403 |             # Add failed chunks file to result files
 404 |             job.result_files.append({
 405 |                 "type": "file",
 406 |                 "name": f"{doc_name}_failed_chunks.json",
 407 |                 "path": f"{job.job_id}_failed_chunks.json"
 408 |             })
 409 | 
 410 |             # Add option to retry failed chunks
 411 |             job.result_files.append({
 412 |                 "type": "action",
 413 |                 "name": f"Retry {len(job.failed_chunks)} Failed Chunks",
 414 |                 "action": "retry_chunks",
 415 |                 "job_id": job.job_id
 416 |             })
 417 | 
 418 |         job.status = "completed"
 419 |         job.progress = 100
 420 |         job.add_log("✅ Processing completed successfully!")
 421 | 
 422 |     except Exception as e:
 423 |         error_message = f"Error processing document: {str(e)}"
 424 |         job.status = "failed"
 425 |         job.error = error_message
 426 |         job.add_log(f"❌ {error_message}")
 427 |         logger.error(error_message, exc_info=True)
 428 | 
 429 |     # Restore stdout
 430 |     sys.stdout = original_stdout
 431 | 
 432 | 
 433 | def process_retry_chunks(job_id):
 434 |     """Process only the failed chunks from a previous job."""
 435 |     job = jobs[job_id]
 436 |     original_job_id = job.retry_for_job_id
 437 | 
 438 |     if not original_job_id or original_job_id not in jobs:
 439 |         job.add_log("❌ Original job not found")
 440 |         job.status = "failed"
 441 |         job.error = "Original job not found"
 442 |         return
 443 | 
 444 |     original_job = jobs[original_job_id]
 445 | 
 446 |     if not original_job.failed_chunks:
 447 |         job.add_log("No failed chunks to retry")
 448 |         job.status = "completed"
 449 |         return
 450 | 
 451 |     # Start processing
 452 |     job.status = "processing"
 453 |     job.add_log(f"Retrying {len(original_job.failed_chunks)} failed chunks from job {original_job_id}")
 454 | 
 455 |     # Initialize progress tracking
 456 |     job.total_chunks = len(original_job.failed_chunks)
 457 |     job.current_chunk = 0
 458 |     job.update_progress()
 459 | 
 460 |     # Redirect stdout to capture logs
 461 |     original_stdout = sys.stdout
 462 |     sys.stdout = capture_output(job)
 463 | 
 464 |     try:
 465 |         # Load original settings with adjustments for retry
 466 |         settings = job.settings.copy()
 467 | 
 468 |         # Use more aggressive settings for retries
 469 |         settings['chunk_timeout'] = int(settings.get('chunk_timeout', 300)) * 2
 470 |         settings['api_timeout'] = int(settings.get('api_timeout', 60)) * 2
 471 |         settings['max_retries'] = int(settings.get('max_retries', 3)) + 2
 472 | 
 473 |         # Initialize processor
 474 |         processor = GeminiDocumentProcessor(
 475 |             chunk_size=int(settings.get('chunk_size', 7)),
 476 |             api_key=settings.get('api_key', ""),
 477 |             model_name=settings.get('model_name', "gemini-2.0-flash"),
 478 |             language="thai",  # Always Thai for this tool
 479 |             max_retries=int(settings.get('max_retries', 5)),  # More retries
 480 |             extract_images=settings.get('extract_images', True),
 481 |             min_img_width=int(settings.get('min_img_width', 100)),
 482 |             min_img_height=int(settings.get('min_img_height', 100)),
 483 |             img_format=settings.get('img_format', "png"),
 484 |             max_workers=int(settings.get('max_workers', 4)),
 485 |             request_timeout=int(settings.get('api_timeout', 120))  # Longer timeout
 486 |         )
 487 | 
 488 |         # Load the original summaries file
 489 |         original_output_path = original_job.output_path
 490 | 
 491 |         # Read original file to get existing summaries
 492 |         with open(original_output_path, 'r', encoding='utf-8') as f:
 493 |             original_content = f.read()
 494 | 
 495 |         # Process failed chunks
 496 |         successful_retries = 0
 497 |         for idx, failed_chunk in enumerate(original_job.failed_chunks):
 498 |             chunk_num = failed_chunk.get('chunk_number')
 499 |             if not chunk_num:
 500 |                 continue
 501 | 
 502 |             # Update progress
 503 |             job.current_chunk = idx + 1
 504 |             job.update_progress()
 505 | 
 506 |             # Try to extract page range from failed chunk info
 507 |             page_start = failed_chunk.get('page_start', 0)
 508 |             page_end = failed_chunk.get('page_end', 0)
 509 | 
 510 |             # If we don't have page info, try to calculate it
 511 |             if page_start <= 0 or page_end <= 0:
 512 |                 chunk_size = int(settings.get('chunk_size', 7))
 513 |                 page_start = (chunk_num - 1) * chunk_size + 1
 514 |                 page_end = chunk_num * chunk_size
 515 | 
 516 |             job.add_log(f"Retrying chunk {chunk_num} (pages {page_start}-{page_end})...")
 517 | 
 518 |             try:
 519 |                 # Process this chunk with extended timeout
 520 |                 summary, images = processor.process_chunk(
 521 |                     job.file_path,
 522 |                     page_start,
 523 |                     page_end,
 524 |                     timeout=int(settings.get('api_timeout', 120))
 525 |                 )
 526 | 
 527 |                 # Save temporary result
 528 |                 temp_file = f"temp_retry_{job.job_id}_chunk_{chunk_num}.md"
 529 |                 with open(temp_file, "w", encoding="utf-8") as f:
 530 |                     f.write(summary)
 531 | 
 532 |                 job.add_log(f"  Successfully reprocessed chunk {chunk_num}")
 533 |                 successful_retries += 1
 534 | 
 535 |                 # We would need to update the original content here
 536 |                 # This is a simplistic approach - in practice, you'd need more sophisticated parsing
 537 |                 chunk_marker = f"### Chunk {chunk_num}"
 538 |                 next_chunk_marker = f"### Chunk {chunk_num + 1}"
 539 | 
 540 |                 if chunk_marker in original_content:
 541 |                     start_idx = original_content.find(chunk_marker)
 542 |                     end_idx = original_content.find(next_chunk_marker, start_idx)
 543 | 
 544 |                     if end_idx == -1:  # Last chunk
 545 |                         end_idx = original_content.find("---\n*Summary generated", start_idx)
 546 | 
 547 |                     if start_idx != -1 and end_idx != -1:
 548 |                         # Replace the chunk content
 549 |                         new_content = original_content[:start_idx] + chunk_marker + "\n\n" + summary + "\n\n"
 550 |                         if end_idx != -1:
 551 |                             new_content += original_content[end_idx:]
 552 | 
 553 |                         original_content = new_content
 554 | 
 555 |             except Exception as e:
 556 |                 job.add_log(f"❌ Error retrying chunk {chunk_num}: {str(e)}")
 557 |                 job.failed_chunks.append({
 558 |                     "chunk_number": chunk_num,
 559 |                     "page_start": page_start,
 560 |                     "page_end": page_end,
 561 |                     "error": str(e)
 562 |                 })
 563 | 
 564 |         # Save the updated content to a new file
 565 |         new_output_path = job.output_path
 566 |         with open(new_output_path, 'w', encoding='utf-8') as f:
 567 |             f.write(original_content)
 568 | 
 569 |         # Copy to results directory
 570 |         shutil.copy2(new_output_path, os.path.join(results_dir, os.path.basename(new_output_path)))
 571 | 
 572 |         # Add to result files
 573 |         job.result_files.append({
 574 |             "type": "file",
 575 |             "name": os.path.basename(new_output_path),
 576 |             "path": os.path.basename(new_output_path)
 577 |         })
 578 | 
 579 |         # Summary of retry results
 580 |         if successful_retries > 0:
 581 |             job.add_log(f"✅ Successfully reprocessed {successful_retries} of {len(original_job.failed_chunks)} chunks")
 582 |         else:
 583 |             job.add_log("❌ Failed to reprocess any chunks")
 584 | 
 585 |         # Set final progress and status
 586 |         job.status = "completed"
 587 |         job.progress = 100
 588 | 
 589 |     except Exception as e:
 590 |         error_message = f"Error during retry processing: {str(e)}"
 591 |         job.status = "failed"
 592 |         job.error = error_message
 593 |         job.add_log(f"❌ {error_message}")
 594 |         logger.error(error_message, exc_info=True)
 595 | 
 596 |     # Restore stdout
 597 |     sys.stdout = original_stdout
 598 | 
 599 | @app.route('/')
 600 | def index():
 601 |     """Main page"""
 602 |     return render_template('index.html', obsidian_dir=obsidian_dir)
 603 | 
 604 | 
 605 | @app.route('/upload', methods=['POST'])
 606 | def upload_file():
 607 |     """Handle file upload and job creation"""
 608 |     if 'file' not in request.files:
 609 |         return jsonify({'error': 'No file part'}), 400
 610 | 
 611 |     file = request.files['file']
 612 |     if file.filename == '':
 613 |         return jsonify({'error': 'No selected file'}), 400
 614 | 
 615 |     # Check valid file types
 616 |     if not (file.filename.lower().endswith('.pdf') or file.filename.lower().endswith('.epub')):
 617 |         return jsonify({'error': 'File must be a PDF or EPUB'}), 400
 618 | 
 619 |     # Generate a unique job ID
 620 |     job_id = str(uuid.uuid4())
 621 | 
 622 |     # Save the uploaded file
 623 |     file_filename = os.path.basename(file.filename)
 624 |     file_name_without_ext = os.path.splitext(file_filename)[0]
 625 |     file_path = os.path.join(uploads_dir, f"{job_id}_{file_filename}")
 626 |     file.save(file_path)
 627 | 
 628 |     # Get settings from form
 629 |     settings = {
 630 |         'model_name': request.form.get('model_name', 'gemini-2.0-flash'),
 631 |         'chunk_size': request.form.get('chunk_size', '7'),
 632 |         'api_key': request.form.get('api_key', ''),
 633 |         'extract_images': request.form.get('extract_images') == 'on',
 634 |         'max_retries': request.form.get('max_retries', '3'),
 635 |         'min_img_width': request.form.get('min_img_width', '100'),
 636 |         'min_img_height': request.form.get('min_img_height', '100'),
 637 |         'img_format': request.form.get('img_format', 'png'),
 638 |         'max_workers': request.form.get('max_workers', '4'),
 639 |         'chunk_timeout': request.form.get('chunk_timeout', '300'),  # 5 minutes default
 640 |         'api_timeout': request.form.get('api_timeout', '60'),  # 1 minute default
 641 | 
 642 |         # Obsidian settings
 643 |         'use_obsidian': request.form.get('use_obsidian') == 'on',
 644 |         'obsidian_vault_path': request.form.get('obsidian_vault_path', ''),
 645 |         'obsidian_tags': request.form.get('obsidian_tags', 'book,main,verified'),
 646 |         'obsidian_author': request.form.get('obsidian_author', ''),
 647 |         'obsidian_cover_url': request.form.get('obsidian_cover_url', ''),
 648 |         'obsidian_review': request.form.get('obsidian_review', '')
 649 |     }
 650 | 
 651 |     # Save Obsidian path to settings if provided
 652 |     if settings['obsidian_vault_path'] and os.path.exists(settings['obsidian_vault_path']):
 653 |         try:
 654 |             settings_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'settings.json')
 655 |             settings_data = {}
 656 |             if os.path.exists(settings_file):
 657 |                 with open(settings_file, 'r') as f:
 658 |                     settings_data = json.load(f)
 659 | 
 660 |             settings_data['obsidian_vault_path'] = settings['obsidian_vault_path']
 661 | 
 662 |             with open(settings_file, 'w') as f:
 663 |                 json.dump(settings_data, f, indent=2)
 664 | 
 665 |             global obsidian_dir
 666 |             obsidian_dir = settings['obsidian_vault_path']
 667 |             logger.info(f"Saved Obsidian vault path to settings: {obsidian_dir}")
 668 |         except Exception as e:
 669 |             logger.error(f"Error saving settings: {e}")
 670 | 
 671 |     # Create output path
 672 |     output_path = os.path.join(uploads_dir, f"{job_id}_{file_name_without_ext}_summary.md")
 673 | 
 674 |     # Create and store the job
 675 |     job = ProcessingJob(job_id, file_path, output_path, settings)
 676 |     jobs[job_id] = job
 677 | 
 678 |     # Start processing in a background thread
 679 |     job.thread = threading.Thread(target=process_document_job, args=(job_id,))
 680 |     job.thread.daemon = True
 681 |     job.thread.start()
 682 | 
 683 |     return jsonify({'job_id': job_id, 'redirect': url_for('job_status', job_id=job_id)})
 684 | 
 685 | 
 686 | @app.route('/job/<job_id>')
 687 | def job_status(job_id):
 688 |     """Show job status page"""
 689 |     if job_id not in jobs:
 690 |         return "Job not found", 404
 691 | 
 692 |     return render_template('job_status.html', job_id=job_id)
 693 | 
 694 | 
 695 | @app.route('/api/job/<job_id>')
 696 | def api_job_status(job_id):
 697 |     """API to get job status"""
 698 |     if job_id not in jobs:
 699 |         return jsonify({'error': 'Job not found'}), 404
 700 | 
 701 |     return jsonify(jobs[job_id].to_dict())
 702 | 
 703 | 
 704 | @app.route('/download/<path:filename>')
 705 | def download_file(filename):
 706 |     """Serve result files for download"""
 707 |     return send_from_directory(results_dir, filename, as_attachment=True)
 708 | 
 709 | 
 710 | @app.route('/view/<path:filename>')
 711 | def view_file(filename):
 712 |     """View result files in browser"""
 713 |     return send_from_directory(results_dir, filename)
 714 | 
 715 | 
 716 | @app.route('/retry_chunks/<job_id>', methods=['POST'])
 717 | def retry_failed_chunks(job_id):
 718 |     """Retry processing failed chunks for a job."""
 719 |     if job_id not in jobs:
 720 |         return jsonify({'error': 'Job not found'}), 404
 721 | 
 722 |     job = jobs[job_id]
 723 | 
 724 |     if not hasattr(job, 'failed_chunks') or not job.failed_chunks:
 725 |         return jsonify({'message': 'No failed chunks to retry'}), 200
 726 | 
 727 |     # Create a new job for retry
 728 |     retry_job_id = str(uuid.uuid4())
 729 |     file_name_without_ext = os.path.splitext(os.path.basename(job.file_path))[0]
 730 |     retry_output_path = os.path.join(uploads_dir, f"{retry_job_id}_{file_name_without_ext}_summary_retry.md")
 731 | 
 732 |     retry_job = ProcessingJob(
 733 |         retry_job_id,
 734 |         job.file_path,
 735 |         retry_output_path,
 736 |         job.settings
 737 |     )
 738 | 
 739 |     # Set special flag to indicate this is a retry job
 740 |     retry_job.retry_for_job_id = job_id
 741 |     retry_job.failed_chunks = job.failed_chunks.copy()
 742 | 
 743 |     # Store the job
 744 |     jobs[retry_job_id] = retry_job
 745 | 
 746 |     # Start processing in a background thread
 747 |     retry_job.thread = threading.Thread(target=process_retry_chunks, args=(retry_job_id,))
 748 |     retry_job.thread.daemon = True
 749 |     retry_job.thread.start()
 750 | 
 751 |     return jsonify({'job_id': retry_job_id, 'redirect': url_for('job_status', job_id=retry_job_id)})
 752 | 
 753 | 
 754 | @app.route('/obsidian_check', methods=['POST'])
 755 | def check_obsidian_path():
 756 |     """Check if the Obsidian vault path is valid"""
 757 |     path = request.json.get('path', '')
 758 |     if not path:
 759 |         return jsonify({'valid': False, 'message': 'Path is empty'})
 760 | 
 761 |     if not os.path.exists(path):
 762 |         return jsonify({'valid': False, 'message': 'Path does not exist'})
 763 | 
 764 |     if not os.path.isdir(path):
 765 |         return jsonify({'valid': False, 'message': 'Path is not a directory'})
 766 | 
 767 |     # Check for .obsidian folder to confirm it's an Obsidian vault
 768 |     obsidian_folder = os.path.join(path, '.obsidian')
 769 |     if not os.path.exists(obsidian_folder) or not os.path.isdir(obsidian_folder):
 770 |         return jsonify({'valid': False, 'message': 'Not an Obsidian vault (no .obsidian folder)'})
 771 | 
 772 |     return jsonify({'valid': True, 'message': 'Valid Obsidian vault'})
 773 | 
 774 | 
 775 | if __name__ == '__main__':
 776 |     # Create HTML templates folder and files if they don't exist
 777 |     templates_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'templates')
 778 |     os.makedirs(templates_dir, exist_ok=True)
 779 | 
 780 |     # Create index.html template with additional timeout settings
 781 |     index_html = """
 782 |     <!DOCTYPE html>
 783 |     <html lang="en">
 784 |     <head>
 785 |         <meta charset="UTF-8">
 786 |         <meta name="viewport" content="width=device-width, initial-scale=1.0">
 787 |         <title>Gemini Document Processor</title>
 788 |         <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
 789 |         <style>
 790 |             body { padding-top: 20px; }
 791 |             .card { margin-bottom: 20px; }
 792 |             .form-group { margin-bottom: 15px; }
 793 |             .obsidian-settings { 
 794 |                 background-color: #f5f5f5;
 795 |                 border-radius: 8px;
 796 |                 padding: 15px;
 797 |                 margin-top: 15px;
 798 |             }
 799 |         </style>
 800 |     </head>
 801 |     <body>
 802 |         <div class="container">
 803 |             <h1 class="mb-4">Gemini Document Processor</h1>
 804 | 
 805 |             <div class="card">
 806 |                 <div class="card-header">
 807 |                     <ul class="nav nav-tabs card-header-tabs" id="myTab" role="tablist">
 808 |                         <li class="nav-item" role="presentation">
 809 |                             <button class="nav-link active" id="basic-tab" data-bs-toggle="tab" data-bs-target="#basic" type="button" role="tab" aria-controls="basic" aria-selected="true">Basic Settings</button>
 810 |                         </li>
 811 |                         <li class="nav-item" role="presentation">
 812 |                             <button class="nav-link" id="obsidian-tab" data-bs-toggle="tab" data-bs-target="#obsidian" type="button" role="tab" aria-controls="obsidian" aria-selected="false">Obsidian Integration</button>
 813 |                         </li>
 814 |                         <li class="nav-item" role="presentation">
 815 |                             <button class="nav-link" id="advanced-tab" data-bs-toggle="tab" data-bs-target="#advanced" type="button" role="tab" aria-controls="advanced" aria-selected="false">Advanced Settings</button>
 816 |                         </li>
 817 |                     </ul>
 818 |                 </div>
 819 |                 <div class="card-body">
 820 |                     <form id="uploadForm" enctype="multipart/form-data">
 821 |                         <div class="tab-content" id="myTabContent">
 822 |                             <div class="tab-pane fade show active" id="basic" role="tabpanel" aria-labelledby="basic-tab">
 823 |                                 <div class="form-group">
 824 |                                     <label for="file">Select PDF or EPUB File:</label>
 825 |                                     <input type="file" class="form-control" id="file" name="file" accept=".pdf,.epub" required>
 826 |                                     <small class="form-text text-muted">You can upload either a PDF or EPUB file for processing</small>
 827 |                                 </div>
 828 | 
 829 |                                 <div class="form-group">
 830 |                                     <label for="model_name">Gemini Model:</label>
 831 |                                     <select class="form-select" id="model_name" name="model_name">
 832 |                                         <option value="gemini-2.0-flash">gemini-2.0-flash (Faster)</option>
 833 |                                         <option value="gemini-2.5-flash-preview-04-17">gemini-2.5-flash-preview-04-17 (More accurate)</option>
 834 |                                         <option value="gemini-1.5-pro">gemini-1.5-pro (Backup option)</option>
 835 |                                     </select>
 836 |                                 </div>
 837 | 
 838 |                                 <div class="form-group">
 839 |                                     <label for="chunk_size">Pages per Chunk (PDF) / Content per Chapter (EPUB):</label>
 840 |                                     <input type="number" class="form-control" id="chunk_size" name="chunk_size" value="7" min="1" max="20">
 841 |                                     <small class="form-text text-muted">For PDFs: Number of pages to process in each API call. For EPUBs: Controls how much content is processed at once.</small>
 842 |                                 </div>
 843 | 
 844 |                                 <div class="form-group">
 845 |                                     <label for="api_key">API Key:</label>
 846 |                                     <input type="password" class="form-control" id="api_key" name="api_key" value="">
 847 |                                     <div class="form-check mt-2">
 848 |                                         <input class="form-check-input" type="checkbox" id="showApiKey">
 849 |                                         <label class="form-check-label" for="showApiKey">Show API Key</label>
 850 |                                     </div>
 851 |                                 </div>
 852 | 
 853 |                                 <div class="form-check mb-3">
 854 |                                     <input class="form-check-input" type="checkbox" id="extract_images" name="extract_images" checked>
 855 |                                     <label class="form-check-label" for="extract_images">Extract Images</label>
 856 |                                 </div>
 857 |                             </div>
 858 | 
 859 |                             <div class="tab-pane fade" id="obsidian" role="tabpanel" aria-labelledby="obsidian-tab">
 860 |                                 <h5 class="mb-3">Obsidian Integration</h5>
 861 | 
 862 |                                 <div class="form-check mb-3">
 863 |                                     <input class="form-check-input" type="checkbox" id="use_obsidian" name="use_obsidian">
 864 |                                     <label class="form-check-label" for="use_obsidian">Enable Obsidian Integration</label>
 865 |                                 </div>
 866 | 
 867 |                                 <div id="obsidianSettings" class="obsidian-settings d-none">
 868 |                                     <div class="form-group">
 869 |                                         <label for="obsidian_vault_path">Obsidian Vault Path:</label>
 870 |                                         <div class="input-group">
 871 |                                             <input type="text" class="form-control" id="obsidian_vault_path" name="obsidian_vault_path" value="{{ obsidian_dir or '' }}">
 872 |                                             <button class="btn btn-outline-secondary" type="button" id="checkObsidianPath">Check</button>
 873 |                                         </div>
 874 |                                         <div id="obsidianPathFeedback" class="form-text"></div>
 875 |                                         <small class="form-text text-muted">Path to your Obsidian vault. The summarized document will be saved to a 'books' folder in this location.</small>
 876 |                                     </div>
 877 | 
 878 |                                     <div class="form-group mt-3">
 879 |                                         <label for="obsidian_tags">Tags (comma separated):</label>
 880 |                                         <input type="text" class="form-control" id="obsidian_tags" name="obsidian_tags" value="book,main">
 881 |                                     </div>
 882 | 
 883 |                                     <div class="form-group mt-3">
 884 |                                         <label for="obsidian_author">Author (optional):</label>
 885 |                                         <input type="text" class="form-control" id="obsidian_author" name="obsidian_author" value="TBD">
 886 |                                         <small class="form-text text-muted">If not specified, will use author from document metadata if available</small>
 887 |                                     </div>
 888 | 
 889 |                                     <div class="form-group mt-3">
 890 |                                         <label for="obsidian_cover_url">Cover URL (optional):</label>
 891 |                                         <input type="text" class="form-control" id="obsidian_cover_url" name="obsidian_cover_url" value="TBD">
 892 |                                     </div>
 893 | 
 894 |                                     <div class="form-group mt-3">
 895 |                                         <label for="obsidian_review">Review (optional):</label>
 896 |                                         <input type="text" class="form-control" id="obsidian_review" name="obsidian_review" value="TBD">
 897 |                                         <small class="form-text text-muted">Example: "20/33" or "5/5"</small>
 898 |                                     </div>
 899 |                                 </div>
 900 |                             </div>
 901 | 
 902 |                             <div class="tab-pane fade" id="advanced" role="tabpanel" aria-labelledby="advanced-tab">
 903 |                                 <h5 class="mb-3">Advanced Options</h5>
 904 | 
 905 |                                 <div class="form-group">
 906 |                                     <label for="max_retries">Max Retries:</label>
 907 |                                     <input type="number" class="form-control" id="max_retries" name="max_retries" value="3" min="1" max="10">
 908 |                                     <small class="form-text text-muted">Maximum number of retry attempts for API calls</small>
 909 |                                 </div>
 910 | 
 911 |                                 <div class="form-group">
 912 |                                     <label for="chunk_timeout">Chunk Processing Timeout (seconds):</label>
 913 |                                     <input type="number" class="form-control" id="chunk_timeout" name="chunk_timeout" value="300" min="60" max="1800">
 914 |                                     <small class="form-text text-muted">Maximum time allowed for processing a single chunk before it's considered failed (5-30 minutes)</small>
 915 |                                 </div>
 916 | 
 917 |                                 <div class="form-group">
 918 |                                     <label for="api_timeout">API Request Timeout (seconds):</label>
 919 |                                     <input type="number" class="form-control" id="api_timeout" name="api_timeout" value="180" min="30" max="300">
 920 |                                     <small class="form-text text-muted">Maximum time allowed for a single API call (30-300 seconds)</small>
 921 |                                 </div>
 922 | 
 923 |                                 <div class="form-group">
 924 |                                     <label for="min_img_width">Minimum Image Width:</label>
 925 |                                     <input type="number" class="form-control" id="min_img_width" name="min_img_width" value="100" min="10" max="1000">
 926 |                                     <small class="form-text text-muted">Images smaller than this width will be ignored</small>
 927 |                                 </div>
 928 | 
 929 |                                 <div class="form-group">
 930 |                                     <label for="min_img_height">Minimum Image Height:</label>
 931 |                                     <input type="number" class="form-control" id="min_img_height" name="min_img_height" value="100" min="10" max="1000">
 932 |                                     <small class="form-text text-muted">Images smaller than this height will be ignored</small>
 933 |                                 </div>
 934 | 
 935 |                                 <div class="form-group">
 936 |                                     <label for="img_format">Image Format:</label>
 937 |                                     <select class="form-select" id="img_format" name="img_format">
 938 |                                         <option value="jpg">JPG (Smaller files, some quality loss)</option>
 939 |                                         <option value="png">PNG (Lossless, larger files)</option>
 940 |                                     </select>
 941 |                                 </div>
 942 | 
 943 |                                 <div class="form-group">
 944 |                                     <label for="max_workers">Worker Threads:</label>
 945 |                                     <input type="number" class="form-control" id="max_workers" name="max_workers" value="4" min="1" max="16">
 946 |                                     <small class="form-text text-muted">Number of parallel threads for image extraction</small>
 947 |                                 </div>
 948 |                             </div>
 949 |                         </div>
 950 | 
 951 |                         <div class="mt-4">
 952 |                             <button type="submit" class="btn btn-primary" id="submitBtn">Process Document</button>
 953 |                             <div class="spinner-border text-primary d-none" id="spinner" role="status">
 954 |                                 <span class="visually-hidden">Loading...</span>
 955 |                             </div>
 956 |                         </div>
 957 |                     </form>
 958 |                 </div>
 959 |             </div>
 960 |         </div>
 961 | 
 962 |         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
 963 |         <script>
 964 |             // Toggle API key visibility
 965 |             document.getElementById('showApiKey').addEventListener('change', function() {
 966 |                 const apiKeyInput = document.getElementById('api_key');
 967 |                 apiKeyInput.type = this.checked ? 'text' : 'password';
 968 |             });
 969 | 
 970 |             // Toggle Obsidian settings visibility
 971 |             document.getElementById('use_obsidian').addEventListener('change', function() {
 972 |                 const obsidianSettings = document.getElementById('obsidianSettings');
 973 |                 obsidianSettings.classList.toggle('d-none', !this.checked);
 974 |             });
 975 | 
 976 |             // Check Obsidian path
 977 |             document.getElementById('checkObsidianPath').addEventListener('click', function() {
 978 |                 const path = document.getElementById('obsidian_vault_path').value;
 979 |                 const feedbackElement = document.getElementById('obsidianPathFeedback');
 980 | 
 981 |                 feedbackElement.textContent = 'Checking...';
 982 |                 feedbackElement.className = 'form-text text-muted';
 983 | 
 984 |                 fetch('/obsidian_check', {
 985 |                     method: 'POST',
 986 |                     headers: {
 987 |                         'Content-Type': 'application/json'
 988 |                     },
 989 |                     body: JSON.stringify({ path: path })
 990 |                 })
 991 |                 .then(response => response.json())
 992 |                 .then(data => {
 993 |                     if (data.valid) {
 994 |                         feedbackElement.textContent = '✅ ' + data.message;
 995 |                         feedbackElement.className = 'form-text text-success';
 996 |                     } else {
 997 |                         feedbackElement.textContent = '❌ ' + data.message;
 998 |                         feedbackElement.className = 'form-text text-danger';
 999 |                     }
1000 |                 })
1001 |                 .catch(error => {
1002 |                     feedbackElement.textContent = '❌ Error checking path';
1003 |                     feedbackElement.className = 'form-text text-danger';
1004 |                 });
1005 |             });
1006 | 
1007 |             // Handle form submission
1008 |             document.getElementById('uploadForm').addEventListener('submit', function(e) {
1009 |                 e.preventDefault();
1010 | 
1011 |                 const submitBtn = document.getElementById('submitBtn');
1012 |                 const spinner = document.getElementById('spinner');
1013 | 
1014 |                 // Disable button and show spinner
1015 |                 submitBtn.disabled = true;
1016 |                 spinner.classList.remove('d-none');
1017 | 
1018 |                 // Create FormData object
1019 |                 const formData = new FormData(this);
1020 | 
1021 |                 // Submit the form via AJAX
1022 |                 fetch('/upload', {
1023 |                     method: 'POST',
1024 |                     body: formData
1025 |                 })
1026 |                 .then(response => {
1027 |                     if (!response.ok) {
1028 |                         throw new Error('Network response was not ok');
1029 |                     }
1030 |                     return response.json();
1031 |                 })
1032 |                 .then(data => {
1033 |                     if (data.redirect) {
1034 |                         window.location.href = data.redirect;
1035 |                     }
1036 |                 })
1037 |                 .catch(error => {
1038 |                     alert('Error: ' + error.message);
1039 |                     submitBtn.disabled = false;
1040 |                     spinner.classList.add('d-none');
1041 |                 });
1042 |             });
1043 |         </script>
1044 |     </body>
1045 |     </html>
1046 |     """
1047 | 
1048 |     # Create job_status.html template with retry buttons
1049 |     job_status_html = """
1050 |     <!DOCTYPE html>
1051 |     <html lang="en">
1052 |     <head>
1053 |         <meta charset="UTF-8">
1054 |         <meta name="viewport" content="width=device-width, initial-scale=1.0">
1055 |         <title>Job Status - Gemini Document Processor</title>
1056 |         <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/css/bootstrap.min.css" rel="stylesheet">
1057 |         <style>
1058 |             body { padding-top: 20px; }
1059 |             .log-container {
1060 |                 background-color: #f8f9fa;
1061 |                 border: 1px solid #dee2e6;
1062 |                 border-radius: 0.25rem;
1063 |                 padding: 15px;
1064 |                 height: 300px;
1065 |                 overflow-y: auto;
1066 |                 font-family: monospace;
1067 |                 white-space: pre-wrap;
1068 |             }
1069 |             .log-line {
1070 |                 margin-bottom: 3px;
1071 |             }
1072 |             .badge-obsidian {
1073 |                 background-color: #8d6fdb;
1074 |                 color: white;
1075 |             }
1076 |             .current-chunk-info {
1077 |                 font-size: 0.9rem;
1078 |                 color: #6c757d;
1079 |                 margin-top: 5px;
1080 |             }
1081 |         </style>
1082 |     </head>
1083 |     <body>
1084 |         <div class="container">
1085 |             <h1 class="mb-4">Document Processing Status</h1>
1086 | 
1087 |             <div class="card">
1088 |                 <div class="card-header">
1089 |                     <div class="d-flex justify-content-between align-items-center">
1090 |                         <h5 class="mb-0">Job Status</h5>
1091 |                         <a href="/" class="btn btn-sm btn-outline-secondary">Back to Home</a>
1092 |                     </div>
1093 |                 </div>
1094 |                 <div class="card-body">
1095 |                     <div class="mb-3">
1096 |                         <div class="d-flex justify-content-between">
1097 |                             <strong>Status:</strong>
1098 |                             <span id="status" class="badge bg-secondary">Loading...</span>
1099 |                         </div>
1100 |                     </div>
1101 | 
1102 |                     <div class="mb-3">
1103 |                         <strong>Progress:</strong>
1104 |                         <div class="progress">
1105 |                             <div id="progress" class="progress-bar" role="progressbar" style="width: 0%;" aria-valuenow="0" aria-valuemin="0" aria-valuemax="100">0%</div>
1106 |                         </div>
1107 |                         <div id="chunkInfo" class="current-chunk-info d-none">
1108 |                             Processing chunk <span id="currentChunk">0</span> of <span id="totalChunks">0</span>
1109 |                         </div>
1110 |                     </div>
1111 | 
1112 |                     <div class="mb-3">
1113 |                         <strong>Current Task:</strong>
1114 |                         <div id="message">Initializing...</div>
1115 |                     </div>
1116 | 
1117 |                     <div class="mb-3">
1118 |                         <strong>Log:</strong>
1119 |                         <div id="log" class="log-container"></div>
1120 |                     </div>
1121 | 
1122 |                     <div id="failedChunksContainer" class="mb-3 d-none">
1123 |                         <div class="alert alert-warning" role="alert">
1124 |                             <strong id="failedChunksCount">0</strong> chunks failed processing. 
1125 |                             <button id="retryChunksBtn" class="btn btn-sm btn-warning ms-2">Retry Failed Chunks</button>
1126 |                         </div>
1127 |                     </div>
1128 | 
1129 |                     <div id="resultsContainer" class="mb-3 d-none">
1130 |                         <strong>Results:</strong>
1131 |                         <div id="results" class="list-group mt-2"></div>
1132 |                     </div>
1133 | 
1134 |                     <div id="errorContainer" class="alert alert-danger d-none" role="alert">
1135 |                         <strong>Error:</strong>
1136 |                         <div id="errorMessage"></div>
1137 |                     </div>
1138 |                 </div>
1139 |             </div>
1140 |         </div>
1141 | 
1142 |         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.2/dist/js/bootstrap.bundle.min.js"></script>
1143 |         <script>
1144 |             // Get job ID from URL
1145 |             const jobId = '{{ job_id }}';
1146 |             let statusCheckInterval;
1147 | 
1148 |             // Update status badge based on job status
1149 |             function updateStatusBadge(status) {
1150 |                 const statusElement = document.getElementById('status');
1151 |                 statusElement.textContent = status.charAt(0).toUpperCase() + status.slice(1);
1152 | 
1153 |                 // Update badge color
1154 |                 statusElement.className = 'badge';
1155 |                 switch(status) {
1156 |                     case 'pending':
1157 |                         statusElement.classList.add('bg-secondary');
1158 |                         break;
1159 |                     case 'processing':
1160 |                         statusElement.classList.add('bg-primary');
1161 |                         break;
1162 |                     case 'completed':
1163 |                         statusElement.classList.add('bg-success');
1164 |                         break;
1165 |                     case 'failed':
1166 |                         statusElement.classList.add('bg-danger');
1167 |                         break;
1168 |                     default:
1169 |                         statusElement.classList.add('bg-secondary');
1170 |                 }
1171 |             }
1172 | 
1173 |             // Update progress bar
1174 |             function updateProgress(progress, currentChunk, totalChunks) {
1175 |                 const progressBar = document.getElementById('progress');
1176 |                 progressBar.style.width = `${progress}%`;
1177 |                 progressBar.textContent = `${progress}%`;
1178 |                 progressBar.setAttribute('aria-valuenow', progress);
1179 | 
1180 |                 // Update chunk info if available
1181 |                 const chunkInfo = document.getElementById('chunkInfo');
1182 |                 if (currentChunk && totalChunks) {
1183 |                     document.getElementById('currentChunk').textContent = currentChunk;
1184 |                     document.getElementById('totalChunks').textContent = totalChunks;
1185 |                     chunkInfo.classList.remove('d-none');
1186 |                 } else {
1187 |                     chunkInfo.classList.add('d-none');
1188 |                 }
1189 |             }
1190 | 
1191 |             // Update log
1192 |             function updateLog(logEntries) {
1193 |                 const logContainer = document.getElementById('log');
1194 | 
1195 |                 // Clear existing log
1196 |                 logContainer.innerHTML = '';
1197 | 
1198 |                 // Add new log entries
1199 |                 logEntries.forEach(entry => {
1200 |                     const logLine = document.createElement('div');
1201 |                     logLine.className = 'log-line';
1202 | 
1203 |                     // Add error styling if needed
1204 |                     if (entry.includes('❌') || entry.includes('Error')) {
1205 |                         logLine.className += ' text-danger';
1206 |                     } else if (entry.includes('⚠️') || entry.includes('WARNING')) {
1207 |                         logLine.className += ' text-warning';
1208 |                     } else if (entry.includes('✅')) {
1209 |                         logLine.className += ' text-success';
1210 |                     }
1211 | 
1212 |                     logLine.textContent = entry;
1213 |                     logContainer.appendChild(logLine);
1214 |                 });
1215 | 
1216 |                 // Auto-scroll to bottom
1217 |                 logContainer.scrollTop = logContainer.scrollHeight;
1218 |             }
1219 | 
1220 |             // Update failed chunks info
1221 |             function updateFailedChunks(failedChunksCount) {
1222 |                 const container = document.getElementById('failedChunksContainer');
1223 |                 const countElement = document.getElementById('failedChunksCount');
1224 | 
1225 |                 if (failedChunksCount > 0) {
1226 |                     countElement.textContent = failedChunksCount;
1227 |                     container.classList.remove('d-none');
1228 |                 } else {
1229 |                     container.classList.add('d-none');
1230 |                 }
1231 |             }
1232 | 
1233 |             // Update results
1234 |             function updateResults(resultFiles) {
1235 |                 const resultsContainer = document.getElementById('resultsContainer');
1236 |                 const resultsList = document.getElementById('results');
1237 | 
1238 |                 if (resultFiles && resultFiles.length > 0) {
1239 |                     // Show results container
1240 |                     resultsContainer.classList.remove('d-none');
1241 | 
1242 |                     // Clear existing results
1243 |                     resultsList.innerHTML = '';
1244 | 
1245 |                     // Add new result files
1246 |                     resultFiles.forEach(file => {
1247 |                         const resultItem = document.createElement('a');
1248 |                         resultItem.className = 'list-group-item list-group-item-action d-flex justify-content-between align-items-center';
1249 | 
1250 |                         if (file.type === 'file') {
1251 |                             resultItem.href = `/download/${file.path}`;
1252 |                             resultItem.textContent = file.name;
1253 | 
1254 |                             // Add view button for markdown files
1255 |                             if (file.name.endsWith('.md')) {
1256 |                                 const viewBtn = document.createElement('a');
1257 |                                 viewBtn.className = 'btn btn-sm btn-outline-primary ms-2';
1258 |                                 viewBtn.href = `/view/${file.path}`;
1259 |                                 viewBtn.textContent = 'View';
1260 |                                 viewBtn.target = '_blank';
1261 |                                 resultItem.appendChild(viewBtn);
1262 |                             }
1263 |                         } else if (file.type === 'directory') {
1264 |                             resultItem.textContent = `${file.name} (directory)`;
1265 |                             resultItem.href = '#';
1266 |                             resultItem.style.pointerEvents = 'none';
1267 |                         } else if (file.type === 'obsidian') {
1268 |                             resultItem.className += ' list-group-item-light';
1269 |                             resultItem.innerHTML = `<span>${file.name}</span>`;
1270 |                             resultItem.href = '#';
1271 | 
1272 |                             // Add obsidian badge
1273 |                             const badge = document.createElement('span');
1274 |                             badge.className = 'badge badge-obsidian';
1275 |                             badge.textContent = 'Obsidian';
1276 |                             resultItem.appendChild(badge);
1277 |                         } else if (file.type === 'action' && file.action === 'retry_chunks') {
1278 |                             resultItem.href = '#';
1279 |                             resultItem.textContent = file.name;
1280 |                             resultItem.className = 'list-group-item list-group-item-warning d-flex justify-content-between align-items-center';
1281 | 
1282 |                             // Add retry button
1283 |                             const retryBtn = document.createElement('button');
1284 |                             retryBtn.className = 'btn btn-sm btn-warning';
1285 |                             retryBtn.textContent = 'Retry';
1286 |                             retryBtn.onclick = function(e) {
1287 |                                 e.preventDefault();
1288 |                                 retryFailedChunks(file.job_id);
1289 |                             };
1290 |                             resultItem.appendChild(retryBtn);
1291 |                         }
1292 | 
1293 |                         resultsList.appendChild(resultItem);
1294 |                     });
1295 |                 }
1296 |             }
1297 | 
1298 |             // Show error message
1299 |             function showError(errorMessage) {
1300 |                 const errorContainer = document.getElementById('errorContainer');
1301 |                 const errorMessageElement = document.getElementById('errorMessage');
1302 | 
1303 |                 errorContainer.classList.remove('d-none');
1304 |                 errorMessageElement.textContent = errorMessage;
1305 |             }
1306 | 
1307 |             // Function to retry failed chunks
1308 |             function retryFailedChunks(originalJobId) {
1309 |                 if (!confirm('Are you sure you want to retry failed chunks?')) {
1310 |                     return;
1311 |                 }
1312 | 
1313 |                 fetch(`/retry_chunks/${originalJobId}`, {
1314 |                     method: 'POST'
1315 |                 })
1316 |                 .then(response => {
1317 |                     if (!response.ok) {
1318 |                         throw new Error('Failed to start retry job');
1319 |                     }
1320 |                     return response.json();
1321 |                 })
1322 |                 .then(data => {
1323 |                     if (data.redirect) {
1324 |                         window.location.href = data.redirect;
1325 |                     }
1326 |                 })
1327 |                 .catch(error => {
1328 |                     alert('Error starting retry job: ' + error.message);
1329 |                 });
1330 |             }
1331 | 
1332 |             // Check job status
1333 |             function checkJobStatus() {
1334 |                 fetch(`/api/job/${jobId}`)
1335 |                     .then(response => {
1336 |                         if (!response.ok) {
1337 |                             throw new Error('Failed to get job status');
1338 |                         }
1339 |                         return response.json();
1340 |                     })
1341 |                     .then(data => {
1342 |                         // Update UI with job status
1343 |                         updateStatusBadge(data.status);
1344 |                         updateProgress(data.progress, data.current_chunk, data.total_chunks);
1345 |                         document.getElementById('message').textContent = data.message;
1346 |                         updateLog(data.log);
1347 |                         updateFailedChunks(data.failed_chunks);
1348 | 
1349 |                         // If job is completed, show results
1350 |                         if (data.result_files && data.result_files.length > 0) {
1351 |                             updateResults(data.result_files);
1352 |                         }
1353 | 
1354 |                         // If job has an error, show it
1355 |                         if (data.error) {
1356 |                             showError(data.error);
1357 |                         }
1358 | 
1359 |                         // If job is completed or failed, stop checking
1360 |                         if (data.status === 'completed' || data.status === 'failed') {
1361 |                             clearInterval(statusCheckInterval);
1362 |                             updateStatusBadge(data.status);
1363 |                         }
1364 |                     })
1365 |                     .catch(error => {
1366 |                         console.error('Error checking job status:', error);
1367 |                         document.getElementById('message').textContent = 'Error checking job status';
1368 |                     });
1369 |             }
1370 | 
1371 |             // Start checking job status
1372 |             document.addEventListener('DOMContentLoaded', function() {
1373 |                 // Initial check
1374 |                 checkJobStatus();
1375 | 
1376 |                 // Check status every 5 seconds
1377 |                 statusCheckInterval = setInterval(checkJobStatus, 5000);
1378 | 
1379 |                 // Set up retry button
1380 |                 document.getElementById('retryChunksBtn').addEventListener('click', function() {
1381 |                     retryFailedChunks(jobId);
1382 |                 });
1383 |             });
1384 |         </script>
1385 |     </body>
1386 |     </html>
1387 |     """
1388 | 
1389 |     with open(os.path.join(templates_dir, 'index.html'), 'w') as f:
1390 |         f.write(index_html)
1391 | 
1392 |     with open(os.path.join(templates_dir, 'job_status.html'), 'w') as f:
1393 |         f.write(job_status_html)
1394 | 
1395 |     print("Starting web server at http://127.0.0.1:8081/")
1396 |     print("You can access the document processor by opening this URL in your browser")
1397 |     app.run(debug=True, port=8081)


--------------------------------------------------------------------------------