├── .gitignore ├── .coverage ├── settings.json ├── Screenshot.png ├── requirements.txt ├── LICENSE ├── cleanup.sh ├── tests ├── test_epub_processor.py └── test_document_processor.py ├── README.md ├── epub_processor.py ├── templates ├── job_status.html └── index.html ├── document_processor.py └── document_gui.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | venv -------------------------------------------------------------------------------- /.coverage: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpeterpan/gemini-document-processor/HEAD/.coverage -------------------------------------------------------------------------------- /settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "obsidian_vault_path": "/Users/kidpeterpan/Documents/Pan's Vault" 3 | } -------------------------------------------------------------------------------- /Screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidpeterpan/gemini-document-processor/HEAD/Screenshot.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | Flask==2.3.3 3 | PyMuPDF # For PDF processing (fitz) 4 | ebooklib==0.18.0 # For EPUB processing 5 | beautifulsoup4==4.12.2 # For parsing HTML content in EPUBs 6 | html2text==2020.1.16 # For converting HTML to markdown text 7 | requests==2.31.0 # For API calls to Gemini 8 | pathlib==1.0.1 # For path manipulations 9 | pypdf==5.4.0 10 | google-generativeai 11 | 12 | # Optional dependencies (comment out if not needed) 13 | # PIL or Pillow might be needed for more advanced image processing 14 | Pillow 15 | 16 | # Testing dependencies 17 | pytest 18 | pytest-cov -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 kidpeterpan@github.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cleanup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Script to clean up results, uploads, and log files 4 | # Usage: ./cleanup.sh 5 | 6 | # Set the base directory - default to the current directory 7 | BASE_DIR="$(pwd)" 8 | 9 | # File paths 10 | RESULTS_DIR="$BASE_DIR/results" 11 | UPLOADS_DIR="$BASE_DIR/uploads" 12 | 13 | echo "========================================" 14 | echo "Starting cleanup process..." 15 | echo "========================================" 16 | 17 | # Clean results directory 18 | if [ -d "$RESULTS_DIR" ]; then 19 | echo "Cleaning results directory..." 20 | rm -rf "$RESULTS_DIR"/* 21 | echo "✓ Results directory emptied" 22 | else 23 | echo "! Results directory not found at $RESULTS_DIR" 24 | fi 25 | 26 | # Clean uploads directory 27 | if [ -d "$UPLOADS_DIR" ]; then 28 | echo "Cleaning uploads directory..." 29 | rm -rf "$UPLOADS_DIR"/* 30 | echo "✓ Uploads directory emptied" 31 | else 32 | echo "! Uploads directory not found at $UPLOADS_DIR" 33 | fi 34 | 35 | # Remove log files 36 | echo "Removing log files..." 37 | find "$BASE_DIR" -name "*.log" -type f -delete 38 | echo "✓ Log files removed" 39 | 40 | # Done 41 | echo "========================================" 42 | echo "Cleanup completed!" 43 | echo "========================================" -------------------------------------------------------------------------------- /tests/test_epub_processor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 4 | import pytest 5 | from epub_processor import SimpleEpubProcessor 6 | 7 | class DummyBook: 8 | def get_metadata(self, *args, **kwargs): 9 | return {'title': [('Test Book',)]} 10 | def get_items(self): 11 | class DummyItem: 12 | def get_type(self): return 9 # ebooklib.ITEM_DOCUMENT 13 | def get_content(self): return b'
Content
'
14 | return [DummyItem()]
15 |
16 | def test_process_epub(monkeypatch, tmp_path):
17 | epub_path = tmp_path / "test.epub"
18 | with open(epub_path, "wb") as f:
19 | f.write(b"dummy epub content")
20 | monkeypatch.setattr('ebooklib.epub.read_epub', lambda path: DummyBook())
21 | processor = SimpleEpubProcessor(extract_images=True)
22 | chapters, images_by_chapter, name, meta = processor.process_epub(str(epub_path), image_output_dir=str(tmp_path))
23 | assert isinstance(chapters, dict)
24 | assert isinstance(images_by_chapter, dict)
25 | assert name == "test"
26 | assert meta['title'] == 'Test Book'
27 |
28 | def test_process_epub_missing_file(tmp_path):
29 | processor = SimpleEpubProcessor()
30 | chapters, images_by_chapter, name, meta = processor.process_epub(str(tmp_path / "no_file.epub"))
31 | assert "Error" in chapters or "Failed" in list(chapters.values())[0]
32 |
33 | def test_process_epub_no_chapters(monkeypatch, tmp_path):
34 | class DummyBook:
35 | def get_metadata(self, *args, **kwargs):
36 | return {'title': [('Test Book',)]}
37 | def get_items(self):
38 | return []
39 | epub_path = tmp_path / "test.epub"
40 | with open(epub_path, "wb") as f:
41 | f.write(b"dummy epub content")
42 | monkeypatch.setattr('ebooklib.epub.read_epub', lambda path: DummyBook())
43 | processor = SimpleEpubProcessor()
44 | chapters, images_by_chapter, name, meta = processor.process_epub(str(epub_path), image_output_dir=str(tmp_path))
45 | assert "No Content" in chapters
46 |
47 | def test_has_content():
48 | processor = SimpleEpubProcessor()
49 | assert not processor._has_content("")
50 | assert not processor._has_content("\n\t ")
51 | assert processor._has_content("a" * 101)
52 |
53 | def test_extract_images_basic(tmp_path):
54 | processor = SimpleEpubProcessor()
55 | class DummyBook: pass
56 | class DummyItem: pass
57 | from bs4 import BeautifulSoup
58 | soup = BeautifulSoup('
', 'html.parser')
59 | images = processor._extract_images_basic(DummyBook(), DummyItem(), soup, str(tmp_path), "book", 1)
60 | assert len(images) == 2
61 | assert images[0]['filename'].endswith('.png')
62 | assert images[0]['alt'] == 'A'
63 |
64 | def test_extract_images_basic_malformed_html(tmp_path):
65 | processor = SimpleEpubProcessor()
66 | class DummyBook: pass
67 | class DummyItem: pass
68 | from bs4 import BeautifulSoup
69 | soup = BeautifulSoup('