├── tests ├── __init__.py ├── conftest.py ├── test_epub_processor.py ├── test_pdf_processor.py └── test_utils.py ├── requirements.txt ├── LICENSE ├── .gitignore ├── epub_splitter.py ├── utils.py ├── splitter.py ├── README.md ├── epub_to_markdown.py ├── epub_processor.py └── pdf_to_markdown.py /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Tests for book-splitter 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ebooklib>=0.17.1 2 | beautifulsoup4>=4.9.3 3 | nltk>=3.6.0 4 | html2text>=2020.1.16 5 | PyPDF2>=2.0.0 6 | pdfminer.six>=20201018 7 | 8 | # Development dependencies 9 | pytest>=7.0.0 10 | pytest-cov>=4.0.0 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest configuration and fixtures for book-splitter tests. 3 | """ 4 | 5 | import os 6 | import pytest 7 | import tempfile 8 | 9 | 10 | @pytest.fixture 11 | def temp_dir(): 12 | """Create a temporary directory for test outputs.""" 13 | with tempfile.TemporaryDirectory() as tmpdir: 14 | yield tmpdir 15 | 16 | 17 | @pytest.fixture 18 | def sample_text(): 19 | """Sample text for word counting tests.""" 20 | return "This is a sample text with exactly ten words here." 21 | 22 | 23 | @pytest.fixture 24 | def sample_html(): 25 | """Sample HTML content for word counting tests.""" 26 | return "

This is bold and italic text.

" 27 | 28 | 29 | @pytest.fixture 30 | def sample_text_with_sentences(): 31 | """Sample text with multiple sentences for splitting tests.""" 32 | sentences = [ 33 | "This is the first sentence.", 34 | "Here comes the second one!", 35 | "And a third sentence follows?", 36 | "The fourth sentence is here.", 37 | "Fifth sentence completes the set.", 38 | ] 39 | return " ".join(sentences) 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | venv/ 25 | ENV/ 26 | env/ 27 | 28 | # EPUB and output files 29 | *.epub 30 | *.md 31 | !README.md 32 | !requirements.md 33 | !technical_design.md 34 | 35 | # IDE files 36 | .idea/ 37 | .vscode/ 38 | *.swp 39 | *.swo 40 | 41 | # OS specific files 42 | .DS_Store 43 | Thumbs.db 44 | 45 | # Test files - completely ignore the test_files directory 46 | /test_files/ 47 | /test/ 48 | 49 | # The following pattern was causing issues - commenting out 50 | # !test_files/ 51 | # !test_files/README.md 52 | # !test_files/tests/ 53 | # !test_files/tests/*/ 54 | # !test_files/tests/*/README.md 55 | # !test_files/input/ 56 | # !test_files/output/ 57 | # test_files/input/* 58 | # test_files/output/* 59 | # test_files/tests/*/*.epub 60 | # test_files/tests/*/*.md 61 | # test_files/tests/*/*.pdf 62 | 63 | # But keep README.md files 64 | # !test_files/README.md 65 | # !test_files/input/README.md 66 | # !test_files/output/README.md 67 | # !test_files/tests/*/README.md -------------------------------------------------------------------------------- /epub_splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | EPUB Splitter - A tool to split large EPUB books into smaller parts. 4 | """ 5 | 6 | import argparse 7 | import os 8 | import sys 9 | from typing import List, Optional 10 | from epub_processor import EPUBProcessor 11 | from utils import normalize_output_dir, ensure_output_dir 12 | 13 | def split_epub_to_epub(input_file: str, max_words: int = 80000, output_dir: Optional[str] = None, 14 | strict_boundaries: bool = False, verbose: bool = False) -> List[str]: 15 | """ 16 | Split an EPUB file into multiple EPUB files. 17 | 18 | Args: 19 | input_file: Path to the input EPUB file. 20 | max_words: Maximum words per output file. 21 | output_dir: Directory for output files (defaults to input file's directory). 22 | strict_boundaries: Whether to split only at chapter boundaries. 23 | verbose: Whether to print verbose output. 24 | 25 | Returns: 26 | List of paths to the created EPUB files. 27 | """ 28 | output_dir = normalize_output_dir(output_dir, input_file) 29 | try: 30 | processor = EPUBProcessor( 31 | input_file=input_file, 32 | max_words=max_words, 33 | output_dir=output_dir, 34 | strict_chapters=strict_boundaries, 35 | verbose=verbose 36 | ) 37 | 38 | return processor.process() 39 | except Exception as e: 40 | if verbose: 41 | import traceback 42 | traceback.print_exc() 43 | raise Exception(f"Error processing EPUB: {str(e)}") 44 | 45 | def parse_arguments(): 46 | """Parse command line arguments.""" 47 | parser = argparse.ArgumentParser(description='Split a large EPUB file into smaller parts') 48 | 49 | parser.add_argument('input_file', help='Path to input EPUB file') 50 | parser.add_argument('--max-words', type=int, default=80000, 51 | help='Maximum words per output file (default: 80000)') 52 | parser.add_argument('--output-dir', default=None, 53 | help='Directory for output files (default: same directory as input file)') 54 | parser.add_argument('--strict-boundaries', action='store_true', 55 | help='Only split at chapter boundaries, even if exceeding word limit') 56 | parser.add_argument('--verbose', action='store_true', 57 | help='Print detailed processing information') 58 | 59 | return parser.parse_args() 60 | 61 | def validate_arguments(args): 62 | """Validate the provided arguments.""" 63 | # Check if input file exists 64 | if not os.path.isfile(args.input_file): 65 | print(f"Error: Input file '{args.input_file}' does not exist.") 66 | return False 67 | 68 | # Check if input file is an EPUB 69 | if not args.input_file.lower().endswith('.epub'): 70 | print(f"Error: Input file '{args.input_file}' is not an EPUB file.") 71 | return False 72 | 73 | # Check if output directory exists or can be created 74 | if args.output_dir is not None and not os.path.exists(args.output_dir): 75 | try: 76 | ensure_output_dir(args.output_dir) 77 | except OSError as e: 78 | print(f"Error: Cannot create output directory '{args.output_dir}': {e}") 79 | return False 80 | 81 | # Check if max words is positive 82 | if args.max_words <= 0: 83 | print("Error: Maximum words must be a positive number.") 84 | return False 85 | 86 | return True 87 | 88 | def main(): 89 | """Main entry point for the EPUB splitter tool.""" 90 | args = parse_arguments() 91 | 92 | if not validate_arguments(args): 93 | sys.exit(1) 94 | 95 | try: 96 | result = split_epub_to_epub( 97 | args.input_file, 98 | args.max_words, 99 | args.output_dir, 100 | args.strict_boundaries, 101 | args.verbose 102 | ) 103 | 104 | if args.verbose: 105 | print(f"\nProcessing complete. Created {len(result)} output files:") 106 | for i, output_file in enumerate(result, 1): 107 | print(f" {i}. {output_file}") 108 | else: 109 | print(f"\nProcessing complete. Created {len(result)} output files.") 110 | 111 | except Exception as e: 112 | print(f"Error: {str(e)}") 113 | if args.verbose: 114 | import traceback 115 | traceback.print_exc() 116 | sys.exit(1) 117 | 118 | if __name__ == "__main__": 119 | main() -------------------------------------------------------------------------------- /tests/test_epub_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for EPUB processor functionality. 3 | """ 4 | 5 | import os 6 | import pytest 7 | from unittest.mock import MagicMock, patch 8 | from collections import namedtuple 9 | 10 | # Import the module components we want to test 11 | from epub_processor import EPUBProcessor, Chapter 12 | 13 | 14 | class TestEPUBProcessorInit: 15 | """Tests for EPUBProcessor initialization.""" 16 | 17 | def test_default_output_dir(self, temp_dir): 18 | """Test that output_dir defaults to input file's directory.""" 19 | input_file = os.path.join(temp_dir, "test.epub") 20 | processor = EPUBProcessor(input_file) 21 | assert processor.output_dir == temp_dir 22 | 23 | def test_custom_output_dir(self, temp_dir): 24 | """Test custom output directory.""" 25 | input_file = os.path.join(temp_dir, "test.epub") 26 | custom_dir = "/custom/path" 27 | processor = EPUBProcessor(input_file, output_dir=custom_dir) 28 | assert processor.output_dir == custom_dir 29 | 30 | def test_default_max_words(self, temp_dir): 31 | """Test default max_words value.""" 32 | input_file = os.path.join(temp_dir, "test.epub") 33 | processor = EPUBProcessor(input_file) 34 | assert processor.max_words == 80000 35 | 36 | def test_book_name_extraction(self, temp_dir): 37 | """Test book name is extracted from filename.""" 38 | input_file = os.path.join(temp_dir, "My Book Title.epub") 39 | processor = EPUBProcessor(input_file) 40 | assert processor.book_name == "My Book Title" 41 | 42 | 43 | class TestDetermineSplitPoints: 44 | """Tests for split point determination logic.""" 45 | 46 | @pytest.fixture 47 | def processor(self, temp_dir): 48 | """Create a processor with mock chapters.""" 49 | input_file = os.path.join(temp_dir, "test.epub") 50 | return EPUBProcessor(input_file, max_words=1000) 51 | 52 | def test_single_part_small_book(self, processor): 53 | """Test book that fits in single part.""" 54 | processor.chapters = [ 55 | Chapter('ch1', 'Chapter 1', '', 'ch1.html', 300), 56 | Chapter('ch2', 'Chapter 2', '', 'ch2.html', 400), 57 | ] 58 | split_points = processor.determine_split_points() 59 | 60 | # Should have one split point (end of book) 61 | assert len(split_points) == 1 62 | assert split_points[0] == 1 # Last chapter index 63 | 64 | def test_multiple_parts(self, processor): 65 | """Test book that needs multiple parts.""" 66 | processor.chapters = [ 67 | Chapter('ch1', 'Chapter 1', '', 'ch1.html', 600), 68 | Chapter('ch2', 'Chapter 2', '', 'ch2.html', 600), 69 | Chapter('ch3', 'Chapter 3', '', 'ch3.html', 600), 70 | ] 71 | split_points = processor.determine_split_points() 72 | 73 | # Should have multiple parts 74 | assert len(split_points) >= 2 75 | 76 | def test_strict_chapters_mode(self, temp_dir): 77 | """Test strict chapters mode respects boundaries.""" 78 | input_file = os.path.join(temp_dir, "test.epub") 79 | processor = EPUBProcessor(input_file, max_words=1000, strict_chapters=True) 80 | 81 | processor.chapters = [ 82 | Chapter('ch1', 'Chapter 1', '', 'ch1.html', 800), 83 | Chapter('ch2', 'Chapter 2', '', 'ch2.html', 800), 84 | ] 85 | split_points = processor.determine_split_points() 86 | 87 | # In strict mode, should split at chapter boundary 88 | assert len(split_points) == 2 89 | assert split_points[0] == 0 # Split after first chapter 90 | 91 | def test_avoids_small_parts(self, processor): 92 | """Test that very small parts are avoided (40% threshold).""" 93 | processor.chapters = [ 94 | Chapter('ch1', 'Chapter 1', '', 'ch1.html', 200), # Less than 40% of max 95 | Chapter('ch2', 'Chapter 2', '', 'ch2.html', 900), 96 | ] 97 | split_points = processor.determine_split_points() 98 | 99 | # Should combine small chapter with next 100 | assert len(split_points) == 1 101 | 102 | def test_empty_book(self, processor): 103 | """Test handling of empty book.""" 104 | processor.chapters = [] 105 | split_points = processor.determine_split_points() 106 | assert split_points == [] 107 | 108 | 109 | class TestChapterNamedTuple: 110 | """Tests for Chapter namedtuple structure.""" 111 | 112 | def test_chapter_creation(self): 113 | """Test creating a Chapter object.""" 114 | chapter = Chapter( 115 | id='ch1', 116 | title='Test Chapter', 117 | content='

Content

Content

' 125 | assert chapter.file_name == 'chapter1.html' 126 | assert chapter.word_count == 100 127 | 128 | def test_chapter_immutable(self): 129 | """Test that Chapter is immutable (namedtuple behavior).""" 130 | chapter = Chapter('ch1', 'Title', 'content', 'file.html', 100) 131 | 132 | with pytest.raises(AttributeError): 133 | chapter.title = 'New Title' 134 | -------------------------------------------------------------------------------- /tests/test_pdf_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for PDF processor functionality. 3 | """ 4 | 5 | import pytest 6 | 7 | from pdf_to_markdown import ( 8 | detect_section_boundaries, 9 | determine_split_points, 10 | ) 11 | 12 | 13 | class TestDetectSectionBoundaries: 14 | """Tests for PDF section boundary detection.""" 15 | 16 | def test_chapter_pattern_basic(self): 17 | """Test detection of 'Chapter N' pattern.""" 18 | pages = [ 19 | {'text': 'Some intro text\nMore content', 'word_count': 100}, 20 | {'text': 'Chapter 1\nThe beginning of the story', 'word_count': 200}, 21 | {'text': 'More content here', 'word_count': 150}, 22 | ] 23 | boundaries = detect_section_boundaries(pages) 24 | assert 1 in boundaries 25 | 26 | def test_chapter_pattern_with_title(self): 27 | """Test detection of 'Chapter N: Title' pattern.""" 28 | pages = [ 29 | {'text': 'Chapter 1: Introduction\nContent here', 'word_count': 200}, 30 | {'text': 'Regular content page', 'word_count': 150}, 31 | ] 32 | boundaries = detect_section_boundaries(pages) 33 | assert 0 in boundaries 34 | 35 | def test_numbered_section_pattern(self): 36 | """Test detection of '1. Introduction' pattern.""" 37 | pages = [ 38 | {'text': '1. Introduction\nThe first section', 'word_count': 200}, 39 | {'text': 'Content continues', 'word_count': 150}, 40 | {'text': '2. Background\nSecond section content', 'word_count': 200}, 41 | ] 42 | boundaries = detect_section_boundaries(pages) 43 | assert 0 in boundaries 44 | assert 2 in boundaries 45 | 46 | def test_roman_numeral_sections(self): 47 | """Test detection of 'I. Title' pattern.""" 48 | pages = [ 49 | {'text': 'I. Introduction\nFirst section', 'word_count': 200}, 50 | {'text': 'Content page', 'word_count': 150}, 51 | {'text': 'II. Methods\nSecond section', 'word_count': 200}, 52 | ] 53 | boundaries = detect_section_boundaries(pages) 54 | assert 0 in boundaries 55 | assert 2 in boundaries 56 | 57 | def test_special_sections(self): 58 | """Test detection of special section names.""" 59 | pages = [ 60 | {'text': 'Prologue\nBefore the main story', 'word_count': 200}, 61 | {'text': 'Chapter content', 'word_count': 150}, 62 | {'text': 'Epilogue\nAfter the main story', 'word_count': 200}, 63 | ] 64 | boundaries = detect_section_boundaries(pages) 65 | assert 0 in boundaries 66 | assert 2 in boundaries 67 | 68 | def test_all_caps_headings(self): 69 | """Test detection of ALL CAPS headings.""" 70 | pages = [ 71 | {'text': 'THE BEGINNING\nStory starts here', 'word_count': 200}, 72 | {'text': 'Regular content', 'word_count': 150}, 73 | ] 74 | boundaries = detect_section_boundaries(pages) 75 | assert 0 in boundaries 76 | 77 | def test_no_boundaries_in_regular_text(self): 78 | """Test that regular text doesn't create false boundaries.""" 79 | pages = [ 80 | {'text': 'Regular paragraph text here.\nMore regular text follows.', 'word_count': 200}, 81 | {'text': 'Another page of normal content.\nNothing special here.', 'word_count': 150}, 82 | ] 83 | boundaries = detect_section_boundaries(pages) 84 | # Should have few or no boundaries 85 | assert len(boundaries) <= 1 86 | 87 | def test_empty_pages(self): 88 | """Test handling of empty pages.""" 89 | pages = [ 90 | {'text': '', 'word_count': 0}, 91 | {'text': 'Chapter 1\nContent', 'word_count': 100}, 92 | ] 93 | boundaries = detect_section_boundaries(pages) 94 | assert 1 in boundaries 95 | 96 | def test_appendix_detection(self): 97 | """Test detection of appendix sections.""" 98 | pages = [ 99 | {'text': 'Main content', 'word_count': 200}, 100 | {'text': 'Appendix A\nSupplementary material', 'word_count': 150}, 101 | ] 102 | boundaries = detect_section_boundaries(pages) 103 | assert 1 in boundaries 104 | 105 | 106 | class TestDetermineSplitPoints: 107 | """Tests for PDF split point determination.""" 108 | 109 | def test_single_part_small_pdf(self): 110 | """Test PDF that fits in single part.""" 111 | pages = [ 112 | {'text': 'Page 1', 'word_count': 300}, 113 | {'text': 'Page 2', 'word_count': 400}, 114 | ] 115 | split_points = determine_split_points(pages, max_words=1000) 116 | 117 | assert len(split_points) == 1 118 | assert split_points[0] == 1 # Last page index 119 | 120 | def test_multiple_parts(self): 121 | """Test PDF that needs multiple parts.""" 122 | pages = [ 123 | {'text': 'Page ' + str(i), 'word_count': 500} 124 | for i in range(10) 125 | ] 126 | split_points = determine_split_points(pages, max_words=1000) 127 | 128 | # Should have multiple parts 129 | assert len(split_points) >= 2 130 | 131 | def test_respects_word_limit(self): 132 | """Test that word limit is respected.""" 133 | pages = [ 134 | {'text': 'Page ' + str(i), 'word_count': 400} 135 | for i in range(10) 136 | ] 137 | split_points = determine_split_points(pages, max_words=1000) 138 | 139 | # Verify each part is within limits 140 | prev_split = -1 141 | for split_point in split_points: 142 | part_words = sum( 143 | pages[i]['word_count'] 144 | for i in range(prev_split + 1, split_point + 1) 145 | ) 146 | # Allow some flexibility 147 | assert part_words <= 1200 # max_words + buffer 148 | prev_split = split_point 149 | 150 | def test_empty_pdf(self): 151 | """Test handling of empty PDF.""" 152 | split_points = determine_split_points([], max_words=1000) 153 | assert split_points == [] 154 | 155 | def test_single_page_pdf(self): 156 | """Test handling of single page PDF.""" 157 | pages = [{'text': 'Content', 'word_count': 500}] 158 | split_points = determine_split_points(pages, max_words=1000) 159 | 160 | assert len(split_points) == 1 161 | assert split_points[0] == 0 162 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Shared utility functions for book splitting operations. 3 | """ 4 | 5 | import os 6 | import re 7 | from typing import Optional, List, Dict, Any 8 | 9 | 10 | def count_words(text: str) -> int: 11 | """ 12 | Count words in text using a simple regex approach. 13 | 14 | Args: 15 | text: Text to count words in. 16 | 17 | Returns: 18 | Number of words in the text. 19 | """ 20 | # Remove HTML tags if any remain 21 | text = re.sub(r'<[^>]+>', ' ', text) 22 | 23 | # Normalize whitespace 24 | text = re.sub(r'\s+', ' ', text).strip() 25 | 26 | # Count words (sequences of alphanumeric characters) 27 | words = re.findall(r'\w+', text) 28 | return len(words) 29 | 30 | 31 | def normalize_output_dir(output_dir: Optional[str], input_file: str) -> str: 32 | """ 33 | Normalize the output directory path. 34 | 35 | If output_dir is None, uses the directory of the input file. 36 | If input_file has no directory part, defaults to current directory. 37 | 38 | Args: 39 | output_dir: Specified output directory, or None for default. 40 | input_file: Path to the input file. 41 | 42 | Returns: 43 | Normalized output directory path. 44 | """ 45 | if output_dir is None: 46 | output_dir = os.path.dirname(os.path.abspath(input_file)) 47 | if not output_dir: 48 | output_dir = '.' 49 | return output_dir 50 | 51 | 52 | def validate_input_file(file_path: str, expected_extensions: Optional[List[str]] = None) -> None: 53 | """ 54 | Validate that the input file exists and has the expected extension. 55 | 56 | Args: 57 | file_path: Path to the file to validate. 58 | expected_extensions: List of valid extensions (e.g., ['.epub', '.pdf']). 59 | 60 | Raises: 61 | FileNotFoundError: If the file does not exist. 62 | ValueError: If the file has an invalid extension. 63 | """ 64 | if not os.path.isfile(file_path): 65 | raise FileNotFoundError(f"Input file '{file_path}' does not exist.") 66 | 67 | if expected_extensions: 68 | ext = os.path.splitext(file_path)[1].lower() 69 | if ext not in [e.lower() for e in expected_extensions]: 70 | raise ValueError( 71 | f"Input file '{file_path}' has invalid extension '{ext}'. " 72 | f"Expected: {', '.join(expected_extensions)}" 73 | ) 74 | 75 | 76 | def ensure_output_dir(output_dir: str) -> None: 77 | """ 78 | Ensure the output directory exists, creating it if necessary. 79 | 80 | Args: 81 | output_dir: Path to the output directory. 82 | 83 | Raises: 84 | OSError: If the directory cannot be created. 85 | """ 86 | if not os.path.exists(output_dir): 87 | os.makedirs(output_dir) 88 | 89 | 90 | def sanitize_for_anchor(text: str) -> str: 91 | """ 92 | Sanitize text for use as an anchor in markdown. 93 | 94 | Args: 95 | text: Text to sanitize. 96 | 97 | Returns: 98 | Sanitized text suitable for markdown anchors. 99 | """ 100 | # Replace spaces and special characters with hyphens 101 | text = re.sub(r'[^\w\s-]', '', text.lower()) 102 | text = re.sub(r'[\s-]+', '-', text) 103 | return text 104 | 105 | 106 | def split_long_text(text: str, max_words: int, overlap_words: int = 1000) -> List[str]: 107 | """ 108 | Split long text into smaller chunks when chapter-based chunking isn't sufficient. 109 | Used as fallback when individual chapters/sections are too large. 110 | 111 | Ported from dingran.me chunking.ts implementation. 112 | 113 | Args: 114 | text: Text to split. 115 | max_words: Maximum words per chunk. 116 | overlap_words: Number of words to overlap between chunks (default: 1000). 117 | 118 | Returns: 119 | List of text chunks. 120 | """ 121 | words = text.strip().split() 122 | total_words = len(words) 123 | 124 | if total_words <= max_words: 125 | return [text] 126 | 127 | chunks = [] 128 | start = 0 129 | 130 | while start < total_words: 131 | end = min(start + max_words, total_words) 132 | 133 | # Try to find a good break point (sentence ending) 134 | actual_end = end 135 | if end < total_words: 136 | # Look backwards for a sentence ending within the last 20% of the chunk 137 | min_end = start + int(max_words * 0.8) 138 | for i in range(end, min_end, -1): 139 | word = words[i - 1] # -1 because we're looking at the word before position i 140 | if word.endswith('.') or word.endswith('!') or word.endswith('?'): 141 | actual_end = i 142 | break 143 | 144 | chunk_words = words[start:actual_end] 145 | chunks.append(' '.join(chunk_words)) 146 | 147 | # Move start point (overlap only if not at end) 148 | if actual_end < total_words: 149 | start = max(actual_end - overlap_words, actual_end) 150 | else: 151 | break 152 | 153 | return chunks 154 | 155 | 156 | def get_chunking_stats(chunks: List[Dict[str, Any]]) -> Dict[str, Any]: 157 | """ 158 | Get statistics for a set of chunks. 159 | 160 | Ported from dingran.me chunking.ts implementation. 161 | 162 | Args: 163 | chunks: List of chunk dictionaries with 'word_count' and optional 'chapter_info' keys. 164 | 165 | Returns: 166 | Dictionary with statistics including total_chunks, total_words, avg_words_per_chunk, 167 | min_words, max_words, and chapter_spread. 168 | """ 169 | if not chunks: 170 | return { 171 | 'total_chunks': 0, 172 | 'total_words': 0, 173 | 'avg_words_per_chunk': 0, 174 | 'min_words': 0, 175 | 'max_words': 0, 176 | 'chapter_spread': 'No chunks', 177 | } 178 | 179 | word_counts = [c.get('word_count', 0) for c in chunks] 180 | total_chunks = len(chunks) 181 | total_words = sum(word_counts) 182 | avg_words_per_chunk = round(total_words / total_chunks) if total_chunks > 0 else 0 183 | min_words = min(word_counts) 184 | max_words = max(word_counts) 185 | 186 | # Analyze chapter spread 187 | chapter_info_list = [] 188 | for c in chunks: 189 | chapter_info = c.get('chapter_info') 190 | if chapter_info: 191 | start = chapter_info.get('start_chapter', '?') 192 | end = chapter_info.get('end_chapter', '?') 193 | chapter_info_list.append(f"{start}-{end}") 194 | 195 | chapter_spread = ', '.join(chapter_info_list) if chapter_info_list else 'No chapter info' 196 | 197 | return { 198 | 'total_chunks': total_chunks, 199 | 'total_words': total_words, 200 | 'avg_words_per_chunk': avg_words_per_chunk, 201 | 'min_words': min_words, 202 | 'max_words': max_words, 203 | 'chapter_spread': chapter_spread, 204 | } 205 | 206 | 207 | def repair_hyphenation(text: str) -> str: 208 | """ 209 | Repair words that were hyphenated across line breaks. 210 | 211 | Common in PDFs where words are split at line endings. 212 | Ported from dingran.me pdf.ts implementation. 213 | 214 | Args: 215 | text: Text with potentially broken hyphenations. 216 | 217 | Returns: 218 | Text with repaired hyphenations. 219 | """ 220 | # Fix hyphenated words broken across lines: word- \n next -> wordnext 221 | return re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text) 222 | -------------------------------------------------------------------------------- /splitter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Unified Book Splitter - A tool that can split EPUB or PDF files into multiple files 4 | with support for different output formats (Markdown, EPUB, PDF). 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | from typing import List, Optional 11 | 12 | from utils import normalize_output_dir, ensure_output_dir 13 | 14 | def split_book(input_file: str, 15 | output_format: str = 'markdown', 16 | max_words: int = 80000, 17 | output_dir: Optional[str] = None, 18 | strict_boundaries: bool = False, 19 | verbose: bool = False) -> List[str]: 20 | """ 21 | Split any supported book file (EPUB or PDF) into multiple files of the specified format. 22 | 23 | Args: 24 | input_file: Path to the input book file (EPUB or PDF). 25 | output_format: Output format ('markdown', 'epub', or 'pdf'). 26 | max_words: Maximum words per output file. 27 | output_dir: Directory for output files (defaults to input file's directory). 28 | strict_boundaries: Whether to split only at detected chapter/section boundaries. 29 | verbose: Whether to print verbose output. 30 | 31 | Returns: 32 | List of paths to the created files. 33 | """ 34 | output_dir = normalize_output_dir(output_dir, input_file) 35 | input_ext = os.path.splitext(input_file)[1].lower() 36 | 37 | # Validate input format 38 | if input_ext not in ['.epub', '.pdf']: 39 | print(f"Error: Unsupported input format: {input_ext}") 40 | print("Supported input formats: .epub, .pdf") 41 | return [] 42 | 43 | # Validate output format 44 | if output_format.lower() not in ['markdown', 'epub', 'pdf']: 45 | print(f"Error: Unsupported output format: {output_format}") 46 | print("Supported output formats: markdown, epub, pdf") 47 | return [] 48 | 49 | # EPUB to Markdown conversion 50 | if input_ext == '.epub' and output_format.lower() == 'markdown': 51 | try: 52 | from epub_to_markdown import split_epub 53 | return split_epub( 54 | input_file=input_file, 55 | max_words=max_words, 56 | output_dir=output_dir, 57 | strict_chapters=strict_boundaries, 58 | verbose=verbose 59 | ) 60 | except ImportError: 61 | print("Error: EPUB to Markdown support not found. Make sure epub_to_markdown.py is in the same directory.") 62 | return [] 63 | 64 | # EPUB to EPUB conversion (splitting) 65 | elif input_ext == '.epub' and output_format.lower() == 'epub': 66 | try: 67 | from epub_splitter import split_epub_to_epub 68 | return split_epub_to_epub( 69 | input_file=input_file, 70 | max_words=max_words, 71 | output_dir=output_dir, 72 | strict_boundaries=strict_boundaries, 73 | verbose=verbose 74 | ) 75 | except ImportError: 76 | print("Error: EPUB to EPUB support not found. Make sure epub_splitter.py is in the same directory.") 77 | return [] 78 | 79 | # PDF to Markdown conversion 80 | elif input_ext == '.pdf' and output_format.lower() == 'markdown': 81 | try: 82 | from pdf_to_markdown import split_pdf 83 | return split_pdf( 84 | input_file=input_file, 85 | max_words=max_words, 86 | output_dir=output_dir, 87 | strict_boundaries=strict_boundaries, 88 | verbose=verbose 89 | ) 90 | except ImportError: 91 | print("Error: PDF to Markdown support not found. Make sure pdf_to_markdown.py is in the same directory.") 92 | return [] 93 | 94 | # PDF to PDF conversion (splitting) 95 | elif input_ext == '.pdf' and output_format.lower() == 'pdf': 96 | print("Error: PDF to PDF splitting is not yet implemented.") 97 | return [] 98 | 99 | # EPUB to PDF conversion 100 | elif input_ext == '.epub' and output_format.lower() == 'pdf': 101 | print("Error: EPUB to PDF conversion is not yet implemented.") 102 | return [] 103 | 104 | # PDF to EPUB conversion 105 | elif input_ext == '.pdf' and output_format.lower() == 'epub': 106 | print("Error: PDF to EPUB conversion is not yet implemented.") 107 | return [] 108 | 109 | return [] 110 | 111 | def main(): 112 | """Main entry point for the script.""" 113 | parser = argparse.ArgumentParser(description='Split a book file (EPUB or PDF) into multiple parts') 114 | 115 | parser.add_argument('input_file', help='Path to input book file (EPUB or PDF)') 116 | parser.add_argument('--output-format', choices=['markdown', 'epub', 'pdf'], default='markdown', 117 | help='Output format (default: markdown)') 118 | parser.add_argument('--max-words', type=int, default=80000, 119 | help='Maximum words per output file (default: 80000)') 120 | parser.add_argument('--output-dir', default=None, 121 | help='Directory for output files (default: same directory as input file)') 122 | parser.add_argument('--strict-boundaries', action='store_true', 123 | help='Only split at chapter/section boundaries when possible') 124 | parser.add_argument('--verbose', action='store_true', 125 | help='Print detailed processing information') 126 | 127 | args = parser.parse_args() 128 | 129 | # Validate arguments 130 | if not os.path.isfile(args.input_file): 131 | print(f"Error: Input file '{args.input_file}' does not exist.") 132 | return 1 133 | 134 | if not args.input_file.lower().endswith(('.epub', '.pdf')): 135 | print(f"Error: Input file '{args.input_file}' is not a supported format (EPUB or PDF).") 136 | return 1 137 | 138 | if args.output_dir is not None and not os.path.exists(args.output_dir): 139 | try: 140 | ensure_output_dir(args.output_dir) 141 | except OSError as e: 142 | print(f"Error: Cannot create output directory '{args.output_dir}': {e}") 143 | return 1 144 | 145 | if args.max_words <= 0: 146 | print("Error: Maximum words must be a positive number.") 147 | return 1 148 | 149 | # Display configuration 150 | effective_output_dir = args.output_dir or os.path.dirname(os.path.abspath(args.input_file)) or '.' 151 | print(f"Processing '{args.input_file}'") 152 | print(f"Output format: {args.output_format}") 153 | print(f"Maximum words per file: {args.max_words}") 154 | print(f"Output directory: {effective_output_dir}") 155 | print(f"Strict boundaries: {'Yes' if args.strict_boundaries else 'No'}") 156 | print(f"Verbose mode: {'Yes' if args.verbose else 'No'}") 157 | print("---") 158 | 159 | try: 160 | result = split_book( 161 | input_file=args.input_file, 162 | output_format=args.output_format, 163 | max_words=args.max_words, 164 | output_dir=args.output_dir, 165 | strict_boundaries=args.strict_boundaries, 166 | verbose=args.verbose 167 | ) 168 | 169 | if result: 170 | print(f"\nProcessing complete. Created {len(result)} files:") 171 | for i, output_file in enumerate(result, 1): 172 | print(f" {i}. {output_file}") 173 | return 0 174 | else: 175 | print("No output files were created.") 176 | return 1 177 | 178 | except Exception as e: 179 | print(f"Error: {str(e)}") 180 | return 1 181 | 182 | if __name__ == "__main__": 183 | sys.exit(main()) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Book Splitter 2 | 3 | A set of command-line tools to split large EPUB and PDF books into smaller, more manageable parts. 4 | 5 | ## Features 6 | 7 | - Split EPUB files based on word count (default: 80,000 words per part) 8 | - Split PDF files based on word count and page boundaries 9 | - Intelligently split at chapter boundaries when possible 10 | - Multiple output formats supported: 11 | - Multiple EPUB files 12 | - Multiple Markdown files 13 | - (Future support for PDF output) 14 | - Preserve book structure and content 15 | - Option for strict chapter boundary splitting 16 | - Unified command-line interface for all formats 17 | 18 | ## Installation 19 | 20 | ### Prerequisites 21 | 22 | - Python 3.7 or higher 23 | - pip (Python package manager) 24 | 25 | ### Setup 26 | 27 | 1. Clone the repository: 28 | 29 | ```bash 30 | git clone https://github.com/dingran/book-splitter.git 31 | cd book-splitter 32 | ``` 33 | 34 | 2. Create and activate a virtual environment (recommended): 35 | 36 | ```bash 37 | python -m venv venv 38 | source venv/bin/activate # On Windows: venv\Scripts\activate 39 | ``` 40 | 41 | 3. Install the required packages: 42 | 43 | ```bash 44 | pip install -r requirements.txt 45 | ``` 46 | 47 | ## Usage 48 | 49 | ### Unified Splitter (Recommended) 50 | 51 | The easiest way to use this tool is through the unified `splitter.py` script: 52 | 53 | ```bash 54 | python splitter.py my_book.epub 55 | python splitter.py my_book.pdf 56 | ``` 57 | 58 | By default, this will convert your input file to multiple markdown files. You can specify the output format: 59 | 60 | ```bash 61 | python splitter.py my_book.epub --output-format epub 62 | python splitter.py my_book.pdf --output-format markdown 63 | ``` 64 | 65 | ### EPUB to EPUB Splitting 66 | 67 | Basic usage: 68 | 69 | ```bash 70 | python epub_splitter.py my_book.epub 71 | ``` 72 | 73 | This will split `my_book.epub` into multiple EPUB files with a maximum of 80,000 words each, preferring to split at chapter boundaries. 74 | 75 | ### EPUB to Markdown Splitting 76 | 77 | To convert an EPUB to multiple markdown files: 78 | 79 | ```bash 80 | python epub_to_markdown.py my_book.epub 81 | ``` 82 | 83 | This will split `my_book.epub` into multiple markdown (.md) files with a maximum of 80,000 words each. Each markdown file will include a table of contents and chapter headers. 84 | 85 | ### PDF to Markdown Splitting 86 | 87 | To convert a PDF to multiple markdown files: 88 | 89 | ```bash 90 | python pdf_to_markdown.py my_book.pdf 91 | ``` 92 | 93 | This will split `my_book.pdf` into multiple markdown (.md) files with a maximum of 80,000 words each. Each markdown file will include page numbers and word counts. 94 | 95 | ### Command-line options for the unified splitter 96 | 97 | ``` 98 | usage: splitter.py [-h] [--output-format {markdown,epub,pdf}] [--max-words MAX_WORDS] 99 | [--output-dir OUTPUT_DIR] [--strict-boundaries] [--verbose] input_file 100 | 101 | Split a book file (EPUB or PDF) into multiple parts 102 | 103 | positional arguments: 104 | input_file Path to input book file (EPUB or PDF) 105 | 106 | optional arguments: 107 | -h, --help Show this help message and exit 108 | --output-format {markdown,epub,pdf} 109 | Output format (default: markdown) 110 | --max-words MAX_WORDS 111 | Maximum words per output file (default: 80000) 112 | --output-dir OUTPUT_DIR 113 | Directory for output files (default: current directory) 114 | --strict-boundaries Only split at chapter/section boundaries when possible 115 | --verbose Print detailed processing information 116 | ``` 117 | 118 | Note: Individual tools use `--strict-chapters` or `--strict-boundaries` depending on the script, but the unified splitter standardizes on `--strict-boundaries` for all formats. 119 | 120 | ### Examples 121 | 122 | Split a book with a maximum of 50,000 words per part: 123 | 124 | ```bash 125 | python epub_to_markdown.py --max-words 50000 my_book.epub 126 | ``` 127 | 128 | Only split at chapter boundaries, even if it means some parts might be larger than the word limit: 129 | 130 | ```bash 131 | python epub_splitter.py --strict-chapters my_book.epub 132 | ``` 133 | 134 | Output files to a specific directory and show detailed processing information: 135 | 136 | ```bash 137 | python epub_to_markdown.py --output-dir ./split_books --verbose my_book.epub 138 | ``` 139 | 140 | Split a PDF book with detailed processing information: 141 | 142 | ```bash 143 | python pdf_to_markdown.py --verbose my_book.pdf 144 | ``` 145 | 146 | Use the unified splitter to convert an EPUB to markdown with custom word count: 147 | 148 | ```bash 149 | python splitter.py --output-format markdown --max-words 50000 my_book.epub 150 | ``` 151 | 152 | ## Project Structure 153 | 154 | The project includes these main components: 155 | 156 | - **`splitter.py`** - Unified tool that handles both EPUB and PDF conversion to various formats 157 | - Serves as the main entry point with a standardized interface 158 | - Can process both EPUB and PDF files as input 159 | - Supports multiple output formats (markdown, EPUB, with PDF planned for future) 160 | - Provides a consistent command-line interface for all operations 161 | 162 | - **`epub_splitter.py`** - Tool for splitting EPUB files into multiple EPUB files 163 | - Contains the `split_epub_to_epub` function for processing EPUBs 164 | - Maintains chapter boundaries and book structure 165 | - Preserves metadata, CSS, images, and other assets from the original EPUB 166 | 167 | - **`epub_to_markdown.py`** - Tool for converting EPUB files to markdown 168 | - Extracts chapter content from EPUB files 169 | - Formats the content as markdown with proper headers and links 170 | - Creates tables of contents for each output file 171 | - Splits content into multiple files based on word count 172 | 173 | - **`pdf_to_markdown.py`** - Tool for converting PDF files to markdown 174 | - Extracts text from PDF pages using multiple extraction methods for accuracy 175 | - Detects section boundaries where possible 176 | - Preserves page numbers in the output markdown 177 | - Cleans up common OCR and formatting issues 178 | 179 | - **`epub_processor.py`** - Core library for handling EPUB processing 180 | - Contains the `EPUBProcessor` class that manages EPUB manipulation 181 | - Handles chapter extraction, word counting, and split point determination 182 | - Preserves metadata and assets between split files 183 | - Used by epub_splitter.py to perform the actual EPUB splitting 184 | 185 | - **`requirements.txt`** - Required Python packages for running the tools 186 | - **`LICENSE`** - MIT License for the project 187 | - **`technical_design.md`** - Technical design documentation explaining architecture decisions 188 | 189 | ## Code Organization 190 | 191 | The codebase is modular and follows these design principles: 192 | 193 | 1. **Separation of concerns**: Each file focuses on a specific task (EPUB splitting, PDF processing, etc.) 194 | 2. **Common interfaces**: Similar functions across files follow the same parameter patterns 195 | 3. **Unified frontend**: The `splitter.py` script provides a single entry point for all functionality 196 | 4. **Error handling**: Comprehensive error checking with informative messages 197 | 5. **Documentation**: Detailed docstrings and command-line help 198 | 199 | The unified interface in `splitter.py` makes it easy to use for most common tasks, while the specialized scripts provide more tailored functionality for specific use cases. 200 | 201 | ## Limitations 202 | 203 | - Some complex EPUB structures might not be handled perfectly 204 | - Internal links between different parts of the book might be broken after splitting 205 | - Very large chapters cannot be split internally (will be kept as a single unit) 206 | - PDF extraction quality depends on the PDF structure; scanned PDFs may have lower quality text extraction 207 | - Some PDF formatting and layout may be lost in the conversion to markdown 208 | 209 | ## License 210 | 211 | This project is licensed under the MIT License - see the LICENSE file for details. -------------------------------------------------------------------------------- /epub_to_markdown.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | EPUB to Markdown Splitter - Splits an EPUB book into multiple markdown files based on word count. 4 | """ 5 | 6 | import os 7 | import sys 8 | import re 9 | import argparse 10 | from typing import List, Optional 11 | from bs4 import BeautifulSoup 12 | import ebooklib 13 | from ebooklib import epub 14 | import html2text 15 | 16 | from utils import count_words, normalize_output_dir, sanitize_for_anchor, ensure_output_dir 17 | 18 | def extract_markdown_from_html(html_content): 19 | """Convert HTML content to markdown.""" 20 | h = html2text.HTML2Text() 21 | h.ignore_links = False 22 | h.ignore_images = False 23 | h.ignore_tables = False 24 | h.ignore_emphasis = False 25 | return h.handle(html_content) 26 | 27 | def split_epub(input_file: str, max_words: int = 80000, output_dir: Optional[str] = None, 28 | strict_chapters: bool = False, verbose: bool = False) -> List[str]: 29 | """Split an EPUB file into multiple markdown files. 30 | 31 | Args: 32 | input_file: Path to the input EPUB file. 33 | max_words: Maximum words per output file. 34 | output_dir: Directory for output files (defaults to input file's directory). 35 | strict_chapters: Only split at chapter boundaries. 36 | verbose: Print verbose output. 37 | 38 | Returns: 39 | List of paths to created markdown files. 40 | """ 41 | output_dir = normalize_output_dir(output_dir, input_file) 42 | def log(message): 43 | if verbose: 44 | print(message) 45 | 46 | log(f"Processing '{input_file}'...") 47 | 48 | # Extract book name from input file path 49 | book_name = os.path.splitext(os.path.basename(input_file))[0] 50 | 51 | try: 52 | # Load the book 53 | book = epub.read_epub(input_file) 54 | 55 | # Get book title 56 | book_title = getattr(book, 'title', book_name) or book_name 57 | 58 | # Get spine items (ordered document items) 59 | log("Extracting chapters...") 60 | chapters = [] 61 | total_word_count = 0 62 | 63 | # Collect all document items from the spine 64 | for itemref in book.spine: 65 | item_id = itemref[0] 66 | item = book.get_item_with_id(item_id) 67 | if item is not None and item.get_type() == ebooklib.ITEM_DOCUMENT: 68 | # Get content and convert to markdown 69 | content = item.get_content().decode('utf-8') 70 | soup = BeautifulSoup(content, 'html.parser') 71 | 72 | # Try to extract title 73 | title_elem = soup.find(['h1', 'h2', 'h3', 'title']) 74 | title = title_elem.get_text().strip() if title_elem else f"Chapter {len(chapters) + 1}" 75 | 76 | # Convert to markdown 77 | markdown_content = extract_markdown_from_html(content) 78 | 79 | # Count words 80 | word_count = count_words(markdown_content) 81 | total_word_count += word_count 82 | 83 | # Add chapter to list 84 | chapters.append({ 85 | 'title': title, 86 | 'markdown': markdown_content, 87 | 'word_count': word_count 88 | }) 89 | 90 | log(f" - {title} ({word_count} words)") 91 | 92 | log(f"Found {len(chapters)} chapters with a total of {total_word_count} words.") 93 | 94 | # Determine split points 95 | log("Determining split points...") 96 | split_points = [] 97 | current_word_count = 0 98 | current_chapters = [] 99 | 100 | for i, chapter in enumerate(chapters): 101 | # Check if adding this chapter would exceed the word limit 102 | if current_word_count + chapter['word_count'] > max_words and current_chapters: 103 | if strict_chapters or current_word_count >= 0.4 * max_words: 104 | # Split before current chapter 105 | split_points.append(i - 1) 106 | current_word_count = chapter['word_count'] 107 | current_chapters = [chapter] 108 | else: 109 | # Add chapter and split after it 110 | current_chapters.append(chapter) 111 | current_word_count += chapter['word_count'] 112 | split_points.append(i) 113 | current_word_count = 0 114 | current_chapters = [] 115 | else: 116 | # Add chapter to current part 117 | current_chapters.append(chapter) 118 | current_word_count += chapter['word_count'] 119 | 120 | # Add the last part if there are remaining chapters 121 | if current_chapters: 122 | split_points.append(len(chapters) - 1) 123 | 124 | # Log split information 125 | part_start = 0 126 | for i, split_point in enumerate(split_points): 127 | part_chapters = chapters[part_start:split_point + 1] 128 | part_words = sum(c['word_count'] for c in part_chapters) 129 | log(f" Part {i+1}: Chapters {part_start+1}-{split_point+1} ({part_words} words)") 130 | part_start = split_point + 1 131 | 132 | # Create output files 133 | log("Creating output markdown files...") 134 | output_files = [] 135 | 136 | part_start = 0 137 | for i, split_point in enumerate(split_points): 138 | part_num = i + 1 139 | output_filename = f"{book_name}_part{part_num}.md" 140 | output_path = os.path.join(output_dir, output_filename) 141 | 142 | log(f" Creating {output_filename}...") 143 | 144 | # Get chapters for this part 145 | part_chapters = chapters[part_start:split_point + 1] 146 | 147 | # Create markdown content for this part 148 | markdown_content = f"# {book_title} - Part {part_num}\n\n" 149 | markdown_content += f"*Words: {sum(c['word_count'] for c in part_chapters)}*\n\n" 150 | 151 | # Add table of contents 152 | markdown_content += "## Table of Contents\n\n" 153 | for j, chapter in enumerate(part_chapters): 154 | markdown_content += f"{j+1}. [{chapter['title']}](#{sanitize_for_anchor(chapter['title'])})\n" 155 | markdown_content += "\n---\n\n" 156 | 157 | # Add chapter content 158 | for chapter in part_chapters: 159 | markdown_content += f"## {chapter['title']}\n\n" 160 | markdown_content += chapter['markdown'] 161 | markdown_content += "\n\n---\n\n" 162 | 163 | # Write to file 164 | with open(output_path, 'w', encoding='utf-8') as f: 165 | f.write(markdown_content) 166 | 167 | output_files.append(output_path) 168 | part_start = split_point + 1 169 | 170 | return output_files 171 | 172 | except Exception as e: 173 | if verbose: 174 | import traceback 175 | traceback.print_exc() 176 | raise Exception(f"Error processing EPUB: {str(e)}") 177 | 178 | def main(): 179 | """Main entry point for the script.""" 180 | parser = argparse.ArgumentParser(description='Split an EPUB file into multiple markdown files') 181 | 182 | parser.add_argument('input_file', help='Path to input EPUB file') 183 | parser.add_argument('--max-words', type=int, default=80000, 184 | help='Maximum words per output file (default: 80000)') 185 | parser.add_argument('--output-dir', default=None, 186 | help='Directory for output files (default: same directory as input file)') 187 | parser.add_argument('--strict-chapters', action='store_true', 188 | help='Only split at chapter boundaries, even if exceeding word limit') 189 | parser.add_argument('--verbose', action='store_true', 190 | help='Print detailed processing information') 191 | 192 | args = parser.parse_args() 193 | 194 | # Validate arguments 195 | if not os.path.isfile(args.input_file): 196 | print(f"Error: Input file '{args.input_file}' does not exist.") 197 | return 1 198 | 199 | if not args.input_file.lower().endswith('.epub'): 200 | print(f"Error: Input file '{args.input_file}' is not an EPUB file.") 201 | return 1 202 | 203 | if args.output_dir is not None and not os.path.exists(args.output_dir): 204 | try: 205 | ensure_output_dir(args.output_dir) 206 | except OSError as e: 207 | print(f"Error: Cannot create output directory '{args.output_dir}': {e}") 208 | return 1 209 | 210 | if args.max_words <= 0: 211 | print("Error: Maximum words must be a positive number.") 212 | return 1 213 | 214 | try: 215 | result = split_epub( 216 | input_file=args.input_file, 217 | max_words=args.max_words, 218 | output_dir=args.output_dir, 219 | strict_chapters=args.strict_chapters, 220 | verbose=args.verbose 221 | ) 222 | 223 | print(f"\nProcessing complete. Created {len(result)} markdown files:") 224 | for i, output_file in enumerate(result, 1): 225 | print(f" {i}. {output_file}") 226 | 227 | except Exception as e: 228 | print(f"Error: {str(e)}") 229 | return 1 230 | 231 | return 0 232 | 233 | if __name__ == "__main__": 234 | sys.exit(main()) -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests for shared utility functions. 3 | """ 4 | 5 | import os 6 | import pytest 7 | import tempfile 8 | 9 | from utils import ( 10 | count_words, 11 | normalize_output_dir, 12 | validate_input_file, 13 | ensure_output_dir, 14 | sanitize_for_anchor, 15 | split_long_text, 16 | get_chunking_stats, 17 | repair_hyphenation, 18 | ) 19 | 20 | 21 | class TestCountWords: 22 | """Tests for count_words function.""" 23 | 24 | def test_simple_text(self): 25 | """Test word counting with simple text.""" 26 | text = "Hello world this is a test" 27 | assert count_words(text) == 6 28 | 29 | def test_with_html_tags(self): 30 | """Test that HTML tags are stripped before counting.""" 31 | text = "

Hello world

" 32 | assert count_words(text) == 2 33 | 34 | def test_with_multiple_whitespace(self): 35 | """Test that multiple whitespace is normalized.""" 36 | text = "Hello world\n\nthis is\ta test" 37 | assert count_words(text) == 6 38 | 39 | def test_empty_string(self): 40 | """Test word counting with empty string.""" 41 | assert count_words("") == 0 42 | 43 | def test_only_whitespace(self): 44 | """Test word counting with only whitespace.""" 45 | assert count_words(" \n\t ") == 0 46 | 47 | def test_with_numbers(self): 48 | """Test that numbers are counted as words.""" 49 | text = "There are 5 items and 10 more" 50 | assert count_words(text) == 7 51 | 52 | def test_with_punctuation(self): 53 | """Test that punctuation doesn't create extra words.""" 54 | text = "Hello, world! How are you?" 55 | assert count_words(text) == 5 56 | 57 | 58 | class TestNormalizeOutputDir: 59 | """Tests for normalize_output_dir function.""" 60 | 61 | def test_with_none_output_dir(self, temp_dir): 62 | """Test that None output_dir uses input file's directory.""" 63 | input_file = os.path.join(temp_dir, "test.epub") 64 | result = normalize_output_dir(None, input_file) 65 | assert result == temp_dir 66 | 67 | def test_with_provided_output_dir(self, temp_dir): 68 | """Test that provided output_dir is returned unchanged.""" 69 | input_file = os.path.join(temp_dir, "test.epub") 70 | output_dir = "/custom/output/path" 71 | result = normalize_output_dir(output_dir, input_file) 72 | assert result == output_dir 73 | 74 | def test_with_relative_input_file(self): 75 | """Test with relative input file path.""" 76 | result = normalize_output_dir(None, "test.epub") 77 | # Should return current directory since input has no directory part 78 | assert os.path.isabs(result) 79 | 80 | 81 | class TestValidateInputFile: 82 | """Tests for validate_input_file function.""" 83 | 84 | def test_nonexistent_file(self): 85 | """Test that nonexistent file raises FileNotFoundError.""" 86 | with pytest.raises(FileNotFoundError): 87 | validate_input_file("/nonexistent/path/file.epub") 88 | 89 | def test_invalid_extension(self, temp_dir): 90 | """Test that invalid extension raises ValueError.""" 91 | test_file = os.path.join(temp_dir, "test.txt") 92 | with open(test_file, "w") as f: 93 | f.write("test") 94 | 95 | with pytest.raises(ValueError): 96 | validate_input_file(test_file, [".epub", ".pdf"]) 97 | 98 | def test_valid_file(self, temp_dir): 99 | """Test that valid file passes validation.""" 100 | test_file = os.path.join(temp_dir, "test.epub") 101 | with open(test_file, "w") as f: 102 | f.write("test") 103 | 104 | # Should not raise 105 | validate_input_file(test_file, [".epub", ".pdf"]) 106 | 107 | def test_no_extension_check(self, temp_dir): 108 | """Test validation without extension check.""" 109 | test_file = os.path.join(temp_dir, "test.txt") 110 | with open(test_file, "w") as f: 111 | f.write("test") 112 | 113 | # Should not raise when no extensions specified 114 | validate_input_file(test_file) 115 | 116 | 117 | class TestEnsureOutputDir: 118 | """Tests for ensure_output_dir function.""" 119 | 120 | def test_creates_new_directory(self, temp_dir): 121 | """Test that new directory is created.""" 122 | new_dir = os.path.join(temp_dir, "new_subdir") 123 | assert not os.path.exists(new_dir) 124 | ensure_output_dir(new_dir) 125 | assert os.path.exists(new_dir) 126 | 127 | def test_existing_directory(self, temp_dir): 128 | """Test that existing directory doesn't cause error.""" 129 | # Should not raise 130 | ensure_output_dir(temp_dir) 131 | 132 | def test_nested_directories(self, temp_dir): 133 | """Test creating nested directories.""" 134 | nested_dir = os.path.join(temp_dir, "a", "b", "c") 135 | ensure_output_dir(nested_dir) 136 | assert os.path.exists(nested_dir) 137 | 138 | 139 | class TestSanitizeForAnchor: 140 | """Tests for sanitize_for_anchor function.""" 141 | 142 | def test_simple_text(self): 143 | """Test simple text sanitization.""" 144 | assert sanitize_for_anchor("Hello World") == "hello-world" 145 | 146 | def test_special_characters(self): 147 | """Test removal of special characters.""" 148 | assert sanitize_for_anchor("Chapter 1: Introduction!") == "chapter-1-introduction" 149 | 150 | def test_multiple_spaces(self): 151 | """Test that multiple spaces become single hyphen.""" 152 | assert sanitize_for_anchor("Hello World") == "hello-world" 153 | 154 | def test_already_hyphenated(self): 155 | """Test text with existing hyphens.""" 156 | assert sanitize_for_anchor("well-known-term") == "well-known-term" 157 | 158 | 159 | class TestSplitLongText: 160 | """Tests for split_long_text function.""" 161 | 162 | def test_short_text_not_split(self): 163 | """Test that short text is not split.""" 164 | text = "This is a short text" 165 | result = split_long_text(text, max_words=100) 166 | assert len(result) == 1 167 | assert result[0] == text 168 | 169 | def test_long_text_split(self): 170 | """Test that long text is split at sentence boundaries.""" 171 | # Create text with multiple sentences 172 | sentences = ["This is sentence number {}.".format(i) for i in range(50)] 173 | text = " ".join(sentences) 174 | 175 | result = split_long_text(text, max_words=50) 176 | assert len(result) > 1 177 | 178 | # Verify each chunk is within limit (approximately) 179 | for chunk in result: 180 | word_count = len(chunk.split()) 181 | # Allow some flexibility due to sentence boundary seeking 182 | assert word_count <= 60 # max_words + some buffer 183 | 184 | def test_respects_sentence_boundaries(self): 185 | """Test that splits try to occur at sentence endings when possible.""" 186 | # Use longer text so the algorithm has room to find sentence boundaries 187 | sentences = ["Sentence number {} here.".format(i) for i in range(20)] 188 | text = " ".join(sentences) 189 | result = split_long_text(text, max_words=15, overlap_words=0) 190 | 191 | # Count how many chunks end with sentence-ending punctuation 192 | sentence_endings = 0 193 | for chunk in result[:-1]: # Exclude last chunk 194 | last_word = chunk.split()[-1] 195 | if last_word.endswith('.') or last_word.endswith('!') or last_word.endswith('?'): 196 | sentence_endings += 1 197 | 198 | # At least some chunks should end at sentence boundaries 199 | # (algorithm tries but can't always succeed) 200 | assert sentence_endings > 0 or len(result) == 1 201 | 202 | 203 | class TestGetChunkingStats: 204 | """Tests for get_chunking_stats function.""" 205 | 206 | def test_empty_chunks(self): 207 | """Test stats for empty chunk list.""" 208 | stats = get_chunking_stats([]) 209 | assert stats['total_chunks'] == 0 210 | assert stats['total_words'] == 0 211 | 212 | def test_single_chunk(self): 213 | """Test stats for single chunk.""" 214 | chunks = [{'word_count': 1000}] 215 | stats = get_chunking_stats(chunks) 216 | assert stats['total_chunks'] == 1 217 | assert stats['total_words'] == 1000 218 | assert stats['avg_words_per_chunk'] == 1000 219 | assert stats['min_words'] == 1000 220 | assert stats['max_words'] == 1000 221 | 222 | def test_multiple_chunks(self): 223 | """Test stats for multiple chunks.""" 224 | chunks = [ 225 | {'word_count': 500}, 226 | {'word_count': 1000}, 227 | {'word_count': 750}, 228 | ] 229 | stats = get_chunking_stats(chunks) 230 | assert stats['total_chunks'] == 3 231 | assert stats['total_words'] == 2250 232 | assert stats['avg_words_per_chunk'] == 750 233 | assert stats['min_words'] == 500 234 | assert stats['max_words'] == 1000 235 | 236 | def test_with_chapter_info(self): 237 | """Test stats include chapter spread.""" 238 | chunks = [ 239 | {'word_count': 500, 'chapter_info': {'start_chapter': 0, 'end_chapter': 2}}, 240 | {'word_count': 500, 'chapter_info': {'start_chapter': 3, 'end_chapter': 5}}, 241 | ] 242 | stats = get_chunking_stats(chunks) 243 | assert '0-2' in stats['chapter_spread'] 244 | assert '3-5' in stats['chapter_spread'] 245 | 246 | 247 | class TestRepairHyphenation: 248 | """Tests for repair_hyphenation function.""" 249 | 250 | def test_simple_hyphenation(self): 251 | """Test fixing simple hyphenated word.""" 252 | text = "This is a hyphen-\nated word." 253 | result = repair_hyphenation(text) 254 | assert "hyphenated" in result 255 | assert "hyphen-\nated" not in result 256 | 257 | def test_multiple_hyphenations(self): 258 | """Test fixing multiple hyphenated words.""" 259 | text = "First hyphen-\nated word and second broken-\nword here." 260 | result = repair_hyphenation(text) 261 | assert "hyphenated" in result 262 | assert "brokenword" in result 263 | 264 | def test_no_hyphenation(self): 265 | """Test text without hyphenation is unchanged.""" 266 | text = "Normal text without any hyphenation." 267 | result = repair_hyphenation(text) 268 | assert result == text 269 | 270 | def test_regular_hyphens_preserved(self): 271 | """Test that regular hyphens (not at line breaks) are preserved.""" 272 | text = "This is a well-known fact." 273 | result = repair_hyphenation(text) 274 | assert "well-known" in result 275 | -------------------------------------------------------------------------------- /epub_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | EPUB Processor module for handling EPUB files. 3 | """ 4 | 5 | import os 6 | import re 7 | from collections import namedtuple 8 | from typing import List, Optional, Dict, Any 9 | from bs4 import BeautifulSoup 10 | import ebooklib 11 | from ebooklib import epub 12 | from copy import deepcopy 13 | 14 | from utils import count_words, normalize_output_dir 15 | 16 | # A simple structure to hold chapter info 17 | Chapter = namedtuple('Chapter', ['id', 'title', 'content', 'file_name', 'word_count']) 18 | 19 | class EPUBProcessor: 20 | """Handles the processing of EPUB files, including splitting into smaller parts.""" 21 | 22 | def __init__(self, input_file: str, max_words: int = 80000, output_dir: Optional[str] = None, 23 | strict_chapters: bool = False, verbose: bool = False): 24 | """Initialize the EPUB processor. 25 | 26 | Args: 27 | input_file: Path to the input EPUB file. 28 | max_words: Maximum number of words per output file. 29 | output_dir: Directory for output files (defaults to input file's directory). 30 | strict_chapters: Only split at chapter boundaries. 31 | verbose: Print detailed processing information. 32 | """ 33 | self.input_file = input_file 34 | self.max_words = max_words 35 | self.output_dir = normalize_output_dir(output_dir, input_file) 36 | self.strict_chapters = strict_chapters 37 | self.verbose = verbose 38 | self.book = None 39 | self.chapters = [] 40 | self.output_files = [] 41 | 42 | # Extract book name from input file path 43 | self.book_name = os.path.splitext(os.path.basename(input_file))[0] 44 | 45 | def log(self, message): 46 | """Print a message if verbose mode is enabled.""" 47 | if self.verbose: 48 | print(message) 49 | 50 | def process(self): 51 | """Process the EPUB file and split it into smaller parts. 52 | 53 | Returns: 54 | list: Paths to the generated output files. 55 | """ 56 | self.log(f"Processing '{self.input_file}'...") 57 | 58 | # Load the book 59 | self.book = epub.read_epub(self.input_file) 60 | 61 | # Extract chapters 62 | self.extract_chapters() 63 | 64 | # Determine split points 65 | split_points = self.determine_split_points() 66 | 67 | # Create output EPUBs 68 | self.create_output_epubs(split_points) 69 | 70 | return self.output_files 71 | 72 | def extract_chapters(self): 73 | """Extract chapters from the EPUB book.""" 74 | self.log("Extracting chapters...") 75 | 76 | # Get spine items (ordered document items) 77 | spine_items = [] 78 | for itemref in self.book.spine: 79 | item_id = itemref[0] 80 | item = self.book.get_item_with_id(item_id) 81 | if item is not None and item.get_type() == ebooklib.ITEM_DOCUMENT: 82 | spine_items.append(item) 83 | 84 | # Process each spine item as a chapter 85 | for i, item in enumerate(spine_items): 86 | # Get content 87 | content = item.get_content().decode('utf-8') 88 | 89 | # Parse HTML with BeautifulSoup 90 | soup = BeautifulSoup(content, 'html.parser') 91 | 92 | # Try to extract title 93 | title = None 94 | title_elem = soup.find(['h1', 'h2', 'h3', 'title']) 95 | if title_elem: 96 | title = title_elem.get_text().strip() 97 | 98 | # If no title found, use generic name 99 | if not title: 100 | title = f"Chapter {i+1}" 101 | 102 | # Count words using shared utility 103 | text = soup.get_text() 104 | word_count = count_words(text) 105 | 106 | # Create chapter object 107 | chapter = Chapter( 108 | id=item.get_id(), 109 | title=title, 110 | content=content, 111 | file_name=item.file_name, 112 | word_count=word_count 113 | ) 114 | 115 | self.chapters.append(chapter) 116 | 117 | self.log(f" - {title} ({word_count} words)") 118 | 119 | self.log(f"Found {len(self.chapters)} chapters with a total of {sum(c.word_count for c in self.chapters)} words.") 120 | 121 | def determine_split_points(self): 122 | """Determine where to split the book. 123 | 124 | Returns: 125 | list: Indices of the last chapter in each part. 126 | """ 127 | self.log("Determining split points...") 128 | 129 | split_points = [] 130 | current_word_count = 0 131 | current_chapters = [] 132 | 133 | for i, chapter in enumerate(self.chapters): 134 | # If adding this chapter would exceed the word limit 135 | if current_word_count + chapter.word_count > self.max_words and current_chapters: 136 | # If strict chapter mode is enabled or current word count is at least 40% of max 137 | # (avoiding creating very small parts) 138 | if self.strict_chapters or current_word_count >= 0.4 * self.max_words: 139 | split_points.append(i - 1) # Split before current chapter 140 | current_word_count = chapter.word_count 141 | current_chapters = [chapter] 142 | else: 143 | # Add chapter anyway and split after it 144 | current_chapters.append(chapter) 145 | current_word_count += chapter.word_count 146 | split_points.append(i) 147 | current_word_count = 0 148 | current_chapters = [] 149 | else: 150 | # Add chapter to current part 151 | current_chapters.append(chapter) 152 | current_word_count += chapter.word_count 153 | 154 | # Add the last part if there are remaining chapters 155 | if current_chapters: 156 | split_points.append(len(self.chapters) - 1) 157 | 158 | # Log split points 159 | part_start = 0 160 | for i, split_point in enumerate(split_points): 161 | part_chapters = self.chapters[part_start:split_point + 1] 162 | part_words = sum(c.word_count for c in part_chapters) 163 | self.log(f" Part {i+1}: Chapters {part_start+1}-{split_point+1} ({part_words} words)") 164 | part_start = split_point + 1 165 | 166 | return split_points 167 | 168 | def create_output_epubs(self, split_points): 169 | """Create output EPUB files based on split points. 170 | 171 | Args: 172 | split_points (list): Indices of the last chapter in each part. 173 | """ 174 | self.log("Creating output EPUB files...") 175 | 176 | part_start = 0 177 | for i, split_point in enumerate(split_points): 178 | part_num = i + 1 179 | output_filename = f"{self.book_name}_part{part_num}.epub" 180 | output_path = os.path.join(self.output_dir, output_filename) 181 | 182 | self.log(f" Creating {output_filename}...") 183 | 184 | # Create a new EPUB book 185 | book = epub.EpubBook() 186 | 187 | # Copy metadata from original book 188 | self._copy_metadata(book) 189 | 190 | # Update title to indicate it's a part 191 | book.set_title(f"{book.title} - Part {part_num}") 192 | 193 | # Get chapters for this part 194 | part_chapters = self.chapters[part_start:split_point + 1] 195 | 196 | # Add chapters to the book 197 | epub_chapters = [] 198 | for chapter in part_chapters: 199 | # Create EpubHtml item 200 | epub_chapter = epub.EpubHtml( 201 | title=chapter.title, 202 | file_name=chapter.file_name, 203 | content=chapter.content 204 | ) 205 | epub_chapter.id = chapter.id 206 | book.add_item(epub_chapter) 207 | epub_chapters.append(epub_chapter) 208 | 209 | # Copy CSS, images, and other assets from original book 210 | self._copy_assets(book) 211 | 212 | # Add chapters to spine 213 | for chapter in epub_chapters: 214 | book.spine.append(chapter) 215 | 216 | # Create table of contents 217 | book.toc = [(epub.Section(chapter.title), [chapter]) for chapter in epub_chapters] 218 | 219 | # Add default NCX and Nav files 220 | book.add_item(epub.EpubNcx()) 221 | book.add_item(epub.EpubNav()) 222 | 223 | # Save the EPUB file 224 | epub.write_epub(output_path, book, {}) 225 | 226 | self.output_files.append(output_path) 227 | part_start = split_point + 1 228 | 229 | def _copy_metadata(self, book): 230 | """Copy metadata from original book to the new book. 231 | 232 | Args: 233 | book (EpubBook): The target EPUB book. 234 | """ 235 | # Copy basic metadata 236 | if self.book.title: 237 | book.set_title(self.book.title) 238 | if self.book.language: 239 | book.set_language(self.book.language) 240 | 241 | # Copy identifiers safely 242 | if 'http://purl.org/dc/elements/1.1/' in self.book.metadata: 243 | dc_metadata = self.book.metadata['http://purl.org/dc/elements/1.1/'] 244 | 245 | # Copy identifier 246 | if 'identifier' in dc_metadata: 247 | for item in dc_metadata['identifier']: 248 | # Handle different formats of metadata 249 | if len(item) >= 2: # At least has id and value 250 | item_id = item[0] if len(item) > 0 else 'id' 251 | value = item[1] if len(item) > 1 else '' 252 | book.add_metadata('DC', 'identifier', value, {'id': item_id}) 253 | 254 | # Copy creator (author) 255 | if 'creator' in dc_metadata: 256 | for item in dc_metadata['creator']: 257 | if len(item) >= 2: 258 | value = item[1] 259 | book.add_author(value) 260 | 261 | def _copy_assets(self, book): 262 | """Copy CSS, images, and other assets from original book to the new book. 263 | 264 | Args: 265 | book (EpubBook): The target EPUB book. 266 | """ 267 | # Copy items that are not documents 268 | for item in self.book.get_items(): 269 | if item.get_type() != ebooklib.ITEM_DOCUMENT: 270 | new_item = deepcopy(item) 271 | book.add_item(new_item) -------------------------------------------------------------------------------- /pdf_to_markdown.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | PDF to Markdown Splitter - Splits a PDF file into multiple markdown files based on page count. 4 | """ 5 | 6 | import os 7 | import sys 8 | import re 9 | import argparse 10 | import io 11 | from pathlib import Path 12 | from typing import List, Dict, Any, Tuple, Optional 13 | import PyPDF2 14 | from pdfminer.high_level import extract_text 15 | from pdfminer.pdfpage import PDFPage 16 | 17 | from utils import count_words, normalize_output_dir, ensure_output_dir, repair_hyphenation 18 | 19 | def extract_text_from_pdf(pdf_path: str, verbose: bool = False) -> List[Dict[str, Any]]: 20 | """ 21 | Extract text from PDF file, page by page. 22 | 23 | Args: 24 | pdf_path: Path to the PDF file 25 | verbose: Whether to print verbose output 26 | 27 | Returns: 28 | List of dictionaries with page number, text, and word count 29 | """ 30 | if verbose: 31 | print(f"Extracting text from {pdf_path}...") 32 | 33 | pages = [] 34 | 35 | # Use PyPDF2 to get total page count and basic metadata 36 | with open(pdf_path, 'rb') as file: 37 | reader = PyPDF2.PdfReader(file) 38 | total_pages = len(reader.pages) 39 | 40 | # Extract document info if available 41 | title = None 42 | if reader.metadata: 43 | title = reader.metadata.title 44 | 45 | # Use pdfminer for better text extraction 46 | for page_num in range(total_pages): 47 | if verbose and page_num % 10 == 0: 48 | print(f" Processing page {page_num + 1}/{total_pages}...") 49 | 50 | # Extract using PyPDF2 first 51 | page = reader.pages[page_num] 52 | text = page.extract_text() 53 | 54 | # If PyPDF2 extraction is poor, try pdfminer 55 | if not text or len(text.strip()) < 100: 56 | # Re-open file for pdfminer 57 | with open(pdf_path, 'rb') as miner_file: 58 | # Skip to the desired page 59 | pdf_pages = list(PDFPage.get_pages(miner_file, 60 | pagenos=[page_num], 61 | maxpages=1)) 62 | if pdf_pages: 63 | # Use StringIO to capture the text 64 | output = io.StringIO() 65 | text = extract_text(pdf_path, page_numbers=[page_num]) 66 | 67 | # Count words 68 | word_count = count_words(text) 69 | 70 | pages.append({ 71 | 'page_num': page_num + 1, 72 | 'text': text, 73 | 'word_count': word_count 74 | }) 75 | 76 | if verbose: 77 | if page_num % 10 == 0 or page_num == total_pages - 1: 78 | print(f" Page {page_num + 1}: {word_count} words") 79 | 80 | total_words = sum(page['word_count'] for page in pages) 81 | if verbose: 82 | print(f"Extracted {total_pages} pages with a total of {total_words} words.") 83 | 84 | return pages, title 85 | 86 | def determine_split_points(pages: List[Dict[str, Any]], 87 | max_words: int, 88 | strict_boundaries: bool = False, 89 | verbose: bool = False) -> List[int]: 90 | """ 91 | Determine where to split the PDF. 92 | 93 | Args: 94 | pages: List of page dictionaries with text and word count 95 | max_words: Maximum words per output file 96 | strict_boundaries: Whether to split only at detected section boundaries 97 | verbose: Whether to print verbose output 98 | 99 | Returns: 100 | List of indices of the last page in each part 101 | """ 102 | if verbose: 103 | print("Determining split points...") 104 | 105 | # Try to detect section/chapter boundaries 106 | section_boundaries = detect_section_boundaries(pages) 107 | 108 | split_points = [] 109 | current_word_count = 0 110 | current_pages = [] 111 | 112 | for i, page in enumerate(pages): 113 | # Check if adding this page would exceed the word limit 114 | if current_word_count + page['word_count'] > max_words and current_pages: 115 | # Find the nearest section boundary if strict mode is enabled 116 | if strict_boundaries and section_boundaries: 117 | # Find the closest boundary before current position 118 | closest_boundary = None 119 | for boundary in section_boundaries: 120 | if boundary <= i: 121 | if closest_boundary is None or boundary > closest_boundary: 122 | closest_boundary = boundary 123 | 124 | # If a nearby boundary is found (within 20% of max_words) 125 | if closest_boundary is not None and closest_boundary > i - 5: 126 | # Split at the boundary 127 | split_points.append(closest_boundary) 128 | current_word_count = sum(p['word_count'] for p in pages[closest_boundary+1:i+1]) 129 | current_pages = pages[closest_boundary+1:i+1] 130 | continue 131 | 132 | # Otherwise split at current position 133 | split_points.append(i - 1) 134 | current_word_count = page['word_count'] 135 | current_pages = [page] 136 | else: 137 | # Add page to current part 138 | current_pages.append(page) 139 | current_word_count += page['word_count'] 140 | 141 | # Add the last part if there are remaining pages 142 | if current_pages: 143 | split_points.append(len(pages) - 1) 144 | 145 | # Log split information 146 | if verbose: 147 | part_start = 0 148 | for i, split_point in enumerate(split_points): 149 | part_pages = pages[part_start:split_point + 1] 150 | part_words = sum(p['word_count'] for p in part_pages) 151 | print(f" Part {i+1}: Pages {part_start+1}-{split_point+1} ({part_words} words)") 152 | part_start = split_point + 1 153 | 154 | return split_points 155 | 156 | def detect_section_boundaries(pages: List[Dict[str, Any]]) -> List[int]: 157 | """ 158 | Attempt to detect section or chapter boundaries in the PDF using multiple strategies. 159 | 160 | Ported from dingran.me pdf.ts implementation with multi-strategy approach. 161 | 162 | Args: 163 | pages: List of page dictionaries with text and word count. 164 | 165 | Returns: 166 | List of page indices that likely contain section boundaries. 167 | """ 168 | boundaries = [] 169 | 170 | # Common chapter/section heading patterns (ordered by specificity) 171 | chapter_patterns = [ 172 | r'^chapter\s+\d+(?:\s*[:\-\u2014].*)?', # Chapter N or Chapter N: Title 173 | r'^chapter\s+(?:one|two|three|four|five|six|seven|eight|nine|ten)', # Chapter (word) 174 | r'^\d+\.\s+[A-Z][A-Za-z\s]+', # Numbered sections like "1. Introduction" 175 | r'^[IVXLC]+\.\s+[A-Z][A-Za-z\s]+', # Roman numeral sections 176 | r'^section\s+\d+', 177 | r'^part\s+\d+', 178 | r'^prologue', 179 | r'^epilogue', 180 | r'^introduction', 181 | r'^conclusion', 182 | r'^appendix', 183 | ] 184 | 185 | for i, page in enumerate(pages): 186 | text = page['text'] 187 | 188 | # Check first few lines of the page 189 | lines = text.split('\n')[:10] 190 | found_boundary = False 191 | 192 | for line in lines: 193 | line_lower = line.strip().lower() 194 | line_stripped = line.strip() 195 | 196 | # Skip empty lines 197 | if not line_lower: 198 | continue 199 | 200 | # Strategy 1: Check for common chapter/section patterns 201 | for pattern in chapter_patterns: 202 | if re.match(pattern, line_lower, re.IGNORECASE): 203 | if i not in boundaries: 204 | boundaries.append(i) 205 | found_boundary = True 206 | break 207 | 208 | if found_boundary: 209 | break 210 | 211 | # Strategy 2: Check for short, all-caps headings (likely titles) 212 | if (len(line_stripped) < 60 and len(line_stripped) > 3 and 213 | line_stripped.isupper() and not line_stripped.isdigit()): 214 | if i not in boundaries: 215 | boundaries.append(i) 216 | found_boundary = True 217 | break 218 | 219 | # Strategy 3: Check for short headings followed by empty/whitespace line 220 | line_idx = lines.index(line) if line in lines else -1 221 | if (line_idx >= 0 and line_idx < len(lines) - 1 and 222 | len(line_stripped) < 50 and len(line_stripped) > 3 and 223 | not lines[line_idx + 1].strip()): 224 | # Only if it looks like a title (starts with capital or number) 225 | if line_stripped[0].isupper() or line_stripped[0].isdigit(): 226 | if i not in boundaries: 227 | boundaries.append(i) 228 | found_boundary = True 229 | break 230 | 231 | return sorted(set(boundaries)) 232 | 233 | def create_markdown_files(pages: List[Dict[str, Any]], 234 | split_points: List[int], 235 | output_dir: str, 236 | input_file: str, 237 | title: str = None, 238 | verbose: bool = False) -> List[str]: 239 | """ 240 | Create markdown files based on split points. 241 | 242 | Args: 243 | pages: List of page dictionaries with text and word count 244 | split_points: List of indices of the last page in each part 245 | output_dir: Directory for output files 246 | input_file: Path to the input PDF file 247 | title: Title of the document 248 | verbose: Whether to print verbose output 249 | 250 | Returns: 251 | List of paths to the created markdown files 252 | """ 253 | if verbose: 254 | print("Creating output markdown files...") 255 | 256 | # Extract base name without extension 257 | base_name = os.path.splitext(os.path.basename(input_file))[0] 258 | 259 | # Use file name as title if no title is provided 260 | if not title: 261 | title = base_name 262 | 263 | output_files = [] 264 | part_start = 0 265 | 266 | for i, split_point in enumerate(split_points): 267 | part_num = i + 1 268 | output_filename = f"{base_name}_part{part_num}.md" 269 | output_path = os.path.join(output_dir, output_filename) 270 | 271 | if verbose: 272 | print(f" Creating {output_filename}...") 273 | 274 | # Get pages for this part 275 | part_pages = pages[part_start:split_point + 1] 276 | 277 | # Create markdown content for this part 278 | markdown_content = f"# {title} - Part {part_num}\n\n" 279 | markdown_content += f"*Pages: {part_pages[0]['page_num']}-{part_pages[-1]['page_num']}*\n\n" 280 | markdown_content += f"*Words: {sum(p['word_count'] for p in part_pages)}*\n\n" 281 | 282 | # Add page content 283 | for page in part_pages: 284 | markdown_content += f"## Page {page['page_num']}\n\n" 285 | 286 | # Clean up text 287 | text = page['text'] 288 | 289 | # Remove excessive newlines 290 | text = re.sub(r'\n{3,}', '\n\n', text) 291 | 292 | # Fix hyphenated words broken across lines 293 | text = repair_hyphenation(text) 294 | 295 | markdown_content += text + "\n\n---\n\n" 296 | 297 | # Write to file 298 | with open(output_path, 'w', encoding='utf-8') as f: 299 | f.write(markdown_content) 300 | 301 | output_files.append(output_path) 302 | part_start = split_point + 1 303 | 304 | return output_files 305 | 306 | def split_pdf(input_file: str, 307 | max_words: int = 80000, 308 | output_dir: Optional[str] = None, 309 | strict_boundaries: bool = False, 310 | verbose: bool = False) -> List[str]: 311 | """ 312 | Split a PDF file into multiple markdown files. 313 | 314 | Args: 315 | input_file: Path to the input PDF file. 316 | max_words: Maximum words per output file. 317 | output_dir: Directory for output files (defaults to input file's directory). 318 | strict_boundaries: Whether to split only at detected section boundaries. 319 | verbose: Whether to print verbose output. 320 | 321 | Returns: 322 | List of paths to the created markdown files. 323 | """ 324 | output_dir = normalize_output_dir(output_dir, input_file) 325 | try: 326 | # Extract text from PDF 327 | pages, title = extract_text_from_pdf(input_file, verbose) 328 | 329 | # Determine split points 330 | split_points = determine_split_points(pages, max_words, strict_boundaries, verbose) 331 | 332 | # Create output files 333 | output_files = create_markdown_files(pages, split_points, output_dir, input_file, title, verbose) 334 | 335 | return output_files 336 | 337 | except Exception as e: 338 | if verbose: 339 | import traceback 340 | traceback.print_exc() 341 | raise Exception(f"Error processing PDF: {str(e)}") 342 | 343 | def main(): 344 | """Main entry point for the script.""" 345 | parser = argparse.ArgumentParser(description='Split a PDF file into multiple markdown files') 346 | 347 | parser.add_argument('input_file', help='Path to input PDF file') 348 | parser.add_argument('--max-words', type=int, default=80000, 349 | help='Maximum words per output file (default: 80000)') 350 | parser.add_argument('--output-dir', default=None, 351 | help='Directory for output files (default: same directory as input file)') 352 | parser.add_argument('--strict-boundaries', action='store_true', 353 | help='Only split at detected section boundaries when possible') 354 | parser.add_argument('--verbose', action='store_true', 355 | help='Print detailed processing information') 356 | 357 | args = parser.parse_args() 358 | 359 | # Validate arguments 360 | if not os.path.isfile(args.input_file): 361 | print(f"Error: Input file '{args.input_file}' does not exist.") 362 | return 1 363 | 364 | if not args.input_file.lower().endswith('.pdf'): 365 | print(f"Error: Input file '{args.input_file}' is not a PDF file.") 366 | return 1 367 | 368 | if args.output_dir is not None and not os.path.exists(args.output_dir): 369 | try: 370 | ensure_output_dir(args.output_dir) 371 | except OSError as e: 372 | print(f"Error: Cannot create output directory '{args.output_dir}': {e}") 373 | return 1 374 | 375 | if args.max_words <= 0: 376 | print("Error: Maximum words must be a positive number.") 377 | return 1 378 | 379 | try: 380 | result = split_pdf( 381 | input_file=args.input_file, 382 | max_words=args.max_words, 383 | output_dir=args.output_dir, 384 | strict_boundaries=args.strict_boundaries, 385 | verbose=args.verbose 386 | ) 387 | 388 | print(f"\nProcessing complete. Created {len(result)} markdown files:") 389 | for i, output_file in enumerate(result, 1): 390 | print(f" {i}. {output_file}") 391 | 392 | except Exception as e: 393 | print(f"Error: {str(e)}") 394 | return 1 395 | 396 | return 0 397 | 398 | if __name__ == "__main__": 399 | sys.exit(main()) --------------------------------------------------------------------------------