├── .idea └── .idea.PdfTocExtractor │ └── .idea │ ├── .name │ ├── encodings.xml │ ├── vcs.xml │ ├── indexLayout.xml │ └── .gitignore ├── tests └── PdfTocExtractor.Tests │ ├── GlobalUsings.cs │ ├── PdfTocExtractor.Tests.csproj │ ├── TestResults.md │ ├── DebugTest.cs │ ├── Exporters │ ├── ExportOptionsTests.cs │ ├── MarkdownExporterTests.cs │ ├── JsonExporterTests.cs │ └── TextExporterTests.cs │ ├── Models │ └── TocItemTests.cs │ ├── TestData │ ├── MockHelpers.cs │ └── TestDataBuilder.cs │ └── PdfTocExtractorTests.cs ├── src ├── PdfTocExtractor.Example │ ├── PdfTocExtractor.Example.csproj │ └── Program.cs ├── PdfTocExtractor.Cli │ ├── Program.cs │ ├── PdfTocExtractor.Cli.csproj │ ├── README.md │ └── Commands │ │ ├── ExtractCommand.cs │ │ ├── SmartCommand.cs │ │ ├── DiagnoseCommand.cs │ │ └── SemanticCommand.cs └── PdfTocExtractor │ ├── PdfTocExtractor.csproj │ ├── Exporters │ ├── IExporter.cs │ ├── TextExporter.cs │ ├── JsonExporter.cs │ ├── MarkdownExporter.cs │ └── XmlExporter.cs │ ├── Models │ └── TocItem.cs │ ├── Semantic │ ├── TextFragment.cs │ ├── SemanticAnalysisOptions.cs │ ├── SemanticTocExtractor.cs │ ├── PdfTextExtractor.cs │ └── SemanticHeadingAnalyzer.cs │ ├── README.md │ └── PdfTocExtractor.cs ├── docs ├── CHANGELOG.md └── UPGRADE_SUMMARY.md ├── .github └── workflows │ ├── ci.yaml │ ├── README.md │ └── publish.yaml └── PdfTocExtractor.sln /.idea/.idea.PdfTocExtractor/.idea/.name: -------------------------------------------------------------------------------- 1 | PdfTocExtractor -------------------------------------------------------------------------------- /.idea/.idea.PdfTocExtractor/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/.idea.PdfTocExtractor/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/GlobalUsings.cs: -------------------------------------------------------------------------------- 1 | global using Xunit; 2 | global using FluentAssertions; 3 | global using Moq; 4 | global using PdfTocExtractor.Models; 5 | global using PdfTocExtractor.Exporters; 6 | global using PdfTocExtractor.Tests.TestData; 7 | -------------------------------------------------------------------------------- /.idea/.idea.PdfTocExtractor/.idea/indexLayout.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/.idea.PdfTocExtractor/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Rider ignored files 5 | /contentModel.xml 6 | /projectSettingsUpdater.xml 7 | /.idea.PdfTocExtractor.iml 8 | /modules.xml 9 | # Editor-based HTTP Client requests 10 | /httpRequests/ 11 | # Datasource local storage ignored files 12 | /dataSources/ 13 | /dataSources.local.xml 14 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Example/PdfTocExtractor.Example.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | enable 7 | enable 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/PdfTocExtractor.Tests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | false 8 | true 9 | 10 | 11 | 12 | 13 | 14 | 15 | runtime; build; native; contentfiles; analyzers; buildtransitive 16 | all 17 | 18 | 19 | runtime; build; native; contentfiles; analyzers; buildtransitive 20 | all 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Cli/Program.cs: -------------------------------------------------------------------------------- 1 | using System.CommandLine; 2 | using PdfTocExtractor.Cli.Commands; 3 | 4 | namespace PdfTocExtractor.Cli; 5 | 6 | class Program 7 | { 8 | static async Task Main(string[] args) 9 | { 10 | var rootCommand = new RootCommand("PDF Table of Contents Extractor - 从PDF文件提取目录并导出为多种格式") 11 | { 12 | ExtractCommand.Create(), 13 | SmartCommand.Create(), 14 | SemanticCommand.Create(), 15 | DiagnoseCommand.Create() 16 | }; 17 | 18 | rootCommand.SetHandler(() => 19 | { 20 | Console.WriteLine("PDF Table of Contents Extractor"); 21 | Console.WriteLine("使用 --help 查看可用命令"); 22 | Console.WriteLine(); 23 | Console.WriteLine("示例:"); 24 | Console.WriteLine(" pdftoc extract input.pdf -o output.md # 提取PDF书签"); 25 | Console.WriteLine(" pdftoc semantic input.pdf -o output.md # 语义分析提取"); 26 | Console.WriteLine(" pdftoc smart input.pdf -o output.md # 智能提取(推荐)"); 27 | Console.WriteLine(" pdftoc extract input.pdf -o output.json -f json"); 28 | Console.WriteLine(" pdftoc semantic input.pdf --mode strict --debug # 严格模式+调试"); 29 | Console.WriteLine(" pdftoc diagnose input.pdf # 诊断PDF文件问题"); 30 | }); 31 | 32 | return await rootCommand.InvokeAsync(args); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/PdfTocExtractor.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | true 8 | 9 | 10 | PdfTocExtractor 11 | 2.0.0 12 | DealiAxy 13 | A powerful library for extracting table of contents (TOC) from PDF files with advanced semantic analysis capabilities. Supports both bookmark extraction and intelligent structure recognition with multiple output formats. 14 | pdf;toc;table-of-contents;extraction;bookmark;semantic-analysis;nlp;DealiAxy;itext 15 | https://github.com/star-plan/pdf-toc-extractor 16 | https://github.com/star-plan/pdf-toc-extractor 17 | MIT 18 | README.md 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /docs/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 更新日志 2 | 3 | 本文档记录了PdfTocExtractor项目的所有重要更改。 4 | 5 | 格式基于 [Keep a Changelog](https://keepachangelog.com/zh-CN/1.0.0/), 6 | 并且本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。 7 | 8 | ## [2.0.0] - 2025-08-03 9 | 10 | ### 🎉 重大更新:语义分析功能 11 | 12 | #### 新增 13 | - **🧠 语义分析引擎**:全新的基于NLP的智能章节标题识别系统 14 | - **📝 新增 `semantic` 命令**:专门用于语义分析的命令行工具 15 | - **🎯 智能文本分类**:能够区分真正的标题和操作步骤、列表项 16 | - **🔍 上下文分析**:基于文本位置、字体、间距等特征进行综合判断 17 | - **⚙️ 可配置置信度**:支持调整识别精度的置信度阈值 18 | - **📊 多种分析模式**:默认、严格、宽松、调试模式可选 19 | - **🚫 智能过滤**:自动排除页码、IP地址、操作步骤等非标题内容 20 | 21 | #### 改进 22 | - **🔄 重构 `smart` 命令**:现在会自动尝试语义分析作为备选方案 23 | - **📈 大幅提升识别准确率**:从识别1000+错误标题降低到59个精确标题 24 | - **🎨 优化用户界面**:更清晰的调试输出和进度显示 25 | 26 | #### 技术架构 27 | - **🏗️ 模块化设计**:新增 `PdfTocExtractor.Semantic` 命名空间 28 | - **🔧 可扩展框架**:支持自定义语义规则和分析策略 29 | - **⚡ 高性能处理**:优化的文本提取和合并算法 30 | 31 | #### 移除 32 | - **🧹 清理过时代码**:移除了基于规则的复杂分析代码 33 | 34 | ## [未发布] 35 | 36 | ### 新增 37 | - 初始版本发布 38 | - PDF目录提取核心功能 39 | - 支持多种输出格式:Markdown、JSON、XML、纯文本 40 | - 命令行工具 `pdftoc` 41 | - NuGet包发布:核心库和CLI工具 42 | - AOT编译支持,生成原生可执行文件 43 | - 跨平台支持:Windows、Linux、macOS 44 | - GitHub Actions CI/CD流程 45 | - 可扩展的导出器架构 46 | 47 | ### 技术特性 48 | - 基于 .NET 8.0 49 | - 使用 iText 9.2.0 进行PDF处理 50 | - 支持异步操作 51 | - 完整的单元测试覆盖 52 | - 代码质量检查和格式化 53 | 54 | ## [1.0.0] - 即将发布 55 | 56 | ### 新增 57 | - 🎉 首次正式发布 58 | - 📖 完整的PDF目录提取功能 59 | - 🛠️ 命令行工具和库支持 60 | - 🚀 AOT编译和多平台支持 61 | - 📚 完整的文档和示例 62 | 63 | --- 64 | 65 | ## 版本说明 66 | 67 | - **主版本号**:当你做了不兼容的 API 修改 68 | - **次版本号**:当你做了向下兼容的功能性新增 69 | - **修订号**:当你做了向下兼容的问题修正 70 | 71 | ## 贡献指南 72 | 73 | 如果您想为此项目做出贡献,请: 74 | 75 | 1. 查看 [Issues](https://github.com/star-plan/PdfTocExtractor/issues) 了解当前的问题和功能请求 76 | 2. 提交 Pull Request 时请更新此更新日志 77 | 3. 遵循 [语义化版本](https://semver.org/lang/zh-CN/) 规范 78 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: 持续集成 2 | run-name: ${{ github.actor }} 正在运行CI检查 🔍 3 | 4 | on: 5 | push: 6 | branches: [ master, develop ] 7 | pull_request: 8 | branches: [ master, develop ] 9 | 10 | jobs: 11 | # 基础构建和测试 12 | test: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | 18 | - name: Setup .NET 19 | uses: actions/setup-dotnet@v4 20 | with: 21 | dotnet-version: 8.0.x 22 | 23 | - name: 缓存NuGet包 24 | uses: actions/cache@v4 25 | with: 26 | path: ~/.nuget/packages 27 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/packages.lock.json') }} 28 | restore-keys: | 29 | ${{ runner.os }}-nuget- 30 | 31 | - name: 恢复依赖 32 | run: dotnet restore 33 | 34 | - name: 构建项目 35 | run: dotnet build --no-restore --configuration Release 36 | 37 | - name: 运行测试 38 | run: dotnet test --no-build --configuration Release --verbosity normal 39 | 40 | # 简化的AOT编译测试 (仅Linux) 41 | aot-test: 42 | runs-on: ubuntu-latest 43 | needs: test # 只有基础测试通过后才运行AOT测试 44 | 45 | steps: 46 | - uses: actions/checkout@v4 47 | 48 | - name: Setup .NET 49 | uses: actions/setup-dotnet@v4 50 | with: 51 | dotnet-version: 8.0.x 52 | 53 | - name: 安装Linux依赖 54 | run: | 55 | sudo apt-get update 56 | sudo apt-get install -y clang zlib1g-dev 57 | 58 | - name: 恢复依赖 59 | run: dotnet restore 60 | 61 | - name: AOT编译测试 62 | run: dotnet publish src/PdfTocExtractor.Cli/PdfTocExtractor.Cli.csproj -c Release -r linux-x64 --self-contained true -p:PublishAot=true -o ./test-publish 63 | 64 | - name: 测试可执行文件 65 | run: ./test-publish/pdftoc --help 66 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Exporters/IExporter.cs: -------------------------------------------------------------------------------- 1 | using PdfTocExtractor.Models; 2 | 3 | namespace PdfTocExtractor.Exporters; 4 | 5 | /// 6 | /// 目录导出器接口 7 | /// 8 | public interface IExporter 9 | { 10 | /// 11 | /// 导出格式名称 12 | /// 13 | string FormatName { get; } 14 | 15 | /// 16 | /// 文件扩展名(不包含点) 17 | /// 18 | string FileExtension { get; } 19 | 20 | /// 21 | /// 导出目录项目到字符串 22 | /// 23 | /// 目录项目列表 24 | /// 导出选项 25 | /// 导出的字符串内容 26 | string Export(IEnumerable tocItems, ExportOptions? options = null); 27 | 28 | /// 29 | /// 异步导出目录项目到文件 30 | /// 31 | /// 目录项目列表 32 | /// 输出文件路径 33 | /// 导出选项 34 | Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null); 35 | } 36 | 37 | /// 38 | /// 导出选项 39 | /// 40 | public class ExportOptions 41 | { 42 | /// 43 | /// 缩进字符串(默认为两个空格) 44 | /// 45 | public string IndentString { get; set; } = " "; 46 | 47 | /// 48 | /// 是否包含页码 49 | /// 50 | public bool IncludePageNumbers { get; set; } = true; 51 | 52 | /// 53 | /// 是否包含链接(如果格式支持) 54 | /// 55 | public bool IncludeLinks { get; set; } = false; 56 | 57 | /// 58 | /// 最大层级深度(0表示无限制) 59 | /// 60 | public int MaxDepth { get; set; } = 0; 61 | 62 | /// 63 | /// 页码格式化字符串 64 | /// 65 | public string PageNumberFormat { get; set; } = "第 {0} 页"; 66 | 67 | /// 68 | /// 自定义标题(用于某些格式的文档标题) 69 | /// 70 | public string? CustomTitle { get; set; } 71 | 72 | /// 73 | /// 编码格式(默认UTF-8) 74 | /// 75 | public System.Text.Encoding Encoding { get; set; } = System.Text.Encoding.UTF8; 76 | } 77 | -------------------------------------------------------------------------------- /.github/workflows/README.md: -------------------------------------------------------------------------------- 1 | # GitHub Actions 工作流说明 2 | 3 | 本项目包含两个主要的GitHub Actions工作流: 4 | 5 | ## 📋 工作流概览 6 | 7 | ### 1. 持续集成 (CI) - `ci.yaml` 8 | **触发条件**:推送到 `master` 或 `develop` 分支,或创建针对这些分支的Pull Request 9 | 10 | **功能**: 11 | - ✅ 代码格式检查 12 | - 🔨 多平台构建测试 13 | - 🧪 单元测试执行 14 | - 📊 代码覆盖率收集 15 | - 🚀 AOT编译测试(Linux、Windows、macOS) 16 | 17 | ### 2. 发布流程 (Publish) - `publish.yaml` 18 | **触发条件**:推送版本标签(格式:`v*.*.*`,如 `v1.0.0`) 19 | 20 | **功能**: 21 | - 🧪 运行完整测试套件 22 | - 📦 发布NuGet包(核心库 + CLI工具) 23 | - 🔨 多平台AOT编译(Windows、Linux、macOS) 24 | - 📋 创建GitHub Release 25 | - 📁 上传可执行文件到Release 26 | 27 | ## 🚀 发布新版本 28 | 29 | ### 步骤1:准备发布 30 | 1. 确保所有测试通过 31 | 2. 更新版本号(在项目文件中) 32 | 3. 更新CHANGELOG.md(如果有) 33 | 34 | ### 步骤2:创建并推送标签 35 | ```bash 36 | # 创建标签 37 | git tag v1.0.0 38 | 39 | # 推送标签到远程仓库 40 | git push origin v1.0.0 41 | ``` 42 | 43 | ### 步骤3:自动化流程 44 | 推送标签后,GitHub Actions将自动: 45 | 1. 运行测试 46 | 2. 发布NuGet包 47 | 3. 编译多平台可执行文件 48 | 4. 创建GitHub Release 49 | 50 | ## 🔧 配置要求 51 | 52 | ### 必需的Secrets 53 | 在GitHub仓库设置中添加以下Secrets: 54 | 55 | - `NUGET_GALLERY_TOKEN`:NuGet.org的API密钥 56 | - 获取方式:登录 [NuGet.org](https://www.nuget.org) → Account Settings → API Keys 57 | 58 | ### 权限设置 59 | 工作流需要以下权限(已在YAML中配置): 60 | - `contents: write` - 创建Release 61 | - `id-token: write` - 身份验证 62 | - `issues: write` - 更新Issue 63 | 64 | ## 📦 发布产物 65 | 66 | ### NuGet包 67 | - `PdfTocExtractor` - 核心库 68 | - `PdfTocExtractor.Cli` - CLI工具包 69 | 70 | ### 可执行文件 71 | - `PdfTocExtractor-windows-{version}.zip` - Windows可执行文件 72 | - `PdfTocExtractor-linux-{version}.tar.gz` - Linux可执行文件 73 | - `PdfTocExtractor-macOS-{version}.tar.gz` - macOS可执行文件 74 | 75 | ## 🔍 监控和调试 76 | 77 | ### 查看工作流状态 78 | 1. 访问仓库的 "Actions" 标签页 79 | 2. 选择相应的工作流运行 80 | 3. 查看详细日志 81 | 82 | ### 常见问题 83 | 1. **AOT编译失败**:检查代码是否AOT兼容 84 | 2. **NuGet发布失败**:验证API密钥是否正确 85 | 3. **测试失败**:确保所有测试在本地通过 86 | 87 | ## 📝 工作流特性 88 | 89 | ### 优化特性 90 | - ✅ NuGet包缓存,提升构建速度 91 | - ✅ 并行构建多个平台 92 | - ✅ 失败时不影响其他平台构建 93 | - ✅ 自动生成Release说明 94 | 95 | ### 安全特性 96 | - ✅ 最小权限原则 97 | - ✅ 安全的密钥管理 98 | - ✅ 构建产物验证 99 | 100 | ## 🎯 下一步优化 101 | 102 | 可考虑的改进: 103 | - 添加安全扫描 104 | - 集成代码质量检查工具 105 | - 添加性能基准测试 106 | - 支持预发布版本 107 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Models/TocItem.cs: -------------------------------------------------------------------------------- 1 | namespace PdfTocExtractor.Models; 2 | 3 | /// 4 | /// 表示PDF目录中的一个项目 5 | /// 6 | public class TocItem 7 | { 8 | /// 9 | /// 标题 10 | /// 11 | public string Title { get; set; } = string.Empty; 12 | 13 | /// 14 | /// 页码 15 | /// 16 | public string Page { get; set; } = string.Empty; 17 | 18 | /// 19 | /// 层级深度(从0开始) 20 | /// 21 | public int Level { get; set; } 22 | 23 | /// 24 | /// 子项目 25 | /// 26 | public List Children { get; set; } = new(); 27 | 28 | /// 29 | /// 父项目(用于构建层级关系) 30 | /// 31 | public TocItem? Parent { get; set; } 32 | 33 | /// 34 | /// 获取页码的数字部分 35 | /// 36 | public int PageNumber 37 | { 38 | get 39 | { 40 | if (string.IsNullOrEmpty(Page) || Page == "无页码" || Page == "N/A") 41 | return 0; 42 | 43 | // 处理 "5 XYZ ..." 格式,只取页码部分 44 | var pageStr = Page.Contains(' ') ? Page.Split(' ')[0] : Page; 45 | return int.TryParse(pageStr, out var pageNum) ? pageNum : 0; 46 | } 47 | } 48 | 49 | /// 50 | /// 是否有子项目 51 | /// 52 | public bool HasChildren => Children.Count > 0; 53 | 54 | /// 55 | /// 获取所有后代项目(递归) 56 | /// 57 | public IEnumerable GetAllDescendants() 58 | { 59 | foreach (var child in Children) 60 | { 61 | yield return child; 62 | foreach (var descendant in child.GetAllDescendants()) 63 | { 64 | yield return descendant; 65 | } 66 | } 67 | } 68 | 69 | /// 70 | /// 获取项目的完整路径(从根到当前项目的标题路径) 71 | /// 72 | public string GetFullPath(string separator = " > ") 73 | { 74 | var path = new List(); 75 | var current = this; 76 | 77 | while (current != null) 78 | { 79 | path.Insert(0, current.Title); 80 | current = current.Parent; 81 | } 82 | 83 | return string.Join(separator, path); 84 | } 85 | 86 | public override string ToString() 87 | { 88 | return $"{new string(' ', Level * 2)}- {Title} (第 {Page} 页)"; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Cli/PdfTocExtractor.Cli.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | enable 7 | enable 8 | pdftoc 9 | 10 | 11 | true 12 | pdftoc 13 | ./nupkg 14 | 15 | 16 | PdfTocExtractor.Cli 17 | 2.0.0 18 | DealiAxy 19 | A powerful command-line tool for extracting table of contents (TOC) from PDF files with advanced semantic analysis capabilities. Supports bookmark extraction, intelligent structure recognition, and multiple output formats. 20 | pdf;toc;table-of-contents;extraction;bookmark;semantic-analysis;nlp;cli;tool;dotnet-tool;DealiAxy 21 | https://github.com/star-plan/pdf-toc-extractor 22 | https://github.com/star-plan/pdf-toc-extractor 23 | MIT 24 | README.md 25 | true 26 | 27 | 28 | true 29 | 30 | 31 | 32 | 33 | true 34 | partial 35 | true 36 | false 37 | Size 38 | true 39 | true 40 | 41 | true 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Semantic/TextFragment.cs: -------------------------------------------------------------------------------- 1 | namespace PdfTocExtractor.Semantic; 2 | 3 | /// 4 | /// 表示PDF中的一个文本片段及其上下文信息 5 | /// 6 | public class TextFragment 7 | { 8 | /// 9 | /// 文本内容 10 | /// 11 | public string Text { get; set; } = string.Empty; 12 | 13 | /// 14 | /// 字体大小 15 | /// 16 | public float FontSize { get; set; } 17 | 18 | /// 19 | /// 字体名称 20 | /// 21 | public string FontName { get; set; } = string.Empty; 22 | 23 | /// 24 | /// 是否为粗体 25 | /// 26 | public bool IsBold { get; set; } 27 | 28 | /// 29 | /// 是否为斜体 30 | /// 31 | public bool IsItalic { get; set; } 32 | 33 | /// 34 | /// X坐标位置 35 | /// 36 | public float X { get; set; } 37 | 38 | /// 39 | /// Y坐标位置 40 | /// 41 | public float Y { get; set; } 42 | 43 | /// 44 | /// 页码 45 | /// 46 | public int PageNumber { get; set; } 47 | 48 | /// 49 | /// 文本宽度 50 | /// 51 | public float Width { get; set; } 52 | 53 | /// 54 | /// 文本高度 55 | /// 56 | public float Height { get; set; } 57 | 58 | /// 59 | /// 是否独立成行 60 | /// 61 | public bool IsStandalone { get; set; } 62 | 63 | /// 64 | /// 前后的垂直间距 65 | /// 66 | public float VerticalSpaceBefore { get; set; } 67 | public float VerticalSpaceAfter { get; set; } 68 | 69 | /// 70 | /// 语义分析结果 71 | /// 72 | public SemanticAnalysisResult? SemanticResult { get; set; } 73 | 74 | public override string ToString() 75 | { 76 | return $"[Page {PageNumber}] \"{Text}\" - Font: {FontName}, Size: {FontSize}, Bold: {IsBold}"; 77 | } 78 | } 79 | 80 | /// 81 | /// 语义分析结果 82 | /// 83 | public class SemanticAnalysisResult 84 | { 85 | /// 86 | /// 是否可能是标题 87 | /// 88 | public bool IsLikelyHeading { get; set; } 89 | 90 | /// 91 | /// 标题置信度 (0-1) 92 | /// 93 | public float HeadingConfidence { get; set; } 94 | 95 | /// 96 | /// 推测的标题层级 (1-6) 97 | /// 98 | public int EstimatedLevel { get; set; } 99 | 100 | /// 101 | /// 分析原因 102 | /// 103 | public List Reasons { get; set; } = new(); 104 | 105 | /// 106 | /// 排除原因(如果不是标题) 107 | /// 108 | public List ExclusionReasons { get; set; } = new(); 109 | } 110 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Semantic/SemanticAnalysisOptions.cs: -------------------------------------------------------------------------------- 1 | namespace PdfTocExtractor.Semantic; 2 | 3 | /// 4 | /// 语义分析配置选项 5 | /// 6 | public class SemanticAnalysisOptions 7 | { 8 | /// 9 | /// 标题的最小长度 10 | /// 11 | public int MinHeadingLength { get; set; } = 3; 12 | 13 | /// 14 | /// 标题的最大长度 15 | /// 16 | public int MaxHeadingLength { get; set; } = 100; 17 | 18 | /// 19 | /// 字体大小倍数阈值(相对于平均字体大小) 20 | /// 21 | public float FontSizeMultiplier { get; set; } = 1.1f; 22 | 23 | /// 24 | /// 是否将粗体视为标题指示器 25 | /// 26 | public bool ConsiderBoldAsHeading { get; set; } = true; 27 | 28 | /// 29 | /// 最小垂直间距 30 | /// 31 | public float MinVerticalSpacing { get; set; } = 5f; 32 | 33 | /// 34 | /// 最小置信度阈值 35 | /// 36 | public float MinConfidenceThreshold { get; set; } = 0.3f; 37 | 38 | /// 39 | /// 最大标题层级数 40 | /// 41 | public int MaxHeadingLevels { get; set; } = 6; 42 | 43 | /// 44 | /// 是否启用调试模式 45 | /// 46 | public bool DebugMode { get; set; } = false; 47 | 48 | /// 49 | /// 跳过的页面范围(通常是目录页) 50 | /// 51 | public List SkipPages { get; set; } = new() { 1, 2, 3 }; 52 | 53 | /// 54 | /// 是否忽略页眉页脚 55 | /// 56 | public bool IgnoreHeaderFooter { get; set; } = true; 57 | 58 | /// 59 | /// 页眉高度 60 | /// 61 | public float HeaderHeight { get; set; } = 50f; 62 | 63 | /// 64 | /// 页脚高度 65 | /// 66 | public float FooterHeight { get; set; } = 50f; 67 | 68 | /// 69 | /// 默认配置 70 | /// 71 | public static SemanticAnalysisOptions Default => new(); 72 | 73 | /// 74 | /// 严格模式配置 75 | /// 76 | public static SemanticAnalysisOptions Strict => new() 77 | { 78 | MinHeadingLength = 5, 79 | MaxHeadingLength = 80, 80 | FontSizeMultiplier = 1.3f, 81 | MinConfidenceThreshold = 0.5f, 82 | MinVerticalSpacing = 8f 83 | }; 84 | 85 | /// 86 | /// 宽松模式配置 87 | /// 88 | public static SemanticAnalysisOptions Relaxed => new() 89 | { 90 | MinHeadingLength = 2, 91 | MaxHeadingLength = 150, 92 | FontSizeMultiplier = 1.05f, 93 | MinConfidenceThreshold = 0.2f, 94 | MinVerticalSpacing = 2f 95 | }; 96 | 97 | /// 98 | /// 调试模式配置 99 | /// 100 | public static SemanticAnalysisOptions Debug => new() 101 | { 102 | DebugMode = true, 103 | MinConfidenceThreshold = 0.1f 104 | }; 105 | } 106 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Exporters/TextExporter.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | using PdfTocExtractor.Models; 3 | 4 | namespace PdfTocExtractor.Exporters; 5 | 6 | /// 7 | /// 纯文本格式导出器 8 | /// 9 | public class TextExporter : IExporter 10 | { 11 | public string FormatName => "Text"; 12 | public string FileExtension => "txt"; 13 | 14 | public string Export(IEnumerable tocItems, ExportOptions? options = null) 15 | { 16 | options ??= new ExportOptions(); 17 | var sb = new StringBuilder(); 18 | 19 | // 添加文档标题 20 | var title = options.CustomTitle ?? "PDF 目录"; 21 | sb.AppendLine(title); 22 | sb.AppendLine(new string('=', title.Length)); 23 | sb.AppendLine(); 24 | 25 | // 过滤深度并导出目录项 26 | var filteredItems = FilterByDepth(tocItems, options.MaxDepth); 27 | ExportItems(filteredItems, sb, options); 28 | 29 | return sb.ToString(); 30 | } 31 | 32 | public async Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null) 33 | { 34 | var content = Export(tocItems, options); 35 | options ??= new ExportOptions(); 36 | await File.WriteAllTextAsync(filePath, content, options.Encoding); 37 | } 38 | 39 | private void ExportItems(IEnumerable items, StringBuilder sb, ExportOptions options) 40 | { 41 | foreach (var item in items) 42 | { 43 | // 生成缩进 44 | var indentLevel = Math.Max(0, item.Level); 45 | var indent = string.Concat(Enumerable.Repeat(options.IndentString, indentLevel)); 46 | 47 | // 构建项目文本 48 | var itemText = new StringBuilder(); 49 | itemText.Append($"{indent}- {item.Title}"); 50 | 51 | // 添加页码信息 52 | if (options.IncludePageNumbers && !string.IsNullOrEmpty(item.Page) && item.Page != "无页码" && item.Page != "N/A") 53 | { 54 | var pageText = string.Format(options.PageNumberFormat, item.Page); 55 | // 根据页码格式决定括号类型:默认中文格式使用中文括号,自定义格式使用英文括号 56 | var isDefaultFormat = options.PageNumberFormat == "第 {0} 页"; 57 | var brackets = isDefaultFormat ? ("(", ")") : ("(", ")"); 58 | // 中文格式不需要空格,英文格式需要空格 59 | var spacing = isDefaultFormat ? "" : " "; 60 | itemText.Append($"{spacing}{brackets.Item1}{pageText}{brackets.Item2}"); 61 | } 62 | 63 | sb.AppendLine(itemText.ToString()); 64 | 65 | // 递归处理子项目 66 | if (item.HasChildren) 67 | { 68 | ExportItems(item.Children, sb, options); 69 | } 70 | } 71 | } 72 | 73 | private IEnumerable FilterByDepth(IEnumerable items, int maxDepth) 74 | { 75 | if (maxDepth <= 0) return items; 76 | 77 | return items.Where(item => item.Level <= maxDepth).Select(item => new TocItem 78 | { 79 | Title = item.Title, 80 | Page = item.Page, 81 | Level = item.Level, 82 | Parent = item.Parent, 83 | Children = FilterByDepth(item.Children, maxDepth).ToList() 84 | }); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Exporters/JsonExporter.cs: -------------------------------------------------------------------------------- 1 | using System.Text.Json; 2 | using System.Text.Json.Serialization; 3 | using PdfTocExtractor.Models; 4 | 5 | namespace PdfTocExtractor.Exporters; 6 | 7 | /// 8 | /// JSON导出数据结构 9 | /// 10 | public class JsonExportData 11 | { 12 | public string Title { get; set; } = string.Empty; 13 | public DateTime GeneratedAt { get; set; } 14 | public IEnumerable Items { get; set; } = Enumerable.Empty(); 15 | } 16 | 17 | /// 18 | /// JSON格式导出器 19 | /// 20 | public class JsonExporter : IExporter 21 | { 22 | public string FormatName => "JSON"; 23 | public string FileExtension => "json"; 24 | 25 | private static readonly JsonSerializerOptions DefaultJsonOptions = new() 26 | { 27 | WriteIndented = true, 28 | PropertyNamingPolicy = JsonNamingPolicy.CamelCase, 29 | DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingNull 30 | }; 31 | 32 | public string Export(IEnumerable tocItems, ExportOptions? options = null) 33 | { 34 | options ??= new ExportOptions(); 35 | 36 | var exportData = new JsonExportData 37 | { 38 | Title = options.CustomTitle ?? "PDF 目录", 39 | GeneratedAt = DateTime.Now, 40 | Items = FilterByDepth(tocItems, options.MaxDepth).Select(item => ConvertToJsonObject(item, options)) 41 | }; 42 | 43 | return JsonSerializer.Serialize(exportData, DefaultJsonOptions); 44 | } 45 | 46 | public async Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null) 47 | { 48 | var content = Export(tocItems, options); 49 | options ??= new ExportOptions(); 50 | await File.WriteAllTextAsync(filePath, content, options.Encoding); 51 | } 52 | 53 | private object ConvertToJsonObject(TocItem item, ExportOptions options) 54 | { 55 | var result = new Dictionary 56 | { 57 | ["title"] = item.Title, 58 | ["level"] = item.Level 59 | }; 60 | 61 | if (options.IncludePageNumbers && !string.IsNullOrEmpty(item.Page) && item.Page != "无页码" && item.Page != "N/A") 62 | { 63 | result["page"] = item.Page; 64 | if (item.PageNumber > 0) 65 | { 66 | result["pageNumber"] = item.PageNumber; 67 | } 68 | } 69 | 70 | if (item.HasChildren) 71 | { 72 | result["children"] = item.Children.Select(child => ConvertToJsonObject(child, options)).ToArray(); 73 | } 74 | 75 | // 添加完整路径信息 76 | result["fullPath"] = item.GetFullPath(); 77 | 78 | return result; 79 | } 80 | 81 | private IEnumerable FilterByDepth(IEnumerable items, int maxDepth) 82 | { 83 | if (maxDepth <= 0) return items; 84 | 85 | return items.Where(item => item.Level <= maxDepth).Select(item => new TocItem 86 | { 87 | Title = item.Title, 88 | Page = item.Page, 89 | Level = item.Level, 90 | Parent = item.Parent, 91 | Children = FilterByDepth(item.Children, maxDepth).ToList() 92 | }); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /PdfTocExtractor.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "src", "src", "{C669A9F3-C228-4EDE-9136-7AF17B055AFA}" 4 | EndProject 5 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PdfTocExtractor", "src\PdfTocExtractor\PdfTocExtractor.csproj", "{87484A40-376B-4E0C-AF0D-6D60AECA5603}" 6 | EndProject 7 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PdfTocExtractor.Cli", "src\PdfTocExtractor.Cli\PdfTocExtractor.Cli.csproj", "{6D134678-C077-4E9A-968D-51520CE6FB5C}" 8 | EndProject 9 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PdfTocExtractor.Example", "src\PdfTocExtractor.Example\PdfTocExtractor.Example.csproj", "{B60EFDEA-D11C-4317-9637-F239F9FFF6DF}" 10 | EndProject 11 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "docs", "docs", "{3AA68D4C-0689-44CC-B61B-011B6AB8112C}" 12 | ProjectSection(SolutionItems) = preProject 13 | README.md = README.md 14 | EndProjectSection 15 | EndProject 16 | Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{E5F6G7H8-I9J0-1234-EFGH-567890123456}" 17 | EndProject 18 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "PdfTocExtractor.Tests", "tests\PdfTocExtractor.Tests\PdfTocExtractor.Tests.csproj", "{D4E5F6G7-H8I9-0123-DEFG-456789012345}" 19 | EndProject 20 | Global 21 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 22 | Debug|Any CPU = Debug|Any CPU 23 | Release|Any CPU = Release|Any CPU 24 | EndGlobalSection 25 | GlobalSection(NestedProjects) = preSolution 26 | {87484A40-376B-4E0C-AF0D-6D60AECA5603} = {C669A9F3-C228-4EDE-9136-7AF17B055AFA} 27 | {6D134678-C077-4E9A-968D-51520CE6FB5C} = {C669A9F3-C228-4EDE-9136-7AF17B055AFA} 28 | {B60EFDEA-D11C-4317-9637-F239F9FFF6DF} = {C669A9F3-C228-4EDE-9136-7AF17B055AFA} 29 | {D4E5F6G7-H8I9-0123-DEFG-456789012345} = {E5F6G7H8-I9J0-1234-EFGH-567890123456} 30 | EndGlobalSection 31 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 32 | {87484A40-376B-4E0C-AF0D-6D60AECA5603}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 33 | {87484A40-376B-4E0C-AF0D-6D60AECA5603}.Debug|Any CPU.Build.0 = Debug|Any CPU 34 | {87484A40-376B-4E0C-AF0D-6D60AECA5603}.Release|Any CPU.ActiveCfg = Release|Any CPU 35 | {87484A40-376B-4E0C-AF0D-6D60AECA5603}.Release|Any CPU.Build.0 = Release|Any CPU 36 | {6D134678-C077-4E9A-968D-51520CE6FB5C}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 37 | {6D134678-C077-4E9A-968D-51520CE6FB5C}.Debug|Any CPU.Build.0 = Debug|Any CPU 38 | {6D134678-C077-4E9A-968D-51520CE6FB5C}.Release|Any CPU.ActiveCfg = Release|Any CPU 39 | {6D134678-C077-4E9A-968D-51520CE6FB5C}.Release|Any CPU.Build.0 = Release|Any CPU 40 | {B60EFDEA-D11C-4317-9637-F239F9FFF6DF}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 41 | {B60EFDEA-D11C-4317-9637-F239F9FFF6DF}.Debug|Any CPU.Build.0 = Debug|Any CPU 42 | {B60EFDEA-D11C-4317-9637-F239F9FFF6DF}.Release|Any CPU.ActiveCfg = Release|Any CPU 43 | {B60EFDEA-D11C-4317-9637-F239F9FFF6DF}.Release|Any CPU.Build.0 = Release|Any CPU 44 | {D4E5F6G7-H8I9-0123-DEFG-456789012345}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 45 | {D4E5F6G7-H8I9-0123-DEFG-456789012345}.Debug|Any CPU.Build.0 = Debug|Any CPU 46 | {D4E5F6G7-H8I9-0123-DEFG-456789012345}.Release|Any CPU.ActiveCfg = Release|Any CPU 47 | {D4E5F6G7-H8I9-0123-DEFG-456789012345}.Release|Any CPU.Build.0 = Release|Any CPU 48 | EndGlobalSection 49 | EndGlobal 50 | -------------------------------------------------------------------------------- /docs/UPGRADE_SUMMARY.md: -------------------------------------------------------------------------------- 1 | # iTextSharp 升级到 iText 9.2.0 总结 2 | 3 | ## 升级概述 4 | 5 | 成功将项目从 iTextSharp 5.5.13.4 升级到 iText 9.2.0,这是一次重大的现代化升级。 6 | 7 | ## 主要变更 8 | 9 | ### 1. 包依赖更新 10 | - **移除**: `iTextSharp 5.5.13.4` 11 | - **添加**: `itext 9.2.0` 12 | - **添加**: `itext7.bouncy-castle-adapter 9.2.0` (必需,用于加密PDF支持) 13 | 14 | ### 2. 命名空间变更 15 | ```csharp 16 | // 旧版本 17 | using iTextSharp.text.pdf; 18 | 19 | // 新版本 20 | using iText.Kernel.Pdf; 21 | ``` 22 | 23 | ### 3. API 变更 24 | 25 | #### PDF 文档读取 26 | ```csharp 27 | // 旧版本 28 | using var reader = new PdfReader(pdfPath); 29 | var bookmarks = SimpleBookmark.GetBookmark(reader); 30 | 31 | // 新版本 32 | using var reader = new PdfReader(pdfPath); 33 | using var pdfDoc = new PdfDocument(reader); 34 | var outlines = pdfDoc.GetOutlines(false); 35 | var bookmarks = outlines.GetAllChildren(); 36 | ``` 37 | 38 | #### 书签处理 39 | ```csharp 40 | // 旧版本 41 | private List ConvertBookmarksToTocItems(IList> bookmarks) 42 | 43 | // 新版本 44 | private List ConvertBookmarksToTocItems(IList bookmarks, PdfDocument pdfDoc) 45 | ``` 46 | 47 | #### 页码获取 48 | ```csharp 49 | // 旧版本 50 | private static string GetBookmarkPage(Dictionary bookmark) 51 | { 52 | if (!bookmark.TryGetValue("Page", out var pageObj) || pageObj is not string page) 53 | return "无页码"; 54 | // 处理 "5 XYZ ..." 格式 55 | if (page.Contains(' ')) 56 | page = page.Split(' ')[0]; 57 | return page; 58 | } 59 | 60 | // 新版本 61 | private static string GetBookmarkPage(PdfOutline bookmark, PdfDocument pdfDoc) 62 | { 63 | try 64 | { 65 | var destination = bookmark.GetDestination(); 66 | if (destination != null && destination.GetPdfObject() != null) 67 | { 68 | // 通过目标对象和页面引用获取页码 69 | // 详细实现见源码 70 | } 71 | } 72 | catch { } 73 | return "无页码"; 74 | } 75 | ``` 76 | 77 | ## 升级优势 78 | 79 | ### 1. 技术优势 80 | - ✅ **原生 .NET 8 支持**: 消除了 NU1701 兼容性警告 81 | - ✅ **现代化 API**: 更清晰、更一致的 API 设计 82 | - ✅ **更好的性能**: 基于近十年的优化经验重新设计 83 | - ✅ **持续维护**: 活跃的开发和支持 84 | - ✅ **更好的文档**: 完善的官方文档和示例 85 | 86 | ### 2. 功能优势 87 | - ✅ **更好的 Unicode 支持**: 改进的多语言文本处理 88 | - ✅ **增强的安全性**: 支持最新的 PDF 安全标准 89 | - ✅ **扩展性**: 模块化设计,支持各种插件 90 | - ✅ **PDF 2.0 支持**: 支持最新的 PDF 标准 91 | 92 | ### 3. 开发体验 93 | - ✅ **类型安全**: 更强的类型检查 94 | - ✅ **异常处理**: 更清晰的错误信息 95 | - ✅ **调试支持**: 更好的调试体验 96 | - ✅ **诊断工具**: 新增 `pdftoc diagnose` 命令用于故障排除 97 | 98 | ## 测试结果 99 | 100 | 升级后所有测试通过: 101 | - **总测试数**: 155 个 102 | - **通过数**: 155 个 (100%) 103 | - **失败数**: 0 个 104 | - **执行时间**: 0.8 秒 105 | 106 | ## 兼容性说明 107 | 108 | ### 许可证变更 109 | - **iTextSharp 5.x**: AGPL v3 许可证 110 | - **iText 9.x**: AGPL v3 许可证(商业使用需要商业许可证) 111 | 112 | ### 向后兼容性 113 | - API 有重大变更,不是简单的替换 114 | - 需要代码迁移,但核心功能保持一致 115 | - 所有现有功能都能正常工作 116 | 117 | ### 加密PDF支持 118 | - **重要**: iText 9.x 需要额外的 `itext7.bouncy-castle-adapter` 依赖来处理加密PDF 119 | - 没有此依赖会导致 "PdfEncryption exception" 错误 120 | - 添加依赖后,可以正常处理各种加密类型的PDF文件 121 | 122 | ## 后续建议 123 | 124 | ### 1. 立即行动 125 | - ✅ 升级已完成,所有功能正常 126 | - ✅ 测试全部通过 127 | - ✅ 文档已更新 128 | 129 | ### 2. 长期优化 130 | - 🔄 考虑利用 iText 9 的新功能 131 | - 🔄 优化页码获取算法(如需要) 132 | - 🔄 探索 iText 插件生态系统 133 | 134 | ### 3. 监控 135 | - 📊 监控生产环境性能 136 | - 📊 收集用户反馈 137 | - 📊 关注 iText 版本更新 138 | 139 | ## 结论 140 | 141 | 升级成功!项目现在使用现代化的 iText 9.2.0 库,具有更好的性能、安全性和可维护性。所有现有功能保持完整,同时为未来的功能扩展奠定了坚实基础。 142 | 143 | --- 144 | 145 | **升级日期**: 2025-08-01 146 | **升级人员**: AI Assistant 147 | **测试状态**: ✅ 全部通过 148 | **生产就绪**: ✅ 是 149 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Example/Program.cs: -------------------------------------------------------------------------------- 1 | using PdfTocExtractor; 2 | using PdfTocExtractor.Exporters; 3 | 4 | Console.WriteLine("=== PDF TOC Extractor 示例程序 ==="); 5 | Console.WriteLine(); 6 | 7 | // 示例PDF文件路径(您需要替换为实际的PDF文件路径) 8 | var pdfPath = @"C:\path\to\your\document.pdf"; 9 | 10 | // 检查文件是否存在 11 | if (!File.Exists(pdfPath)) 12 | { 13 | Console.WriteLine("请修改 Program.cs 中的 pdfPath 变量,指向一个实际的PDF文件。"); 14 | Console.WriteLine("当前路径: " + pdfPath); 15 | return; 16 | } 17 | 18 | try 19 | { 20 | // 创建提取器实例 21 | var extractor = new PdfTocExtractor.PdfTocExtractor(); 22 | 23 | Console.WriteLine($"正在从PDF文件提取目录: {Path.GetFileName(pdfPath)}"); 24 | 25 | // 提取目录 26 | var tocItems = await extractor.ExtractTocAsync(pdfPath); 27 | 28 | Console.WriteLine($"成功提取 {tocItems.Count} 个顶级目录项"); 29 | 30 | // 显示目录结构 31 | Console.WriteLine("\n=== 目录结构 ==="); 32 | PrintTocItems(tocItems); 33 | 34 | // 导出为不同格式 35 | var baseFileName = Path.GetFileNameWithoutExtension(pdfPath); 36 | var outputDir = Path.GetDirectoryName(pdfPath) ?? Environment.CurrentDirectory; 37 | 38 | // 配置导出选项 39 | var exportOptions = new ExportOptions 40 | { 41 | CustomTitle = $"{baseFileName} - 目录", 42 | MaxDepth = 0, // 无限制 43 | IncludePageNumbers = true, 44 | IncludeLinks = false 45 | }; 46 | 47 | // 导出为Markdown 48 | var markdownPath = Path.Combine(outputDir, $"{baseFileName}_toc.md"); 49 | await extractor.ExportToFileAsync(tocItems, markdownPath, "markdown", exportOptions); 50 | Console.WriteLine($"\n✓ Markdown格式已导出到: {markdownPath}"); 51 | 52 | // 导出为JSON 53 | var jsonPath = Path.Combine(outputDir, $"{baseFileName}_toc.json"); 54 | await extractor.ExportToFileAsync(tocItems, jsonPath, "json", exportOptions); 55 | Console.WriteLine($"✓ JSON格式已导出到: {jsonPath}"); 56 | 57 | // 导出为XML 58 | var xmlPath = Path.Combine(outputDir, $"{baseFileName}_toc.xml"); 59 | await extractor.ExportToFileAsync(tocItems, xmlPath, "xml", exportOptions); 60 | Console.WriteLine($"✓ XML格式已导出到: {xmlPath}"); 61 | 62 | // 导出为纯文本 63 | var textPath = Path.Combine(outputDir, $"{baseFileName}_toc.txt"); 64 | await extractor.ExportToFileAsync(tocItems, textPath, "text", exportOptions); 65 | Console.WriteLine($"✓ 纯文本格式已导出到: {textPath}"); 66 | 67 | // 演示字符串导出 68 | Console.WriteLine("\n=== Markdown格式预览 ==="); 69 | var markdownContent = extractor.ExportToString(tocItems, "markdown", exportOptions); 70 | Console.WriteLine(markdownContent.Length > 500 71 | ? markdownContent.Substring(0, 500) + "..." 72 | : markdownContent); 73 | 74 | Console.WriteLine("\n=== 操作完成 ==="); 75 | } 76 | catch (FileNotFoundException ex) 77 | { 78 | Console.WriteLine($"错误: {ex.Message}"); 79 | } 80 | catch (InvalidOperationException ex) 81 | { 82 | Console.WriteLine($"错误: {ex.Message}"); 83 | 84 | if (ex.Message.Contains("没有目录(书签)信息")) 85 | { 86 | Console.WriteLine(); 87 | Console.WriteLine("这个PDF没有嵌入的书签信息。"); 88 | Console.WriteLine("语义分析功能正在开发中,敬请期待!"); 89 | } 90 | } 91 | catch (Exception ex) 92 | { 93 | Console.WriteLine($"未知错误: {ex.Message}"); 94 | } 95 | 96 | static void PrintTocItems(IEnumerable items, int level = 0) 97 | { 98 | foreach (var item in items) 99 | { 100 | var indent = new string(' ', level * 2); 101 | Console.WriteLine($"{indent}- {item.Title} (第 {item.Page} 页)"); 102 | 103 | if (item.HasChildren) 104 | { 105 | PrintTocItems(item.Children, level + 1); 106 | } 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Exporters/MarkdownExporter.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | using PdfTocExtractor.Models; 3 | 4 | namespace PdfTocExtractor.Exporters; 5 | 6 | /// 7 | /// Markdown格式导出器 8 | /// 9 | public class MarkdownExporter : IExporter 10 | { 11 | public string FormatName => "Markdown"; 12 | public string FileExtension => "md"; 13 | 14 | public string Export(IEnumerable tocItems, ExportOptions? options = null) 15 | { 16 | options ??= new ExportOptions(); 17 | var sb = new StringBuilder(); 18 | 19 | // 添加文档标题 20 | var title = options.CustomTitle ?? "PDF 目录"; 21 | sb.AppendLine($"# {title}"); 22 | sb.AppendLine(); 23 | 24 | // 过滤深度并导出目录项 25 | var filteredItems = FilterByDepth(tocItems, options.MaxDepth); 26 | ExportItems(filteredItems, sb, options); 27 | 28 | return sb.ToString(); 29 | } 30 | 31 | public async Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null) 32 | { 33 | var content = Export(tocItems, options); 34 | options ??= new ExportOptions(); 35 | await File.WriteAllTextAsync(filePath, content, options.Encoding); 36 | } 37 | 38 | private void ExportItems(IEnumerable items, StringBuilder sb, ExportOptions options) 39 | { 40 | foreach (var item in items) 41 | { 42 | // 生成缩进 43 | var indentLevel = Math.Max(0, item.Level); 44 | var indent = string.Concat(Enumerable.Repeat(options.IndentString, indentLevel)); 45 | 46 | // 构建项目文本 47 | var itemText = new StringBuilder(); 48 | itemText.Append($"{indent}- "); 49 | 50 | if (options.IncludeLinks && item.PageNumber > 0) 51 | { 52 | // 生成带链接的格式(虽然Markdown中PDF页面链接有限,但可以作为锚点) 53 | itemText.Append($"[{item.Title}](#{item.PageNumber})"); 54 | } 55 | else 56 | { 57 | itemText.Append(item.Title); 58 | } 59 | 60 | // 添加页码信息 61 | if (options.IncludePageNumbers && !string.IsNullOrEmpty(item.Page) && item.Page != "无页码" && item.Page != "N/A") 62 | { 63 | var pageText = string.Format(options.PageNumberFormat, item.Page); 64 | // 根据页码格式决定括号类型:默认中文格式使用中文括号,自定义格式使用英文括号 65 | var isDefaultFormat = options.PageNumberFormat == "第 {0} 页"; 66 | var brackets = isDefaultFormat ? ("(", ")") : ("(", ")"); 67 | // 中文格式不需要空格,英文格式需要空格 68 | var spacing = isDefaultFormat ? "" : " "; 69 | 70 | if (options.IncludeLinks && item.PageNumber > 0) 71 | { 72 | itemText.Append($"{spacing}{brackets.Item1}{pageText}{brackets.Item2}"); 73 | } 74 | else 75 | { 76 | itemText.Append($"{spacing}{brackets.Item1}{pageText}{brackets.Item2}"); 77 | } 78 | } 79 | 80 | sb.AppendLine(itemText.ToString()); 81 | 82 | // 递归处理子项目 83 | if (item.HasChildren) 84 | { 85 | ExportItems(item.Children, sb, options); 86 | } 87 | } 88 | } 89 | 90 | private IEnumerable FilterByDepth(IEnumerable items, int maxDepth) 91 | { 92 | if (maxDepth <= 0) return items; 93 | 94 | return items.Where(item => item.Level <= maxDepth).Select(item => new TocItem 95 | { 96 | Title = item.Title, 97 | Page = item.Page, 98 | Level = item.Level, 99 | Parent = item.Parent, 100 | Children = FilterByDepth(item.Children, maxDepth).ToList() 101 | }); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/TestResults.md: -------------------------------------------------------------------------------- 1 | # PdfTocExtractor 单元测试结果报告 2 | 3 | ## 项目概述 4 | 5 | PdfTocExtractor 是一个用于从PDF文件提取目录(TOC)的.NET库,支持多种输出格式。本报告总结了为该项目创建的全面单元测试套件。 6 | 7 | ## 测试项目结构 8 | 9 | ``` 10 | tests/PdfTocExtractor.Tests/ 11 | ├── Models/ 12 | │ └── TocItemTests.cs # TocItem模型测试 13 | ├── Exporters/ 14 | │ ├── ExportOptionsTests.cs # 导出选项测试 15 | │ ├── MarkdownExporterTests.cs # Markdown导出器测试 16 | │ ├── JsonExporterTests.cs # JSON导出器测试 17 | │ ├── XmlExporterTests.cs # XML导出器测试 18 | │ └── TextExporterTests.cs # 文本导出器测试 19 | ├── Integration/ 20 | │ └── IntegrationTests.cs # 集成测试 21 | ├── TestData/ 22 | │ ├── TestDataBuilder.cs # 测试数据构建器 23 | │ └── MockHelpers.cs # Mock对象辅助类 24 | ├── PdfTocExtractorTests.cs # 主类测试 25 | ├── GlobalUsings.cs # 全局引用 26 | └── PdfTocExtractor.Tests.csproj # 项目文件 27 | ``` 28 | 29 | ## 测试覆盖范围 30 | 31 | ### 1. 核心模型测试 (TocItemTests.cs) 32 | - ✅ 构造函数和默认值 33 | - ✅ 属性设置和获取 34 | - ✅ 页码解析逻辑 35 | - ✅ 层级关系处理 36 | - ✅ 子项目管理 37 | - ✅ 路径生成功能 38 | - ✅ ToString格式化 39 | 40 | ### 2. 导出选项测试 (ExportOptionsTests.cs) 41 | - ✅ 默认值验证 42 | - ✅ 各属性的设置和获取 43 | - ✅ 编码设置 44 | - ✅ 格式化选项 45 | 46 | ### 3. 导出器测试 47 | #### Markdown导出器 (MarkdownExporterTests.cs) 48 | - ✅ 基本导出功能 49 | - ✅ 层级结构处理 50 | - ✅ 自定义选项支持 51 | - ✅ 文件导出功能 52 | - ⚠️ 括号格式差异(实际使用中文括号) 53 | 54 | #### JSON导出器 (JsonExporterTests.cs) 55 | - ✅ JSON结构生成 56 | - ✅ 层级关系序列化 57 | - ✅ 选项过滤功能 58 | - ⚠️ 时间戳字段差异 59 | 60 | #### XML导出器 (XmlExporterTests.cs) 61 | - ⚠️ XML结构与测试期望不匹配 62 | - ⚠️ 根元素名称差异 63 | - ⚠️ 属性结构差异 64 | 65 | #### 文本导出器 (TextExporterTests.cs) 66 | - ✅ 基本文本格式 67 | - ✅ 标题下划线生成 68 | - ⚠️ 括号格式差异 69 | 70 | ### 4. 主类测试 (PdfTocExtractorTests.cs) 71 | - ✅ 构造函数和初始化 72 | - ✅ 导出器注册功能 73 | - ✅ 格式支持查询 74 | - ✅ 字符串导出功能 75 | - ✅ 文件导出功能 76 | - ⚠️ 异常类型差异(NotSupportedException vs ArgumentException) 77 | 78 | ### 5. 集成测试 (IntegrationTests.cs) 79 | - ✅ 端到端工作流程 80 | - ✅ 多格式导出一致性 81 | - ✅ 性能测试 82 | - ✅ 并发导出测试 83 | - ✅ 自定义导出器集成 84 | 85 | ## 测试运行结果 86 | 87 | **总计**: 150个测试 88 | **成功**: 106个测试 89 | **失败**: 44个测试 90 | **跳过**: 0个测试 91 | 92 | ## 主要问题分析 93 | 94 | ### 1. 格式差异问题 95 | **问题**: 实际实现使用中文括号"()",测试期望英文括号"()" 96 | **影响**: Markdown和Text导出器的多个测试失败 97 | **建议**: 更新测试以匹配实际实现,或修改实现以使用英文括号 98 | 99 | ### 2. XML结构差异 100 | **问题**: XML导出器的实际结构与测试期望不匹配 101 | - 根元素: `PdfTableOfContents` vs `TableOfContents` 102 | - 属性结构差异 103 | - 元素嵌套方式不同 104 | **建议**: 重新设计XML测试以匹配实际实现 105 | 106 | ### 3. 异常类型不匹配 107 | **问题**: 主类抛出`NotSupportedException`,测试期望`ArgumentException` 108 | **建议**: 统一异常类型或更新测试期望 109 | 110 | ### 4. JSON字段差异 111 | **问题**: JSON导出器的时间戳字段名称和结构与测试期望不同 112 | **建议**: 更新测试以匹配实际JSON结构 113 | 114 | ## 测试框架和工具 115 | 116 | - **测试框架**: xUnit 2.6.1 117 | - **断言库**: FluentAssertions 6.12.0 118 | - **Mock框架**: Moq 4.20.69 119 | - **覆盖率工具**: coverlet.collector 6.0.0 120 | - **测试运行器**: Microsoft.NET.Test.Sdk 17.8.0 121 | 122 | ## 测试数据和辅助工具 123 | 124 | ### TestDataBuilder 125 | 提供各种测试场景的数据构建方法: 126 | - 简单目录项 127 | - 层级结构目录项 128 | - 深层嵌套目录项 129 | - 特殊字符目录项 130 | - 各种页码格式 131 | - 大量数据集 132 | 133 | ### MockHelpers 134 | 提供Mock对象和辅助功能: 135 | - Mock导出器创建 136 | - 临时文件管理 137 | - 数据验证工具 138 | - 格式验证方法 139 | 140 | ## 建议的改进措施 141 | 142 | ### 1. 立即修复 143 | 1. 更新测试以匹配实际的括号格式 144 | 2. 修复XML导出器测试结构 145 | 3. 统一异常类型处理 146 | 4. 更新JSON测试期望 147 | 148 | ### 2. 长期改进 149 | 1. 添加更多边界条件测试 150 | 2. 增加性能基准测试 151 | 3. 添加内存使用测试 152 | 4. 实现测试覆盖率报告 153 | 154 | ### 3. 代码质量 155 | 1. 考虑标准化括号使用(建议使用英文括号) 156 | 2. 统一异常处理策略 157 | 3. 改进XML结构的可读性 158 | 4. 添加更多文档和示例 159 | 160 | ## 结论 161 | 162 | 虽然当前有44个测试失败,但这些主要是由于测试期望与实际实现之间的格式差异造成的,而不是功能性问题。核心功能(目录提取、格式转换、文件操作)都工作正常。 163 | 164 | 通过修复这些格式差异,测试套件将为PdfTocExtractor项目提供全面的质量保证,确保代码的可靠性和可维护性。 165 | 166 | ## 下一步行动 167 | 168 | 1. 修复测试中的格式期望差异 169 | 2. 重新运行测试验证修复效果 170 | 3. 生成测试覆盖率报告 171 | 4. 完善文档和使用示例 172 | 5. 考虑添加更多实际PDF文件的集成测试 173 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Exporters/XmlExporter.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | using System.Xml; 3 | using PdfTocExtractor.Models; 4 | 5 | namespace PdfTocExtractor.Exporters; 6 | 7 | /// 8 | /// XML格式导出器 9 | /// 10 | public class XmlExporter : IExporter 11 | { 12 | public string FormatName => "XML"; 13 | public string FileExtension => "xml"; 14 | 15 | public string Export(IEnumerable tocItems, ExportOptions? options = null) 16 | { 17 | options ??= new ExportOptions(); 18 | 19 | var settings = new XmlWriterSettings 20 | { 21 | Indent = true, 22 | IndentChars = options.IndentString, 23 | Encoding = options.Encoding, 24 | OmitXmlDeclaration = false 25 | }; 26 | 27 | var stringWriter = new StringWriter(); 28 | try 29 | { 30 | using var xmlWriter = XmlWriter.Create(stringWriter, settings); 31 | 32 | xmlWriter.WriteStartDocument(); 33 | xmlWriter.WriteStartElement("PdfTableOfContents"); 34 | 35 | // 添加元数据 36 | xmlWriter.WriteAttributeString("title", options.CustomTitle ?? "PDF 目录"); 37 | xmlWriter.WriteAttributeString("generated", DateTime.Now.ToString("yyyy-MM-dd HH:mm:ss")); 38 | 39 | // 导出目录项 40 | var filteredItems = FilterByDepth(tocItems, options.MaxDepth); 41 | foreach (var item in filteredItems) 42 | { 43 | WriteItem(xmlWriter, item, options); 44 | } 45 | 46 | xmlWriter.WriteEndElement(); // PdfTableOfContents 47 | xmlWriter.WriteEndDocument(); 48 | xmlWriter.Flush(); // 确保所有内容都写入StringWriter 49 | 50 | return stringWriter.ToString(); 51 | } 52 | finally 53 | { 54 | stringWriter.Dispose(); 55 | } 56 | } 57 | 58 | public async Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null) 59 | { 60 | var content = Export(tocItems, options); 61 | options ??= new ExportOptions(); 62 | await File.WriteAllTextAsync(filePath, content, options.Encoding); 63 | } 64 | 65 | private void WriteItem(XmlWriter writer, TocItem item, ExportOptions options) 66 | { 67 | writer.WriteStartElement("Item"); 68 | 69 | // 写入属性 70 | writer.WriteAttributeString("level", item.Level.ToString()); 71 | if (options.IncludePageNumbers && !string.IsNullOrEmpty(item.Page) && item.Page != "无页码" && item.Page != "N/A") 72 | { 73 | writer.WriteAttributeString("page", item.Page); 74 | if (item.PageNumber > 0) 75 | { 76 | writer.WriteAttributeString("pageNumber", item.PageNumber.ToString()); 77 | } 78 | } 79 | 80 | // 写入标题 81 | writer.WriteElementString("Title", item.Title); 82 | 83 | // 写入完整路径 84 | writer.WriteElementString("FullPath", item.GetFullPath()); 85 | 86 | // 写入子项目 87 | if (item.HasChildren) 88 | { 89 | writer.WriteStartElement("Children"); 90 | foreach (var child in item.Children) 91 | { 92 | WriteItem(writer, child, options); 93 | } 94 | writer.WriteEndElement(); // Children 95 | } 96 | 97 | writer.WriteEndElement(); // Item 98 | } 99 | 100 | private IEnumerable FilterByDepth(IEnumerable items, int maxDepth) 101 | { 102 | if (maxDepth <= 0) return items; 103 | 104 | return items.Where(item => item.Level <= maxDepth).Select(item => new TocItem 105 | { 106 | Title = item.Title, 107 | Page = item.Page, 108 | Level = item.Level, 109 | Parent = item.Parent, 110 | Children = FilterByDepth(item.Children, maxDepth).ToList() 111 | }); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/README.md: -------------------------------------------------------------------------------- 1 | # PdfTocExtractor 2 | 3 | ![License](https://img.shields.io/badge/license-MIT-blue) 4 | ![.NET](https://img.shields.io/badge/.NET-8.0-purple) 5 | ![NuGet](https://img.shields.io/nuget/v/PdfTocExtractor) 6 | 7 | PdfTocExtractor 是一个纯 C# 实现的轻量级PDF目录提取库,用于从PDF文件中提取目录(TOC)并导出为多种格式。支持Markdown、JSON、XML、纯文本等格式,完全摆脱命令行依赖,无需额外的PDF处理工具,适合在 .NET 项目中内嵌、分发或集成自动化流程中使用。 8 | 9 | 通过PdfTocExtractor,您可以轻松从PDF文档中提取书签和目录结构,生成清晰的导航文档,完美适用于文档处理、内容分析和自动化工作流。 10 | 11 | 🚀 跨平台、零依赖、极速提取,一切尽在 PdfTocExtractor! 12 | 13 | ## ✨ 功能特点 14 | 15 | - 📖 从PDF文件提取书签/目录信息 16 | - 📄 支持多种输出格式:Markdown、JSON、XML、纯文本 17 | - 🎯 可配置的导出选项(层级深度、页码格式等) 18 | - 🔧 可扩展的导出器架构,支持自定义格式 19 | - ⚡ 异步操作支持,高性能处理 20 | - 🌐 跨平台支持:Windows、Linux、macOS 21 | 22 | ## 📦 安装 23 | 24 | ### 通过 NuGet 安装 25 | 26 | ```bash 27 | dotnet add package PdfTocExtractor 28 | ``` 29 | 30 | 或在 Package Manager Console 中: 31 | 32 | ```powershell 33 | Install-Package PdfTocExtractor 34 | ``` 35 | 36 | ## 🚀 快速开始 37 | 38 | ### 基本用法 39 | 40 | ```csharp 41 | using PdfTocExtractor; 42 | using PdfTocExtractor.Exporters; 43 | 44 | // 创建提取器实例 45 | var extractor = new PdfTocExtractor(); 46 | 47 | // 提取目录 48 | var tocItems = await extractor.ExtractTocAsync("document.pdf"); 49 | 50 | // 导出为Markdown 51 | await extractor.ExportToFileAsync(tocItems, "output.md", "markdown"); 52 | 53 | // 导出为JSON 54 | await extractor.ExportToFileAsync(tocItems, "output.json", "json"); 55 | ``` 56 | 57 | ### 高级用法 58 | 59 | ```csharp 60 | // 使用自定义导出选项 61 | var exportOptions = new ExportOptions 62 | { 63 | MaxDepth = 3, 64 | IncludePageNumbers = true, 65 | CustomTitle = "文档目录", 66 | IndentString = " ", 67 | PageNumberFormat = "第 {0} 页" 68 | }; 69 | 70 | await extractor.ExportToFileAsync(tocItems, "output.md", "markdown", exportOptions); 71 | 72 | // 直接从PDF提取并导出 73 | await extractor.ExtractAndExportAsync("document.pdf", "output.xml"); 74 | ``` 75 | 76 | ## 📄 支持的输出格式 77 | 78 | - **Markdown** (`md`, `markdown`) - 适合文档和网页显示,支持层级结构 79 | - **JSON** (`json`) - 适合程序处理和API集成,包含完整元数据 80 | - **XML** (`xml`) - 结构化数据交换,标准化格式 81 | - **Text** (`txt`, `text`) - 纯文本格式,简洁易读 82 | 83 | ## ⚙️ 导出选项 84 | 85 | - `MaxDepth` - 最大层级深度(0表示无限制) 86 | - `IncludePageNumbers` - 是否包含页码信息 87 | - `IncludeLinks` - 是否包含链接(如果格式支持) 88 | - `CustomTitle` - 自定义文档标题 89 | - `IndentString` - 缩进字符串 90 | - `PageNumberFormat` - 页码格式化字符串 91 | - `Encoding` - 文件编码格式 92 | 93 | ## 🔧 扩展性 94 | 95 | 您可以通过实现 `IExporter` 接口来创建自定义导出器: 96 | 97 | ```csharp 98 | public class CustomExporter : IExporter 99 | { 100 | public string FormatName => "Custom"; 101 | public string FileExtension => "custom"; 102 | 103 | public string Export(IEnumerable tocItems, ExportOptions? options = null) 104 | { 105 | // 实现自定义导出逻辑 106 | return "custom format content"; 107 | } 108 | 109 | public async Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null) 110 | { 111 | var content = Export(tocItems, options); 112 | await File.WriteAllTextAsync(filePath, content); 113 | } 114 | } 115 | 116 | // 注册自定义导出器 117 | extractor.RegisterExporter("custom", new CustomExporter()); 118 | ``` 119 | 120 | ## 📝 示例输出 121 | 122 | ### Markdown 输出示例 123 | 124 | ```markdown 125 | # 文档目录 126 | 127 | - [第1章 概述](#第1章-概述) (第 1 页) 128 | - [1.1 背景](#11-背景) (第 2 页) 129 | - [1.2 目标](#12-目标) (第 3 页) 130 | - [第2章 技术架构](#第2章-技术架构) (第 5 页) 131 | - [2.1 系统设计](#21-系统设计) (第 6 页) 132 | - [2.1.1 核心组件](#211-核心组件) (第 7 页) 133 | - [2.1.2 数据流](#212-数据流) (第 8 页) 134 | ``` 135 | 136 | ### JSON 输出示例 137 | 138 | ```json 139 | { 140 | "title": "文档目录", 141 | "generatedAt": "2024-01-15T10:30:00", 142 | "items": [ 143 | { 144 | "title": "第1章 概述", 145 | "level": 1, 146 | "page": "第 1 页", 147 | "pageNumber": 1, 148 | "children": [ 149 | { 150 | "title": "1.1 背景", 151 | "level": 2, 152 | "page": "第 2 页", 153 | "pageNumber": 2 154 | } 155 | ] 156 | } 157 | ] 158 | } 159 | ``` 160 | 161 | ## 🛠️ 技术实现 162 | 163 | PdfTocExtractor 使用以下技术: 164 | 165 | - **.NET 8.0** - 现代化的.NET平台 166 | - **[iText 9.2.0](https://github.com/itext/itext7-dotnet)** - 强大的PDF处理库 167 | - **[iText7.bouncy-castle-adapter 9.2.0](https://www.nuget.org/packages/itext7.bouncy-castle-adapter)** - 加密PDF支持(必需) 168 | - **[Newtonsoft.Json 13.0.3](https://github.com/JamesNK/Newtonsoft.Json)** - JSON序列化 169 | 170 | ## 🔧 故障排除 171 | 172 | ### 常见问题 173 | 174 | #### "PdfEncryption exception" 错误 175 | 176 | 如果遇到此错误,通常是因为PDF文件使用了加密或权限保护。请确保已安装 `itext7.bouncy-castle-adapter` 包: 177 | 178 | ```bash 179 | dotnet add package itext7.bouncy-castle-adapter 180 | ``` 181 | 182 | #### "此PDF文件没有目录(书签)信息" 错误 183 | 184 | 这表示PDF文件确实没有嵌入的书签/目录信息。可以: 185 | - 检查PDF是否在其他阅读器中显示目录面板 186 | - 考虑使用其他工具为PDF添加书签 187 | 188 | ## 📄 许可证 189 | 190 | MIT License 191 | 192 | ## 🤝 相关项目 193 | 194 | - [PdfTocExtractor.Cli](https://www.nuget.org/packages/PdfTocExtractor.Cli) - 命令行工具版本 195 | - [项目主页](https://github.com/star-plan/pdf-toc-extractor) - 完整的项目文档和示例 196 | 197 | ## 📞 支持 198 | 199 | 如果您遇到问题或有建议,请: 200 | 201 | - 📋 [提交Issue](https://github.com/star-plan/pdf-toc-extractor/issues) 202 | - 💬 [参与讨论](https://github.com/star-plan/pdf-toc-extractor/discussions) 203 | - ⭐ 如果这个项目对您有帮助,请给个Star! 204 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Cli/README.md: -------------------------------------------------------------------------------- 1 | # PdfTocExtractor.Cli 2 | 3 | ![License](https://img.shields.io/badge/license-MIT-blue) 4 | ![.NET](https://img.shields.io/badge/.NET-8.0-purple) 5 | ![NuGet](https://img.shields.io/nuget/v/PdfTocExtractor.Cli) 6 | 7 | PdfTocExtractor.Cli 是一个纯 C# AOT 实现的轻量级PDF目录提取命令行工具,用于从PDF文件中提取目录(TOC)并导出为多种格式。支持Markdown、JSON、XML、纯文本等格式,完全摆脱命令行依赖,无需额外的PDF处理工具,适合在自动化流程中使用。 8 | 9 | 通过PdfTocExtractor.Cli,您可以轻松从PDF文档中提取书签和目录结构,生成清晰的导航文档,完美适用于文档处理、内容分析和自动化工作流。 10 | 11 | 🚀 跨平台、零依赖、极速提取,一切尽在 PdfTocExtractor.Cli! 12 | 13 | ## ✨ 功能特点 14 | 15 | - 📖 从PDF文件提取书签/目录信息 16 | - 📄 支持多种输出格式:Markdown、JSON、XML、纯文本 17 | - 🎯 可配置的导出选项(层级深度、页码格式等) 18 | - 🛠️ 提供诊断工具,帮助排查PDF读取问题 19 | - ⚡ 异步操作支持,高性能处理 20 | - 🚀 支持AOT编译,原生性能无需.NET运行时 21 | - 🌐 跨平台支持:Windows、Linux、macOS 22 | 23 | ## 📦 安装 24 | 25 | ### 作为 .NET Global Tool 安装 26 | 27 | ```bash 28 | dotnet tool install --global PdfTocExtractor.Cli 29 | ``` 30 | 31 | ### 从 GitHub Releases 下载 32 | 33 | 访问 [Releases页面](https://github.com/star-plan/pdf-toc-extractor/releases) 下载适合您平台的可执行文件: 34 | 35 | - **Windows**: `PdfTocExtractor-windows-{version}.zip` 36 | - **Linux**: `PdfTocExtractor-linux-{version}.tar.gz` 37 | - **macOS**: `PdfTocExtractor-macOS-{version}.tar.gz` 38 | 39 | ### 从源码构建 40 | 41 | ```bash 42 | git clone https://github.com/star-plan/pdf-toc-extractor.git 43 | cd PdfTocExtractor/src/PdfTocExtractor.Cli 44 | dotnet build -c Release 45 | ``` 46 | 47 | ## 🚀 使用方法 48 | 49 | ### 基本用法 50 | 51 | ```bash 52 | # 基本用法 - 提取PDF目录并保存为Markdown 53 | pdftoc extract document.pdf -o output.md 54 | 55 | # 指定输出格式 56 | pdftoc extract document.pdf -o output.json -f json 57 | 58 | # 设置最大层级深度 59 | pdftoc extract document.pdf -o output.xml --max-depth 3 60 | ``` 61 | 62 | ### 高级用法 63 | 64 | ```bash 65 | # 自定义标题和页码格式 66 | pdftoc extract document.pdf -o output.txt --title "我的文档目录" --page-format "第 {0} 页" 67 | 68 | # 包含页码和链接信息 69 | pdftoc extract document.pdf -o output.md --include-pages --include-links 70 | 71 | # 显示详细输出 72 | pdftoc extract document.pdf -o output.md --verbose 73 | 74 | # 诊断PDF文件问题(当遇到读取错误时很有用) 75 | pdftoc diagnose document.pdf 76 | ``` 77 | 78 | ## 📋 参数说明 79 | 80 | ### 提取命令(extract) 81 | 82 | | 参数 | 缩写 | 说明 | 是否必需 | 83 | | --- | --- | --- | --- | 84 | | `input` | - | 输入PDF文件路径 | 是 | 85 | | `--output` | `-o` | 输出文件路径 | 否,默认为控制台输出 | 86 | | `--format` | `-f` | 输出格式 (markdown/json/xml/text) | 否,默认为markdown | 87 | | `--max-depth` | - | 最大层级深度 | 否,默认为0(无限制) | 88 | | `--include-pages` | - | 包含页码信息 | 否,默认为true | 89 | | `--include-links` | - | 包含链接信息 | 否,默认为false | 90 | | `--title` | - | 自定义文档标题 | 否 | 91 | | `--indent` | - | 缩进字符串 | 否,默认为" " | 92 | | `--page-format` | - | 页码格式化字符串 | 否,默认为"第 {0} 页" | 93 | | `--verbose` | `-v` | 显示详细输出 | 否 | 94 | 95 | ### 诊断命令(diagnose) 96 | 97 | | 参数 | 说明 | 是否必需 | 98 | | --- | --- | --- | 99 | | `pdf-file` | 要诊断的PDF文件路径 | 是 | 100 | 101 | ## 📄 支持的输出格式 102 | 103 | - **Markdown** (`md`, `markdown`) - 适合文档和网页显示,支持层级结构 104 | - **JSON** (`json`) - 适合程序处理和API集成,包含完整元数据 105 | - **XML** (`xml`) - 结构化数据交换,标准化格式 106 | - **Text** (`txt`, `text`) - 纯文本格式,简洁易读 107 | 108 | ## 📝 示例输出 109 | 110 | ### 提取PDF目录 111 | 112 | ``` 113 | 正在从 document.pdf 提取目录... 114 | 找到 15 个目录项 115 | 正在导出为 Markdown 格式... 116 | 目录已成功导出到: output.md 117 | ``` 118 | 119 | ### Markdown 输出示例 120 | 121 | ```markdown 122 | # 文档目录 123 | 124 | - [第1章 概述](#第1章-概述) (第 1 页) 125 | - [1.1 背景](#11-背景) (第 2 页) 126 | - [1.2 目标](#12-目标) (第 3 页) 127 | - [第2章 技术架构](#第2章-技术架构) (第 5 页) 128 | - [2.1 系统设计](#21-系统设计) (第 6 页) 129 | - [2.1.1 核心组件](#211-核心组件) (第 7 页) 130 | - [2.1.2 数据流](#212-数据流) (第 8 页) 131 | ``` 132 | 133 | ### JSON 输出示例 134 | 135 | ```json 136 | { 137 | "title": "文档目录", 138 | "generatedAt": "2024-01-15T10:30:00", 139 | "items": [ 140 | { 141 | "title": "第1章 概述", 142 | "level": 1, 143 | "page": "第 1 页", 144 | "pageNumber": 1, 145 | "children": [ 146 | { 147 | "title": "1.1 背景", 148 | "level": 2, 149 | "page": "第 2 页", 150 | "pageNumber": 2 151 | } 152 | ] 153 | } 154 | ] 155 | } 156 | ``` 157 | 158 | ## 🔧 故障排除 159 | 160 | ### 常见问题 161 | 162 | #### "PdfEncryption exception" 错误 163 | 164 | 如果遇到此错误,通常是因为PDF文件使用了加密或权限保护。请尝试以下解决方案: 165 | 166 | 1. **使用诊断命令**: 167 | ```bash 168 | pdftoc diagnose your-document.pdf 169 | ``` 170 | 这会显示PDF文件的详细信息,帮助诊断问题。 171 | 172 | 2. **PDF文件类型**: 173 | - ✅ 支持:有权限保护但无用户密码的PDF 174 | - ✅ 支持:未加密的PDF 175 | - ❌ 不支持:需要用户密码的PDF(功能开发中) 176 | 177 | #### "此PDF文件没有目录(书签)信息" 错误 178 | 179 | 这表示PDF文件确实没有嵌入的书签/目录信息。可以: 180 | - 使用诊断命令确认:`pdftoc diagnose your-document.pdf` 181 | - 检查PDF是否在其他阅读器中显示目录面板 182 | - 考虑使用其他工具为PDF添加书签 183 | 184 | #### 输出文件为空或格式错误 185 | 186 | 1. 检查输入PDF是否有有效的目录结构 187 | 2. 尝试不同的输出格式:`-f json` 或 `-f xml` 188 | 3. 使用 `--verbose` 选项查看详细处理信息 189 | 190 | ## 🛠️ 技术实现 191 | 192 | PdfTocExtractor.Cli 使用以下技术: 193 | 194 | - **.NET 8.0** - 现代化的.NET平台 195 | - **[System.CommandLine 2.0.0](https://github.com/dotnet/command-line-api)** - 命令行参数解析 196 | - **[PdfTocExtractor](https://www.nuget.org/packages/PdfTocExtractor)** - 核心PDF处理库 197 | - **AOT编译支持** - 原生性能,无需.NET运行时 198 | 199 | ## 📄 许可证 200 | 201 | MIT License 202 | 203 | ## 🤝 相关项目 204 | 205 | - [PdfTocExtractor](https://www.nuget.org/packages/PdfTocExtractor) - 核心库,用于在.NET项目中集成 206 | - [项目主页](https://github.com/star-plan/pdf-toc-extractor) - 完整的项目文档和示例 207 | 208 | ## 📞 支持 209 | 210 | 如果您遇到问题或有建议,请: 211 | 212 | - 📋 [提交Issue](https://github.com/star-plan/pdf-toc-extractor/issues) 213 | - 💬 [参与讨论](https://github.com/star-plan/pdf-toc-extractor/discussions) 214 | - ⭐ 如果这个项目对您有帮助,请给个Star! 215 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/DebugTest.cs: -------------------------------------------------------------------------------- 1 | using PdfTocExtractor.Exporters; 2 | using PdfTocExtractor.Models; 3 | using Xunit; 4 | 5 | namespace PdfTocExtractor.Tests; 6 | 7 | public class DebugTest 8 | { 9 | [Fact] 10 | public void Debug_XmlExporter_ActualOutput() 11 | { 12 | // Arrange 13 | var exporter = new XmlExporter(); 14 | var tocItems = new List 15 | { 16 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 17 | }; 18 | 19 | // Act 20 | try 21 | { 22 | System.Console.WriteLine("=== Starting XML Export ==="); 23 | System.Console.WriteLine($"TocItems count: {tocItems.Count}"); 24 | System.Console.WriteLine($"First item: Title='{tocItems[0].Title}', Page='{tocItems[0].Page}', Level={tocItems[0].Level}"); 25 | 26 | var result = exporter.Export(tocItems); 27 | 28 | // Assert - 输出实际结果以便调试 29 | System.Console.WriteLine("=== XML Export Result ==="); 30 | System.Console.WriteLine($"Length: {result.Length}"); 31 | System.Console.WriteLine($"Content: '{result}'"); 32 | System.Console.WriteLine("=== End ==="); 33 | 34 | Assert.NotNull(result); 35 | } 36 | catch (Exception ex) 37 | { 38 | System.Console.WriteLine($"=== XML Export Exception ==="); 39 | System.Console.WriteLine($"Type: {ex.GetType().Name}"); 40 | System.Console.WriteLine($"Message: {ex.Message}"); 41 | System.Console.WriteLine($"StackTrace: {ex.StackTrace}"); 42 | System.Console.WriteLine("=== End ==="); 43 | throw; 44 | } 45 | } 46 | 47 | [Fact] 48 | public void Debug_XmlExporter_StepByStep() 49 | { 50 | // 测试XML导出器的各个步骤 51 | try 52 | { 53 | System.Console.WriteLine("=== Step by Step XML Export ==="); 54 | 55 | // Step 1: Create exporter 56 | var exporter = new XmlExporter(); 57 | System.Console.WriteLine("Step 1: Exporter created"); 58 | 59 | // Step 2: Create simple item 60 | var item = new TocItem { Title = "Test", Page = "1", Level = 0 }; 61 | System.Console.WriteLine($"Step 2: Item created - Title: '{item.Title}'"); 62 | 63 | // Step 3: Test GetFullPath 64 | var fullPath = item.GetFullPath(); 65 | System.Console.WriteLine($"Step 3: GetFullPath result: '{fullPath}'"); 66 | 67 | // Step 4: Create list 68 | var items = new List { item }; 69 | System.Console.WriteLine($"Step 4: List created with {items.Count} items"); 70 | 71 | // Step 5: Export 72 | var result = exporter.Export(items); 73 | System.Console.WriteLine($"Step 5: Export completed, length: {result.Length}"); 74 | 75 | if (result.Length > 0) 76 | { 77 | System.Console.WriteLine($"Content: {result}"); 78 | } 79 | else 80 | { 81 | System.Console.WriteLine("Result is empty!"); 82 | } 83 | 84 | System.Console.WriteLine("=== End Step by Step ==="); 85 | } 86 | catch (Exception ex) 87 | { 88 | System.Console.WriteLine($"Exception in step by step: {ex}"); 89 | throw; 90 | } 91 | } 92 | 93 | [Fact] 94 | public void Debug_MarkdownExporter_ActualOutput() 95 | { 96 | // Arrange 97 | var exporter = new MarkdownExporter(); 98 | var tocItems = new List 99 | { 100 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 101 | }; 102 | 103 | // Act 104 | var result = exporter.Export(tocItems); 105 | 106 | // Assert - 输出实际结果以便调试 107 | System.Console.WriteLine("=== Markdown Export Result ==="); 108 | System.Console.WriteLine($"Length: {result.Length}"); 109 | System.Console.WriteLine($"Content: '{result}'"); 110 | System.Console.WriteLine("=== End ==="); 111 | 112 | Assert.NotNull(result); 113 | } 114 | 115 | [Fact] 116 | public void Debug_JsonExporter_ActualOutput() 117 | { 118 | // Arrange 119 | var exporter = new JsonExporter(); 120 | var tocItems = new List 121 | { 122 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 123 | }; 124 | 125 | // Act 126 | var result = exporter.Export(tocItems); 127 | 128 | // Assert - 输出实际结果以便调试 129 | System.Console.WriteLine("=== JSON Export Result ==="); 130 | System.Console.WriteLine($"Length: {result.Length}"); 131 | System.Console.WriteLine($"Content: '{result}'"); 132 | System.Console.WriteLine("=== End ==="); 133 | 134 | Assert.NotNull(result); 135 | } 136 | 137 | [Fact] 138 | public void Debug_MarkdownExporter_MaxDepth() 139 | { 140 | // Arrange 141 | var exporter = new MarkdownExporter(); 142 | var tocItems = new List 143 | { 144 | new TocItem 145 | { 146 | Title = "Chapter 1", 147 | Page = "5", 148 | Level = 0, 149 | Children = new List 150 | { 151 | new TocItem 152 | { 153 | Title = "Section 1.1", 154 | Page = "6", 155 | Level = 1, 156 | Children = new List 157 | { 158 | new TocItem { Title = "Subsection 1.1.1", Page = "7", Level = 2 } 159 | } 160 | } 161 | } 162 | } 163 | }; 164 | var options = new ExportOptions { MaxDepth = 1 }; 165 | 166 | // Act 167 | var result = exporter.Export(tocItems, options); 168 | 169 | // Assert - 输出实际结果以便调试 170 | System.Console.WriteLine("=== Markdown MaxDepth Test ==="); 171 | System.Console.WriteLine($"MaxDepth: {options.MaxDepth}"); 172 | System.Console.WriteLine($"Length: {result.Length}"); 173 | System.Console.WriteLine($"Content: '{result}'"); 174 | System.Console.WriteLine("=== End ==="); 175 | 176 | Assert.NotNull(result); 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/Exporters/ExportOptionsTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using PdfTocExtractor.Exporters; 3 | using System.Text; 4 | using Xunit; 5 | 6 | namespace PdfTocExtractor.Tests.Exporters; 7 | 8 | public class ExportOptionsTests 9 | { 10 | [Fact] 11 | public void Constructor_ShouldInitializeWithDefaultValues() 12 | { 13 | // Arrange & Act 14 | var options = new ExportOptions(); 15 | 16 | // Assert 17 | options.IndentString.Should().Be(" "); 18 | options.IncludePageNumbers.Should().BeTrue(); 19 | options.IncludeLinks.Should().BeFalse(); 20 | options.MaxDepth.Should().Be(0); 21 | options.PageNumberFormat.Should().Be("第 {0} 页"); 22 | options.CustomTitle.Should().BeNull(); 23 | options.Encoding.Should().Be(Encoding.UTF8); 24 | } 25 | 26 | [Fact] 27 | public void IndentString_ShouldSetAndGetCorrectly() 28 | { 29 | // Arrange 30 | var options = new ExportOptions(); 31 | var customIndent = " "; // 4 spaces 32 | 33 | // Act 34 | options.IndentString = customIndent; 35 | 36 | // Assert 37 | options.IndentString.Should().Be(customIndent); 38 | } 39 | 40 | [Theory] 41 | [InlineData(true)] 42 | [InlineData(false)] 43 | public void IncludePageNumbers_ShouldSetAndGetCorrectly(bool value) 44 | { 45 | // Arrange 46 | var options = new ExportOptions(); 47 | 48 | // Act 49 | options.IncludePageNumbers = value; 50 | 51 | // Assert 52 | options.IncludePageNumbers.Should().Be(value); 53 | } 54 | 55 | [Theory] 56 | [InlineData(true)] 57 | [InlineData(false)] 58 | public void IncludeLinks_ShouldSetAndGetCorrectly(bool value) 59 | { 60 | // Arrange 61 | var options = new ExportOptions(); 62 | 63 | // Act 64 | options.IncludeLinks = value; 65 | 66 | // Assert 67 | options.IncludeLinks.Should().Be(value); 68 | } 69 | 70 | [Theory] 71 | [InlineData(0)] 72 | [InlineData(1)] 73 | [InlineData(3)] 74 | [InlineData(10)] 75 | public void MaxDepth_ShouldSetAndGetCorrectly(int depth) 76 | { 77 | // Arrange 78 | var options = new ExportOptions(); 79 | 80 | // Act 81 | options.MaxDepth = depth; 82 | 83 | // Assert 84 | options.MaxDepth.Should().Be(depth); 85 | } 86 | 87 | [Theory] 88 | [InlineData("Page {0}")] 89 | [InlineData("第 {0} 页")] 90 | [InlineData("p. {0}")] 91 | [InlineData("{0}")] 92 | public void PageNumberFormat_ShouldSetAndGetCorrectly(string format) 93 | { 94 | // Arrange 95 | var options = new ExportOptions(); 96 | 97 | // Act 98 | options.PageNumberFormat = format; 99 | 100 | // Assert 101 | options.PageNumberFormat.Should().Be(format); 102 | } 103 | 104 | [Theory] 105 | [InlineData("Custom Title")] 106 | [InlineData("文档目录")] 107 | [InlineData("")] 108 | [InlineData(null)] 109 | public void CustomTitle_ShouldSetAndGetCorrectly(string? title) 110 | { 111 | // Arrange 112 | var options = new ExportOptions(); 113 | 114 | // Act 115 | options.CustomTitle = title; 116 | 117 | // Assert 118 | options.CustomTitle.Should().Be(title); 119 | } 120 | 121 | [Fact] 122 | public void Encoding_ShouldSetAndGetCorrectly() 123 | { 124 | // Arrange 125 | var options = new ExportOptions(); 126 | var encoding = Encoding.ASCII; 127 | 128 | // Act 129 | options.Encoding = encoding; 130 | 131 | // Assert 132 | options.Encoding.Should().Be(encoding); 133 | } 134 | 135 | [Fact] 136 | public void AllProperties_ShouldBeSettableIndependently() 137 | { 138 | // Arrange 139 | var options = new ExportOptions(); 140 | 141 | // Act 142 | options.IndentString = "\t"; 143 | options.IncludePageNumbers = false; 144 | options.IncludeLinks = true; 145 | options.MaxDepth = 5; 146 | options.PageNumberFormat = "Page {0}"; 147 | options.CustomTitle = "Test Document"; 148 | options.Encoding = Encoding.Unicode; 149 | 150 | // Assert 151 | options.IndentString.Should().Be("\t"); 152 | options.IncludePageNumbers.Should().BeFalse(); 153 | options.IncludeLinks.Should().BeTrue(); 154 | options.MaxDepth.Should().Be(5); 155 | options.PageNumberFormat.Should().Be("Page {0}"); 156 | options.CustomTitle.Should().Be("Test Document"); 157 | options.Encoding.Should().Be(Encoding.Unicode); 158 | } 159 | 160 | [Fact] 161 | public void DefaultValues_ShouldBeAppropriateForMostUseCases() 162 | { 163 | // Arrange & Act 164 | var options = new ExportOptions(); 165 | 166 | // Assert 167 | // Two spaces is a common indentation 168 | options.IndentString.Should().Be(" "); 169 | 170 | // Page numbers are usually wanted 171 | options.IncludePageNumbers.Should().BeTrue(); 172 | 173 | // Links are not always supported by all formats 174 | options.IncludeLinks.Should().BeFalse(); 175 | 176 | // No depth limit by default 177 | options.MaxDepth.Should().Be(0); 178 | 179 | // Chinese format is appropriate for the library's target audience 180 | options.PageNumberFormat.Should().Be("第 {0} 页"); 181 | 182 | // UTF-8 is the most common encoding 183 | options.Encoding.Should().Be(Encoding.UTF8); 184 | } 185 | 186 | [Theory] 187 | [InlineData("")] 188 | [InlineData(" ")] 189 | [InlineData(" ")] 190 | [InlineData("\t")] 191 | [InlineData(" ")] 192 | public void IndentString_ShouldAcceptVariousWhitespaceValues(string indent) 193 | { 194 | // Arrange 195 | var options = new ExportOptions(); 196 | 197 | // Act 198 | options.IndentString = indent; 199 | 200 | // Assert 201 | options.IndentString.Should().Be(indent); 202 | } 203 | 204 | [Theory] 205 | [InlineData(-1)] 206 | [InlineData(-10)] 207 | public void MaxDepth_ShouldAcceptNegativeValues(int depth) 208 | { 209 | // Arrange 210 | var options = new ExportOptions(); 211 | 212 | // Act 213 | options.MaxDepth = depth; 214 | 215 | // Assert 216 | options.MaxDepth.Should().Be(depth); 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Cli/Commands/ExtractCommand.cs: -------------------------------------------------------------------------------- 1 | using System.CommandLine; 2 | using System.CommandLine.Invocation; 3 | using PdfTocExtractor.Exporters; 4 | 5 | namespace PdfTocExtractor.Cli.Commands; 6 | 7 | public static class ExtractCommand 8 | { 9 | public static Command Create() 10 | { 11 | var inputArgument = new Argument("input", "PDF文件路径") 12 | { 13 | Arity = ArgumentArity.ExactlyOne 14 | }; 15 | 16 | var outputOption = new Option( 17 | aliases: ["-o", "--output"], 18 | description: "输出文件路径(如果未指定,将使用输入文件名加上相应扩展名)"); 19 | 20 | var formatOption = new Option( 21 | aliases: ["-f", "--format"], 22 | description: "输出格式 (markdown, json, xml, text)。如果未指定,将根据输出文件扩展名推断"); 23 | 24 | var maxDepthOption = new Option( 25 | aliases: ["--max-depth"], 26 | description: "最大层级深度(0表示无限制)", 27 | getDefaultValue: () => 0); 28 | 29 | var includePageNumbersOption = new Option( 30 | aliases: ["--include-pages"], 31 | description: "是否包含页码信息", 32 | getDefaultValue: () => true); 33 | 34 | var includeLinksOption = new Option( 35 | aliases: ["--include-links"], 36 | description: "是否包含链接(如果格式支持)", 37 | getDefaultValue: () => false); 38 | 39 | var customTitleOption = new Option( 40 | aliases: ["--title"], 41 | description: "自定义文档标题"); 42 | 43 | var indentOption = new Option( 44 | aliases: ["--indent"], 45 | description: "缩进字符串", 46 | getDefaultValue: () => " "); 47 | 48 | var pageFormatOption = new Option( 49 | aliases: ["--page-format"], 50 | description: "页码格式化字符串", 51 | getDefaultValue: () => "第 {0} 页"); 52 | 53 | var verboseOption = new Option( 54 | aliases: ["-v", "--verbose"], 55 | description: "显示详细输出"); 56 | 57 | var command = new Command("extract", "从PDF文件提取目录") 58 | { 59 | inputArgument, 60 | outputOption, 61 | formatOption, 62 | maxDepthOption, 63 | includePageNumbersOption, 64 | includeLinksOption, 65 | customTitleOption, 66 | indentOption, 67 | pageFormatOption, 68 | verboseOption 69 | }; 70 | 71 | command.SetHandler(async (context) => 72 | { 73 | try 74 | { 75 | var input = context.ParseResult.GetValueForArgument(inputArgument); 76 | var output = context.ParseResult.GetValueForOption(outputOption); 77 | var format = context.ParseResult.GetValueForOption(formatOption); 78 | var maxDepth = context.ParseResult.GetValueForOption(maxDepthOption); 79 | var includePages = context.ParseResult.GetValueForOption(includePageNumbersOption); 80 | var includeLinks = context.ParseResult.GetValueForOption(includeLinksOption); 81 | var customTitle = context.ParseResult.GetValueForOption(customTitleOption); 82 | var indent = context.ParseResult.GetValueForOption(indentOption) ?? " "; 83 | var pageFormat = context.ParseResult.GetValueForOption(pageFormatOption) ?? "第 {0} 页"; 84 | var verbose = context.ParseResult.GetValueForOption(verboseOption); 85 | 86 | await ExecuteExtractCommand(input, output, format, maxDepth, includePages, includeLinks, customTitle, indent, pageFormat, verbose); 87 | } 88 | catch (Exception ex) 89 | { 90 | Console.ForegroundColor = ConsoleColor.Red; 91 | Console.WriteLine($"错误: {ex.Message}"); 92 | Console.ResetColor(); 93 | context.ExitCode = 1; 94 | } 95 | }); 96 | 97 | return command; 98 | } 99 | 100 | private static async Task ExecuteExtractCommand( 101 | FileInfo input, 102 | FileInfo? output, 103 | string? format, 104 | int maxDepth, 105 | bool includePages, 106 | bool includeLinks, 107 | string? customTitle, 108 | string indent, 109 | string pageFormat, 110 | bool verbose) 111 | { 112 | if (!input.Exists) 113 | { 114 | throw new FileNotFoundException($"输入文件不存在: {input.FullName}"); 115 | } 116 | 117 | if (verbose) 118 | { 119 | Console.WriteLine($"正在处理PDF文件: {input.FullName}"); 120 | } 121 | 122 | var extractor = new PdfTocExtractor(); 123 | 124 | // 提取目录 125 | var tocItems = await extractor.ExtractTocAsync(input.FullName); 126 | 127 | if (verbose) 128 | { 129 | Console.WriteLine($"成功提取 {tocItems.Count} 个顶级目录项"); 130 | var totalItems = tocItems.Sum(item => 1 + item.GetAllDescendants().Count()); 131 | Console.WriteLine($"总共 {totalItems} 个目录项"); 132 | } 133 | 134 | // 确定输出文件和格式 135 | var (outputFile, outputFormat) = DetermineOutputFileAndFormat(input, output, format); 136 | 137 | if (verbose) 138 | { 139 | Console.WriteLine($"输出文件: {outputFile.FullName}"); 140 | Console.WriteLine($"输出格式: {outputFormat}"); 141 | } 142 | 143 | // 配置导出选项 144 | var exportOptions = new ExportOptions 145 | { 146 | MaxDepth = maxDepth, 147 | IncludePageNumbers = includePages, 148 | IncludeLinks = includeLinks, 149 | CustomTitle = customTitle, 150 | IndentString = indent, 151 | PageNumberFormat = pageFormat 152 | }; 153 | 154 | // 导出 155 | await extractor.ExportToFileAsync(tocItems, outputFile.FullName, outputFormat, exportOptions); 156 | 157 | Console.ForegroundColor = ConsoleColor.Green; 158 | Console.WriteLine($"✓ 成功导出到: {outputFile.FullName}"); 159 | Console.ResetColor(); 160 | 161 | if (verbose) 162 | { 163 | Console.WriteLine($"文件大小: {new FileInfo(outputFile.FullName).Length} 字节"); 164 | } 165 | } 166 | 167 | private static (FileInfo outputFile, string format) DetermineOutputFileAndFormat(FileInfo input, FileInfo? output, string? format) 168 | { 169 | if (output != null) 170 | { 171 | // 如果指定了输出文件 172 | var outputFormat = format ?? Path.GetExtension(output.FullName).TrimStart('.'); 173 | if (string.IsNullOrEmpty(outputFormat)) 174 | { 175 | outputFormat = "md"; // 默认为markdown 176 | } 177 | return (output, outputFormat); 178 | } 179 | else 180 | { 181 | // 如果没有指定输出文件,根据格式生成 182 | var outputFormat = format ?? "md"; 183 | var extension = outputFormat switch 184 | { 185 | "markdown" or "md" => "md", 186 | "json" => "json", 187 | "xml" => "xml", 188 | "text" or "txt" => "txt", 189 | _ => outputFormat 190 | }; 191 | 192 | var outputPath = Path.ChangeExtension(input.FullName, extension); 193 | return (new FileInfo(outputPath), outputFormat); 194 | } 195 | } 196 | } 197 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Cli/Commands/SmartCommand.cs: -------------------------------------------------------------------------------- 1 | using System.CommandLine; 2 | using PdfTocExtractor.Exporters; 3 | using PdfTocExtractor.Models; 4 | using PdfTocExtractor.Semantic; 5 | 6 | namespace PdfTocExtractor.Cli.Commands; 7 | 8 | /// 9 | /// 智能提取命令 10 | /// 11 | public static class SmartCommand 12 | { 13 | public static Command Create() 14 | { 15 | var inputOption = new Option( 16 | aliases: new[] { "--input", "-i" }, 17 | description: "输入PDF文件路径") 18 | { 19 | IsRequired = true 20 | }; 21 | 22 | var outputOption = new Option( 23 | aliases: new[] { "--output", "-o" }, 24 | description: "输出文件路径"); 25 | 26 | var formatOption = new Option( 27 | aliases: new[] { "--format", "-f" }, 28 | description: "输出格式 (markdown, json, xml, text)"); 29 | 30 | var maxDepthOption = new Option( 31 | aliases: new[] { "--max-depth", "-d" }, 32 | description: "最大层级深度", 33 | getDefaultValue: () => 0); 34 | 35 | var includePagesOption = new Option( 36 | aliases: new[] { "--include-pages", "-p" }, 37 | description: "包含页码信息", 38 | getDefaultValue: () => true); 39 | 40 | var includeLinksOption = new Option( 41 | aliases: new[] { "--include-links", "-l" }, 42 | description: "包含链接信息", 43 | getDefaultValue: () => false); 44 | 45 | var customTitleOption = new Option( 46 | aliases: new[] { "--title", "-t" }, 47 | description: "自定义标题"); 48 | 49 | var indentOption = new Option( 50 | aliases: new[] { "--indent" }, 51 | description: "缩进字符串", 52 | getDefaultValue: () => " "); 53 | 54 | var pageFormatOption = new Option( 55 | aliases: new[] { "--page-format" }, 56 | description: "页码格式字符串", 57 | getDefaultValue: () => "{0}"); 58 | 59 | var verboseOption = new Option( 60 | aliases: new[] { "--verbose", "-v" }, 61 | description: "显示详细输出", 62 | getDefaultValue: () => false); 63 | 64 | 65 | 66 | var command = new Command("smart", "智能提取目录:先尝试提取书签,失败则分析结构") 67 | { 68 | inputOption, 69 | outputOption, 70 | formatOption, 71 | maxDepthOption, 72 | includePagesOption, 73 | includeLinksOption, 74 | customTitleOption, 75 | indentOption, 76 | pageFormatOption, 77 | verboseOption, 78 | 79 | }; 80 | 81 | command.SetHandler(async (context) => 82 | { 83 | var input = context.ParseResult.GetValueForOption(inputOption)!; 84 | var output = context.ParseResult.GetValueForOption(outputOption); 85 | var format = context.ParseResult.GetValueForOption(formatOption); 86 | var maxDepth = context.ParseResult.GetValueForOption(maxDepthOption); 87 | var includePages = context.ParseResult.GetValueForOption(includePagesOption); 88 | var includeLinks = context.ParseResult.GetValueForOption(includeLinksOption); 89 | var customTitle = context.ParseResult.GetValueForOption(customTitleOption); 90 | var indent = context.ParseResult.GetValueForOption(indentOption); 91 | var pageFormat = context.ParseResult.GetValueForOption(pageFormatOption); 92 | var verbose = context.ParseResult.GetValueForOption(verboseOption); 93 | await ExecuteSmartCommand(input, output, format, maxDepth, includePages, includeLinks, 94 | customTitle, indent, pageFormat, verbose); 95 | }); 96 | 97 | return command; 98 | } 99 | 100 | private static async Task ExecuteSmartCommand( 101 | FileInfo input, 102 | FileInfo? output, 103 | string? format, 104 | int maxDepth, 105 | bool includePages, 106 | bool includeLinks, 107 | string? customTitle, 108 | string indent, 109 | string pageFormat, 110 | bool verbose) 111 | { 112 | if (!input.Exists) 113 | { 114 | throw new FileNotFoundException($"输入文件不存在: {input.FullName}"); 115 | } 116 | 117 | if (verbose) 118 | { 119 | Console.WriteLine($"正在智能处理PDF文件: {input.FullName}"); 120 | } 121 | 122 | var extractor = new PdfTocExtractor(); 123 | List tocItems; 124 | 125 | try 126 | { 127 | // 尝试提取书签 128 | if (verbose) 129 | { 130 | Console.WriteLine("尝试提取PDF书签..."); 131 | } 132 | 133 | tocItems = await extractor.ExtractTocAsync(input.FullName); 134 | 135 | if (verbose) 136 | { 137 | Console.WriteLine("成功提取PDF书签"); 138 | Console.WriteLine($"成功提取 {tocItems.Count} 个顶级目录项"); 139 | var totalItems = tocItems.Sum(item => 1 + item.GetAllDescendants().Count()); 140 | Console.WriteLine($"总共 {totalItems} 个目录项"); 141 | } 142 | } 143 | catch (InvalidOperationException ex) when (ex.Message.Contains("没有目录(书签)信息")) 144 | { 145 | if (verbose) 146 | { 147 | Console.WriteLine("PDF文件没有书签信息,切换到语义分析模式..."); 148 | } 149 | 150 | try 151 | { 152 | // 使用语义分析 153 | var semanticOptions = SemanticAnalysisOptions.Default; 154 | tocItems = await extractor.ExtractTocSemanticAsync(input.FullName, semanticOptions); 155 | 156 | if (verbose) 157 | { 158 | Console.WriteLine("语义分析完成"); 159 | Console.WriteLine($"成功识别 {tocItems.Count} 个顶级目录项"); 160 | var totalItems = tocItems.Sum(item => 1 + item.GetAllDescendants().Count()); 161 | Console.WriteLine($"总共 {totalItems} 个目录项"); 162 | } 163 | } 164 | catch (Exception semanticEx) 165 | { 166 | Console.WriteLine("错误: PDF文件没有书签信息,语义分析也失败了"); 167 | Console.WriteLine($"语义分析错误: {semanticEx.Message}"); 168 | Console.WriteLine("建议:"); 169 | Console.WriteLine(" - 使用 'semantic' 命令进行更精细的语义分析"); 170 | Console.WriteLine(" - 尝试调整语义分析参数"); 171 | return; 172 | } 173 | } 174 | 175 | if (tocItems.Count == 0) 176 | { 177 | Console.WriteLine("警告: 未能提取到任何目录信息"); 178 | return; 179 | } 180 | 181 | // 创建导出选项 182 | var exportOptions = new ExportOptions 183 | { 184 | MaxDepth = maxDepth, 185 | IncludePageNumbers = includePages, 186 | IncludeLinks = includeLinks, 187 | CustomTitle = customTitle, 188 | IndentString = indent, 189 | PageNumberFormat = pageFormat 190 | }; 191 | 192 | // 导出结果 193 | if (output != null) 194 | { 195 | await extractor.ExportToFileAsync(tocItems, output.FullName, format, exportOptions); 196 | 197 | if (verbose) 198 | { 199 | Console.WriteLine($"目录已导出到: {output.FullName}"); 200 | } 201 | } 202 | else 203 | { 204 | // 输出到控制台 205 | var outputFormat = format ?? "text"; 206 | var result = extractor.ExportToString(tocItems, outputFormat, exportOptions); 207 | Console.WriteLine(result); 208 | } 209 | } 210 | 211 | 212 | } 213 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: 发布PdfTocExtractor 2 | run-name: ${{ github.actor }} 正在发布PdfTocExtractor 🚀 3 | 4 | on: 5 | push: 6 | tags: 7 | - "v*.*.*" # 版本格式匹配 8 | 9 | # 为整个工作流设置权限 10 | permissions: 11 | contents: write 12 | id-token: write 13 | issues: write 14 | 15 | jobs: 16 | # 第一步:运行测试 17 | test: 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | fetch-depth: 0 # 获取所有历史记录用于版本号计算 24 | 25 | - name: Setup .NET 26 | uses: actions/setup-dotnet@v4 27 | with: 28 | dotnet-version: 8.0.x 29 | 30 | - name: 缓存NuGet包 31 | uses: actions/cache@v4 32 | with: 33 | path: ~/.nuget/packages 34 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/packages.lock.json') }} 35 | restore-keys: | 36 | ${{ runner.os }}-nuget- 37 | 38 | - name: 恢复依赖 39 | run: dotnet restore 40 | 41 | - name: 运行测试 42 | run: dotnet test --no-restore --verbosity normal 43 | 44 | # 第二步:发布NuGet包 45 | publish-nuget: 46 | needs: test 47 | runs-on: ubuntu-latest 48 | 49 | steps: 50 | - uses: actions/checkout@v4 51 | with: 52 | fetch-depth: 0 # 获取所有历史记录用于版本号计算 53 | 54 | - name: Setup .NET 55 | uses: actions/setup-dotnet@v4 56 | with: 57 | dotnet-version: 8.0.x 58 | 59 | - name: 缓存NuGet包 60 | uses: actions/cache@v4 61 | with: 62 | path: ~/.nuget/packages 63 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/packages.lock.json') }} 64 | restore-keys: | 65 | ${{ runner.os }}-nuget- 66 | 67 | - name: 提取版本号 68 | id: get_version 69 | shell: bash 70 | run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT 71 | 72 | - name: 恢复依赖 73 | run: dotnet restore 74 | 75 | - name: 构建核心库 76 | run: dotnet build --no-restore -c Release --nologo src/PdfTocExtractor/PdfTocExtractor.csproj -p:Version=${{ steps.get_version.outputs.VERSION }} 77 | 78 | - name: 构建CLI工具 79 | run: dotnet build --no-restore -c Release --nologo src/PdfTocExtractor.Cli/PdfTocExtractor.Cli.csproj -p:Version=${{ steps.get_version.outputs.VERSION }} 80 | 81 | - name: 创建核心库NuGet包 82 | run: dotnet pack -c Release src/PdfTocExtractor/PdfTocExtractor.csproj -p:PackageVersion=${{ steps.get_version.outputs.VERSION }} --no-build --output ./nupkg 83 | 84 | - name: 创建CLI工具NuGet包 85 | run: dotnet pack -c Release src/PdfTocExtractor.Cli/PdfTocExtractor.Cli.csproj -p:PackageVersion=${{ steps.get_version.outputs.VERSION }} --no-build --output ./nupkg 86 | 87 | - name: 发布到NuGet Gallery 88 | run: dotnet nuget push ./nupkg/*.nupkg --api-key ${{ secrets.NUGET_GALLERY_TOKEN }} --source https://api.nuget.org/v3/index.json --skip-duplicate 89 | 90 | # 第三步:编译各平台可执行文件 91 | build-executables: 92 | needs: publish-nuget # 确保在NuGet包发布后运行 93 | strategy: 94 | fail-fast: false 95 | matrix: 96 | kind: ['windows', 'linux', 'macOS'] 97 | include: 98 | - kind: windows 99 | os: windows-latest 100 | target: win-x64 101 | extension: '.zip' 102 | - kind: linux 103 | os: ubuntu-latest 104 | target: linux-x64 105 | extension: '.tar.gz' 106 | - kind: macOS 107 | os: macos-latest 108 | target: osx-x64 109 | extension: '.tar.gz' 110 | 111 | runs-on: ${{ matrix.os }} 112 | 113 | steps: 114 | - uses: actions/checkout@v4 115 | with: 116 | fetch-depth: 0 # 获取所有历史记录用于版本号计算 117 | 118 | - name: 提取版本号 119 | id: get_version 120 | shell: bash 121 | run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT 122 | 123 | - name: Setup .NET 124 | uses: actions/setup-dotnet@v4 125 | with: 126 | dotnet-version: 8.0.x 127 | 128 | - name: 缓存NuGet包 129 | uses: actions/cache@v4 130 | with: 131 | path: ~/.nuget/packages 132 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/packages.lock.json') }} 133 | restore-keys: | 134 | ${{ runner.os }}-nuget- 135 | 136 | - name: 安装Linux依赖 137 | if: matrix.kind == 'linux' 138 | run: | 139 | sudo apt-get update 140 | sudo apt-get install -y clang zlib1g-dev libkrb5-dev 141 | 142 | - name: 设置Windows环境 143 | if: matrix.kind == 'windows' 144 | shell: pwsh 145 | run: | 146 | Write-Host "设置Windows编译环境..." 147 | # 确保有最新的开发者工具 148 | choco install visualstudio2022buildtools -y --no-progress 149 | 150 | - name: 恢复依赖 151 | run: dotnet restore 152 | 153 | - name: AOT编译 154 | run: | 155 | echo "正在为 ${{ matrix.kind }} 平台进行AOT编译..." 156 | dotnet publish src/PdfTocExtractor.Cli/PdfTocExtractor.Cli.csproj -c Release -r ${{ matrix.target }} --self-contained true -p:PublishAot=true -p:Version=${{ steps.get_version.outputs.VERSION }} -o ./publish/${{ matrix.kind }} 157 | 158 | - name: 打包Windows可执行文件 159 | if: matrix.kind == 'windows' 160 | run: | 161 | cd ./publish/${{ matrix.kind }} 162 | 7z a -tzip ../../PdfTocExtractor-${{ matrix.kind }}-${{ steps.get_version.outputs.VERSION }}${{ matrix.extension }} * 163 | 164 | - name: 打包Linux/macOS可执行文件 165 | if: matrix.kind != 'windows' 166 | run: | 167 | cd ./publish/${{ matrix.kind }} 168 | tar -czvf ../../PdfTocExtractor-${{ matrix.kind }}-${{ steps.get_version.outputs.VERSION }}${{ matrix.extension }} * 169 | 170 | # 上传构建产物作为工作流构件(artifacts) 171 | - name: 上传构建产物 172 | uses: actions/upload-artifact@v4 173 | with: 174 | name: PdfTocExtractor-${{ matrix.kind }}-${{ steps.get_version.outputs.VERSION }} 175 | path: ./PdfTocExtractor-${{ matrix.kind }}-${{ steps.get_version.outputs.VERSION }}${{ matrix.extension }} 176 | retention-days: 1 177 | 178 | # 第四步:统一上传所有平台可执行文件到GitHub Release 179 | upload-to-release: 180 | needs: build-executables 181 | runs-on: ubuntu-latest 182 | 183 | steps: 184 | - name: 提取版本号 185 | id: get_version 186 | shell: bash 187 | run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT 188 | 189 | # 下载所有平台构建产物 190 | - name: 下载Windows构建产物 191 | uses: actions/download-artifact@v4 192 | with: 193 | name: PdfTocExtractor-windows-${{ steps.get_version.outputs.VERSION }} 194 | path: ./artifacts 195 | 196 | - name: 下载Linux构建产物 197 | uses: actions/download-artifact@v4 198 | with: 199 | name: PdfTocExtractor-linux-${{ steps.get_version.outputs.VERSION }} 200 | path: ./artifacts 201 | 202 | - name: 下载macOS构建产物 203 | uses: actions/download-artifact@v4 204 | with: 205 | name: PdfTocExtractor-macOS-${{ steps.get_version.outputs.VERSION }} 206 | path: ./artifacts 207 | 208 | # 列出下载的文件以确认 209 | - name: 列出下载的文件 210 | run: ls -la ./artifacts 211 | 212 | # 统一上传到GitHub Release 213 | - name: 上传所有文件到GitHub Release 214 | uses: softprops/action-gh-release@v1 215 | env: 216 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 217 | with: 218 | files: ./artifacts/* 219 | tag_name: ${{ github.ref }} 220 | fail_on_unmatched_files: false 221 | draft: false 222 | name: PdfTocExtractor 版本 ${{ steps.get_version.outputs.VERSION }} 223 | generate_release_notes: true 224 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Cli/Commands/DiagnoseCommand.cs: -------------------------------------------------------------------------------- 1 | using System.CommandLine; 2 | using System.Text; 3 | using iText.Kernel.Pdf; 4 | 5 | namespace PdfTocExtractor.Cli.Commands; 6 | 7 | public static class DiagnoseCommand 8 | { 9 | public static Command Create() 10 | { 11 | var inputArgument = new Argument("input", "PDF文件路径") 12 | { 13 | Arity = ArgumentArity.ExactlyOne 14 | }; 15 | 16 | var command = new Command("diagnose", "诊断PDF文件的详细信息") 17 | { 18 | inputArgument 19 | }; 20 | 21 | command.SetHandler(async (context) => 22 | { 23 | try 24 | { 25 | var input = context.ParseResult.GetValueForArgument(inputArgument); 26 | await ExecuteDiagnoseCommand(input); 27 | } 28 | catch (Exception ex) 29 | { 30 | Console.ForegroundColor = ConsoleColor.Red; 31 | Console.WriteLine($"错误: {ex.Message}"); 32 | Console.ResetColor(); 33 | context.ExitCode = 1; 34 | } 35 | }); 36 | 37 | return command; 38 | } 39 | 40 | private static async Task ExecuteDiagnoseCommand(FileInfo input) 41 | { 42 | if (!input.Exists) 43 | { 44 | throw new FileNotFoundException($"输入文件不存在: {input.FullName}"); 45 | } 46 | 47 | Console.WriteLine("=== PDF文件诊断信息 ==="); 48 | Console.WriteLine($"文件路径: {input.FullName}"); 49 | Console.WriteLine($"文件大小: {input.Length} 字节"); 50 | Console.WriteLine(); 51 | 52 | // 基本文件信息 53 | await DiagnoseFileBasics(input); 54 | 55 | // PDF结构信息 56 | await DiagnosePdfStructure(input); 57 | 58 | // 尝试不同的读取方式 59 | await TryDifferentReadingMethods(input); 60 | } 61 | 62 | private static async Task DiagnoseFileBasics(FileInfo input) 63 | { 64 | Console.WriteLine("=== 基本文件信息 ==="); 65 | 66 | try 67 | { 68 | // 读取文件头 69 | using var fs = new FileStream(input.FullName, FileMode.Open, FileAccess.Read); 70 | var header = new byte[8]; 71 | await fs.ReadAsync(header, 0, 8); 72 | 73 | var headerString = Encoding.ASCII.GetString(header); 74 | Console.WriteLine($"文件头: {headerString}"); 75 | Console.WriteLine($"文件头(十六进制): {Convert.ToHexString(header)}"); 76 | 77 | if (headerString.StartsWith("%PDF-")) 78 | { 79 | Console.WriteLine("✓ 文件头格式正确"); 80 | } 81 | else 82 | { 83 | Console.WriteLine("✗ 文件头格式不正确,可能不是有效的PDF文件"); 84 | } 85 | } 86 | catch (Exception ex) 87 | { 88 | Console.WriteLine($"✗ 读取文件头失败: {ex.Message}"); 89 | } 90 | 91 | Console.WriteLine(); 92 | } 93 | 94 | private static Task DiagnosePdfStructure(FileInfo input) 95 | { 96 | Console.WriteLine("=== PDF结构信息 ==="); 97 | 98 | try 99 | { 100 | using var reader = new PdfReader(input.FullName); 101 | Console.WriteLine("✓ PdfReader创建成功"); 102 | 103 | // 尝试创建PdfDocument来获取更多信息 104 | try 105 | { 106 | using var pdfDoc = new PdfDocument(reader); 107 | Console.WriteLine("✓ PdfDocument创建成功"); 108 | 109 | // 现在可以安全地检查加密状态 110 | Console.WriteLine($"是否加密: {reader.IsEncrypted()}"); 111 | 112 | var pageCount = pdfDoc.GetNumberOfPages(); 113 | Console.WriteLine($"页面数量: {pageCount}"); 114 | 115 | var outlines = pdfDoc.GetOutlines(false); 116 | if (outlines != null) 117 | { 118 | var bookmarks = outlines.GetAllChildren(); 119 | Console.WriteLine($"书签数量: {bookmarks?.Count ?? 0}"); 120 | 121 | if (bookmarks != null && bookmarks.Count > 0) 122 | { 123 | Console.WriteLine("✓ 找到书签信息,应该可以提取目录"); 124 | } 125 | else 126 | { 127 | Console.WriteLine("⚠️ 没有书签信息,无法提取目录"); 128 | } 129 | } 130 | else 131 | { 132 | Console.WriteLine("⚠️ 没有书签信息,无法提取目录"); 133 | } 134 | } 135 | catch (Exception ex) 136 | { 137 | Console.WriteLine($"✗ 创建PdfDocument失败: {ex.GetType().Name}: {ex.Message}"); 138 | Console.WriteLine("这通常表示PDF有加密或权限保护"); 139 | 140 | // 尝试获取更多错误详情 141 | if (ex.Message.Contains("PdfEncryption")) 142 | { 143 | Console.WriteLine("⚠️ 这是一个加密相关的错误"); 144 | Console.WriteLine("可能的原因:"); 145 | Console.WriteLine("1. PDF设置了权限密码(owner password)"); 146 | Console.WriteLine("2. PDF使用了不支持的加密算法"); 147 | Console.WriteLine("3. PDF文件可能需要特定的解密方式"); 148 | } 149 | } 150 | } 151 | catch (Exception ex) 152 | { 153 | Console.WriteLine($"✗ PDF结构分析失败: {ex.GetType().Name}: {ex.Message}"); 154 | Console.WriteLine($"详细错误: {ex}"); 155 | } 156 | 157 | Console.WriteLine(); 158 | return Task.CompletedTask; 159 | } 160 | 161 | private static Task TryDifferentReadingMethods(FileInfo input) 162 | { 163 | Console.WriteLine("=== 尝试不同的读取方式 ==="); 164 | 165 | // 方法1: 标准读取 166 | Console.WriteLine("1. 标准读取方式:"); 167 | try 168 | { 169 | using var reader = new PdfReader(input.FullName); 170 | using var pdfDoc = new PdfDocument(reader); 171 | Console.WriteLine("✓ 标准读取成功"); 172 | } 173 | catch (Exception ex) 174 | { 175 | Console.WriteLine($"✗ 标准读取失败: {ex.GetType().Name}: {ex.Message}"); 176 | } 177 | 178 | // 方法2: 使用空密码 179 | Console.WriteLine("2. 使用空密码:"); 180 | try 181 | { 182 | var readerProperties = new ReaderProperties().SetPassword(new byte[0]); 183 | using var reader = new PdfReader(input.FullName, readerProperties); 184 | using var pdfDoc = new PdfDocument(reader); 185 | Console.WriteLine("✓ 空密码读取成功"); 186 | } 187 | catch (Exception ex) 188 | { 189 | Console.WriteLine($"✗ 空密码读取失败: {ex.GetType().Name}: {ex.Message}"); 190 | } 191 | 192 | // 方法2.5: 使用null密码 193 | Console.WriteLine("2.5. 使用null密码:"); 194 | try 195 | { 196 | var readerProperties = new ReaderProperties().SetPassword(null); 197 | using var reader = new PdfReader(input.FullName, readerProperties); 198 | using var pdfDoc = new PdfDocument(reader); 199 | Console.WriteLine("✓ null密码读取成功"); 200 | } 201 | catch (Exception ex) 202 | { 203 | Console.WriteLine($"✗ null密码读取失败: {ex.GetType().Name}: {ex.Message}"); 204 | } 205 | 206 | // 方法3: 使用SetUnethicalReading 207 | Console.WriteLine("3. 使用SetUnethicalReading:"); 208 | try 209 | { 210 | using var reader = new PdfReader(input.FullName).SetUnethicalReading(true); 211 | using var pdfDoc = new PdfDocument(reader); 212 | Console.WriteLine("✓ SetUnethicalReading读取成功"); 213 | } 214 | catch (Exception ex) 215 | { 216 | Console.WriteLine($"✗ SetUnethicalReading读取失败: {ex.GetType().Name}: {ex.Message}"); 217 | } 218 | 219 | Console.WriteLine(); 220 | return Task.CompletedTask; 221 | } 222 | } 223 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Semantic/SemanticTocExtractor.cs: -------------------------------------------------------------------------------- 1 | using PdfTocExtractor.Models; 2 | 3 | namespace PdfTocExtractor.Semantic; 4 | 5 | /// 6 | /// 基于语义分析的目录提取器 7 | /// 8 | public class SemanticTocExtractor 9 | { 10 | private readonly SemanticAnalysisOptions _options; 11 | private readonly PdfTextExtractor _textExtractor; 12 | private readonly SemanticHeadingAnalyzer _headingAnalyzer; 13 | 14 | public SemanticTocExtractor(SemanticAnalysisOptions? options = null) 15 | { 16 | _options = options ?? SemanticAnalysisOptions.Default; 17 | _textExtractor = new PdfTextExtractor(); 18 | _headingAnalyzer = new SemanticHeadingAnalyzer(_options); 19 | } 20 | 21 | /// 22 | /// 从PDF文件提取目录 23 | /// 24 | public List ExtractToc(string pdfPath) 25 | { 26 | if (!File.Exists(pdfPath)) 27 | throw new FileNotFoundException($"PDF文件不存在: {pdfPath}"); 28 | 29 | try 30 | { 31 | if (_options.DebugMode) 32 | { 33 | Console.WriteLine("=== 开始语义分析目录提取 ==="); 34 | Console.WriteLine($"文件: {Path.GetFileName(pdfPath)}"); 35 | Console.WriteLine($"配置: {GetOptionsDescription()}"); 36 | Console.WriteLine(); 37 | } 38 | 39 | // 1. 提取文本片段 40 | var fragments = _textExtractor.ExtractTextFragments(pdfPath, _options); 41 | 42 | if (!fragments.Any()) 43 | { 44 | if (_options.DebugMode) 45 | { 46 | Console.WriteLine("未提取到任何文本片段"); 47 | } 48 | return new List(); 49 | } 50 | 51 | // 2. 语义分析识别标题 52 | var headings = _headingAnalyzer.AnalyzeHeadings(fragments); 53 | 54 | if (!headings.Any()) 55 | { 56 | if (_options.DebugMode) 57 | { 58 | Console.WriteLine("未识别到任何标题"); 59 | } 60 | return new List(); 61 | } 62 | 63 | // 3. 转换为TocItem 64 | var tocItems = ConvertToTocItems(headings); 65 | 66 | if (_options.DebugMode) 67 | { 68 | Console.WriteLine($"\n=== 最终结果 ==="); 69 | Console.WriteLine($"识别到 {tocItems.Count} 个顶级目录项"); 70 | var totalItems = tocItems.Sum(item => 1 + item.GetAllDescendants().Count()); 71 | Console.WriteLine($"总共 {totalItems} 个目录项"); 72 | 73 | Console.WriteLine("\n目录结构预览:"); 74 | PrintTocPreview(tocItems, 0, 5); 75 | } 76 | 77 | return tocItems; 78 | } 79 | catch (Exception ex) 80 | { 81 | throw new InvalidOperationException($"语义分析提取目录时发生错误: {ex.Message}", ex); 82 | } 83 | } 84 | 85 | /// 86 | /// 异步提取目录 87 | /// 88 | public Task> ExtractTocAsync(string pdfPath) 89 | { 90 | return Task.Run(() => ExtractToc(pdfPath)); 91 | } 92 | 93 | /// 94 | /// 将标题片段转换为TocItem 95 | /// 96 | private List ConvertToTocItems(List headings) 97 | { 98 | var tocItems = new List(); 99 | var levelStack = new Stack(); 100 | 101 | foreach (var heading in headings) 102 | { 103 | var level = heading.SemanticResult?.EstimatedLevel ?? 1; 104 | 105 | var tocItem = new TocItem 106 | { 107 | Title = CleanHeadingText(heading.Text), 108 | Page = heading.PageNumber.ToString(), 109 | Level = level - 1, // 转换为0基索引 110 | Children = new List() 111 | }; 112 | 113 | // 建立层级关系 114 | while (levelStack.Count > 0 && levelStack.Peek().Level >= tocItem.Level) 115 | { 116 | levelStack.Pop(); 117 | } 118 | 119 | if (levelStack.Count == 0) 120 | { 121 | tocItems.Add(tocItem); 122 | } 123 | else 124 | { 125 | var parent = levelStack.Peek(); 126 | tocItem.Parent = parent; 127 | parent.Children.Add(tocItem); 128 | } 129 | 130 | levelStack.Push(tocItem); 131 | } 132 | 133 | return tocItems; 134 | } 135 | 136 | /// 137 | /// 清理标题文本 138 | /// 139 | private string CleanHeadingText(string text) 140 | { 141 | if (string.IsNullOrWhiteSpace(text)) 142 | return string.Empty; 143 | 144 | // 移除多余的空白字符 145 | text = text.Trim(); 146 | text = System.Text.RegularExpressions.Regex.Replace(text, @"\s+", " "); 147 | 148 | return text; 149 | } 150 | 151 | /// 152 | /// 获取配置描述 153 | /// 154 | private string GetOptionsDescription() 155 | { 156 | return $"置信度阈值: {_options.MinConfidenceThreshold:F2}, " + 157 | $"字体倍数: {_options.FontSizeMultiplier:F2}, " + 158 | $"跳过页面: [{string.Join(",", _options.SkipPages)}]"; 159 | } 160 | 161 | /// 162 | /// 打印目录预览 163 | /// 164 | private void PrintTocPreview(List items, int currentLevel, int maxItems) 165 | { 166 | var count = 0; 167 | foreach (var item in items) 168 | { 169 | if (count >= maxItems) 170 | { 171 | Console.WriteLine($"{new string(' ', currentLevel * 2)}... 还有 {items.Count - count} 个项目"); 172 | break; 173 | } 174 | 175 | var indent = new string(' ', currentLevel * 2); 176 | Console.WriteLine($"{indent}- [{item.Level}] {item.Title} (第 {item.Page} 页)"); 177 | 178 | if (item.Children.Any() && currentLevel < 2) 179 | { 180 | PrintTocPreview(item.Children, currentLevel + 1, 3); 181 | } 182 | 183 | count++; 184 | } 185 | } 186 | 187 | /// 188 | /// 获取分析统计信息 189 | /// 190 | public SemanticAnalysisStatistics GetAnalysisStatistics(string pdfPath) 191 | { 192 | try 193 | { 194 | var fragments = _textExtractor.ExtractTextFragments(pdfPath, _options); 195 | var headings = _headingAnalyzer.AnalyzeHeadings(fragments); 196 | 197 | var stats = new SemanticAnalysisStatistics 198 | { 199 | TotalTextFragments = fragments.Count, 200 | IdentifiedHeadings = headings.Count, 201 | AverageConfidence = headings.Any() ? 202 | headings.Average(h => h.SemanticResult?.HeadingConfidence ?? 0) : 0, 203 | HeadingsByLevel = headings 204 | .GroupBy(h => h.SemanticResult?.EstimatedLevel ?? 0) 205 | .ToDictionary(g => g.Key, g => g.Count()), 206 | AverageFontSize = fragments.Any() ? fragments.Average(f => f.FontSize) : 0, 207 | BoldTextCount = fragments.Count(f => f.IsBold) 208 | }; 209 | 210 | return stats; 211 | } 212 | catch (Exception ex) 213 | { 214 | throw new InvalidOperationException($"获取分析统计信息时发生错误: {ex.Message}", ex); 215 | } 216 | } 217 | } 218 | 219 | /// 220 | /// 语义分析统计信息 221 | /// 222 | public class SemanticAnalysisStatistics 223 | { 224 | public int TotalTextFragments { get; set; } 225 | public int IdentifiedHeadings { get; set; } 226 | public float AverageConfidence { get; set; } 227 | public Dictionary HeadingsByLevel { get; set; } = new(); 228 | public float AverageFontSize { get; set; } 229 | public int BoldTextCount { get; set; } 230 | } 231 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/Models/TocItemTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using PdfTocExtractor.Models; 3 | using Xunit; 4 | 5 | namespace PdfTocExtractor.Tests.Models; 6 | 7 | public class TocItemTests 8 | { 9 | [Fact] 10 | public void Constructor_ShouldInitializeWithDefaultValues() 11 | { 12 | // Arrange & Act 13 | var tocItem = new TocItem(); 14 | 15 | // Assert 16 | tocItem.Title.Should().Be(string.Empty); 17 | tocItem.Page.Should().Be(string.Empty); 18 | tocItem.Level.Should().Be(0); 19 | tocItem.Children.Should().NotBeNull().And.BeEmpty(); 20 | tocItem.Parent.Should().BeNull(); 21 | tocItem.HasChildren.Should().BeFalse(); 22 | } 23 | 24 | [Fact] 25 | public void Properties_ShouldSetAndGetCorrectly() 26 | { 27 | // Arrange 28 | var tocItem = new TocItem(); 29 | var parent = new TocItem { Title = "Parent" }; 30 | var child = new TocItem { Title = "Child" }; 31 | 32 | // Act 33 | tocItem.Title = "Test Title"; 34 | tocItem.Page = "5"; 35 | tocItem.Level = 2; 36 | tocItem.Parent = parent; 37 | tocItem.Children.Add(child); 38 | 39 | // Assert 40 | tocItem.Title.Should().Be("Test Title"); 41 | tocItem.Page.Should().Be("5"); 42 | tocItem.Level.Should().Be(2); 43 | tocItem.Parent.Should().Be(parent); 44 | tocItem.Children.Should().HaveCount(1).And.Contain(child); 45 | tocItem.HasChildren.Should().BeTrue(); 46 | } 47 | 48 | [Theory] 49 | [InlineData("5", 5)] 50 | [InlineData("10", 10)] 51 | [InlineData("1", 1)] 52 | [InlineData("100", 100)] 53 | public void PageNumber_ShouldReturnCorrectIntegerValue_WhenPageIsValidNumber(string page, int expected) 54 | { 55 | // Arrange 56 | var tocItem = new TocItem { Page = page }; 57 | 58 | // Act 59 | var result = tocItem.PageNumber; 60 | 61 | // Assert 62 | result.Should().Be(expected); 63 | } 64 | 65 | [Theory] 66 | [InlineData("5 XYZ 123 456", 5)] 67 | [InlineData("10 ABC DEF", 10)] 68 | [InlineData("1 Some Additional Info", 1)] 69 | public void PageNumber_ShouldReturnFirstNumber_WhenPageContainsSpaces(string page, int expected) 70 | { 71 | // Arrange 72 | var tocItem = new TocItem { Page = page }; 73 | 74 | // Act 75 | var result = tocItem.PageNumber; 76 | 77 | // Assert 78 | result.Should().Be(expected); 79 | } 80 | 81 | [Theory] 82 | [InlineData("")] 83 | [InlineData("无页码")] 84 | [InlineData("N/A")] 85 | [InlineData("abc")] 86 | [InlineData("invalid")] 87 | public void PageNumber_ShouldReturnZero_WhenPageIsInvalid(string page) 88 | { 89 | // Arrange 90 | var tocItem = new TocItem { Page = page }; 91 | 92 | // Act 93 | var result = tocItem.PageNumber; 94 | 95 | // Assert 96 | result.Should().Be(0); 97 | } 98 | 99 | [Fact] 100 | public void HasChildren_ShouldReturnTrue_WhenChildrenExist() 101 | { 102 | // Arrange 103 | var tocItem = new TocItem(); 104 | var child = new TocItem { Title = "Child" }; 105 | 106 | // Act 107 | tocItem.Children.Add(child); 108 | 109 | // Assert 110 | tocItem.HasChildren.Should().BeTrue(); 111 | } 112 | 113 | [Fact] 114 | public void HasChildren_ShouldReturnFalse_WhenNoChildren() 115 | { 116 | // Arrange 117 | var tocItem = new TocItem(); 118 | 119 | // Act & Assert 120 | tocItem.HasChildren.Should().BeFalse(); 121 | } 122 | 123 | [Fact] 124 | public void GetAllDescendants_ShouldReturnEmptyCollection_WhenNoChildren() 125 | { 126 | // Arrange 127 | var tocItem = new TocItem { Title = "Root" }; 128 | 129 | // Act 130 | var descendants = tocItem.GetAllDescendants().ToList(); 131 | 132 | // Assert 133 | descendants.Should().BeEmpty(); 134 | } 135 | 136 | [Fact] 137 | public void GetAllDescendants_ShouldReturnDirectChildren_WhenOnlyDirectChildren() 138 | { 139 | // Arrange 140 | var root = new TocItem { Title = "Root" }; 141 | var child1 = new TocItem { Title = "Child1" }; 142 | var child2 = new TocItem { Title = "Child2" }; 143 | 144 | root.Children.Add(child1); 145 | root.Children.Add(child2); 146 | 147 | // Act 148 | var descendants = root.GetAllDescendants().ToList(); 149 | 150 | // Assert 151 | descendants.Should().HaveCount(2); 152 | descendants.Should().Contain(child1); 153 | descendants.Should().Contain(child2); 154 | } 155 | 156 | [Fact] 157 | public void GetAllDescendants_ShouldReturnAllDescendants_WhenNestedChildren() 158 | { 159 | // Arrange 160 | var root = new TocItem { Title = "Root" }; 161 | var child1 = new TocItem { Title = "Child1" }; 162 | var child2 = new TocItem { Title = "Child2" }; 163 | var grandchild1 = new TocItem { Title = "Grandchild1" }; 164 | var grandchild2 = new TocItem { Title = "Grandchild2" }; 165 | 166 | root.Children.Add(child1); 167 | root.Children.Add(child2); 168 | child1.Children.Add(grandchild1); 169 | child2.Children.Add(grandchild2); 170 | 171 | // Act 172 | var descendants = root.GetAllDescendants().ToList(); 173 | 174 | // Assert 175 | descendants.Should().HaveCount(4); 176 | descendants.Should().Contain(child1); 177 | descendants.Should().Contain(child2); 178 | descendants.Should().Contain(grandchild1); 179 | descendants.Should().Contain(grandchild2); 180 | } 181 | 182 | [Fact] 183 | public void GetFullPath_ShouldReturnSingleTitle_WhenNoParent() 184 | { 185 | // Arrange 186 | var tocItem = new TocItem { Title = "Root" }; 187 | 188 | // Act 189 | var path = tocItem.GetFullPath(); 190 | 191 | // Assert 192 | path.Should().Be("Root"); 193 | } 194 | 195 | [Fact] 196 | public void GetFullPath_ShouldReturnFullPath_WhenHasParents() 197 | { 198 | // Arrange 199 | var root = new TocItem { Title = "Root" }; 200 | var child = new TocItem { Title = "Child", Parent = root }; 201 | var grandchild = new TocItem { Title = "Grandchild", Parent = child }; 202 | 203 | // Act 204 | var path = grandchild.GetFullPath(); 205 | 206 | // Assert 207 | path.Should().Be("Root > Child > Grandchild"); 208 | } 209 | 210 | [Fact] 211 | public void GetFullPath_ShouldUseCustomSeparator_WhenProvided() 212 | { 213 | // Arrange 214 | var root = new TocItem { Title = "Root" }; 215 | var child = new TocItem { Title = "Child", Parent = root }; 216 | var grandchild = new TocItem { Title = "Grandchild", Parent = child }; 217 | 218 | // Act 219 | var path = grandchild.GetFullPath(" / "); 220 | 221 | // Assert 222 | path.Should().Be("Root / Child / Grandchild"); 223 | } 224 | 225 | [Fact] 226 | public void ToString_ShouldReturnFormattedString() 227 | { 228 | // Arrange 229 | var tocItem = new TocItem 230 | { 231 | Title = "Test Title", 232 | Page = "5", 233 | Level = 2 234 | }; 235 | 236 | // Act 237 | var result = tocItem.ToString(); 238 | 239 | // Assert 240 | result.Should().Be(" - Test Title (第 5 页)"); 241 | } 242 | 243 | [Fact] 244 | public void ToString_ShouldIndentCorrectly_BasedOnLevel() 245 | { 246 | // Arrange 247 | var level0 = new TocItem { Title = "Level 0", Page = "1", Level = 0 }; 248 | var level1 = new TocItem { Title = "Level 1", Page = "2", Level = 1 }; 249 | var level2 = new TocItem { Title = "Level 2", Page = "3", Level = 2 }; 250 | 251 | // Act & Assert 252 | level0.ToString().Should().Be("- Level 0 (第 1 页)"); 253 | level1.ToString().Should().Be(" - Level 1 (第 2 页)"); 254 | level2.ToString().Should().Be(" - Level 2 (第 3 页)"); 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/TestData/MockHelpers.cs: -------------------------------------------------------------------------------- 1 | using Moq; 2 | using PdfTocExtractor.Exporters; 3 | using PdfTocExtractor.Models; 4 | 5 | namespace PdfTocExtractor.Tests.TestData; 6 | 7 | /// 8 | /// 用于创建Mock对象的辅助类 9 | /// 10 | public static class MockHelpers 11 | { 12 | /// 13 | /// 创建Mock的IExporter 14 | /// 15 | public static Mock CreateMockExporter(string formatName = "Mock", string fileExtension = "mock") 16 | { 17 | var mock = new Mock(); 18 | 19 | mock.Setup(x => x.FormatName).Returns(formatName); 20 | mock.Setup(x => x.FileExtension).Returns(fileExtension); 21 | 22 | mock.Setup(x => x.Export(It.IsAny>(), It.IsAny())) 23 | .Returns((IEnumerable items, ExportOptions? options) => 24 | { 25 | var title = options?.CustomTitle ?? "Mock Export"; 26 | var itemCount = items.Count(); 27 | return $"{title}\nItems: {itemCount}"; 28 | }); 29 | 30 | mock.Setup(x => x.ExportToFileAsync( 31 | It.IsAny>(), 32 | It.IsAny(), 33 | It.IsAny())) 34 | .Returns((IEnumerable items, string filePath, ExportOptions? options) => 35 | { 36 | var content = mock.Object.Export(items, options); 37 | return File.WriteAllTextAsync(filePath, content); 38 | }); 39 | 40 | return mock; 41 | } 42 | 43 | /// 44 | /// 创建抛出异常的Mock IExporter 45 | /// 46 | public static Mock CreateFailingMockExporter(Exception exception) 47 | { 48 | var mock = new Mock(); 49 | 50 | mock.Setup(x => x.FormatName).Returns("FailingMock"); 51 | mock.Setup(x => x.FileExtension).Returns("fail"); 52 | 53 | mock.Setup(x => x.Export(It.IsAny>(), It.IsAny())) 54 | .Throws(exception); 55 | 56 | mock.Setup(x => x.ExportToFileAsync( 57 | It.IsAny>(), 58 | It.IsAny(), 59 | It.IsAny())) 60 | .ThrowsAsync(exception); 61 | 62 | return mock; 63 | } 64 | 65 | /// 66 | /// 创建验证调用的Mock IExporter 67 | /// 68 | public static Mock CreateVerifiableMockExporter() 69 | { 70 | var mock = new Mock(); 71 | 72 | mock.Setup(x => x.FormatName).Returns("Verifiable"); 73 | mock.Setup(x => x.FileExtension).Returns("verify"); 74 | 75 | mock.Setup(x => x.Export(It.IsAny>(), It.IsAny())) 76 | .Returns("Verified Export") 77 | .Verifiable(); 78 | 79 | mock.Setup(x => x.ExportToFileAsync( 80 | It.IsAny>(), 81 | It.IsAny(), 82 | It.IsAny())) 83 | .Returns(Task.CompletedTask) 84 | .Verifiable(); 85 | 86 | return mock; 87 | } 88 | 89 | /// 90 | /// 创建模拟文件系统操作的临时文件 91 | /// 92 | public static string CreateTempFile(string content = "") 93 | { 94 | var tempFile = Path.GetTempFileName(); 95 | if (!string.IsNullOrEmpty(content)) 96 | { 97 | File.WriteAllText(tempFile, content); 98 | } 99 | return tempFile; 100 | } 101 | 102 | /// 103 | /// 创建临时目录 104 | /// 105 | public static string CreateTempDirectory() 106 | { 107 | var tempDir = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); 108 | Directory.CreateDirectory(tempDir); 109 | return tempDir; 110 | } 111 | 112 | /// 113 | /// 清理临时文件 114 | /// 115 | public static void CleanupTempFile(string filePath) 116 | { 117 | try 118 | { 119 | if (File.Exists(filePath)) 120 | { 121 | File.Delete(filePath); 122 | } 123 | } 124 | catch 125 | { 126 | // 忽略清理错误 127 | } 128 | } 129 | 130 | /// 131 | /// 清理临时目录 132 | /// 133 | public static void CleanupTempDirectory(string directoryPath) 134 | { 135 | try 136 | { 137 | if (Directory.Exists(directoryPath)) 138 | { 139 | Directory.Delete(directoryPath, true); 140 | } 141 | } 142 | catch 143 | { 144 | // 忽略清理错误 145 | } 146 | } 147 | 148 | /// 149 | /// 创建测试用的ExportOptions 150 | /// 151 | public static ExportOptions CreateTestExportOptions( 152 | string? customTitle = null, 153 | bool includePageNumbers = true, 154 | bool includeLinks = false, 155 | int maxDepth = 0, 156 | string indentString = " ", 157 | string pageNumberFormat = "第 {0} 页") 158 | { 159 | return new ExportOptions 160 | { 161 | CustomTitle = customTitle, 162 | IncludePageNumbers = includePageNumbers, 163 | IncludeLinks = includeLinks, 164 | MaxDepth = maxDepth, 165 | IndentString = indentString, 166 | PageNumberFormat = pageNumberFormat 167 | }; 168 | } 169 | 170 | /// 171 | /// 验证两个TocItem列表是否相等(用于测试) 172 | /// 173 | public static bool AreTocItemListsEqual(IEnumerable list1, IEnumerable list2) 174 | { 175 | var items1 = list1.ToList(); 176 | var items2 = list2.ToList(); 177 | 178 | if (items1.Count != items2.Count) 179 | return false; 180 | 181 | for (int i = 0; i < items1.Count; i++) 182 | { 183 | if (!AreTocItemsEqual(items1[i], items2[i])) 184 | return false; 185 | } 186 | 187 | return true; 188 | } 189 | 190 | /// 191 | /// 验证两个TocItem是否相等(递归比较) 192 | /// 193 | public static bool AreTocItemsEqual(TocItem item1, TocItem item2) 194 | { 195 | if (item1.Title != item2.Title || 196 | item1.Page != item2.Page || 197 | item1.Level != item2.Level || 198 | item1.Children.Count != item2.Children.Count) 199 | { 200 | return false; 201 | } 202 | 203 | for (int i = 0; i < item1.Children.Count; i++) 204 | { 205 | if (!AreTocItemsEqual(item1.Children[i], item2.Children[i])) 206 | return false; 207 | } 208 | 209 | return true; 210 | } 211 | 212 | /// 213 | /// 创建测试用的PDF文件路径(不存在的文件) 214 | /// 215 | public static string CreateNonExistentPdfPath() 216 | { 217 | return Path.Combine(Path.GetTempPath(), $"non_existent_{Guid.NewGuid()}.pdf"); 218 | } 219 | 220 | /// 221 | /// 创建测试用的空PDF文件 222 | /// 223 | public static string CreateEmptyPdfFile() 224 | { 225 | var tempFile = Path.GetTempFileName(); 226 | var pdfPath = Path.ChangeExtension(tempFile, ".pdf"); 227 | File.Move(tempFile, pdfPath); 228 | 229 | // 创建一个空的PDF文件(仅用于测试文件存在性) 230 | File.WriteAllBytes(pdfPath, new byte[] { 0x25, 0x50, 0x44, 0x46 }); // PDF header 231 | 232 | return pdfPath; 233 | } 234 | 235 | /// 236 | /// 验证字符串是否为有效的JSON格式 237 | /// 238 | public static bool IsValidJson(string jsonString) 239 | { 240 | try 241 | { 242 | Newtonsoft.Json.JsonConvert.DeserializeObject(jsonString); 243 | return true; 244 | } 245 | catch 246 | { 247 | return false; 248 | } 249 | } 250 | 251 | /// 252 | /// 验证字符串是否为有效的XML格式 253 | /// 254 | public static bool IsValidXml(string xmlString) 255 | { 256 | try 257 | { 258 | var doc = new System.Xml.XmlDocument(); 259 | doc.LoadXml(xmlString); 260 | return true; 261 | } 262 | catch 263 | { 264 | return false; 265 | } 266 | } 267 | } 268 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Semantic/PdfTextExtractor.cs: -------------------------------------------------------------------------------- 1 | using iText.Kernel.Geom; 2 | using iText.Kernel.Pdf; 3 | using iText.Kernel.Pdf.Canvas.Parser; 4 | using iText.Kernel.Pdf.Canvas.Parser.Data; 5 | using iText.Kernel.Pdf.Canvas.Parser.Listener; 6 | 7 | namespace PdfTocExtractor.Semantic; 8 | 9 | /// 10 | /// PDF文本提取器,提取文本及其样式信息 11 | /// 12 | public class PdfTextExtractor 13 | { 14 | /// 15 | /// 从PDF文件提取文本片段 16 | /// 17 | public List ExtractTextFragments(string pdfPath, SemanticAnalysisOptions options) 18 | { 19 | if (!File.Exists(pdfPath)) 20 | throw new FileNotFoundException($"PDF文件不存在: {pdfPath}"); 21 | 22 | var fragments = new List(); 23 | 24 | try 25 | { 26 | using var reader = new PdfReader(pdfPath); 27 | using var pdfDoc = new PdfDocument(reader); 28 | 29 | int totalPages = pdfDoc.GetNumberOfPages(); 30 | 31 | if (options.DebugMode) 32 | { 33 | Console.WriteLine($"开始提取PDF文本: {System.IO.Path.GetFileName(pdfPath)}"); 34 | Console.WriteLine($"总页数: {totalPages}"); 35 | Console.WriteLine($"跳过页面: {string.Join(", ", options.SkipPages)}"); 36 | } 37 | 38 | for (int pageNum = 1; pageNum <= totalPages; pageNum++) 39 | { 40 | // 跳过指定页面(通常是目录页) 41 | if (options.SkipPages.Contains(pageNum)) 42 | { 43 | if (options.DebugMode) 44 | { 45 | Console.WriteLine($"跳过页面 {pageNum}"); 46 | } 47 | continue; 48 | } 49 | 50 | try 51 | { 52 | var page = pdfDoc.GetPage(pageNum); 53 | 54 | // 使用简单的文本提取方法 55 | string pageText = iText.Kernel.Pdf.Canvas.Parser.PdfTextExtractor.GetTextFromPage(page); 56 | 57 | if (!string.IsNullOrWhiteSpace(pageText)) 58 | { 59 | // 将页面文本分割成行,每行作为一个文本片段 60 | var lines = pageText.Split('\n', StringSplitOptions.RemoveEmptyEntries); 61 | 62 | for (int lineIndex = 0; lineIndex < lines.Length; lineIndex++) 63 | { 64 | var line = lines[lineIndex].Trim(); 65 | if (!string.IsNullOrWhiteSpace(line) && line.Length > 1) 66 | { 67 | var fragment = new TextFragment 68 | { 69 | Text = line, 70 | FontSize = 12f, // 默认字体大小 71 | FontName = "Unknown", 72 | IsBold = false, 73 | IsItalic = false, 74 | X = 0, 75 | Y = lineIndex * 15, // 估算Y位置 76 | Width = line.Length * 6, // 估算宽度 77 | Height = 12, 78 | PageNumber = pageNum, 79 | IsStandalone = true 80 | }; 81 | 82 | fragments.Add(fragment); 83 | } 84 | } 85 | } 86 | 87 | if (options.DebugMode) 88 | { 89 | var pageFragments = fragments.Where(f => f.PageNumber == pageNum).ToList(); 90 | Console.WriteLine($"页面 {pageNum}: 提取到 {pageFragments.Count} 个文本片段"); 91 | } 92 | } 93 | catch (Exception ex) 94 | { 95 | if (options.DebugMode) 96 | { 97 | Console.WriteLine($"处理页面 {pageNum} 时出错: {ex.Message}"); 98 | } 99 | } 100 | } 101 | 102 | // 合并相邻的文本片段 103 | var mergedFragments = MergeAdjacentFragments(fragments, options); 104 | 105 | if (options.DebugMode) 106 | { 107 | Console.WriteLine($"文本提取完成: {fragments.Count} -> {mergedFragments.Count} 个片段"); 108 | } 109 | 110 | return mergedFragments; 111 | } 112 | catch (Exception ex) 113 | { 114 | throw new InvalidOperationException($"提取PDF文本时发生错误: {ex.Message}", ex); 115 | } 116 | } 117 | 118 | /// 119 | /// 合并相邻的文本片段 120 | /// 121 | private List MergeAdjacentFragments(List fragments, SemanticAnalysisOptions options) 122 | { 123 | var mergedFragments = new List(); 124 | 125 | // 按页面分组 126 | var groupedByPage = fragments.GroupBy(f => f.PageNumber); 127 | 128 | foreach (var pageGroup in groupedByPage) 129 | { 130 | var pageFragments = pageGroup.OrderBy(f => f.Y).ThenBy(f => f.X).ToList(); 131 | 132 | for (int i = 0; i < pageFragments.Count; i++) 133 | { 134 | var current = pageFragments[i]; 135 | var merged = new TextFragment 136 | { 137 | Text = current.Text, 138 | FontSize = current.FontSize, 139 | FontName = current.FontName, 140 | IsBold = current.IsBold, 141 | IsItalic = current.IsItalic, 142 | X = current.X, 143 | Y = current.Y, 144 | Width = current.Width, 145 | Height = current.Height, 146 | PageNumber = current.PageNumber 147 | }; 148 | 149 | // 查找同一行的相邻片段进行合并 150 | var j = i + 1; 151 | while (j < pageFragments.Count) 152 | { 153 | var next = pageFragments[j]; 154 | 155 | // 检查是否在同一行 156 | if (Math.Abs(next.Y - current.Y) > 3f) 157 | break; 158 | 159 | // 检查是否相邻 160 | var expectedX = current.X + current.Width; 161 | if (Math.Abs(next.X - expectedX) > 15f) 162 | break; 163 | 164 | // 检查字体是否相同 165 | if (Math.Abs(next.FontSize - current.FontSize) > 0.5f) 166 | break; 167 | 168 | // 合并文本 169 | merged.Text += next.Text; 170 | merged.Width = next.X + next.Width - merged.X; 171 | 172 | current = next; 173 | j++; 174 | } 175 | 176 | // 跳过已合并的片段 177 | i = j - 1; 178 | 179 | // 只保留有意义的文本 180 | if (!string.IsNullOrWhiteSpace(merged.Text) && merged.Text.Length > 1) 181 | { 182 | mergedFragments.Add(merged); 183 | } 184 | } 185 | } 186 | 187 | // 分析独立成行和垂直间距 188 | AnalyzeSpacing(mergedFragments); 189 | 190 | return mergedFragments; 191 | } 192 | 193 | /// 194 | /// 分析文本间距 195 | /// 196 | private void AnalyzeSpacing(List fragments) 197 | { 198 | var groupedByPage = fragments.GroupBy(f => f.PageNumber); 199 | 200 | foreach (var pageGroup in groupedByPage) 201 | { 202 | var pageFragments = pageGroup.OrderBy(f => f.Y).ToList(); 203 | 204 | for (int i = 0; i < pageFragments.Count; i++) 205 | { 206 | var current = pageFragments[i]; 207 | 208 | // 检查是否独立成行 209 | var sameLineFragments = pageFragments.Where(f => 210 | Math.Abs(f.Y - current.Y) < 5f && f != current).ToList(); 211 | current.IsStandalone = !sameLineFragments.Any(); 212 | 213 | // 计算垂直间距 214 | if (i > 0) 215 | { 216 | current.VerticalSpaceBefore = Math.Abs(current.Y - pageFragments[i - 1].Y); 217 | } 218 | 219 | if (i < pageFragments.Count - 1) 220 | { 221 | current.VerticalSpaceAfter = Math.Abs(pageFragments[i + 1].Y - current.Y); 222 | } 223 | } 224 | } 225 | } 226 | 227 | 228 | } 229 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/Exporters/MarkdownExporterTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using PdfTocExtractor.Exporters; 3 | using PdfTocExtractor.Models; 4 | using System.Text; 5 | using Xunit; 6 | 7 | namespace PdfTocExtractor.Tests.Exporters; 8 | 9 | public class MarkdownExporterTests 10 | { 11 | private readonly MarkdownExporter _exporter; 12 | 13 | public MarkdownExporterTests() 14 | { 15 | _exporter = new MarkdownExporter(); 16 | } 17 | 18 | [Fact] 19 | public void FormatName_ShouldReturnMarkdown() 20 | { 21 | // Act & Assert 22 | _exporter.FormatName.Should().Be("Markdown"); 23 | } 24 | 25 | [Fact] 26 | public void FileExtension_ShouldReturnMd() 27 | { 28 | // Act & Assert 29 | _exporter.FileExtension.Should().Be("md"); 30 | } 31 | 32 | [Fact] 33 | public void Export_ShouldReturnEmptyDocumentWithTitle_WhenNoTocItems() 34 | { 35 | // Arrange 36 | var tocItems = new List(); 37 | var options = new ExportOptions { CustomTitle = "Test Document" }; 38 | 39 | // Act 40 | var result = _exporter.Export(tocItems, options); 41 | 42 | // Assert 43 | result.Should().Contain("# Test Document"); 44 | result.Should().NotContain("##"); // No sub-headers 45 | } 46 | 47 | [Fact] 48 | public void Export_ShouldUseDefaultTitle_WhenCustomTitleIsNull() 49 | { 50 | // Arrange 51 | var tocItems = new List(); 52 | var options = new ExportOptions { CustomTitle = null }; 53 | 54 | // Act 55 | var result = _exporter.Export(tocItems, options); 56 | 57 | // Assert 58 | result.Should().Contain("# PDF 目录"); 59 | } 60 | 61 | [Fact] 62 | public void Export_ShouldCreateCorrectMarkdownStructure_WithSingleItem() 63 | { 64 | // Arrange 65 | var tocItems = new List 66 | { 67 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 68 | }; 69 | 70 | // Act 71 | var result = _exporter.Export(tocItems); 72 | 73 | // Assert 74 | result.Should().Contain("# PDF 目录"); 75 | result.Should().Contain("- Chapter 1(第 5 页)"); 76 | } 77 | 78 | [Fact] 79 | public void Export_ShouldCreateCorrectHierarchy_WithNestedItems() 80 | { 81 | // Arrange 82 | var tocItems = new List 83 | { 84 | new TocItem 85 | { 86 | Title = "Chapter 1", 87 | Page = "5", 88 | Level = 0, 89 | Children = new List 90 | { 91 | new TocItem { Title = "Section 1.1", Page = "6", Level = 1 }, 92 | new TocItem { Title = "Section 1.2", Page = "10", Level = 1 } 93 | } 94 | } 95 | }; 96 | 97 | // Act 98 | var result = _exporter.Export(tocItems); 99 | 100 | // Assert 101 | result.Should().Contain("- Chapter 1(第 5 页)"); 102 | result.Should().Contain(" - Section 1.1(第 6 页)"); 103 | result.Should().Contain(" - Section 1.2(第 10 页)"); 104 | } 105 | 106 | [Fact] 107 | public void Export_ShouldRespectMaxDepth_WhenSpecified() 108 | { 109 | // Arrange 110 | var tocItems = new List 111 | { 112 | new TocItem 113 | { 114 | Title = "Chapter 1", 115 | Page = "5", 116 | Level = 0, 117 | Children = new List 118 | { 119 | new TocItem 120 | { 121 | Title = "Section 1.1", 122 | Page = "6", 123 | Level = 1, 124 | Children = new List 125 | { 126 | new TocItem { Title = "Subsection 1.1.1", Page = "7", Level = 2 } 127 | } 128 | } 129 | } 130 | } 131 | }; 132 | var options = new ExportOptions { MaxDepth = 1 }; 133 | 134 | // Act 135 | var result = _exporter.Export(tocItems, options); 136 | 137 | // Assert 138 | result.Should().Contain("- Chapter 1(第 5 页)"); 139 | result.Should().Contain(" - Section 1.1(第 6 页)"); 140 | result.Should().NotContain("Subsection 1.1.1"); 141 | } 142 | 143 | [Fact] 144 | public void Export_ShouldExcludePageNumbers_WhenIncludePageNumbersIsFalse() 145 | { 146 | // Arrange 147 | var tocItems = new List 148 | { 149 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 150 | }; 151 | var options = new ExportOptions { IncludePageNumbers = false }; 152 | 153 | // Act 154 | var result = _exporter.Export(tocItems, options); 155 | 156 | // Assert 157 | result.Should().Contain("- Chapter 1"); 158 | result.Should().NotContain("第 5 页"); 159 | result.Should().NotContain("("); 160 | result.Should().NotContain(")"); 161 | } 162 | 163 | [Fact] 164 | public void Export_ShouldUseCustomIndentation_WhenSpecified() 165 | { 166 | // Arrange 167 | var tocItems = new List 168 | { 169 | new TocItem 170 | { 171 | Title = "Chapter 1", 172 | Page = "5", 173 | Level = 0, 174 | Children = new List 175 | { 176 | new TocItem { Title = "Section 1.1", Page = "6", Level = 1 } 177 | } 178 | } 179 | }; 180 | var options = new ExportOptions { IndentString = "\t" }; 181 | 182 | // Act 183 | var result = _exporter.Export(tocItems, options); 184 | 185 | // Assert 186 | result.Should().Contain("- Chapter 1(第 5 页)"); 187 | result.Should().Contain("\t- Section 1.1(第 6 页)"); 188 | } 189 | 190 | [Fact] 191 | public void Export_ShouldUseCustomPageNumberFormat_WhenSpecified() 192 | { 193 | // Arrange 194 | var tocItems = new List 195 | { 196 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 197 | }; 198 | var options = new ExportOptions { PageNumberFormat = "Page {0}" }; 199 | 200 | // Act 201 | var result = _exporter.Export(tocItems, options); 202 | 203 | // Assert 204 | result.Should().Contain("- Chapter 1 (Page 5)"); 205 | } 206 | 207 | [Fact] 208 | public void Export_ShouldHandleEmptyPageNumbers() 209 | { 210 | // Arrange 211 | var tocItems = new List 212 | { 213 | new TocItem { Title = "Chapter 1", Page = "", Level = 0 }, 214 | new TocItem { Title = "Chapter 2", Page = "无页码", Level = 0 }, 215 | new TocItem { Title = "Chapter 3", Page = "N/A", Level = 0 } 216 | }; 217 | 218 | // Act 219 | var result = _exporter.Export(tocItems); 220 | 221 | // Assert 222 | result.Should().Contain("- Chapter 1"); 223 | result.Should().Contain("- Chapter 2"); 224 | result.Should().Contain("- Chapter 3"); 225 | result.Should().NotContain("第 页"); 226 | result.Should().NotContain("第 无页码 页"); 227 | result.Should().NotContain("第 N/A 页"); 228 | } 229 | 230 | [Fact] 231 | public async Task ExportToFileAsync_ShouldCreateFileWithCorrectContent() 232 | { 233 | // Arrange 234 | var tocItems = new List 235 | { 236 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 237 | }; 238 | var tempFile = Path.GetTempFileName(); 239 | var options = new ExportOptions { CustomTitle = "Test Export" }; 240 | 241 | try 242 | { 243 | // Act 244 | await _exporter.ExportToFileAsync(tocItems, tempFile, options); 245 | 246 | // Assert 247 | File.Exists(tempFile).Should().BeTrue(); 248 | var content = await File.ReadAllTextAsync(tempFile); 249 | content.Should().Contain("# Test Export"); 250 | content.Should().Contain("- Chapter 1(第 5 页)"); 251 | } 252 | finally 253 | { 254 | // Cleanup 255 | if (File.Exists(tempFile)) 256 | File.Delete(tempFile); 257 | } 258 | } 259 | 260 | [Fact] 261 | public async Task ExportToFileAsync_ShouldUseSpecifiedEncoding() 262 | { 263 | // Arrange 264 | var tocItems = new List 265 | { 266 | new TocItem { Title = "测试章节", Page = "5", Level = 0 } 267 | }; 268 | var tempFile = Path.GetTempFileName(); 269 | var options = new ExportOptions { Encoding = Encoding.Unicode }; 270 | 271 | try 272 | { 273 | // Act 274 | await _exporter.ExportToFileAsync(tocItems, tempFile, options); 275 | 276 | // Assert 277 | File.Exists(tempFile).Should().BeTrue(); 278 | var content = await File.ReadAllTextAsync(tempFile, Encoding.Unicode); 279 | content.Should().Contain("测试章节"); 280 | } 281 | finally 282 | { 283 | // Cleanup 284 | if (File.Exists(tempFile)) 285 | File.Delete(tempFile); 286 | } 287 | } 288 | 289 | [Fact] 290 | public void Export_ShouldHandleNullOptions() 291 | { 292 | // Arrange 293 | var tocItems = new List 294 | { 295 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 296 | }; 297 | 298 | // Act 299 | var result = _exporter.Export(tocItems, null); 300 | 301 | // Assert 302 | result.Should().NotBeNullOrEmpty(); 303 | result.Should().Contain("# PDF 目录"); 304 | result.Should().Contain("- Chapter 1(第 5 页)"); 305 | } 306 | } 307 | -------------------------------------------------------------------------------- /src/PdfTocExtractor.Cli/Commands/SemanticCommand.cs: -------------------------------------------------------------------------------- 1 | using System.CommandLine; 2 | using PdfTocExtractor.Exporters; 3 | using PdfTocExtractor.Semantic; 4 | 5 | namespace PdfTocExtractor.Cli.Commands; 6 | 7 | /// 8 | /// 语义分析命令 9 | /// 10 | public static class SemanticCommand 11 | { 12 | public static Command Create() 13 | { 14 | var inputOption = new Option( 15 | aliases: new[] { "--input", "-i" }, 16 | description: "输入PDF文件路径") 17 | { 18 | IsRequired = true 19 | }; 20 | 21 | var outputOption = new Option( 22 | aliases: new[] { "--output", "-o" }, 23 | description: "输出文件路径"); 24 | 25 | var formatOption = new Option( 26 | aliases: new[] { "--format", "-f" }, 27 | description: "输出格式 (markdown, json, xml, text)"); 28 | 29 | var maxDepthOption = new Option( 30 | aliases: new[] { "--max-depth", "-d" }, 31 | description: "最大层级深度", 32 | getDefaultValue: () => 0); 33 | 34 | var includePagesOption = new Option( 35 | aliases: new[] { "--include-pages", "-p" }, 36 | description: "包含页码信息", 37 | getDefaultValue: () => true); 38 | 39 | var customTitleOption = new Option( 40 | aliases: new[] { "--title", "-t" }, 41 | description: "自定义标题"); 42 | 43 | var verboseOption = new Option( 44 | aliases: new[] { "--verbose", "-v" }, 45 | description: "显示详细输出", 46 | getDefaultValue: () => false); 47 | 48 | var debugOption = new Option( 49 | aliases: new[] { "--debug" }, 50 | description: "启用调试模式", 51 | getDefaultValue: () => false); 52 | 53 | var modeOption = new Option( 54 | aliases: new[] { "--mode" }, 55 | description: "分析模式 (default, strict, relaxed)", 56 | getDefaultValue: () => "default"); 57 | 58 | var skipPagesOption = new Option( 59 | aliases: new[] { "--skip-pages" }, 60 | description: "跳过的页面(如 '1,2,3' 或 '1-3')", 61 | getDefaultValue: () => "1,2,3"); 62 | 63 | var confidenceOption = new Option( 64 | aliases: new[] { "--confidence" }, 65 | description: "最小置信度阈值 (0.0-1.0)", 66 | getDefaultValue: () => 0.3f); 67 | 68 | var fontMultiplierOption = new Option( 69 | aliases: new[] { "--font-multiplier" }, 70 | description: "字体大小倍数阈值", 71 | getDefaultValue: () => 1.1f); 72 | 73 | var command = new Command("semantic", "使用语义分析识别PDF目录结构") 74 | { 75 | inputOption, 76 | outputOption, 77 | formatOption, 78 | maxDepthOption, 79 | includePagesOption, 80 | customTitleOption, 81 | verboseOption, 82 | debugOption, 83 | modeOption, 84 | skipPagesOption, 85 | confidenceOption, 86 | fontMultiplierOption 87 | }; 88 | 89 | command.SetHandler(async (context) => 90 | { 91 | var input = context.ParseResult.GetValueForOption(inputOption)!; 92 | var output = context.ParseResult.GetValueForOption(outputOption); 93 | var format = context.ParseResult.GetValueForOption(formatOption); 94 | var maxDepth = context.ParseResult.GetValueForOption(maxDepthOption); 95 | var includePages = context.ParseResult.GetValueForOption(includePagesOption); 96 | var customTitle = context.ParseResult.GetValueForOption(customTitleOption); 97 | var verbose = context.ParseResult.GetValueForOption(verboseOption); 98 | var debug = context.ParseResult.GetValueForOption(debugOption); 99 | var mode = context.ParseResult.GetValueForOption(modeOption)!; 100 | var skipPages = context.ParseResult.GetValueForOption(skipPagesOption)!; 101 | var confidence = context.ParseResult.GetValueForOption(confidenceOption); 102 | var fontMultiplier = context.ParseResult.GetValueForOption(fontMultiplierOption); 103 | 104 | await ExecuteSemanticCommand(input, output, format, maxDepth, includePages, customTitle, 105 | verbose, debug, mode, skipPages, confidence, fontMultiplier); 106 | }); 107 | 108 | return command; 109 | } 110 | 111 | private static async Task ExecuteSemanticCommand( 112 | FileInfo input, 113 | FileInfo? output, 114 | string? format, 115 | int maxDepth, 116 | bool includePages, 117 | string? customTitle, 118 | bool verbose, 119 | bool debug, 120 | string mode, 121 | string skipPages, 122 | float confidence, 123 | float fontMultiplier) 124 | { 125 | if (!input.Exists) 126 | { 127 | throw new FileNotFoundException($"输入文件不存在: {input.FullName}"); 128 | } 129 | 130 | if (verbose) 131 | { 132 | Console.WriteLine($"正在使用语义分析处理PDF文件: {input.FullName}"); 133 | Console.WriteLine($"分析模式: {mode}"); 134 | Console.WriteLine($"跳过页面: {skipPages}"); 135 | Console.WriteLine($"置信度阈值: {confidence:F2}"); 136 | Console.WriteLine($"字体倍数: {fontMultiplier:F2}"); 137 | } 138 | 139 | try 140 | { 141 | // 创建语义分析选项 142 | var semanticOptions = CreateSemanticOptions(mode, skipPages, confidence, fontMultiplier, debug); 143 | 144 | var extractor = new PdfTocExtractor(); 145 | var tocItems = await extractor.ExtractTocSemanticAsync(input.FullName, semanticOptions); 146 | 147 | if (verbose) 148 | { 149 | Console.WriteLine($"成功识别 {tocItems.Count} 个顶级目录项"); 150 | var totalItems = tocItems.Sum(item => 1 + item.GetAllDescendants().Count()); 151 | Console.WriteLine($"总共 {totalItems} 个目录项"); 152 | } 153 | 154 | if (tocItems.Count == 0) 155 | { 156 | Console.WriteLine("未识别到任何目录结构"); 157 | Console.WriteLine("建议:"); 158 | Console.WriteLine(" - 尝试使用 --mode relaxed 降低识别阈值"); 159 | Console.WriteLine(" - 调整 --confidence 参数(如 0.2)"); 160 | Console.WriteLine(" - 使用 --debug 查看详细分析过程"); 161 | Console.WriteLine(" - 检查 --skip-pages 是否正确跳过了目录页"); 162 | return; 163 | } 164 | 165 | // 创建导出选项 166 | var exportOptions = new ExportOptions 167 | { 168 | MaxDepth = maxDepth, 169 | IncludePageNumbers = includePages, 170 | CustomTitle = customTitle ?? "语义分析提取的目录" 171 | }; 172 | 173 | // 导出结果 174 | if (output != null) 175 | { 176 | await extractor.ExportToFileAsync(tocItems, output.FullName, format, exportOptions); 177 | 178 | if (verbose) 179 | { 180 | Console.WriteLine($"目录已导出到: {output.FullName}"); 181 | } 182 | } 183 | else 184 | { 185 | // 输出到控制台 186 | var outputFormat = format ?? "text"; 187 | var result = extractor.ExportToString(tocItems, outputFormat, exportOptions); 188 | Console.WriteLine(result); 189 | } 190 | } 191 | catch (Exception ex) 192 | { 193 | Console.WriteLine($"语义分析失败: {ex.Message}"); 194 | if (debug && ex.InnerException != null) 195 | { 196 | Console.WriteLine($"详细错误: {ex.InnerException.Message}"); 197 | } 198 | } 199 | } 200 | 201 | private static SemanticAnalysisOptions CreateSemanticOptions( 202 | string mode, 203 | string skipPages, 204 | float confidence, 205 | float fontMultiplier, 206 | bool debug) 207 | { 208 | // 基础配置 209 | SemanticAnalysisOptions options = mode.ToLowerInvariant() switch 210 | { 211 | "strict" => SemanticAnalysisOptions.Strict, 212 | "relaxed" => SemanticAnalysisOptions.Relaxed, 213 | "debug" => SemanticAnalysisOptions.Debug, 214 | _ => SemanticAnalysisOptions.Default 215 | }; 216 | 217 | // 应用自定义参数 218 | options.MinConfidenceThreshold = confidence; 219 | options.FontSizeMultiplier = fontMultiplier; 220 | options.DebugMode = debug; 221 | 222 | // 解析跳过页面 223 | options.SkipPages = ParseSkipPages(skipPages); 224 | 225 | return options; 226 | } 227 | 228 | private static List ParseSkipPages(string skipPages) 229 | { 230 | var result = new List(); 231 | 232 | if (string.IsNullOrWhiteSpace(skipPages)) 233 | return result; 234 | 235 | var parts = skipPages.Split(',', StringSplitOptions.RemoveEmptyEntries); 236 | 237 | foreach (var part in parts) 238 | { 239 | var trimmed = part.Trim(); 240 | 241 | if (trimmed.Contains('-')) 242 | { 243 | // 范围格式 "1-3" 244 | var rangeParts = trimmed.Split('-'); 245 | if (rangeParts.Length == 2 && 246 | int.TryParse(rangeParts[0], out var start) && 247 | int.TryParse(rangeParts[1], out var end)) 248 | { 249 | for (int i = start; i <= end; i++) 250 | { 251 | result.Add(i); 252 | } 253 | } 254 | } 255 | else if (int.TryParse(trimmed, out var pageNum)) 256 | { 257 | // 单个页面 258 | result.Add(pageNum); 259 | } 260 | } 261 | 262 | return result.Distinct().OrderBy(x => x).ToList(); 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/TestData/TestDataBuilder.cs: -------------------------------------------------------------------------------- 1 | using PdfTocExtractor.Models; 2 | 3 | namespace PdfTocExtractor.Tests.TestData; 4 | 5 | /// 6 | /// 用于构建测试数据的辅助类 7 | /// 8 | public static class TestDataBuilder 9 | { 10 | /// 11 | /// 创建简单的单级目录项 12 | /// 13 | public static List CreateSimpleTocItems() 14 | { 15 | return new List 16 | { 17 | new TocItem { Title = "Chapter 1", Page = "1", Level = 0 }, 18 | new TocItem { Title = "Chapter 2", Page = "10", Level = 0 }, 19 | new TocItem { Title = "Chapter 3", Page = "20", Level = 0 } 20 | }; 21 | } 22 | 23 | /// 24 | /// 创建带有层级结构的目录项 25 | /// 26 | public static List CreateHierarchicalTocItems() 27 | { 28 | var chapter1 = new TocItem { Title = "Chapter 1: Introduction", Page = "1", Level = 0 }; 29 | var section11 = new TocItem { Title = "1.1 Overview", Page = "2", Level = 1, Parent = chapter1 }; 30 | var section12 = new TocItem { Title = "1.2 Objectives", Page = "5", Level = 1, Parent = chapter1 }; 31 | var subsection121 = new TocItem { Title = "1.2.1 Primary Goals", Page = "6", Level = 2, Parent = section12 }; 32 | var subsection122 = new TocItem { Title = "1.2.2 Secondary Goals", Page = "8", Level = 2, Parent = section12 }; 33 | 34 | chapter1.Children.AddRange(new[] { section11, section12 }); 35 | section12.Children.AddRange(new[] { subsection121, subsection122 }); 36 | 37 | var chapter2 = new TocItem { Title = "Chapter 2: Methodology", Page = "15", Level = 0 }; 38 | var section21 = new TocItem { Title = "2.1 Research Design", Page = "16", Level = 1, Parent = chapter2 }; 39 | var section22 = new TocItem { Title = "2.2 Data Collection", Page = "20", Level = 1, Parent = chapter2 }; 40 | 41 | chapter2.Children.AddRange(new[] { section21, section22 }); 42 | 43 | var chapter3 = new TocItem { Title = "Chapter 3: Results", Page = "30", Level = 0 }; 44 | 45 | return new List { chapter1, chapter2, chapter3 }; 46 | } 47 | 48 | /// 49 | /// 创建深层嵌套的目录项(用于测试最大深度限制) 50 | /// 51 | public static List CreateDeepNestedTocItems() 52 | { 53 | var level0 = new TocItem { Title = "Level 0", Page = "1", Level = 0 }; 54 | var level1 = new TocItem { Title = "Level 1", Page = "2", Level = 1, Parent = level0 }; 55 | var level2 = new TocItem { Title = "Level 2", Page = "3", Level = 2, Parent = level1 }; 56 | var level3 = new TocItem { Title = "Level 3", Page = "4", Level = 3, Parent = level2 }; 57 | var level4 = new TocItem { Title = "Level 4", Page = "5", Level = 4, Parent = level3 }; 58 | 59 | level0.Children.Add(level1); 60 | level1.Children.Add(level2); 61 | level2.Children.Add(level3); 62 | level3.Children.Add(level4); 63 | 64 | return new List { level0 }; 65 | } 66 | 67 | /// 68 | /// 创建包含特殊字符的目录项 69 | /// 70 | public static List CreateSpecialCharacterTocItems() 71 | { 72 | return new List 73 | { 74 | new TocItem { Title = "Chapter <1> & \"Introduction\"", Page = "1", Level = 0 }, 75 | new TocItem { Title = "Section with 'quotes' and symbols: @#$%", Page = "5", Level = 0 }, 76 | new TocItem { Title = "Unicode: 测试章节 🔍 📄", Page = "10", Level = 0 }, 77 | new TocItem { Title = "XML entities: <>&", Page = "15", Level = 0 } 78 | }; 79 | } 80 | 81 | /// 82 | /// 创建包含各种页码格式的目录项 83 | /// 84 | public static List CreateVariousPageFormatTocItems() 85 | { 86 | return new List 87 | { 88 | new TocItem { Title = "Normal Page", Page = "5", Level = 0 }, 89 | new TocItem { Title = "Complex Page", Page = "10 XYZ 123 456", Level = 0 }, 90 | new TocItem { Title = "Empty Page", Page = "", Level = 0 }, 91 | new TocItem { Title = "No Page Info", Page = "无页码", Level = 0 }, 92 | new TocItem { Title = "N/A Page", Page = "N/A", Level = 0 }, 93 | new TocItem { Title = "Invalid Page", Page = "abc", Level = 0 } 94 | }; 95 | } 96 | 97 | /// 98 | /// 创建空的目录项列表 99 | /// 100 | public static List CreateEmptyTocItems() 101 | { 102 | return new List(); 103 | } 104 | 105 | /// 106 | /// 创建大量目录项(用于性能测试) 107 | /// 108 | public static List CreateLargeTocItems(int count = 100) 109 | { 110 | var items = new List(); 111 | 112 | for (int i = 1; i <= count; i++) 113 | { 114 | var chapter = new TocItem 115 | { 116 | Title = $"Chapter {i}", 117 | Page = (i * 10).ToString(), 118 | Level = 0 119 | }; 120 | 121 | // 每5个章节添加一些子节 122 | if (i % 5 == 0) 123 | { 124 | for (int j = 1; j <= 3; j++) 125 | { 126 | var section = new TocItem 127 | { 128 | Title = $"{i}.{j} Section {j}", 129 | Page = (i * 10 + j).ToString(), 130 | Level = 1, 131 | Parent = chapter 132 | }; 133 | chapter.Children.Add(section); 134 | } 135 | } 136 | 137 | items.Add(chapter); 138 | } 139 | 140 | return items; 141 | } 142 | 143 | /// 144 | /// 创建模拟PDF书签数据(用于模拟iTextSharp的书签格式) 145 | /// 146 | public static List> CreateMockBookmarkData() 147 | { 148 | return new List> 149 | { 150 | new Dictionary 151 | { 152 | ["Title"] = "Chapter 1", 153 | ["Page"] = "1 XYZ 0 792 0", 154 | ["Kids"] = new List> 155 | { 156 | new Dictionary 157 | { 158 | ["Title"] = "Section 1.1", 159 | ["Page"] = "2 XYZ 0 792 0" 160 | }, 161 | new Dictionary 162 | { 163 | ["Title"] = "Section 1.2", 164 | ["Page"] = "5 XYZ 0 792 0" 165 | } 166 | } 167 | }, 168 | new Dictionary 169 | { 170 | ["Title"] = "Chapter 2", 171 | ["Page"] = "10 XYZ 0 792 0" 172 | } 173 | }; 174 | } 175 | 176 | /// 177 | /// 创建包含无效书签数据的模拟数据 178 | /// 179 | public static List> CreateInvalidMockBookmarkData() 180 | { 181 | return new List> 182 | { 183 | new Dictionary 184 | { 185 | // 缺少Title 186 | ["Page"] = "1 XYZ 0 792 0" 187 | }, 188 | new Dictionary 189 | { 190 | ["Title"] = "Valid Chapter", 191 | // 缺少Page 192 | }, 193 | new Dictionary 194 | { 195 | ["Title"] = "Chapter with Invalid Page", 196 | ["Page"] = "invalid page format" 197 | } 198 | }; 199 | } 200 | 201 | /// 202 | /// 创建复杂的嵌套书签数据 203 | /// 204 | public static List> CreateComplexMockBookmarkData() 205 | { 206 | return new List> 207 | { 208 | new Dictionary 209 | { 210 | ["Title"] = "Part I: Introduction", 211 | ["Page"] = "1 XYZ 0 792 0", 212 | ["Kids"] = new List> 213 | { 214 | new Dictionary 215 | { 216 | ["Title"] = "Chapter 1: Overview", 217 | ["Page"] = "3 XYZ 0 792 0", 218 | ["Kids"] = new List> 219 | { 220 | new Dictionary 221 | { 222 | ["Title"] = "1.1 Background", 223 | ["Page"] = "4 XYZ 0 792 0" 224 | }, 225 | new Dictionary 226 | { 227 | ["Title"] = "1.2 Scope", 228 | ["Page"] = "7 XYZ 0 792 0", 229 | ["Kids"] = new List> 230 | { 231 | new Dictionary 232 | { 233 | ["Title"] = "1.2.1 Technical Scope", 234 | ["Page"] = "8 XYZ 0 792 0" 235 | }, 236 | new Dictionary 237 | { 238 | ["Title"] = "1.2.2 Business Scope", 239 | ["Page"] = "10 XYZ 0 792 0" 240 | } 241 | } 242 | } 243 | } 244 | }, 245 | new Dictionary 246 | { 247 | ["Title"] = "Chapter 2: Methodology", 248 | ["Page"] = "15 XYZ 0 792 0" 249 | } 250 | } 251 | }, 252 | new Dictionary 253 | { 254 | ["Title"] = "Part II: Implementation", 255 | ["Page"] = "25 XYZ 0 792 0" 256 | } 257 | }; 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/Exporters/JsonExporterTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using Newtonsoft.Json; 3 | using Newtonsoft.Json.Linq; 4 | using PdfTocExtractor.Exporters; 5 | using PdfTocExtractor.Models; 6 | using System.Text; 7 | using Xunit; 8 | 9 | namespace PdfTocExtractor.Tests.Exporters; 10 | 11 | public class JsonExporterTests 12 | { 13 | private readonly JsonExporter _exporter; 14 | 15 | public JsonExporterTests() 16 | { 17 | _exporter = new JsonExporter(); 18 | } 19 | 20 | [Fact] 21 | public void FormatName_ShouldReturnJSON() 22 | { 23 | // Act & Assert 24 | _exporter.FormatName.Should().Be("JSON"); 25 | } 26 | 27 | [Fact] 28 | public void FileExtension_ShouldReturnJson() 29 | { 30 | // Act & Assert 31 | _exporter.FileExtension.Should().Be("json"); 32 | } 33 | 34 | [Fact] 35 | public void Export_ShouldReturnValidJson_WhenNoTocItems() 36 | { 37 | // Arrange 38 | var tocItems = new List(); 39 | var options = new ExportOptions { CustomTitle = "Test Document" }; 40 | 41 | // Act 42 | var result = _exporter.Export(tocItems, options); 43 | 44 | // Assert 45 | result.Should().NotBeNullOrEmpty(); 46 | var json = JObject.Parse(result); 47 | json["title"]?.ToString().Should().Be("Test Document"); 48 | json["items"].Should().NotBeNull(); 49 | ((JArray)json["items"]!).Should().BeEmpty(); 50 | } 51 | 52 | [Fact] 53 | public void Export_ShouldUseDefaultTitle_WhenCustomTitleIsNull() 54 | { 55 | // Arrange 56 | var tocItems = new List(); 57 | var options = new ExportOptions { CustomTitle = null }; 58 | 59 | // Act 60 | var result = _exporter.Export(tocItems, options); 61 | 62 | // Assert 63 | var json = JObject.Parse(result); 64 | json["title"]?.ToString().Should().Be("PDF 目录"); 65 | } 66 | 67 | [Fact] 68 | public void Export_ShouldCreateCorrectJsonStructure_WithSingleItem() 69 | { 70 | // Arrange 71 | var tocItems = new List 72 | { 73 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 74 | }; 75 | 76 | // Act 77 | var result = _exporter.Export(tocItems); 78 | 79 | // Assert 80 | var json = JObject.Parse(result); 81 | var items = (JArray)json["items"]!; 82 | items.Should().HaveCount(1); 83 | 84 | var item = (JObject)items[0]; 85 | item["title"]?.ToString().Should().Be("Chapter 1"); 86 | item["level"]?.Value().Should().Be(0); 87 | item["page"]?.ToString().Should().Be("5"); 88 | } 89 | 90 | [Fact] 91 | public void Export_ShouldCreateCorrectHierarchy_WithNestedItems() 92 | { 93 | // Arrange 94 | var tocItems = new List 95 | { 96 | new TocItem 97 | { 98 | Title = "Chapter 1", 99 | Page = "5", 100 | Level = 0, 101 | Children = new List 102 | { 103 | new TocItem { Title = "Section 1.1", Page = "6", Level = 1 }, 104 | new TocItem { Title = "Section 1.2", Page = "10", Level = 1 } 105 | } 106 | } 107 | }; 108 | 109 | // Act 110 | var result = _exporter.Export(tocItems); 111 | 112 | // Assert 113 | var json = JObject.Parse(result); 114 | var items = (JArray)json["items"]!; 115 | items.Should().HaveCount(1); 116 | 117 | var chapter = (JObject)items[0]; 118 | chapter["title"]?.ToString().Should().Be("Chapter 1"); 119 | 120 | var children = (JArray)chapter["children"]!; 121 | children.Should().HaveCount(2); 122 | children[0]["title"]?.ToString().Should().Be("Section 1.1"); 123 | children[1]["title"]?.ToString().Should().Be("Section 1.2"); 124 | } 125 | 126 | [Fact] 127 | public void Export_ShouldRespectMaxDepth_WhenSpecified() 128 | { 129 | // Arrange 130 | var tocItems = new List 131 | { 132 | new TocItem 133 | { 134 | Title = "Chapter 1", 135 | Page = "5", 136 | Level = 0, 137 | Children = new List 138 | { 139 | new TocItem 140 | { 141 | Title = "Section 1.1", 142 | Page = "6", 143 | Level = 1, 144 | Children = new List 145 | { 146 | new TocItem { Title = "Subsection 1.1.1", Page = "7", Level = 2 } 147 | } 148 | } 149 | } 150 | } 151 | }; 152 | var options = new ExportOptions { MaxDepth = 1 }; 153 | 154 | // Act 155 | var result = _exporter.Export(tocItems, options); 156 | 157 | // Assert 158 | var json = JObject.Parse(result); 159 | var items = (JArray)json["items"]!; 160 | var chapter = (JObject)items[0]; 161 | var children = (JArray)chapter["children"]!; 162 | 163 | children.Should().HaveCount(1); 164 | children[0]["title"]?.ToString().Should().Be("Section 1.1"); 165 | children[0]["children"].Should().BeNull(); 166 | } 167 | 168 | [Fact] 169 | public void Export_ShouldExcludePageNumbers_WhenIncludePageNumbersIsFalse() 170 | { 171 | // Arrange 172 | var tocItems = new List 173 | { 174 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 175 | }; 176 | var options = new ExportOptions { IncludePageNumbers = false }; 177 | 178 | // Act 179 | var result = _exporter.Export(tocItems, options); 180 | 181 | // Assert 182 | var json = JObject.Parse(result); 183 | var items = (JArray)json["items"]!; 184 | var item = (JObject)items[0]; 185 | 186 | item["title"]?.ToString().Should().Be("Chapter 1"); 187 | item["page"].Should().BeNull(); 188 | } 189 | 190 | [Fact] 191 | public void Export_ShouldHandleEmptyPageNumbers() 192 | { 193 | // Arrange 194 | var tocItems = new List 195 | { 196 | new TocItem { Title = "Chapter 1", Page = "", Level = 0 }, 197 | new TocItem { Title = "Chapter 2", Page = "无页码", Level = 0 }, 198 | new TocItem { Title = "Chapter 3", Page = "N/A", Level = 0 } 199 | }; 200 | 201 | // Act 202 | var result = _exporter.Export(tocItems); 203 | 204 | // Assert 205 | var json = JObject.Parse(result); 206 | var items = (JArray)json["items"]!; 207 | 208 | items.Should().HaveCount(3); 209 | items[0]["page"].Should().BeNull(); 210 | items[1]["page"].Should().BeNull(); 211 | items[2]["page"].Should().BeNull(); 212 | } 213 | 214 | [Fact] 215 | public void Export_ShouldIncludeGeneratedTimestamp() 216 | { 217 | // Arrange 218 | var tocItems = new List 219 | { 220 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 221 | }; 222 | 223 | // Act 224 | var result = _exporter.Export(tocItems); 225 | 226 | // Assert 227 | var json = JObject.Parse(result); 228 | json["generatedAt"].Should().NotBeNull(); 229 | 230 | var generatedAt = json["generatedAt"]?.ToString(); 231 | DateTime.TryParse(generatedAt, out var timestamp).Should().BeTrue(); 232 | timestamp.Should().BeCloseTo(DateTime.Now, TimeSpan.FromMinutes(1)); 233 | } 234 | 235 | [Fact] 236 | public void Export_ShouldFormatJsonWithIndentation() 237 | { 238 | // Arrange 239 | var tocItems = new List 240 | { 241 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 242 | }; 243 | 244 | // Act 245 | var result = _exporter.Export(tocItems); 246 | 247 | // Assert 248 | result.Should().Contain(" "); // Should contain indentation 249 | result.Should().Contain("\n"); // Should contain line breaks 250 | } 251 | 252 | [Fact] 253 | public async Task ExportToFileAsync_ShouldCreateFileWithCorrectContent() 254 | { 255 | // Arrange 256 | var tocItems = new List 257 | { 258 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 259 | }; 260 | var tempFile = Path.GetTempFileName(); 261 | var options = new ExportOptions { CustomTitle = "Test Export" }; 262 | 263 | try 264 | { 265 | // Act 266 | await _exporter.ExportToFileAsync(tocItems, tempFile, options); 267 | 268 | // Assert 269 | File.Exists(tempFile).Should().BeTrue(); 270 | var content = await File.ReadAllTextAsync(tempFile); 271 | var json = JObject.Parse(content); 272 | json["title"]?.ToString().Should().Be("Test Export"); 273 | } 274 | finally 275 | { 276 | // Cleanup 277 | if (File.Exists(tempFile)) 278 | File.Delete(tempFile); 279 | } 280 | } 281 | 282 | [Fact] 283 | public async Task ExportToFileAsync_ShouldUseSpecifiedEncoding() 284 | { 285 | // Arrange 286 | var tocItems = new List 287 | { 288 | new TocItem { Title = "测试章节", Page = "5", Level = 0 } 289 | }; 290 | var tempFile = Path.GetTempFileName(); 291 | var options = new ExportOptions { Encoding = Encoding.Unicode }; 292 | 293 | try 294 | { 295 | // Act 296 | await _exporter.ExportToFileAsync(tocItems, tempFile, options); 297 | 298 | // Assert 299 | File.Exists(tempFile).Should().BeTrue(); 300 | var content = await File.ReadAllTextAsync(tempFile, Encoding.Unicode); 301 | var json = JObject.Parse(content); 302 | json["items"]?[0]?["title"]?.ToString().Should().Be("测试章节"); 303 | } 304 | finally 305 | { 306 | // Cleanup 307 | if (File.Exists(tempFile)) 308 | File.Delete(tempFile); 309 | } 310 | } 311 | 312 | [Fact] 313 | public void Export_ShouldHandleNullOptions() 314 | { 315 | // Arrange 316 | var tocItems = new List 317 | { 318 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 319 | }; 320 | 321 | // Act 322 | var result = _exporter.Export(tocItems, null); 323 | 324 | // Assert 325 | result.Should().NotBeNullOrEmpty(); 326 | var json = JObject.Parse(result); 327 | json["title"]?.ToString().Should().Be("PDF 目录"); 328 | json["items"].Should().NotBeNull(); 329 | } 330 | 331 | [Fact] 332 | public void Export_ShouldHandleComplexPageNumbers() 333 | { 334 | // Arrange 335 | var tocItems = new List 336 | { 337 | new TocItem { Title = "Chapter 1", Page = "5 XYZ 123 456", Level = 0 } 338 | }; 339 | 340 | // Act 341 | var result = _exporter.Export(tocItems); 342 | 343 | // Assert 344 | var json = JObject.Parse(result); 345 | var items = (JArray)json["items"]!; 346 | var item = (JObject)items[0]; 347 | 348 | item["page"]?.ToString().Should().Be("5 XYZ 123 456"); 349 | } 350 | } 351 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/PdfTocExtractor.cs: -------------------------------------------------------------------------------- 1 | using iText.Kernel.Pdf; 2 | using PdfTocExtractor.Exporters; 3 | using PdfTocExtractor.Models; 4 | using PdfTocExtractor.Semantic; 5 | 6 | namespace PdfTocExtractor; 7 | 8 | /// 9 | /// PDF目录提取器 10 | /// 11 | public class PdfTocExtractor 12 | { 13 | private readonly Dictionary _exporters; 14 | 15 | public PdfTocExtractor() 16 | { 17 | _exporters = new Dictionary(StringComparer.OrdinalIgnoreCase) 18 | { 19 | ["markdown"] = new MarkdownExporter(), 20 | ["md"] = new MarkdownExporter(), 21 | ["json"] = new JsonExporter(), 22 | ["xml"] = new XmlExporter(), 23 | ["text"] = new TextExporter(), 24 | ["txt"] = new TextExporter() 25 | }; 26 | } 27 | 28 | /// 29 | /// 从PDF文件提取目录 30 | /// 31 | /// PDF文件路径 32 | /// 目录项目列表 33 | /// PDF文件不存在 34 | /// PDF文件无法读取或没有目录信息 35 | public List ExtractToc(string pdfPath) 36 | { 37 | if (!File.Exists(pdfPath)) 38 | throw new FileNotFoundException($"PDF文件不存在: {pdfPath}"); 39 | 40 | try 41 | { 42 | using var reader = new PdfReader(pdfPath); 43 | using var pdfDoc = new PdfDocument(reader); 44 | 45 | var outlines = pdfDoc.GetOutlines(false); 46 | if (outlines == null) 47 | throw new InvalidOperationException("此PDF文件没有目录(书签)信息"); 48 | 49 | var bookmarks = outlines.GetAllChildren(); 50 | if (bookmarks == null || bookmarks.Count == 0) 51 | throw new InvalidOperationException("此PDF文件没有目录(书签)信息"); 52 | 53 | return ConvertBookmarksToTocItems(bookmarks, pdfDoc); 54 | } 55 | catch (Exception ex) when (!(ex is FileNotFoundException || ex is InvalidOperationException)) 56 | { 57 | throw new InvalidOperationException($"读取PDF文件时发生错误: {ex.Message}", ex); 58 | } 59 | } 60 | 61 | /// 62 | /// 异步从PDF文件提取目录 63 | /// 64 | /// PDF文件路径 65 | /// 目录项目列表 66 | public Task> ExtractTocAsync(string pdfPath) 67 | { 68 | return Task.Run(() => ExtractToc(pdfPath)); 69 | } 70 | 71 | /// 72 | /// 导出目录到指定格式的字符串 73 | /// 74 | /// 目录项目列表 75 | /// 导出格式 76 | /// 导出选项 77 | /// 导出的字符串内容 78 | public string ExportToString(IEnumerable tocItems, string format, ExportOptions? options = null) 79 | { 80 | var exporter = GetExporter(format); 81 | return exporter.Export(tocItems, options); 82 | } 83 | 84 | /// 85 | /// 导出目录到文件 86 | /// 87 | /// 目录项目列表 88 | /// 输出文件路径 89 | /// 导出格式(如果为空则根据文件扩展名推断) 90 | /// 导出选项 91 | public async Task ExportToFileAsync(IEnumerable tocItems, string outputPath, string? format = null, ExportOptions? options = null) 92 | { 93 | format ??= Path.GetExtension(outputPath).TrimStart('.'); 94 | var exporter = GetExporter(format); 95 | await exporter.ExportToFileAsync(tocItems, outputPath, options); 96 | } 97 | 98 | 99 | 100 | /// 101 | /// 使用语义分析从PDF文件提取目录 102 | /// 103 | /// PDF文件路径 104 | /// 语义分析选项 105 | /// 目录项目列表 106 | /// PDF文件不存在 107 | /// PDF文件无法读取 108 | public List ExtractTocSemantic(string pdfPath, SemanticAnalysisOptions? options = null) 109 | { 110 | var extractor = new SemanticTocExtractor(options); 111 | return extractor.ExtractToc(pdfPath); 112 | } 113 | 114 | /// 115 | /// 异步使用语义分析从PDF文件提取目录 116 | /// 117 | /// PDF文件路径 118 | /// 语义分析选项 119 | /// 目录项目列表 120 | public Task> ExtractTocSemanticAsync(string pdfPath, SemanticAnalysisOptions? options = null) 121 | { 122 | return Task.Run(() => ExtractTocSemantic(pdfPath, options)); 123 | } 124 | 125 | /// 126 | /// 智能提取目录:先尝试提取书签,失败则使用语义分析 127 | /// 128 | /// PDF文件路径 129 | /// 语义分析选项(当书签提取失败时使用) 130 | /// 目录项目列表 131 | /// PDF文件不存在 132 | /// PDF文件无法读取且语义分析也失败 133 | public List ExtractTocSmart(string pdfPath, SemanticAnalysisOptions? semanticOptions = null) 134 | { 135 | try 136 | { 137 | // 首先尝试提取书签 138 | return ExtractToc(pdfPath); 139 | } 140 | catch (InvalidOperationException ex) when (ex.Message.Contains("没有目录(书签)信息")) 141 | { 142 | // 如果没有书签,则使用语义分析 143 | return ExtractTocSemantic(pdfPath, semanticOptions); 144 | } 145 | } 146 | 147 | /// 148 | /// 异步智能提取目录 149 | /// 150 | /// PDF文件路径 151 | /// 语义分析选项 152 | /// 目录项目列表 153 | public Task> ExtractTocSmartAsync(string pdfPath, SemanticAnalysisOptions? semanticOptions = null) 154 | { 155 | return Task.Run(() => ExtractTocSmart(pdfPath, semanticOptions)); 156 | } 157 | 158 | /// 159 | /// 从PDF文件提取目录并直接导出到文件 160 | /// 161 | /// PDF文件路径 162 | /// 输出文件路径 163 | /// 导出格式(如果为空则根据文件扩展名推断) 164 | /// 导出选项 165 | public async Task ExtractAndExportAsync(string pdfPath, string outputPath, string? format = null, ExportOptions? options = null) 166 | { 167 | var tocItems = await ExtractTocAsync(pdfPath); 168 | await ExportToFileAsync(tocItems, outputPath, format, options); 169 | } 170 | 171 | /// 172 | /// 智能提取目录并直接导出到文件 173 | /// 174 | /// PDF文件路径 175 | /// 输出文件路径 176 | /// 导出格式(如果为空则根据文件扩展名推断) 177 | /// 导出选项 178 | /// 语义分析选项 179 | public async Task ExtractSmartAndExportAsync(string pdfPath, string outputPath, string? format = null, 180 | ExportOptions? exportOptions = null, SemanticAnalysisOptions? semanticOptions = null) 181 | { 182 | var tocItems = await ExtractTocSmartAsync(pdfPath, semanticOptions); 183 | await ExportToFileAsync(tocItems, outputPath, format, exportOptions); 184 | } 185 | 186 | 187 | 188 | /// 189 | /// 获取支持的导出格式列表 190 | /// 191 | public IEnumerable GetSupportedFormats() 192 | { 193 | return _exporters.Keys.Distinct(StringComparer.OrdinalIgnoreCase); 194 | } 195 | 196 | /// 197 | /// 注册自定义导出器 198 | /// 199 | /// 格式名称 200 | /// 导出器实例 201 | public void RegisterExporter(string format, IExporter exporter) 202 | { 203 | if (string.IsNullOrEmpty(format)) 204 | throw new ArgumentException("Format cannot be null or empty.", nameof(format)); 205 | 206 | if (exporter == null) 207 | throw new ArgumentNullException(nameof(exporter)); 208 | 209 | _exporters[format.ToLowerInvariant()] = exporter; 210 | } 211 | 212 | private IExporter GetExporter(string format) 213 | { 214 | if (string.IsNullOrEmpty(format)) 215 | throw new ArgumentException("导出格式不能为空", nameof(format)); 216 | 217 | if (!_exporters.TryGetValue(format, out var exporter)) 218 | throw new NotSupportedException($"不支持的导出格式: {format}。支持的格式: {string.Join(", ", GetSupportedFormats())}"); 219 | 220 | return exporter; 221 | } 222 | 223 | /// 224 | /// 将iText的书签转换为TocItem对象 225 | /// 226 | private List ConvertBookmarksToTocItems(IList bookmarks, PdfDocument pdfDoc, TocItem? parent = null, int level = 0) 227 | { 228 | var tocItems = new List(); 229 | 230 | foreach (var bookmark in bookmarks) 231 | { 232 | var tocItem = new TocItem 233 | { 234 | Title = GetBookmarkTitle(bookmark), 235 | Page = GetBookmarkPage(bookmark, pdfDoc), 236 | Level = level, 237 | Parent = parent 238 | }; 239 | 240 | // 处理子书签 241 | var children = bookmark.GetAllChildren(); 242 | if (children != null && children.Count > 0) 243 | { 244 | tocItem.Children = ConvertBookmarksToTocItems(children, pdfDoc, tocItem, level + 1); 245 | } 246 | 247 | tocItems.Add(tocItem); 248 | } 249 | 250 | return tocItems; 251 | } 252 | 253 | /// 254 | /// 获取书签标题 255 | /// 256 | private static string GetBookmarkTitle(PdfOutline bookmark) 257 | { 258 | var title = bookmark.GetTitle(); 259 | return string.IsNullOrEmpty(title) ? "无标题" : title; 260 | } 261 | 262 | /// 263 | /// 获取书签页码 264 | /// 265 | private static string GetBookmarkPage(PdfOutline bookmark, PdfDocument pdfDoc) 266 | { 267 | try 268 | { 269 | var destination = bookmark.GetDestination(); 270 | if (destination != null && destination.GetPdfObject() != null) 271 | { 272 | // 尝试从目标对象中提取页码信息 273 | var destObj = destination.GetPdfObject(); 274 | if (destObj.IsArray()) 275 | { 276 | var array = (PdfArray)destObj; 277 | if (array.Size() > 0) 278 | { 279 | var pageRef = array.Get(0); 280 | if (pageRef != null && pageRef.IsIndirectReference()) 281 | { 282 | // 通过页面引用获取页码 283 | var pageDict = pageRef.GetIndirectReference().GetRefersTo(); 284 | if (pageDict != null) 285 | { 286 | // 遍历所有页面找到匹配的页码 287 | for (int i = 1; i <= pdfDoc.GetNumberOfPages(); i++) 288 | { 289 | var page = pdfDoc.GetPage(i); 290 | if (page.GetPdfObject().Equals(pageDict)) 291 | { 292 | return i.ToString(); 293 | } 294 | } 295 | } 296 | } 297 | } 298 | } 299 | } 300 | } 301 | catch 302 | { 303 | // 如果获取页码失败,返回默认值 304 | } 305 | 306 | return "无页码"; 307 | } 308 | 309 | 310 | } 311 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/PdfTocExtractorTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using Moq; 3 | using PdfTocExtractor.Exporters; 4 | using PdfTocExtractor.Models; 5 | using PdfTocExtractor.Tests.TestData; 6 | using Xunit; 7 | 8 | namespace PdfTocExtractor.Tests; 9 | 10 | public class PdfTocExtractorTests : IDisposable 11 | { 12 | private readonly PdfTocExtractor _extractor; 13 | private readonly List _tempFiles; 14 | 15 | public PdfTocExtractorTests() 16 | { 17 | _extractor = new PdfTocExtractor(); 18 | _tempFiles = new List(); 19 | } 20 | 21 | public void Dispose() 22 | { 23 | // 清理临时文件 24 | foreach (var file in _tempFiles) 25 | { 26 | MockHelpers.CleanupTempFile(file); 27 | } 28 | } 29 | 30 | [Fact] 31 | public void Constructor_ShouldInitializeWithDefaultExporters() 32 | { 33 | // Arrange & Act 34 | var extractor = new PdfTocExtractor(); 35 | 36 | // Assert 37 | var supportedFormats = extractor.GetSupportedFormats().ToList(); 38 | supportedFormats.Should().Contain("markdown"); 39 | supportedFormats.Should().Contain("md"); 40 | supportedFormats.Should().Contain("json"); 41 | supportedFormats.Should().Contain("xml"); 42 | supportedFormats.Should().Contain("text"); 43 | supportedFormats.Should().Contain("txt"); 44 | } 45 | 46 | [Fact] 47 | public void GetSupportedFormats_ShouldReturnAllRegisteredFormats() 48 | { 49 | // Act 50 | var formats = _extractor.GetSupportedFormats().ToList(); 51 | 52 | // Assert 53 | formats.Should().HaveCountGreaterThan(0); 54 | formats.Should().OnlyHaveUniqueItems(); 55 | } 56 | 57 | [Fact] 58 | public void RegisterExporter_ShouldAddNewExporter() 59 | { 60 | // Arrange 61 | var mockExporter = MockHelpers.CreateMockExporter("custom", "cst"); 62 | 63 | // Act 64 | _extractor.RegisterExporter("custom", mockExporter.Object); 65 | 66 | // Assert 67 | var formats = _extractor.GetSupportedFormats(); 68 | formats.Should().Contain("custom"); 69 | } 70 | 71 | [Fact] 72 | public void RegisterExporter_ShouldReplaceExistingExporter() 73 | { 74 | // Arrange 75 | var mockExporter = MockHelpers.CreateMockExporter("markdown", "md"); 76 | 77 | // Act 78 | _extractor.RegisterExporter("markdown", mockExporter.Object); 79 | 80 | // Assert 81 | // 应该能够成功注册,不抛出异常 82 | var formats = _extractor.GetSupportedFormats(); 83 | formats.Should().Contain("markdown"); 84 | } 85 | 86 | [Theory] 87 | [InlineData("")] 88 | [InlineData(null)] 89 | public void RegisterExporter_ShouldThrowArgumentException_WhenFormatIsNullOrEmpty(string format) 90 | { 91 | // Arrange 92 | var mockExporter = MockHelpers.CreateMockExporter(); 93 | 94 | // Act & Assert 95 | Assert.Throws(() => _extractor.RegisterExporter(format, mockExporter.Object)); 96 | } 97 | 98 | [Fact] 99 | public void RegisterExporter_ShouldThrowArgumentNullException_WhenExporterIsNull() 100 | { 101 | // Act & Assert 102 | Assert.Throws(() => _extractor.RegisterExporter("test", null!)); 103 | } 104 | 105 | [Fact] 106 | public void ExtractToc_ShouldThrowFileNotFoundException_WhenFileDoesNotExist() 107 | { 108 | // Arrange 109 | var nonExistentPath = MockHelpers.CreateNonExistentPdfPath(); 110 | 111 | // Act & Assert 112 | Assert.Throws(() => _extractor.ExtractToc(nonExistentPath)); 113 | } 114 | 115 | [Fact] 116 | public async Task ExtractTocAsync_ShouldThrowFileNotFoundException_WhenFileDoesNotExist() 117 | { 118 | // Arrange 119 | var nonExistentPath = MockHelpers.CreateNonExistentPdfPath(); 120 | 121 | // Act & Assert 122 | await Assert.ThrowsAsync(() => _extractor.ExtractTocAsync(nonExistentPath)); 123 | } 124 | 125 | [Fact] 126 | public void ExportToString_ShouldReturnCorrectFormat_WithMarkdown() 127 | { 128 | // Arrange 129 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 130 | 131 | // Act 132 | var result = _extractor.ExportToString(tocItems, "markdown"); 133 | 134 | // Assert 135 | result.Should().NotBeNullOrEmpty(); 136 | result.Should().Contain("# PDF 目录"); 137 | result.Should().Contain("- Chapter 1(第 1 页)"); 138 | } 139 | 140 | [Fact] 141 | public void ExportToString_ShouldReturnCorrectFormat_WithJson() 142 | { 143 | // Arrange 144 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 145 | 146 | // Act 147 | var result = _extractor.ExportToString(tocItems, "json"); 148 | 149 | // Assert 150 | result.Should().NotBeNullOrEmpty(); 151 | MockHelpers.IsValidJson(result).Should().BeTrue(); 152 | } 153 | 154 | [Fact] 155 | public void ExportToString_ShouldReturnCorrectFormat_WithXml() 156 | { 157 | // Arrange 158 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 159 | 160 | // Act 161 | var result = _extractor.ExportToString(tocItems, "xml"); 162 | 163 | // Assert 164 | result.Should().NotBeNullOrEmpty(); 165 | MockHelpers.IsValidXml(result).Should().BeTrue(); 166 | } 167 | 168 | [Fact] 169 | public void ExportToString_ShouldReturnCorrectFormat_WithText() 170 | { 171 | // Arrange 172 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 173 | 174 | // Act 175 | var result = _extractor.ExportToString(tocItems, "text"); 176 | 177 | // Assert 178 | result.Should().NotBeNullOrEmpty(); 179 | result.Should().Contain("PDF 目录"); 180 | result.Should().Contain("- Chapter 1(第 1 页)"); 181 | } 182 | 183 | [Fact] 184 | public void ExportToString_ShouldThrowArgumentException_WhenFormatNotSupported() 185 | { 186 | // Arrange 187 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 188 | 189 | // Act & Assert 190 | Assert.Throws(() => _extractor.ExportToString(tocItems, "unsupported")); 191 | } 192 | 193 | [Fact] 194 | public void ExportToString_ShouldUseOptions_WhenProvided() 195 | { 196 | // Arrange 197 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 198 | var options = MockHelpers.CreateTestExportOptions(customTitle: "Custom Title"); 199 | 200 | // Act 201 | var result = _extractor.ExportToString(tocItems, "markdown", options); 202 | 203 | // Assert 204 | result.Should().Contain("# Custom Title"); 205 | } 206 | 207 | [Fact] 208 | public async Task ExportToFileAsync_ShouldCreateFile_WithCorrectContent() 209 | { 210 | // Arrange 211 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 212 | var tempFile = Path.GetTempFileName(); 213 | _tempFiles.Add(tempFile); 214 | 215 | // Act 216 | await _extractor.ExportToFileAsync(tocItems, tempFile, "markdown"); 217 | 218 | // Assert 219 | File.Exists(tempFile).Should().BeTrue(); 220 | var content = await File.ReadAllTextAsync(tempFile); 221 | content.Should().Contain("# PDF 目录"); 222 | } 223 | 224 | [Fact] 225 | public async Task ExportToFileAsync_ShouldInferFormat_FromFileExtension() 226 | { 227 | // Arrange 228 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 229 | var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".json"); 230 | _tempFiles.Add(tempFile); 231 | 232 | // Act 233 | await _extractor.ExportToFileAsync(tocItems, tempFile); 234 | 235 | // Assert 236 | File.Exists(tempFile).Should().BeTrue(); 237 | var content = await File.ReadAllTextAsync(tempFile); 238 | MockHelpers.IsValidJson(content).Should().BeTrue(); 239 | } 240 | 241 | [Fact] 242 | public async Task ExportToFileAsync_ShouldUseSpecifiedFormat_OverFileExtension() 243 | { 244 | // Arrange 245 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 246 | var tempFile = Path.ChangeExtension(Path.GetTempFileName(), ".json"); 247 | _tempFiles.Add(tempFile); 248 | 249 | // Act 250 | await _extractor.ExportToFileAsync(tocItems, tempFile, "markdown"); 251 | 252 | // Assert 253 | File.Exists(tempFile).Should().BeTrue(); 254 | var content = await File.ReadAllTextAsync(tempFile); 255 | content.Should().Contain("# PDF 目录"); // Markdown format, not JSON 256 | } 257 | 258 | [Fact] 259 | public async Task ExportToFileAsync_ShouldThrowArgumentException_WhenFormatNotSupported() 260 | { 261 | // Arrange 262 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 263 | var tempFile = Path.GetTempFileName(); 264 | _tempFiles.Add(tempFile); 265 | 266 | // Act & Assert 267 | await Assert.ThrowsAsync(() => 268 | _extractor.ExportToFileAsync(tocItems, tempFile, "unsupported")); 269 | } 270 | 271 | [Fact] 272 | public async Task ExtractAndExportAsync_ShouldThrowFileNotFoundException_WhenPdfNotExists() 273 | { 274 | // Arrange 275 | var nonExistentPdf = MockHelpers.CreateNonExistentPdfPath(); 276 | var tempFile = Path.GetTempFileName(); 277 | _tempFiles.Add(tempFile); 278 | 279 | // Act & Assert 280 | await Assert.ThrowsAsync(() => 281 | _extractor.ExtractAndExportAsync(nonExistentPdf, tempFile)); 282 | } 283 | 284 | [Theory] 285 | [InlineData("markdown")] 286 | [InlineData("md")] 287 | [InlineData("json")] 288 | [InlineData("xml")] 289 | [InlineData("text")] 290 | [InlineData("txt")] 291 | public void ExportToString_ShouldWorkWithAllSupportedFormats(string format) 292 | { 293 | // Arrange 294 | var tocItems = TestDataBuilder.CreateSimpleTocItems(); 295 | 296 | // Act 297 | var result = _extractor.ExportToString(tocItems, format); 298 | 299 | // Assert 300 | result.Should().NotBeNullOrEmpty(); 301 | } 302 | 303 | [Fact] 304 | public void ExportToString_ShouldHandleEmptyTocItems() 305 | { 306 | // Arrange 307 | var tocItems = TestDataBuilder.CreateEmptyTocItems(); 308 | 309 | // Act 310 | var result = _extractor.ExportToString(tocItems, "markdown"); 311 | 312 | // Assert 313 | result.Should().NotBeNullOrEmpty(); 314 | result.Should().Contain("# PDF 目录"); 315 | } 316 | 317 | [Fact] 318 | public void ExportToString_ShouldHandleHierarchicalTocItems() 319 | { 320 | // Arrange 321 | var tocItems = TestDataBuilder.CreateHierarchicalTocItems(); 322 | 323 | // Act 324 | var result = _extractor.ExportToString(tocItems, "markdown"); 325 | 326 | // Assert 327 | result.Should().NotBeNullOrEmpty(); 328 | result.Should().Contain("Chapter 1: Introduction"); 329 | result.Should().Contain("1.1 Overview"); 330 | result.Should().Contain("1.2.1 Primary Goals"); 331 | } 332 | 333 | [Fact] 334 | public void ExportToString_ShouldHandleSpecialCharacters() 335 | { 336 | // Arrange 337 | var tocItems = TestDataBuilder.CreateSpecialCharacterTocItems(); 338 | 339 | // Act 340 | var result = _extractor.ExportToString(tocItems, "markdown"); 341 | 342 | // Assert 343 | result.Should().NotBeNullOrEmpty(); 344 | result.Should().Contain("Chapter <1> & \"Introduction\""); 345 | } 346 | } 347 | -------------------------------------------------------------------------------- /src/PdfTocExtractor/Semantic/SemanticHeadingAnalyzer.cs: -------------------------------------------------------------------------------- 1 | using System.Text.RegularExpressions; 2 | 3 | namespace PdfTocExtractor.Semantic; 4 | 5 | /// 6 | /// 基于语义分析的标题识别器 7 | /// 8 | public class SemanticHeadingAnalyzer 9 | { 10 | private readonly SemanticAnalysisOptions _options; 11 | 12 | // 操作动词词典 13 | private readonly HashSet _actionWords = new() 14 | { 15 | "点击", "选择", "输入", "打开", "关闭", "设置", "配置", "进入", "退出", 16 | "保存", "删除", "修改", "查看", "操作", "执行", "运行", "启动", "停止", 17 | "添加", "移除", "编辑", "更新", "刷新", "重置", "清除", "导入", "导出", 18 | "上传", "下载", "发送", "接收", "连接", "断开", "登录", "登出", "注册", 19 | "提交", "取消", "确认", "拒绝", "同意", "申请", "审批", "通过", "驳回" 20 | }; 21 | 22 | // 标题关键词词典 23 | private readonly HashSet _headingKeywords = new() 24 | { 25 | "管理", "系统", "功能", "概述", "介绍", "说明", "中心", "平台", "工具", 26 | "环境", "配置", "设置", "模块", "组件", "服务", "接口", "协议", "标准", 27 | "规范", "流程", "方案", "策略", "政策", "制度", "规则", "原则", "方法", 28 | "技术", "架构", "框架", "结构", "设计", "开发", "部署", "运维", "监控", 29 | "安全", "权限", "认证", "授权", "加密", "解密", "备份", "恢复", "容灾", 30 | "性能", "优化", "调优", "测试", "验证", "评估", "分析", "统计", "报告" 31 | }; 32 | 33 | // 章节指示词 34 | private readonly HashSet _chapterIndicators = new() 35 | { 36 | "章", "节", "部分", "篇", "卷", "册", "编", "辑", "集", "段", "条", "款", "项" 37 | }; 38 | 39 | public SemanticHeadingAnalyzer(SemanticAnalysisOptions? options = null) 40 | { 41 | _options = options ?? SemanticAnalysisOptions.Default; 42 | } 43 | 44 | /// 45 | /// 分析文本片段列表,识别标题 46 | /// 47 | public List AnalyzeHeadings(List fragments) 48 | { 49 | if (!fragments.Any()) 50 | return new List(); 51 | 52 | // 1. 计算统计信息 53 | var stats = CalculateStatistics(fragments); 54 | 55 | // 2. 对每个片段进行语义分析 56 | foreach (var fragment in fragments) 57 | { 58 | fragment.SemanticResult = AnalyzeFragment(fragment, stats); 59 | } 60 | 61 | // 3. 筛选出可能的标题 62 | var potentialHeadings = fragments 63 | .Where(f => f.SemanticResult?.IsLikelyHeading == true) 64 | .OrderBy(f => f.PageNumber) 65 | .ThenBy(f => f.Y) 66 | .ToList(); 67 | 68 | // 4. 确定标题层级 69 | DetermineHeadingLevels(potentialHeadings); 70 | 71 | if (_options.DebugMode) 72 | { 73 | Console.WriteLine($"语义分析完成: 从 {fragments.Count} 个片段中识别出 {potentialHeadings.Count} 个标题"); 74 | foreach (var heading in potentialHeadings.Take(10)) 75 | { 76 | Console.WriteLine($" [{heading.SemanticResult?.EstimatedLevel}] {heading.Text} (置信度: {heading.SemanticResult?.HeadingConfidence:F2})"); 77 | } 78 | } 79 | 80 | return potentialHeadings; 81 | } 82 | 83 | /// 84 | /// 分析单个文本片段 85 | /// 86 | private SemanticAnalysisResult AnalyzeFragment(TextFragment fragment, TextStatistics stats) 87 | { 88 | var result = new SemanticAnalysisResult(); 89 | var reasons = new List(); 90 | var exclusions = new List(); 91 | 92 | var text = fragment.Text.Trim(); 93 | if (string.IsNullOrWhiteSpace(text)) 94 | { 95 | exclusions.Add("空文本"); 96 | result.ExclusionReasons = exclusions; 97 | return result; 98 | } 99 | 100 | // 1. 长度检查 101 | if (text.Length < _options.MinHeadingLength) 102 | { 103 | exclusions.Add($"文本太短 ({text.Length} < {_options.MinHeadingLength})"); 104 | } 105 | else if (text.Length > _options.MaxHeadingLength) 106 | { 107 | exclusions.Add($"文本太长 ({text.Length} > {_options.MaxHeadingLength})"); 108 | } 109 | else 110 | { 111 | reasons.Add("长度适中"); 112 | } 113 | 114 | // 2. 操作动词检查 115 | if (ContainsActionWords(text)) 116 | { 117 | exclusions.Add("包含操作动词"); 118 | } 119 | else 120 | { 121 | reasons.Add("不包含操作动词"); 122 | } 123 | 124 | // 3. 标题关键词检查 125 | if (ContainsHeadingKeywords(text)) 126 | { 127 | reasons.Add("包含标题关键词"); 128 | } 129 | 130 | // 4. 章节编号检查 131 | if (HasChapterNumbering(text)) 132 | { 133 | reasons.Add("包含章节编号"); 134 | } 135 | 136 | // 5. 字体特征检查 137 | if (fragment.FontSize > stats.AverageFontSize * _options.FontSizeMultiplier) 138 | { 139 | reasons.Add($"字体较大 ({fragment.FontSize:F1} > {stats.AverageFontSize * _options.FontSizeMultiplier:F1})"); 140 | } 141 | 142 | if (fragment.IsBold && _options.ConsiderBoldAsHeading) 143 | { 144 | reasons.Add("粗体文本"); 145 | } 146 | 147 | // 6. 位置特征检查 148 | if (fragment.IsStandalone) 149 | { 150 | reasons.Add("独立成行"); 151 | } 152 | 153 | if (fragment.VerticalSpaceBefore > _options.MinVerticalSpacing || 154 | fragment.VerticalSpaceAfter > _options.MinVerticalSpacing) 155 | { 156 | reasons.Add("前后有足够间距"); 157 | } 158 | 159 | // 7. 排除明显的非标题内容 160 | if (IsObviouslyNotHeading(text)) 161 | { 162 | exclusions.Add("明显的非标题内容"); 163 | } 164 | 165 | // 8. 计算置信度 166 | float confidence = CalculateConfidence(reasons, exclusions, fragment, stats); 167 | 168 | result.IsLikelyHeading = confidence >= _options.MinConfidenceThreshold && !exclusions.Any(); 169 | result.HeadingConfidence = confidence; 170 | result.Reasons = reasons; 171 | result.ExclusionReasons = exclusions; 172 | 173 | return result; 174 | } 175 | 176 | /// 177 | /// 检查是否包含操作动词 178 | /// 179 | private bool ContainsActionWords(string text) 180 | { 181 | return _actionWords.Any(word => text.Contains(word)); 182 | } 183 | 184 | /// 185 | /// 检查是否包含标题关键词 186 | /// 187 | private bool ContainsHeadingKeywords(string text) 188 | { 189 | return _headingKeywords.Any(word => text.Contains(word)); 190 | } 191 | 192 | /// 193 | /// 检查是否有章节编号 194 | /// 195 | private bool HasChapterNumbering(string text) 196 | { 197 | var patterns = new[] 198 | { 199 | @"^第[一二三四五六七八九十\d]+[章节部分篇]", // 第一章、第二节 200 | @"^\d+(\.\d+)*[\.、]", // 1.2.3. 或 1.2.3、 201 | @"^[一二三四五六七八九十]+[、..]", // 一、二、 202 | @"^\([一二三四五六七八九十\d]+\)", // (1) (一) 203 | @"^[A-Z]\.", // A. B. 204 | }; 205 | 206 | return patterns.Any(pattern => Regex.IsMatch(text, pattern)); 207 | } 208 | 209 | /// 210 | /// 检查是否明显不是标题 211 | /// 212 | private bool IsObviouslyNotHeading(string text) 213 | { 214 | var excludePatterns = new[] 215 | { 216 | @"^\d+$", // 纯数字 217 | @"^第\s*\d+\s*页$", // 页码 218 | @"^Page\s+\d+$", // 英文页码 219 | @"[\.]{5,}", // 多个点(目录页特征) 220 | @"\d+\.\d+\.\d+\.\d+", // IP地址 221 | @":\d+", // 端口号 222 | @"^www\.|^http|@", // 网址邮箱 223 | @"^\d{4}[-/]\d{1,2}[-/]\d{1,2}", // 日期 224 | @"^[\d\s\-\+\(\)]{10,}$", // 长数字串 225 | }; 226 | 227 | return excludePatterns.Any(pattern => Regex.IsMatch(text, pattern, RegexOptions.IgnoreCase)); 228 | } 229 | 230 | /// 231 | /// 计算置信度 232 | /// 233 | private float CalculateConfidence(List reasons, List exclusions, 234 | TextFragment fragment, TextStatistics stats) 235 | { 236 | if (exclusions.Any()) 237 | return 0f; 238 | 239 | float confidence = 0f; 240 | 241 | // 基础分数 242 | confidence += 0.1f; 243 | 244 | // 各种特征的权重 245 | foreach (var reason in reasons) 246 | { 247 | confidence += reason switch 248 | { 249 | var r when r.Contains("章节编号") => 0.4f, 250 | var r when r.Contains("标题关键词") => 0.3f, 251 | var r when r.Contains("字体较大") => 0.2f, 252 | var r when r.Contains("粗体文本") => 0.15f, 253 | var r when r.Contains("独立成行") => 0.1f, 254 | var r when r.Contains("足够间距") => 0.1f, 255 | var r when r.Contains("不包含操作动词") => 0.05f, 256 | _ => 0.02f 257 | }; 258 | } 259 | 260 | return Math.Min(1.0f, confidence); 261 | } 262 | 263 | /// 264 | /// 确定标题层级 265 | /// 266 | private void DetermineHeadingLevels(List headings) 267 | { 268 | if (!headings.Any()) 269 | return; 270 | 271 | foreach (var heading in headings) 272 | { 273 | // 首先尝试从编号中获取层级 274 | var level = GetLevelFromNumbering(heading.Text); 275 | if (level > 0) 276 | { 277 | heading.SemanticResult!.EstimatedLevel = level; 278 | continue; 279 | } 280 | 281 | // 否则基于字体大小确定层级 282 | var fontSizeRank = headings 283 | .Select(h => h.FontSize) 284 | .Distinct() 285 | .OrderByDescending(s => s) 286 | .ToList() 287 | .IndexOf(heading.FontSize) + 1; 288 | 289 | heading.SemanticResult!.EstimatedLevel = Math.Min(fontSizeRank, _options.MaxHeadingLevels); 290 | } 291 | } 292 | 293 | /// 294 | /// 从编号中获取层级 295 | /// 296 | private int GetLevelFromNumbering(string text) 297 | { 298 | // 匹配 1.2.3. 格式 299 | var match = Regex.Match(text, @"^(\d+(?:\.\d+)*)\."); 300 | if (match.Success) 301 | { 302 | return match.Groups[1].Value.Split('.').Length; 303 | } 304 | 305 | // 匹配第X章格式 306 | if (Regex.IsMatch(text, @"^第\s*[一二三四五六七八九十\d]+\s*章")) 307 | return 1; 308 | 309 | if (Regex.IsMatch(text, @"^第\s*[一二三四五六七八九十\d]+\s*节")) 310 | return 2; 311 | 312 | return 0; 313 | } 314 | 315 | /// 316 | /// 计算文本统计信息 317 | /// 318 | private TextStatistics CalculateStatistics(List fragments) 319 | { 320 | var validFragments = fragments.Where(f => !string.IsNullOrWhiteSpace(f.Text)).ToList(); 321 | 322 | if (!validFragments.Any()) 323 | return new TextStatistics(); 324 | 325 | return new TextStatistics 326 | { 327 | TotalFragments = validFragments.Count, 328 | AverageFontSize = validFragments.Average(f => f.FontSize), 329 | MaxFontSize = validFragments.Max(f => f.FontSize), 330 | MinFontSize = validFragments.Min(f => f.FontSize), 331 | BoldTextCount = validFragments.Count(f => f.IsBold), 332 | AverageTextLength = validFragments.Average(f => f.Text.Length) 333 | }; 334 | } 335 | 336 | /// 337 | /// 文本统计信息 338 | /// 339 | private class TextStatistics 340 | { 341 | public int TotalFragments { get; set; } 342 | public float AverageFontSize { get; set; } 343 | public float MaxFontSize { get; set; } 344 | public float MinFontSize { get; set; } 345 | public int BoldTextCount { get; set; } 346 | public double AverageTextLength { get; set; } 347 | } 348 | } 349 | -------------------------------------------------------------------------------- /tests/PdfTocExtractor.Tests/Exporters/TextExporterTests.cs: -------------------------------------------------------------------------------- 1 | using FluentAssertions; 2 | using PdfTocExtractor.Exporters; 3 | using PdfTocExtractor.Models; 4 | using System.Text; 5 | using Xunit; 6 | 7 | namespace PdfTocExtractor.Tests.Exporters; 8 | 9 | public class TextExporterTests 10 | { 11 | private readonly TextExporter _exporter; 12 | 13 | public TextExporterTests() 14 | { 15 | _exporter = new TextExporter(); 16 | } 17 | 18 | [Fact] 19 | public void FormatName_ShouldReturnText() 20 | { 21 | // Act & Assert 22 | _exporter.FormatName.Should().Be("Text"); 23 | } 24 | 25 | [Fact] 26 | public void FileExtension_ShouldReturnTxt() 27 | { 28 | // Act & Assert 29 | _exporter.FileExtension.Should().Be("txt"); 30 | } 31 | 32 | [Fact] 33 | public void Export_ShouldReturnDocumentWithTitle_WhenNoTocItems() 34 | { 35 | // Arrange 36 | var tocItems = new List(); 37 | var options = new ExportOptions { CustomTitle = "Test Document" }; 38 | 39 | // Act 40 | var result = _exporter.Export(tocItems, options); 41 | 42 | // Assert 43 | result.Should().Contain("Test Document"); 44 | result.Should().Contain("============="); // Title underline 45 | } 46 | 47 | [Fact] 48 | public void Export_ShouldUseDefaultTitle_WhenCustomTitleIsNull() 49 | { 50 | // Arrange 51 | var tocItems = new List(); 52 | var options = new ExportOptions { CustomTitle = null }; 53 | 54 | // Act 55 | var result = _exporter.Export(tocItems, options); 56 | 57 | // Assert 58 | result.Should().Contain("PDF 目录"); 59 | result.Should().Contain("======"); // Title underline 60 | } 61 | 62 | [Fact] 63 | public void Export_ShouldCreateCorrectTextStructure_WithSingleItem() 64 | { 65 | // Arrange 66 | var tocItems = new List 67 | { 68 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 69 | }; 70 | 71 | // Act 72 | var result = _exporter.Export(tocItems); 73 | 74 | // Assert 75 | result.Should().Contain("PDF 目录"); 76 | result.Should().Contain("- Chapter 1(第 5 页)"); 77 | } 78 | 79 | [Fact] 80 | public void Export_ShouldCreateCorrectHierarchy_WithNestedItems() 81 | { 82 | // Arrange 83 | var tocItems = new List 84 | { 85 | new TocItem 86 | { 87 | Title = "Chapter 1", 88 | Page = "5", 89 | Level = 0, 90 | Children = new List 91 | { 92 | new TocItem { Title = "Section 1.1", Page = "6", Level = 1 }, 93 | new TocItem { Title = "Section 1.2", Page = "10", Level = 1 } 94 | } 95 | } 96 | }; 97 | 98 | // Act 99 | var result = _exporter.Export(tocItems); 100 | 101 | // Assert 102 | result.Should().Contain("- Chapter 1(第 5 页)"); 103 | result.Should().Contain(" - Section 1.1(第 6 页)"); 104 | result.Should().Contain(" - Section 1.2(第 10 页)"); 105 | } 106 | 107 | [Fact] 108 | public void Export_ShouldRespectMaxDepth_WhenSpecified() 109 | { 110 | // Arrange 111 | var tocItems = new List 112 | { 113 | new TocItem 114 | { 115 | Title = "Chapter 1", 116 | Page = "5", 117 | Level = 0, 118 | Children = new List 119 | { 120 | new TocItem 121 | { 122 | Title = "Section 1.1", 123 | Page = "6", 124 | Level = 1, 125 | Children = new List 126 | { 127 | new TocItem { Title = "Subsection 1.1.1", Page = "7", Level = 2 } 128 | } 129 | } 130 | } 131 | } 132 | }; 133 | var options = new ExportOptions { MaxDepth = 1 }; 134 | 135 | // Act 136 | var result = _exporter.Export(tocItems, options); 137 | 138 | // Assert 139 | result.Should().Contain("- Chapter 1(第 5 页)"); 140 | result.Should().Contain(" - Section 1.1(第 6 页)"); 141 | result.Should().NotContain("Subsection 1.1.1"); 142 | } 143 | 144 | [Fact] 145 | public void Export_ShouldExcludePageNumbers_WhenIncludePageNumbersIsFalse() 146 | { 147 | // Arrange 148 | var tocItems = new List 149 | { 150 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 151 | }; 152 | var options = new ExportOptions { IncludePageNumbers = false }; 153 | 154 | // Act 155 | var result = _exporter.Export(tocItems, options); 156 | 157 | // Assert 158 | result.Should().Contain("- Chapter 1"); 159 | result.Should().NotContain("第 5 页"); 160 | result.Should().NotContain("("); 161 | result.Should().NotContain(")"); 162 | } 163 | 164 | [Fact] 165 | public void Export_ShouldUseCustomIndentation_WhenSpecified() 166 | { 167 | // Arrange 168 | var tocItems = new List 169 | { 170 | new TocItem 171 | { 172 | Title = "Chapter 1", 173 | Page = "5", 174 | Level = 0, 175 | Children = new List 176 | { 177 | new TocItem { Title = "Section 1.1", Page = "6", Level = 1 } 178 | } 179 | } 180 | }; 181 | var options = new ExportOptions { IndentString = "\t" }; 182 | 183 | // Act 184 | var result = _exporter.Export(tocItems, options); 185 | 186 | // Assert 187 | result.Should().Contain("- Chapter 1(第 5 页)"); 188 | result.Should().Contain("\t- Section 1.1(第 6 页)"); 189 | } 190 | 191 | [Fact] 192 | public void Export_ShouldUseCustomPageNumberFormat_WhenSpecified() 193 | { 194 | // Arrange 195 | var tocItems = new List 196 | { 197 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 198 | }; 199 | var options = new ExportOptions { PageNumberFormat = "Page {0}" }; 200 | 201 | // Act 202 | var result = _exporter.Export(tocItems, options); 203 | 204 | // Assert 205 | result.Should().Contain("- Chapter 1 (Page 5)"); 206 | } 207 | 208 | [Fact] 209 | public void Export_ShouldHandleEmptyPageNumbers() 210 | { 211 | // Arrange 212 | var tocItems = new List 213 | { 214 | new TocItem { Title = "Chapter 1", Page = "", Level = 0 }, 215 | new TocItem { Title = "Chapter 2", Page = "无页码", Level = 0 }, 216 | new TocItem { Title = "Chapter 3", Page = "N/A", Level = 0 } 217 | }; 218 | 219 | // Act 220 | var result = _exporter.Export(tocItems); 221 | 222 | // Assert 223 | result.Should().Contain("- Chapter 1"); 224 | result.Should().Contain("- Chapter 2"); 225 | result.Should().Contain("- Chapter 3"); 226 | result.Should().NotContain("第 页"); 227 | result.Should().NotContain("第 无页码 页"); 228 | result.Should().NotContain("第 N/A 页"); 229 | } 230 | 231 | [Fact] 232 | public void Export_ShouldCreateTitleUnderline_WithCorrectLength() 233 | { 234 | // Arrange 235 | var tocItems = new List(); 236 | var options = new ExportOptions { CustomTitle = "Test" }; 237 | 238 | // Act 239 | var result = _exporter.Export(tocItems, options); 240 | 241 | // Assert 242 | result.Should().Contain("Test"); 243 | result.Should().Contain("===="); // 4 equals for "Test" 244 | } 245 | 246 | [Fact] 247 | public void Export_ShouldHandleLongTitles() 248 | { 249 | // Arrange 250 | var tocItems = new List(); 251 | var longTitle = "This is a very long title for testing purposes"; 252 | var options = new ExportOptions { CustomTitle = longTitle }; 253 | 254 | // Act 255 | var result = _exporter.Export(tocItems, options); 256 | 257 | // Assert 258 | result.Should().Contain(longTitle); 259 | var expectedUnderline = new string('=', longTitle.Length); 260 | result.Should().Contain(expectedUnderline); 261 | } 262 | 263 | [Fact] 264 | public async Task ExportToFileAsync_ShouldCreateFileWithCorrectContent() 265 | { 266 | // Arrange 267 | var tocItems = new List 268 | { 269 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 270 | }; 271 | var tempFile = Path.GetTempFileName(); 272 | var options = new ExportOptions { CustomTitle = "Test Export" }; 273 | 274 | try 275 | { 276 | // Act 277 | await _exporter.ExportToFileAsync(tocItems, tempFile, options); 278 | 279 | // Assert 280 | File.Exists(tempFile).Should().BeTrue(); 281 | var content = await File.ReadAllTextAsync(tempFile); 282 | content.Should().Contain("Test Export"); 283 | content.Should().Contain("- Chapter 1(第 5 页)"); 284 | } 285 | finally 286 | { 287 | // Cleanup 288 | if (File.Exists(tempFile)) 289 | File.Delete(tempFile); 290 | } 291 | } 292 | 293 | [Fact] 294 | public async Task ExportToFileAsync_ShouldUseSpecifiedEncoding() 295 | { 296 | // Arrange 297 | var tocItems = new List 298 | { 299 | new TocItem { Title = "测试章节", Page = "5", Level = 0 } 300 | }; 301 | var tempFile = Path.GetTempFileName(); 302 | var options = new ExportOptions { Encoding = Encoding.Unicode }; 303 | 304 | try 305 | { 306 | // Act 307 | await _exporter.ExportToFileAsync(tocItems, tempFile, options); 308 | 309 | // Assert 310 | File.Exists(tempFile).Should().BeTrue(); 311 | var content = await File.ReadAllTextAsync(tempFile, Encoding.Unicode); 312 | content.Should().Contain("测试章节"); 313 | } 314 | finally 315 | { 316 | // Cleanup 317 | if (File.Exists(tempFile)) 318 | File.Delete(tempFile); 319 | } 320 | } 321 | 322 | [Fact] 323 | public void Export_ShouldHandleNullOptions() 324 | { 325 | // Arrange 326 | var tocItems = new List 327 | { 328 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 } 329 | }; 330 | 331 | // Act 332 | var result = _exporter.Export(tocItems, null); 333 | 334 | // Assert 335 | result.Should().NotBeNullOrEmpty(); 336 | result.Should().Contain("PDF 目录"); 337 | result.Should().Contain("- Chapter 1(第 5 页)"); 338 | } 339 | 340 | [Fact] 341 | public void Export_ShouldHandleMultipleTopLevelItems() 342 | { 343 | // Arrange 344 | var tocItems = new List 345 | { 346 | new TocItem { Title = "Chapter 1", Page = "5", Level = 0 }, 347 | new TocItem { Title = "Chapter 2", Page = "15", Level = 0 }, 348 | new TocItem { Title = "Chapter 3", Page = "25", Level = 0 } 349 | }; 350 | 351 | // Act 352 | var result = _exporter.Export(tocItems); 353 | 354 | // Assert 355 | result.Should().Contain("- Chapter 1(第 5 页)"); 356 | result.Should().Contain("- Chapter 2(第 15 页)"); 357 | result.Should().Contain("- Chapter 3(第 25 页)"); 358 | } 359 | } 360 | --------------------------------------------------------------------------------