├── .idea
└── .idea.PdfTocExtractor
│ └── .idea
│ ├── .name
│ ├── encodings.xml
│ ├── vcs.xml
│ ├── indexLayout.xml
│ └── .gitignore
├── tests
└── PdfTocExtractor.Tests
│ ├── GlobalUsings.cs
│ ├── PdfTocExtractor.Tests.csproj
│ ├── TestResults.md
│ ├── DebugTest.cs
│ ├── Exporters
│ ├── ExportOptionsTests.cs
│ ├── MarkdownExporterTests.cs
│ ├── JsonExporterTests.cs
│ └── TextExporterTests.cs
│ ├── Models
│ └── TocItemTests.cs
│ ├── TestData
│ ├── MockHelpers.cs
│ └── TestDataBuilder.cs
│ └── PdfTocExtractorTests.cs
├── src
├── PdfTocExtractor.Example
│ ├── PdfTocExtractor.Example.csproj
│ └── Program.cs
├── PdfTocExtractor.Cli
│ ├── Program.cs
│ ├── PdfTocExtractor.Cli.csproj
│ ├── README.md
│ └── Commands
│ │ ├── ExtractCommand.cs
│ │ ├── SmartCommand.cs
│ │ ├── DiagnoseCommand.cs
│ │ └── SemanticCommand.cs
└── PdfTocExtractor
│ ├── PdfTocExtractor.csproj
│ ├── Exporters
│ ├── IExporter.cs
│ ├── TextExporter.cs
│ ├── JsonExporter.cs
│ ├── MarkdownExporter.cs
│ └── XmlExporter.cs
│ ├── Models
│ └── TocItem.cs
│ ├── Semantic
│ ├── TextFragment.cs
│ ├── SemanticAnalysisOptions.cs
│ ├── SemanticTocExtractor.cs
│ ├── PdfTextExtractor.cs
│ └── SemanticHeadingAnalyzer.cs
│ ├── README.md
│ └── PdfTocExtractor.cs
├── docs
├── CHANGELOG.md
└── UPGRADE_SUMMARY.md
├── .github
└── workflows
│ ├── ci.yaml
│ ├── README.md
│ └── publish.yaml
└── PdfTocExtractor.sln
/.idea/.idea.PdfTocExtractor/.idea/.name:
--------------------------------------------------------------------------------
1 | PdfTocExtractor
--------------------------------------------------------------------------------
/.idea/.idea.PdfTocExtractor/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/.idea.PdfTocExtractor/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/tests/PdfTocExtractor.Tests/GlobalUsings.cs:
--------------------------------------------------------------------------------
1 | global using Xunit;
2 | global using FluentAssertions;
3 | global using Moq;
4 | global using PdfTocExtractor.Models;
5 | global using PdfTocExtractor.Exporters;
6 | global using PdfTocExtractor.Tests.TestData;
7 |
--------------------------------------------------------------------------------
/.idea/.idea.PdfTocExtractor/.idea/indexLayout.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/.idea.PdfTocExtractor/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Rider ignored files
5 | /contentModel.xml
6 | /projectSettingsUpdater.xml
7 | /.idea.PdfTocExtractor.iml
8 | /modules.xml
9 | # Editor-based HTTP Client requests
10 | /httpRequests/
11 | # Datasource local storage ignored files
12 | /dataSources/
13 | /dataSources.local.xml
14 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor.Example/PdfTocExtractor.Example.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net8.0
6 | enable
7 | enable
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/tests/PdfTocExtractor.Tests/PdfTocExtractor.Tests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 | false
8 | true
9 |
10 |
11 |
12 |
13 |
14 |
15 | runtime; build; native; contentfiles; analyzers; buildtransitive
16 | all
17 |
18 |
19 | runtime; build; native; contentfiles; analyzers; buildtransitive
20 | all
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor.Cli/Program.cs:
--------------------------------------------------------------------------------
1 | using System.CommandLine;
2 | using PdfTocExtractor.Cli.Commands;
3 |
4 | namespace PdfTocExtractor.Cli;
5 |
6 | class Program
7 | {
8 | static async Task Main(string[] args)
9 | {
10 | var rootCommand = new RootCommand("PDF Table of Contents Extractor - 从PDF文件提取目录并导出为多种格式")
11 | {
12 | ExtractCommand.Create(),
13 | SmartCommand.Create(),
14 | SemanticCommand.Create(),
15 | DiagnoseCommand.Create()
16 | };
17 |
18 | rootCommand.SetHandler(() =>
19 | {
20 | Console.WriteLine("PDF Table of Contents Extractor");
21 | Console.WriteLine("使用 --help 查看可用命令");
22 | Console.WriteLine();
23 | Console.WriteLine("示例:");
24 | Console.WriteLine(" pdftoc extract input.pdf -o output.md # 提取PDF书签");
25 | Console.WriteLine(" pdftoc semantic input.pdf -o output.md # 语义分析提取");
26 | Console.WriteLine(" pdftoc smart input.pdf -o output.md # 智能提取(推荐)");
27 | Console.WriteLine(" pdftoc extract input.pdf -o output.json -f json");
28 | Console.WriteLine(" pdftoc semantic input.pdf --mode strict --debug # 严格模式+调试");
29 | Console.WriteLine(" pdftoc diagnose input.pdf # 诊断PDF文件问题");
30 | });
31 |
32 | return await rootCommand.InvokeAsync(args);
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor/PdfTocExtractor.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | net8.0
5 | enable
6 | enable
7 | true
8 |
9 |
10 | PdfTocExtractor
11 | 2.0.0
12 | DealiAxy
13 | A powerful library for extracting table of contents (TOC) from PDF files with advanced semantic analysis capabilities. Supports both bookmark extraction and intelligent structure recognition with multiple output formats.
14 | pdf;toc;table-of-contents;extraction;bookmark;semantic-analysis;nlp;DealiAxy;itext
15 | https://github.com/star-plan/pdf-toc-extractor
16 | https://github.com/star-plan/pdf-toc-extractor
17 | MIT
18 | README.md
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/docs/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | # 更新日志
2 |
3 | 本文档记录了PdfTocExtractor项目的所有重要更改。
4 |
5 | 格式基于 [Keep a Changelog](https://keepachangelog.com/zh-CN/1.0.0/),
6 | 并且本项目遵循 [语义化版本](https://semver.org/lang/zh-CN/)。
7 |
8 | ## [2.0.0] - 2025-08-03
9 |
10 | ### 🎉 重大更新:语义分析功能
11 |
12 | #### 新增
13 | - **🧠 语义分析引擎**:全新的基于NLP的智能章节标题识别系统
14 | - **📝 新增 `semantic` 命令**:专门用于语义分析的命令行工具
15 | - **🎯 智能文本分类**:能够区分真正的标题和操作步骤、列表项
16 | - **🔍 上下文分析**:基于文本位置、字体、间距等特征进行综合判断
17 | - **⚙️ 可配置置信度**:支持调整识别精度的置信度阈值
18 | - **📊 多种分析模式**:默认、严格、宽松、调试模式可选
19 | - **🚫 智能过滤**:自动排除页码、IP地址、操作步骤等非标题内容
20 |
21 | #### 改进
22 | - **🔄 重构 `smart` 命令**:现在会自动尝试语义分析作为备选方案
23 | - **📈 大幅提升识别准确率**:从识别1000+错误标题降低到59个精确标题
24 | - **🎨 优化用户界面**:更清晰的调试输出和进度显示
25 |
26 | #### 技术架构
27 | - **🏗️ 模块化设计**:新增 `PdfTocExtractor.Semantic` 命名空间
28 | - **🔧 可扩展框架**:支持自定义语义规则和分析策略
29 | - **⚡ 高性能处理**:优化的文本提取和合并算法
30 |
31 | #### 移除
32 | - **🧹 清理过时代码**:移除了基于规则的复杂分析代码
33 |
34 | ## [未发布]
35 |
36 | ### 新增
37 | - 初始版本发布
38 | - PDF目录提取核心功能
39 | - 支持多种输出格式:Markdown、JSON、XML、纯文本
40 | - 命令行工具 `pdftoc`
41 | - NuGet包发布:核心库和CLI工具
42 | - AOT编译支持,生成原生可执行文件
43 | - 跨平台支持:Windows、Linux、macOS
44 | - GitHub Actions CI/CD流程
45 | - 可扩展的导出器架构
46 |
47 | ### 技术特性
48 | - 基于 .NET 8.0
49 | - 使用 iText 9.2.0 进行PDF处理
50 | - 支持异步操作
51 | - 完整的单元测试覆盖
52 | - 代码质量检查和格式化
53 |
54 | ## [1.0.0] - 即将发布
55 |
56 | ### 新增
57 | - 🎉 首次正式发布
58 | - 📖 完整的PDF目录提取功能
59 | - 🛠️ 命令行工具和库支持
60 | - 🚀 AOT编译和多平台支持
61 | - 📚 完整的文档和示例
62 |
63 | ---
64 |
65 | ## 版本说明
66 |
67 | - **主版本号**:当你做了不兼容的 API 修改
68 | - **次版本号**:当你做了向下兼容的功能性新增
69 | - **修订号**:当你做了向下兼容的问题修正
70 |
71 | ## 贡献指南
72 |
73 | 如果您想为此项目做出贡献,请:
74 |
75 | 1. 查看 [Issues](https://github.com/star-plan/PdfTocExtractor/issues) 了解当前的问题和功能请求
76 | 2. 提交 Pull Request 时请更新此更新日志
77 | 3. 遵循 [语义化版本](https://semver.org/lang/zh-CN/) 规范
78 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: 持续集成
2 | run-name: ${{ github.actor }} 正在运行CI检查 🔍
3 |
4 | on:
5 | push:
6 | branches: [ master, develop ]
7 | pull_request:
8 | branches: [ master, develop ]
9 |
10 | jobs:
11 | # 基础构建和测试
12 | test:
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - uses: actions/checkout@v4
17 |
18 | - name: Setup .NET
19 | uses: actions/setup-dotnet@v4
20 | with:
21 | dotnet-version: 8.0.x
22 |
23 | - name: 缓存NuGet包
24 | uses: actions/cache@v4
25 | with:
26 | path: ~/.nuget/packages
27 | key: ${{ runner.os }}-nuget-${{ hashFiles('**/packages.lock.json') }}
28 | restore-keys: |
29 | ${{ runner.os }}-nuget-
30 |
31 | - name: 恢复依赖
32 | run: dotnet restore
33 |
34 | - name: 构建项目
35 | run: dotnet build --no-restore --configuration Release
36 |
37 | - name: 运行测试
38 | run: dotnet test --no-build --configuration Release --verbosity normal
39 |
40 | # 简化的AOT编译测试 (仅Linux)
41 | aot-test:
42 | runs-on: ubuntu-latest
43 | needs: test # 只有基础测试通过后才运行AOT测试
44 |
45 | steps:
46 | - uses: actions/checkout@v4
47 |
48 | - name: Setup .NET
49 | uses: actions/setup-dotnet@v4
50 | with:
51 | dotnet-version: 8.0.x
52 |
53 | - name: 安装Linux依赖
54 | run: |
55 | sudo apt-get update
56 | sudo apt-get install -y clang zlib1g-dev
57 |
58 | - name: 恢复依赖
59 | run: dotnet restore
60 |
61 | - name: AOT编译测试
62 | run: dotnet publish src/PdfTocExtractor.Cli/PdfTocExtractor.Cli.csproj -c Release -r linux-x64 --self-contained true -p:PublishAot=true -o ./test-publish
63 |
64 | - name: 测试可执行文件
65 | run: ./test-publish/pdftoc --help
66 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor/Exporters/IExporter.cs:
--------------------------------------------------------------------------------
1 | using PdfTocExtractor.Models;
2 |
3 | namespace PdfTocExtractor.Exporters;
4 |
5 | ///
6 | /// 目录导出器接口
7 | ///
8 | public interface IExporter
9 | {
10 | ///
11 | /// 导出格式名称
12 | ///
13 | string FormatName { get; }
14 |
15 | ///
16 | /// 文件扩展名(不包含点)
17 | ///
18 | string FileExtension { get; }
19 |
20 | ///
21 | /// 导出目录项目到字符串
22 | ///
23 | /// 目录项目列表
24 | /// 导出选项
25 | /// 导出的字符串内容
26 | string Export(IEnumerable tocItems, ExportOptions? options = null);
27 |
28 | ///
29 | /// 异步导出目录项目到文件
30 | ///
31 | /// 目录项目列表
32 | /// 输出文件路径
33 | /// 导出选项
34 | Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null);
35 | }
36 |
37 | ///
38 | /// 导出选项
39 | ///
40 | public class ExportOptions
41 | {
42 | ///
43 | /// 缩进字符串(默认为两个空格)
44 | ///
45 | public string IndentString { get; set; } = " ";
46 |
47 | ///
48 | /// 是否包含页码
49 | ///
50 | public bool IncludePageNumbers { get; set; } = true;
51 |
52 | ///
53 | /// 是否包含链接(如果格式支持)
54 | ///
55 | public bool IncludeLinks { get; set; } = false;
56 |
57 | ///
58 | /// 最大层级深度(0表示无限制)
59 | ///
60 | public int MaxDepth { get; set; } = 0;
61 |
62 | ///
63 | /// 页码格式化字符串
64 | ///
65 | public string PageNumberFormat { get; set; } = "第 {0} 页";
66 |
67 | ///
68 | /// 自定义标题(用于某些格式的文档标题)
69 | ///
70 | public string? CustomTitle { get; set; }
71 |
72 | ///
73 | /// 编码格式(默认UTF-8)
74 | ///
75 | public System.Text.Encoding Encoding { get; set; } = System.Text.Encoding.UTF8;
76 | }
77 |
--------------------------------------------------------------------------------
/.github/workflows/README.md:
--------------------------------------------------------------------------------
1 | # GitHub Actions 工作流说明
2 |
3 | 本项目包含两个主要的GitHub Actions工作流:
4 |
5 | ## 📋 工作流概览
6 |
7 | ### 1. 持续集成 (CI) - `ci.yaml`
8 | **触发条件**:推送到 `master` 或 `develop` 分支,或创建针对这些分支的Pull Request
9 |
10 | **功能**:
11 | - ✅ 代码格式检查
12 | - 🔨 多平台构建测试
13 | - 🧪 单元测试执行
14 | - 📊 代码覆盖率收集
15 | - 🚀 AOT编译测试(Linux、Windows、macOS)
16 |
17 | ### 2. 发布流程 (Publish) - `publish.yaml`
18 | **触发条件**:推送版本标签(格式:`v*.*.*`,如 `v1.0.0`)
19 |
20 | **功能**:
21 | - 🧪 运行完整测试套件
22 | - 📦 发布NuGet包(核心库 + CLI工具)
23 | - 🔨 多平台AOT编译(Windows、Linux、macOS)
24 | - 📋 创建GitHub Release
25 | - 📁 上传可执行文件到Release
26 |
27 | ## 🚀 发布新版本
28 |
29 | ### 步骤1:准备发布
30 | 1. 确保所有测试通过
31 | 2. 更新版本号(在项目文件中)
32 | 3. 更新CHANGELOG.md(如果有)
33 |
34 | ### 步骤2:创建并推送标签
35 | ```bash
36 | # 创建标签
37 | git tag v1.0.0
38 |
39 | # 推送标签到远程仓库
40 | git push origin v1.0.0
41 | ```
42 |
43 | ### 步骤3:自动化流程
44 | 推送标签后,GitHub Actions将自动:
45 | 1. 运行测试
46 | 2. 发布NuGet包
47 | 3. 编译多平台可执行文件
48 | 4. 创建GitHub Release
49 |
50 | ## 🔧 配置要求
51 |
52 | ### 必需的Secrets
53 | 在GitHub仓库设置中添加以下Secrets:
54 |
55 | - `NUGET_GALLERY_TOKEN`:NuGet.org的API密钥
56 | - 获取方式:登录 [NuGet.org](https://www.nuget.org) → Account Settings → API Keys
57 |
58 | ### 权限设置
59 | 工作流需要以下权限(已在YAML中配置):
60 | - `contents: write` - 创建Release
61 | - `id-token: write` - 身份验证
62 | - `issues: write` - 更新Issue
63 |
64 | ## 📦 发布产物
65 |
66 | ### NuGet包
67 | - `PdfTocExtractor` - 核心库
68 | - `PdfTocExtractor.Cli` - CLI工具包
69 |
70 | ### 可执行文件
71 | - `PdfTocExtractor-windows-{version}.zip` - Windows可执行文件
72 | - `PdfTocExtractor-linux-{version}.tar.gz` - Linux可执行文件
73 | - `PdfTocExtractor-macOS-{version}.tar.gz` - macOS可执行文件
74 |
75 | ## 🔍 监控和调试
76 |
77 | ### 查看工作流状态
78 | 1. 访问仓库的 "Actions" 标签页
79 | 2. 选择相应的工作流运行
80 | 3. 查看详细日志
81 |
82 | ### 常见问题
83 | 1. **AOT编译失败**:检查代码是否AOT兼容
84 | 2. **NuGet发布失败**:验证API密钥是否正确
85 | 3. **测试失败**:确保所有测试在本地通过
86 |
87 | ## 📝 工作流特性
88 |
89 | ### 优化特性
90 | - ✅ NuGet包缓存,提升构建速度
91 | - ✅ 并行构建多个平台
92 | - ✅ 失败时不影响其他平台构建
93 | - ✅ 自动生成Release说明
94 |
95 | ### 安全特性
96 | - ✅ 最小权限原则
97 | - ✅ 安全的密钥管理
98 | - ✅ 构建产物验证
99 |
100 | ## 🎯 下一步优化
101 |
102 | 可考虑的改进:
103 | - 添加安全扫描
104 | - 集成代码质量检查工具
105 | - 添加性能基准测试
106 | - 支持预发布版本
107 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor/Models/TocItem.cs:
--------------------------------------------------------------------------------
1 | namespace PdfTocExtractor.Models;
2 |
3 | ///
4 | /// 表示PDF目录中的一个项目
5 | ///
6 | public class TocItem
7 | {
8 | ///
9 | /// 标题
10 | ///
11 | public string Title { get; set; } = string.Empty;
12 |
13 | ///
14 | /// 页码
15 | ///
16 | public string Page { get; set; } = string.Empty;
17 |
18 | ///
19 | /// 层级深度(从0开始)
20 | ///
21 | public int Level { get; set; }
22 |
23 | ///
24 | /// 子项目
25 | ///
26 | public List Children { get; set; } = new();
27 |
28 | ///
29 | /// 父项目(用于构建层级关系)
30 | ///
31 | public TocItem? Parent { get; set; }
32 |
33 | ///
34 | /// 获取页码的数字部分
35 | ///
36 | public int PageNumber
37 | {
38 | get
39 | {
40 | if (string.IsNullOrEmpty(Page) || Page == "无页码" || Page == "N/A")
41 | return 0;
42 |
43 | // 处理 "5 XYZ ..." 格式,只取页码部分
44 | var pageStr = Page.Contains(' ') ? Page.Split(' ')[0] : Page;
45 | return int.TryParse(pageStr, out var pageNum) ? pageNum : 0;
46 | }
47 | }
48 |
49 | ///
50 | /// 是否有子项目
51 | ///
52 | public bool HasChildren => Children.Count > 0;
53 |
54 | ///
55 | /// 获取所有后代项目(递归)
56 | ///
57 | public IEnumerable GetAllDescendants()
58 | {
59 | foreach (var child in Children)
60 | {
61 | yield return child;
62 | foreach (var descendant in child.GetAllDescendants())
63 | {
64 | yield return descendant;
65 | }
66 | }
67 | }
68 |
69 | ///
70 | /// 获取项目的完整路径(从根到当前项目的标题路径)
71 | ///
72 | public string GetFullPath(string separator = " > ")
73 | {
74 | var path = new List();
75 | var current = this;
76 |
77 | while (current != null)
78 | {
79 | path.Insert(0, current.Title);
80 | current = current.Parent;
81 | }
82 |
83 | return string.Join(separator, path);
84 | }
85 |
86 | public override string ToString()
87 | {
88 | return $"{new string(' ', Level * 2)}- {Title} (第 {Page} 页)";
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor.Cli/PdfTocExtractor.Cli.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | net8.0
6 | enable
7 | enable
8 | pdftoc
9 |
10 |
11 | true
12 | pdftoc
13 | ./nupkg
14 |
15 |
16 | PdfTocExtractor.Cli
17 | 2.0.0
18 | DealiAxy
19 | A powerful command-line tool for extracting table of contents (TOC) from PDF files with advanced semantic analysis capabilities. Supports bookmark extraction, intelligent structure recognition, and multiple output formats.
20 | pdf;toc;table-of-contents;extraction;bookmark;semantic-analysis;nlp;cli;tool;dotnet-tool;DealiAxy
21 | https://github.com/star-plan/pdf-toc-extractor
22 | https://github.com/star-plan/pdf-toc-extractor
23 | MIT
24 | README.md
25 | true
26 |
27 |
28 | true
29 |
30 |
31 |
32 |
33 | true
34 | partial
35 | true
36 | false
37 | Size
38 | true
39 | true
40 |
41 | true
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor/Semantic/TextFragment.cs:
--------------------------------------------------------------------------------
1 | namespace PdfTocExtractor.Semantic;
2 |
3 | ///
4 | /// 表示PDF中的一个文本片段及其上下文信息
5 | ///
6 | public class TextFragment
7 | {
8 | ///
9 | /// 文本内容
10 | ///
11 | public string Text { get; set; } = string.Empty;
12 |
13 | ///
14 | /// 字体大小
15 | ///
16 | public float FontSize { get; set; }
17 |
18 | ///
19 | /// 字体名称
20 | ///
21 | public string FontName { get; set; } = string.Empty;
22 |
23 | ///
24 | /// 是否为粗体
25 | ///
26 | public bool IsBold { get; set; }
27 |
28 | ///
29 | /// 是否为斜体
30 | ///
31 | public bool IsItalic { get; set; }
32 |
33 | ///
34 | /// X坐标位置
35 | ///
36 | public float X { get; set; }
37 |
38 | ///
39 | /// Y坐标位置
40 | ///
41 | public float Y { get; set; }
42 |
43 | ///
44 | /// 页码
45 | ///
46 | public int PageNumber { get; set; }
47 |
48 | ///
49 | /// 文本宽度
50 | ///
51 | public float Width { get; set; }
52 |
53 | ///
54 | /// 文本高度
55 | ///
56 | public float Height { get; set; }
57 |
58 | ///
59 | /// 是否独立成行
60 | ///
61 | public bool IsStandalone { get; set; }
62 |
63 | ///
64 | /// 前后的垂直间距
65 | ///
66 | public float VerticalSpaceBefore { get; set; }
67 | public float VerticalSpaceAfter { get; set; }
68 |
69 | ///
70 | /// 语义分析结果
71 | ///
72 | public SemanticAnalysisResult? SemanticResult { get; set; }
73 |
74 | public override string ToString()
75 | {
76 | return $"[Page {PageNumber}] \"{Text}\" - Font: {FontName}, Size: {FontSize}, Bold: {IsBold}";
77 | }
78 | }
79 |
80 | ///
81 | /// 语义分析结果
82 | ///
83 | public class SemanticAnalysisResult
84 | {
85 | ///
86 | /// 是否可能是标题
87 | ///
88 | public bool IsLikelyHeading { get; set; }
89 |
90 | ///
91 | /// 标题置信度 (0-1)
92 | ///
93 | public float HeadingConfidence { get; set; }
94 |
95 | ///
96 | /// 推测的标题层级 (1-6)
97 | ///
98 | public int EstimatedLevel { get; set; }
99 |
100 | ///
101 | /// 分析原因
102 | ///
103 | public List Reasons { get; set; } = new();
104 |
105 | ///
106 | /// 排除原因(如果不是标题)
107 | ///
108 | public List ExclusionReasons { get; set; } = new();
109 | }
110 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor/Semantic/SemanticAnalysisOptions.cs:
--------------------------------------------------------------------------------
1 | namespace PdfTocExtractor.Semantic;
2 |
3 | ///
4 | /// 语义分析配置选项
5 | ///
6 | public class SemanticAnalysisOptions
7 | {
8 | ///
9 | /// 标题的最小长度
10 | ///
11 | public int MinHeadingLength { get; set; } = 3;
12 |
13 | ///
14 | /// 标题的最大长度
15 | ///
16 | public int MaxHeadingLength { get; set; } = 100;
17 |
18 | ///
19 | /// 字体大小倍数阈值(相对于平均字体大小)
20 | ///
21 | public float FontSizeMultiplier { get; set; } = 1.1f;
22 |
23 | ///
24 | /// 是否将粗体视为标题指示器
25 | ///
26 | public bool ConsiderBoldAsHeading { get; set; } = true;
27 |
28 | ///
29 | /// 最小垂直间距
30 | ///
31 | public float MinVerticalSpacing { get; set; } = 5f;
32 |
33 | ///
34 | /// 最小置信度阈值
35 | ///
36 | public float MinConfidenceThreshold { get; set; } = 0.3f;
37 |
38 | ///
39 | /// 最大标题层级数
40 | ///
41 | public int MaxHeadingLevels { get; set; } = 6;
42 |
43 | ///
44 | /// 是否启用调试模式
45 | ///
46 | public bool DebugMode { get; set; } = false;
47 |
48 | ///
49 | /// 跳过的页面范围(通常是目录页)
50 | ///
51 | public List SkipPages { get; set; } = new() { 1, 2, 3 };
52 |
53 | ///
54 | /// 是否忽略页眉页脚
55 | ///
56 | public bool IgnoreHeaderFooter { get; set; } = true;
57 |
58 | ///
59 | /// 页眉高度
60 | ///
61 | public float HeaderHeight { get; set; } = 50f;
62 |
63 | ///
64 | /// 页脚高度
65 | ///
66 | public float FooterHeight { get; set; } = 50f;
67 |
68 | ///
69 | /// 默认配置
70 | ///
71 | public static SemanticAnalysisOptions Default => new();
72 |
73 | ///
74 | /// 严格模式配置
75 | ///
76 | public static SemanticAnalysisOptions Strict => new()
77 | {
78 | MinHeadingLength = 5,
79 | MaxHeadingLength = 80,
80 | FontSizeMultiplier = 1.3f,
81 | MinConfidenceThreshold = 0.5f,
82 | MinVerticalSpacing = 8f
83 | };
84 |
85 | ///
86 | /// 宽松模式配置
87 | ///
88 | public static SemanticAnalysisOptions Relaxed => new()
89 | {
90 | MinHeadingLength = 2,
91 | MaxHeadingLength = 150,
92 | FontSizeMultiplier = 1.05f,
93 | MinConfidenceThreshold = 0.2f,
94 | MinVerticalSpacing = 2f
95 | };
96 |
97 | ///
98 | /// 调试模式配置
99 | ///
100 | public static SemanticAnalysisOptions Debug => new()
101 | {
102 | DebugMode = true,
103 | MinConfidenceThreshold = 0.1f
104 | };
105 | }
106 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor/Exporters/TextExporter.cs:
--------------------------------------------------------------------------------
1 | using System.Text;
2 | using PdfTocExtractor.Models;
3 |
4 | namespace PdfTocExtractor.Exporters;
5 |
6 | ///
7 | /// 纯文本格式导出器
8 | ///
9 | public class TextExporter : IExporter
10 | {
11 | public string FormatName => "Text";
12 | public string FileExtension => "txt";
13 |
14 | public string Export(IEnumerable tocItems, ExportOptions? options = null)
15 | {
16 | options ??= new ExportOptions();
17 | var sb = new StringBuilder();
18 |
19 | // 添加文档标题
20 | var title = options.CustomTitle ?? "PDF 目录";
21 | sb.AppendLine(title);
22 | sb.AppendLine(new string('=', title.Length));
23 | sb.AppendLine();
24 |
25 | // 过滤深度并导出目录项
26 | var filteredItems = FilterByDepth(tocItems, options.MaxDepth);
27 | ExportItems(filteredItems, sb, options);
28 |
29 | return sb.ToString();
30 | }
31 |
32 | public async Task ExportToFileAsync(IEnumerable tocItems, string filePath, ExportOptions? options = null)
33 | {
34 | var content = Export(tocItems, options);
35 | options ??= new ExportOptions();
36 | await File.WriteAllTextAsync(filePath, content, options.Encoding);
37 | }
38 |
39 | private void ExportItems(IEnumerable items, StringBuilder sb, ExportOptions options)
40 | {
41 | foreach (var item in items)
42 | {
43 | // 生成缩进
44 | var indentLevel = Math.Max(0, item.Level);
45 | var indent = string.Concat(Enumerable.Repeat(options.IndentString, indentLevel));
46 |
47 | // 构建项目文本
48 | var itemText = new StringBuilder();
49 | itemText.Append($"{indent}- {item.Title}");
50 |
51 | // 添加页码信息
52 | if (options.IncludePageNumbers && !string.IsNullOrEmpty(item.Page) && item.Page != "无页码" && item.Page != "N/A")
53 | {
54 | var pageText = string.Format(options.PageNumberFormat, item.Page);
55 | // 根据页码格式决定括号类型:默认中文格式使用中文括号,自定义格式使用英文括号
56 | var isDefaultFormat = options.PageNumberFormat == "第 {0} 页";
57 | var brackets = isDefaultFormat ? ("(", ")") : ("(", ")");
58 | // 中文格式不需要空格,英文格式需要空格
59 | var spacing = isDefaultFormat ? "" : " ";
60 | itemText.Append($"{spacing}{brackets.Item1}{pageText}{brackets.Item2}");
61 | }
62 |
63 | sb.AppendLine(itemText.ToString());
64 |
65 | // 递归处理子项目
66 | if (item.HasChildren)
67 | {
68 | ExportItems(item.Children, sb, options);
69 | }
70 | }
71 | }
72 |
73 | private IEnumerable FilterByDepth(IEnumerable items, int maxDepth)
74 | {
75 | if (maxDepth <= 0) return items;
76 |
77 | return items.Where(item => item.Level <= maxDepth).Select(item => new TocItem
78 | {
79 | Title = item.Title,
80 | Page = item.Page,
81 | Level = item.Level,
82 | Parent = item.Parent,
83 | Children = FilterByDepth(item.Children, maxDepth).ToList()
84 | });
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/PdfTocExtractor/Exporters/JsonExporter.cs:
--------------------------------------------------------------------------------
1 | using System.Text.Json;
2 | using System.Text.Json.Serialization;
3 | using PdfTocExtractor.Models;
4 |
5 | namespace PdfTocExtractor.Exporters;
6 |
7 | ///
8 | /// JSON导出数据结构
9 | ///
10 | public class JsonExportData
11 | {
12 | public string Title { get; set; } = string.Empty;
13 | public DateTime GeneratedAt { get; set; }
14 | public IEnumerable