├── example.pdf
├── example
├── cover.jpg
├── qrcode.jpg
├── back-cover.jpg
├── SUMMARY.md
├── chapter0-preface
│ └── index.md
├── _index.md
├── chapter4-mono-code-block
│ └── index.md
├── chapter3-advanced-formatting
│ └── index.md
├── chapter1-basics
│ └── index.md
└── chapter2-code-and-tables
│ └── index.md
├── example-screenshot.jpg
├── .gitignore
├── Makefile
├── filters
├── number-lines.lua
├── simple-image-attr-cleanup.lua
├── README.md
├── fix-lstinline.lua
├── minted-filter.lua
├── image-attr-cleanup.lua
├── ansi-cleanup.lua
├── cleanup-filter.lua
├── symbol-fallback-filter.lua
├── table-filter.lua
├── table-wrap.lua
└── emoji-passthrough.lua
├── diagnose_env.sh
├── cli.py
├── install_pdf_dependencies.sh
├── tree.py
├── table-filter.lua
├── emoji_support.py
├── emoji-commands.tex
├── cache_utils.py
├── frontmatter.py
├── LICENSE
├── table-wrap.lua
├── validate_lua_dependencies.py
├── README.md
└── image_utils.py
/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rootsongjc/pdf-book-exporter/main/example.pdf
--------------------------------------------------------------------------------
/example/cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rootsongjc/pdf-book-exporter/main/example/cover.jpg
--------------------------------------------------------------------------------
/example/qrcode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rootsongjc/pdf-book-exporter/main/example/qrcode.jpg
--------------------------------------------------------------------------------
/example-screenshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rootsongjc/pdf-book-exporter/main/example-screenshot.jpg
--------------------------------------------------------------------------------
/example/back-cover.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rootsongjc/pdf-book-exporter/main/example/back-cover.jpg
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | example/image-caches/
3 | __pycache__/
4 | .pytest_cache/
5 | .ipynb_checkpoints/
6 | .DS_Store
7 |
--------------------------------------------------------------------------------
/example/SUMMARY.md:
--------------------------------------------------------------------------------
1 | - [说明](chapter0-preface/index.md)
2 | - [基础格式测试](chapter1-basics/index.md)
3 | - [代码块和表格测试](chapter2-code-and-tables/index.md)
4 | - [高级格式和特殊内容测试](chapter3-advanced-formatting/index.md)
5 | - [SourceHanMono 字体测试](chapter4-mono-code-block/index.md)
6 |
--------------------------------------------------------------------------------
/example/chapter0-preface/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 说明
3 | weight: 1
4 | ---
5 |
6 | 本书是专门用于测试 PDF 电子书导出功能的综合示例文档,包含了基础格式、代码表格、高级格式等各种 Markdown 内容,用于验证 PDF 导出工具的渲染效果。
7 |
8 | ## 测试目标
9 |
10 | 本测试文档涵盖以下内容:
11 |
12 | ### 基础格式测试
13 |
14 | - ✅ 中英文混排显示
15 | - ✅ Emoji 表情符号渲染
16 | - ✅ 基础文本格式(粗体、斜体、删除线等)
17 | - ✅ 列表和引用格式
18 | - ✅ 链接和特殊字符
19 |
20 | ### 代码和表格测试
21 |
22 | - ✅ 多语言代码块语法高亮
23 | - ✅ 超宽代码块处理
24 | - ✅ 各种表格格式
25 | - ✅ 代码与表格混合内容
26 |
27 | ### 高级格式测试
28 |
29 | - ✅ 数学公式渲染
30 | - ✅ 任务列表和复选框
31 | - ✅ 定义列表
32 | - ✅ 复杂嵌套结构
33 | - ✅ 特殊布局样式
34 |
35 | **开始全面测试!** 🚀
36 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | # Makefile for PDF book exporter tools
2 | # Contains utilities for the PDF export system
3 |
4 | .PHONY: help clean install diagnostics
5 |
6 | # Default target
7 | help:
8 | @echo "PDF Book Exporter - Available targets:"
9 | @echo " install - Install required dependencies"
10 | @echo " diagnostics - Run system diagnostics for emoji support"
11 | @echo " clean - Clean generated files"
12 | @echo ""
13 | @echo "Usage: python cli.py [book_directory] -o [output.pdf] [options]"
14 | @echo "Run 'python cli.py --help' for more information"
15 |
16 | # Install dependencies
17 | install:
18 | @echo "Installing PDF export dependencies..."
19 | @chmod +x install_pdf_dependencies.sh
20 | @./install_pdf_dependencies.sh
21 |
22 | # Run system diagnostics
23 | diagnostics:
24 | @echo "Running PDF export diagnostics..."
25 | @python3 cli.py --diagnostics .
26 |
27 | # Clean generated files
28 | clean:
29 | @echo "Cleaning generated files..."
30 | @rm -f *.pdf
31 | @rm -f emoji-font-config.tex
32 | @rm -rf test-output
33 | @echo "Clean complete."
34 |
--------------------------------------------------------------------------------
/filters/number-lines.lua:
--------------------------------------------------------------------------------
1 | -- number-lines.lua
2 | -- A Pandoc Lua filter to add line numbers to all code blocks
3 | -- This filter adds the 'numberLines' attribute to CodeBlock elements
4 | -- so that Pandoc's built-in syntax highlighter will output a two-column table
5 | -- with line numbers in the left column
6 |
7 | function CodeBlock(elem)
8 | -- Add the numberLines attribute to enable line numbering
9 | elem.attributes.numberLines = ""
10 |
11 | -- Set the starting line number (defaults to 1 if not specified)
12 | if not elem.attributes.startFrom then
13 | elem.attributes.startFrom = "1"
14 | end
15 |
16 | -- Force table format for line numbers in HTML output
17 | elem.attributes["number-lines"] = ""
18 |
19 | -- Return the modified code block
20 | return elem
21 | end
22 |
23 | -- Post-process to ensure line numbers are displayed as a table
24 | function Div(elem)
25 | if elem.classes and elem.classes:includes("sourceCode") then
26 | -- Add additional styling class for better CSS targeting
27 | elem.classes:insert("numbered-code")
28 | end
29 | return elem
30 | end
31 |
--------------------------------------------------------------------------------
/filters/simple-image-attr-cleanup.lua:
--------------------------------------------------------------------------------
1 | --[[
2 | Simple image attribute cleanup filter
3 | This filter removes standalone paragraphs that contain only image attributes
4 | like {width=1486 height=518}
5 | ]]
6 |
7 | function Para(elem)
8 | -- Get the text content of the paragraph
9 | local content_str = pandoc.utils.stringify(elem)
10 |
11 | -- Check if this paragraph contains only image attribute syntax
12 | -- Patterns to match various formats:
13 | -- {width=123 height=456}
14 | -- {height=456 width=123}
15 | -- {width=123}
16 | -- {height=456}
17 | local patterns = {
18 | "^%s*{%s*width%s*=%s*%d+%s+height%s*=%s*%d+%s*}%s*$",
19 | "^%s*{%s*height%s*=%s*%d+%s+width%s*=%s*%d+%s*}%s*$",
20 | "^%s*{%s*width%s*=%s*%d+%s*}%s*$",
21 | "^%s*{%s*height%s*=%s*%d+%s*}%s*$"
22 | }
23 |
24 | -- Check if content matches any image attribute pattern
25 | for _, pattern in ipairs(patterns) do
26 | if content_str:match(pattern) then
27 | return {} -- Remove this paragraph completely
28 | end
29 | end
30 |
31 | -- Return unchanged if not an image attribute
32 | return elem
33 | end
34 |
--------------------------------------------------------------------------------
/example/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: PDF 导出功能完整测试
3 | author: PDF Book Exporter
4 | date: '2025-08-06'
5 | description: 全面测试 PDF 导出工具的各种格式渲染效果
6 | language: zh-hans
7 | weight: 1
8 | book:
9 | title: PDF 导出功能完整测试
10 | description: 全面测试 PDF 导出工具的各种格式渲染效果
11 | language: zh-hans
12 | author: PDF Book Exporter
13 | website: https://jimmysong.io/book/pdf-book-exporter/
14 | appendix: false
15 | cover: cover.jpg
16 | backcover_image: back-cover.jpg
17 | backcover_text: 「几米宋」微信公众号
18 | backcover_link_text: jimmysong.io
19 | backcover_link_url: https://jimmysong.io
20 | backcover_text_color: '#FFFFFF'
21 | backcover_link_color: '#1d09d8'
22 | cover_title_text: PDF 导出功能测试
23 | cover_author_text: PDF Book Exporter
24 | cover_subtitle_text: 完整测试版本,全面测试 PDF 导出工具的各种格式渲染效果
25 | cover_title_color: '#FFFFFF'
26 | cover_author_color: '#E0E0E0'
27 | cover_subtitle_color: '#C0C0C0'
28 | cover_title_font_size: 42
29 | cover_author_font_size: 28
30 | cover_subtitle_font_size: 20
31 | cover_title_position: center
32 | cover_author_position: bottom
33 | cover_overlay_enabled: true
34 | cover_text_shadow: false
35 | cover_background_overlay: false
36 | cover_overlay_opacity: 0
37 | body_color: '#333333'
38 | heading_color: '#2C3E50'
39 | link_color: '#3498DB'
40 | code_color: '#E74C3C'
41 | quote_color: '#7F8C8D'
42 | caption_color: '#95A5A6'
43 | lastmod: '2025-08-06'
44 | ---
45 |
46 |
--------------------------------------------------------------------------------
/filters/README.md:
--------------------------------------------------------------------------------
1 | # PDF Book Exporter Filters
2 |
3 | A collection of Pandoc Lua filters for enhanced PDF book generation.
4 |
5 | ## Available Filters
6 |
7 | ### emoji-passthrough.lua
8 |
9 | Handles emoji characters for LaTeX output with proper font switching.
10 |
11 | ### minted-filter.lua
12 |
13 | Converts fenced code blocks to minted environments for better syntax highlighting.
14 |
15 | ### table-wrap.lua
16 |
17 | Converts pipe tables to longtable format for better page wrapping.
18 |
19 | ### cleanup-filter.lua
20 |
21 | Cleans up problematic characters and formatting issues.
22 |
23 | ### ansi-cleanup.lua
24 |
25 | Removes ANSI escape codes from content.
26 |
27 | ### fix-lstinline.lua
28 |
29 | Fixes inline code styling issues with CJK characters.
30 |
31 | ### symbol-fallback-filter.lua
32 |
33 | Provides fallback handling for special symbols and characters.
34 |
35 | ## Usage
36 |
37 | These filters are automatically applied by the `cli.py` script. They are located in the `filters/` directory and are applied in the correct order during PDF generation.
38 |
39 | ## Requirements
40 |
41 | - Pandoc with Lua support
42 | - LaTeX distribution with required packages (see main documentation)
43 |
44 | ## Filter Details
45 |
46 | Each filter serves a specific purpose in the PDF generation pipeline:
47 |
48 | - **emoji-passthrough.lua**: Ensures proper emoji rendering with font switching
49 | - **minted-filter.lua**: Provides enhanced syntax highlighting for code blocks
50 | - **table-wrap.lua**: Improves table formatting and page breaks
51 | - **cleanup-filter.lua**: Removes problematic characters that can break LaTeX compilation
52 | - **ansi-cleanup.lua**: Strips ANSI escape sequences from content
53 | - **fix-lstinline.lua**: Fixes inline code rendering issues with CJK characters
54 | - **symbol-fallback-filter.lua**: Handles special symbols and provides fallbacks
55 |
--------------------------------------------------------------------------------
/example/chapter4-mono-code-block/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: SourceHanMono 字体测试
3 | weight: 40
4 | ---
5 |
6 | 本文档用于验证 SourceHanMono 字体在代码块中的应用效果。
7 |
8 | ## Python 代码示例
9 |
10 | ```python
11 | def hello_world():
12 | """
13 | 一个简单的函数,展示 SourceHanMono 字体效果
14 | """
15 | print("Hello, 世界!") # 中文注释
16 | print("こんにちは、世界!") # 日文注释
17 | print("안녕하세요, 세계!") # 韩文注释
18 |
19 | # 数字和符号测试
20 | numbers = [1, 2, 3, 4, 5]
21 | symbols = ['!', '@', '#', '$', '%', '^', '&', '*']
22 |
23 | return "测试成功"
24 | ```
25 |
26 | ## Bash 脚本示例
27 |
28 | ```bash
29 | #!/bin/bash
30 | # 这是一个包含中文的脚本
31 |
32 | echo "开始执行脚本..."
33 | echo "正在处理中文文件名:测试文档.txt"
34 |
35 | # 创建包含中文路径的目录
36 | mkdir -p "./测试目录/子目录"
37 | ls -la "./测试目录"
38 |
39 | # 函数定义
40 | function 显示消息() {
41 | echo "函数名也可以是中文:$1"
42 | }
43 |
44 | 显示消息 "这是一个测试消息"
45 | ```
46 |
47 | ## JavaScript 代码示例
48 |
49 | ```javascript
50 | // JavaScript 中的中文变量和注释
51 | const 问候语 = "你好,世界!";
52 | const 数字列表 = [1, 2, 3, 4, 5];
53 |
54 | function 显示问候(名字) {
55 | console.log(`${问候语} ${名字}`);
56 | // 这里展示中文字符在等宽字体中的效果
57 | console.log("中文字符测试:测试");
58 | console.log("English text: test");
59 | console.log("混合文本:mix 测试 test");
60 | }
61 |
62 | 显示问候("张三");
63 | ```
64 |
65 | ## 内联代码测试
66 |
67 | 以下是内联代码的测试:
68 |
69 | - Python 变量:`变量名 = "中文值"`
70 | - 文件路径:`/home/用户/文档/测试文件.txt`
71 | - 命令示例:`ls -la 中文目录`
72 | - 混合内容:`hello世界test`
73 |
74 | ## 等宽字符对齐测试
75 |
76 | ```
77 | ASCII字符: ABCDEFGHIJKLMNOPQRSTUVWXYZ
78 | 中文字符: 你好世界测试字体显示效果验证
79 | 日文字符: こんにちはテストフォント
80 | 韩文字符: 안녕하세요테스트폰트
81 | 混合内容: Test测试テストテスト
82 | ```
83 |
84 | ## 代码注释多语言测试
85 |
86 | ```go
87 | package main
88 |
89 | import "fmt"
90 |
91 | // 主函数 - 中文注释
92 | // メイン関数 - 日文注释
93 | // 메인 함수 - 韩文注释
94 | func main() {
95 | // 变量声明
96 | message := "多语言字体测试"
97 |
98 | fmt.Println(message)
99 | fmt.Println("Hello, 世界!") // 英文 + 中文
100 | fmt.Println("こんにちは、世界!") // 日文
101 | fmt.Println("안녕하세요, 세계!") // 韩文
102 | }
103 | ```
104 |
105 | 此文档将帮助验证 SourceHanMono 字体是否正确应用于所有代码块和内联代码。
106 |
--------------------------------------------------------------------------------
/filters/fix-lstinline.lua:
--------------------------------------------------------------------------------
1 | -- Enhanced fix-lstinline filter
2 | -- This filter fixes the issue where Pandoc generates \passthrough{\lstinline!...!}
3 | -- and ensures inline code uses our template's table-aware wrapping logic
4 |
5 | function RawInline(elem)
6 | -- Only process LaTeX output
7 | if not FORMAT:match 'latex' then
8 | return elem
9 | end
10 |
11 | -- Check if this is a passthrough lstinline command
12 | local content = elem.text
13 | if content:match("^\\passthrough{\\lstinline!.*!}$") then
14 | -- Extract the content between the exclamation marks
15 | local inline_content = content:match("\\passthrough{\\lstinline!(.-)}$")
16 | if inline_content then
17 | -- Remove the trailing exclamation mark
18 | inline_content = inline_content:gsub("!$", "")
19 | -- Use our template's texttt command which has table-aware line breaking
20 | return pandoc.RawInline('latex', '\\texttt{' .. inline_content .. '}')
21 | end
22 | end
23 |
24 | return elem
25 | end
26 |
27 | -- Function to escape LaTeX special characters for safe inclusion in texttt
28 | function escape_latex_special_chars(text)
29 | if not text then return "" end
30 | -- Escape characters that could cause issues in LaTeX
31 | text = text:gsub("\\", "\\textbackslash{}")
32 | text = text:gsub("{", "\\{")
33 | text = text:gsub("}", "\\}")
34 | text = text:gsub("%$", "\\$")
35 | text = text:gsub("&", "\\&")
36 | text = text:gsub("%%", "\\%%")
37 | text = text:gsub("#", "\\#")
38 | text = text:gsub("%^", "\\textasciicircum{}")
39 | text = text:gsub("_", "\\_")
40 | text = text:gsub("~", "\\textasciitilde{}")
41 | return text
42 | end
43 |
44 | -- Handle Code elements directly - use template's texttt command
45 | function Code(elem)
46 | -- Don't process if we're not generating LaTeX
47 | if not FORMAT:match 'latex' then
48 | return elem
49 | end
50 |
51 | -- Escape special characters and use our template's texttt command
52 | local escaped_text = escape_latex_special_chars(elem.text)
53 | return pandoc.RawInline('latex', '\\texttt{' .. escaped_text .. '}')
54 | end
55 |
--------------------------------------------------------------------------------
/filters/minted-filter.lua:
--------------------------------------------------------------------------------
1 | --[[
2 | Pandoc Lua filter to use minted for code blocks instead of listings
3 | This filter converts fenced code blocks to minted environments
4 | ]]
5 |
6 | function CodeBlock(elem)
7 | -- Get the language from the first class
8 | local lang = elem.classes[1] or "text"
9 |
10 | -- Convert common language aliases and unsupported languages
11 | local lang_map = {
12 | ["sh"] = "bash",
13 | ["shell"] = "bash",
14 | ["js"] = "javascript",
15 | ["ts"] = "typescript",
16 | ["py"] = "python",
17 | ["yml"] = "yaml",
18 | ["dockerfile"] = "docker",
19 | ["rs"] = "rust",
20 | ["go-html-template"] = "html",
21 | ["gotemplate"] = "html",
22 | ["go-template"] = "html"
23 | }
24 |
25 | if lang_map[lang] then
26 | lang = lang_map[lang]
27 | end
28 |
29 | -- List of known supported Pygments lexers for common languages
30 | local supported_lexers = {
31 | "text", "bash", "javascript", "typescript", "python", "yaml", "docker", "rust",
32 | "html", "css", "json", "xml", "go", "java", "c", "cpp", "sql", "markdown",
33 | "latex", "php", "ruby", "perl", "r", "scala", "swift", "kotlin", "dart"
34 | }
35 |
36 | -- Check if the language is supported, fallback to text if not
37 | local function is_supported(language)
38 | for _, supported in ipairs(supported_lexers) do
39 | if supported == language then
40 | return true
41 | end
42 | end
43 | return false
44 | end
45 |
46 | if not is_supported(lang) then
47 | lang = "text"
48 | end
49 |
50 | -- Use minted with mdframed and add spacing before and after
51 | local minted_begin = "\\vspace{0.5em}\n\\begin{mdframed}[style=codeblockstyle]\n\\begin{minted}{" .. lang .. "}"
52 | local minted_end = "\\end{minted}\n\\end{mdframed}\n\\vspace{0.5em}"
53 |
54 | -- Return raw LaTeX block
55 | return pandoc.RawBlock("latex", minted_begin .. "\n" .. elem.text .. "\n" .. minted_end)
56 | end
57 |
58 | -- Also handle inline code (if needed)
59 | function Code(elem)
60 | -- For inline code, we can use \mintinline or just keep it as is
61 | return elem
62 | end
63 |
--------------------------------------------------------------------------------
/diagnose_env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "=== 终端环境诊断脚本 ==="
4 | echo "运行时间: $(date)"
5 | echo "用户: $(whoami)"
6 | echo "Shell: $0 ($SHELL)"
7 | echo "工作目录: $(pwd)"
8 | echo ""
9 |
10 | echo "=== PATH信息 ==="
11 | echo "PATH长度: ${#PATH} 字符"
12 | echo "TeX路径:"
13 | echo "$PATH" | tr ':' '\n' | grep -E "(tex|latex)" | head -5
14 | echo ""
15 |
16 | echo "=== LaTeX环境 ==="
17 | echo "LuaLaTeX位置: $(which lualatex)"
18 | echo "LuaLaTeX版本:"
19 | lualatex --version 2>/dev/null | head -2 || echo "❌ LuaLaTeX不可用"
20 | echo ""
21 |
22 | echo "=== Pandoc环境 ==="
23 | echo "Pandoc位置: $(which pandoc)"
24 | echo "Pandoc版本:"
25 | pandoc --version 2>/dev/null | head -1 || echo "❌ Pandoc不可用"
26 | echo ""
27 |
28 | echo "=== 字体环境 ==="
29 | echo "字体缓存:"
30 | fc-list 2>/dev/null | grep -E "(Source Han|PingFang|Noto)" | wc -l | awk '{print $1 " 个相关字体"}'
31 | echo ""
32 |
33 | echo "=== LaTeX包状态 ==="
34 | echo "检查关键包:"
35 | for pkg in fontspec xeCJK luatexja; do
36 | if kpsewhich $pkg.sty >/dev/null 2>&1; then
37 | echo "✅ $pkg.sty 已安装"
38 | else
39 | echo "❌ $pkg.sty 未找到"
40 | fi
41 | done
42 | echo ""
43 |
44 | echo "=== 工作目录文件 ==="
45 | echo "关键文件存在性:"
46 | for file in ../template.tex ../cli.py ../filters/emoji-passthrough.lua; do
47 | if [ -f "$file" ]; then
48 | echo "✅ $(basename $file) 存在 ($(stat -f%z "$file") bytes)"
49 | else
50 | echo "❌ $(basename $file) 缺失"
51 | fi
52 | done
53 | echo ""
54 |
55 | echo "=== 临时目录权限 ==="
56 | temp_dir=$(python3 -c "import tempfile; print(tempfile.gettempdir())")
57 | echo "临时目录: $temp_dir"
58 | echo "权限: $(ls -ld "$temp_dir" 2>/dev/null || echo '无法访问')"
59 | echo ""
60 |
61 | echo "=== 环境变量 ==="
62 | echo "相关环境变量:"
63 | env | grep -E "(TEXMF|LUA|PANDOC|LANG|LC_)" | sort
64 | echo ""
65 |
66 | echo "=== 测试简单命令 ==="
67 | echo "测试LaTeX字体命令:"
68 | echo '\documentclass{article}\usepackage{fontspec}\begin{document}测试\end{document}' > /tmp/test_font.tex
69 | if lualatex -interaction=nonstopmode -output-directory=/tmp /tmp/test_font.tex >/dev/null 2>&1; then
70 | echo "✅ LuaLaTeX字体测试成功"
71 | else
72 | echo "❌ LuaLaTeX字体测试失败"
73 | fi
74 | rm -f /tmp/test_font.*
75 | echo ""
76 |
77 | echo "=== 诊断建议 ==="
78 | echo "如果在新终端窗口中遇到问题,请:"
79 | echo "1. 复制以下命令到新窗口运行此诊断脚本"
80 | echo "2. 对比两个窗口的输出结果"
81 | echo "3. 特别注意PATH、字体和LaTeX包的差异"
82 | echo ""
83 | echo "命令: cd $(pwd) && ./diagnose_env.sh"
84 |
--------------------------------------------------------------------------------
/cli.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import os
3 | from tree import build_tree
4 | from frontmatter import parse_front_matter, should_include, load_book_config
5 | from pdf_builder import build_pdf
6 | from cache_utils import clean_cache, show_cache_info
7 |
8 | def main():
9 | parser = argparse.ArgumentParser(description='Export a Hugo book to PDF')
10 | parser.add_argument('book_dir', help='Path to the book directory')
11 | parser.add_argument('-o', '--output', help='Output PDF file path')
12 | parser.add_argument('--generate-summary', action='store_true', help='Generate summary.md file')
13 | parser.add_argument('--template', default=None, help='Custom LaTeX template path (XeLaTeX only)')
14 | parser.add_argument('--clean-cache', type=int, nargs='?', const=30, help='Clean cache files older than specified days (default: 30)')
15 | parser.add_argument('--cache-info', action='store_true', help='Show cache directory information')
16 | parser.add_argument('--appendix', default=None, help='Path to appendix markdown file')
17 | parser.add_argument('--emoji', action='store_true', help='Enable emoji support')
18 | parser.add_argument('--include-drafts', action='store_true', help='Include draft content')
19 | parser.add_argument('--diagnostics', action='store_true', help='Diagnostics')
20 | parser.add_argument('--generate-troubleshooting-guide', action='store_true', help='Generate troubleshooting guide')
21 | parser.add_argument('--max-table-width', type=float, default=0.85,
22 | help='Maximum table width as fraction of text width (default: 0.85)')
23 |
24 | args = parser.parse_args()
25 |
26 | if args.clean_cache is not None:
27 | clean_cache(args.book_dir, args.clean_cache)
28 | return
29 | if args.cache_info:
30 | show_cache_info(args.book_dir)
31 | return
32 |
33 | root_node = build_tree(args.book_dir, args.include_drafts, parse_front_matter, should_include)
34 | if not root_node:
35 | return
36 |
37 | # Load book configuration from _index.md
38 | book_config = load_book_config(args.book_dir)
39 |
40 | output_pdf = args.output or os.path.join(args.book_dir, 'book.pdf')
41 | ok = build_pdf(args.book_dir, root_node, output_pdf, book_config, args.template, args.appendix, args.emoji, args.max_table_width)
42 | if not ok:
43 | # Non-zero exit so callers (e.g., Makefile) can detect failure
44 | raise SystemExit(1)
45 |
46 | if __name__ == '__main__':
47 | main()
48 |
--------------------------------------------------------------------------------
/example/chapter3-advanced-formatting/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "高级格式和特殊内容测试"
3 | weight: 30
4 | ---
5 |
6 | 本章节测试更复杂的格式,包括数学公式、图表、复杂布局等。
7 |
8 | ## 数学公式测试
9 |
10 | ### 内联数学公式
11 |
12 | 在文本中,我们可以使用内联公式,比如 $E = mc^2$ 或者 $\pi \approx 3.14159$。
13 |
14 | 中文文本中的数学公式:圆的面积公式是 $A = \pi r^2$,其中 $r$ 是半径。
15 |
16 | ### 块级数学公式
17 |
18 | $$
19 | \int_{-\infty}^{\infty} e^{-x^2} dx = \sqrt{\pi}
20 | $$
21 |
22 | $$
23 | \sum_{n=1}^{\infty} \frac{1}{n^2} = \frac{\pi^2}{6}
24 | $$
25 |
26 | ### 复杂数学公式
27 |
28 | \begin{align}
29 | \nabla \times \vec{\mathbf{B}} -\, \frac1c\, \frac{\partial\vec{\mathbf{E}}}{\partial t} &= \frac{4\pi}{c}\vec{\mathbf{j}} \\
30 | \nabla \cdot \vec{\mathbf{E}} &= 4 \pi \rho \\
31 | \nabla \times \vec{\mathbf{E}}\, +\, \frac1c\, \frac{\partial\vec{\mathbf{B}}}{\partial t} &= \vec{\mathbf{0}} \\
32 | \nabla \cdot \vec{\mathbf{B}} &= 0
33 | \end{align}
34 |
35 | ## 任务列表和复选框
36 |
37 | ### 项目进度跟踪
38 |
39 | - [x] 需求分析 ✅
40 | - [x] 技术选型 ✅
41 | - [x] 架构设计 ✅
42 | - [ ] 前端开发 🔄
43 | - [x] 页面设计
44 | - [x] 组件开发
45 | - [ ] 接口对接
46 | - [ ] 测试验证
47 | - [ ] 后端开发 ⏳
48 | - [x] 数据库设计
49 | - [ ] API 开发
50 | - [ ] 业务逻辑
51 | - [ ] 性能优化
52 | - [ ] 测试阶段 ⏳
53 | - [ ] 部署上线 ⏳
54 |
55 | ### 学习计划
56 |
57 | - [x] **第一周**:基础知识
58 | - [x] HTML/CSS 复习
59 | - [x] JavaScript ES6+
60 | - [x] React 基础
61 | - [ ] **第二周**:进阶内容
62 | - [x] React Hooks
63 | - [ ] 状态管理 (Redux/Zustand)
64 | - [ ] 路由管理
65 | - [ ] **第三周**:实战项目
66 | - [ ] 项目搭建
67 | - [ ] 功能开发
68 | - [ ] 测试部署
69 |
70 | ## 定义列表
71 |
72 | ### 技术术语
73 |
74 | API
75 | : Application Programming Interface,应用程序编程接口,是不同软件组件之间通信的规范。
76 |
77 | REST
78 | : Representational State Transfer,表现层状态转换,是一种软件架构风格。
79 |
80 | GraphQL
81 | : 一种用于 API 的查询语言和运行时,由 Facebook 开发。
82 |
83 | ### 编程概念
84 |
85 | 函数式编程
86 | : 一种编程范式,将计算视为数学函数的求值,避免状态变化和可变数据。
87 |
88 | 面向对象编程
89 | : 基于"对象"概念的编程范式,对象包含数据(属性)和代码(方法)。
90 |
91 | ## 复杂嵌套结构
92 |
93 | ### 多层嵌套列表
94 |
95 | 1. **前端技术栈**
96 | 1. **框架选择**
97 | - React
98 | - 优点:
99 | - 组件化开发
100 | - 虚拟 DOM 性能优化
101 | - 丰富的生态系统
102 | - 缺点:
103 | - 学习曲线陡峭
104 | - 需要额外的状态管理库
105 | 2. **构建工具**
106 | - Webpack
107 | - Vite
108 | - Parcel
109 | 2. **后端技术栈**
110 | 1. **语言选择**
111 | - Node.js
112 | - 优点:JavaScript 全栈
113 | - 缺点:单线程限制
114 | - Python
115 | - 优点:简洁易读
116 | - 缺点:性能相对较低
117 |
118 | ## 特殊布局测试
119 |
120 | ### 警告框样式
121 | >
122 | > ⚠️ **警告**
123 | >
124 | > 这是一个警告信息,用于提醒用户注意重要事项。
125 |
126 | > ℹ️ **信息**
127 | >
128 | > 这是一个信息提示,用于提供额外的说明。
129 |
130 | > ✅ **成功**
131 | >
132 | > 操作已成功完成!
133 |
134 | > ❌ **错误**
135 | >
136 | > 发生了错误,请检查输入并重试。
137 |
138 | ### 键盘快捷键
139 |
140 | - 复制:Ctrl + C
141 | - 粘贴:Ctrl + V
142 | - 撤销:Ctrl + Z
143 | - 保存:Ctrl + S
144 | - 查找:Ctrl + F
145 |
146 | ---
147 |
148 | ## 章节总结
149 |
150 | 本章节测试了以下高级格式:
151 |
152 | - ✅ 数学公式(内联和块级)
153 | - ✅ 任务列表和复选框
154 | - ✅ 定义列表
155 | - ✅ 复杂嵌套结构
156 | - ✅ 特殊布局和样式
157 |
158 | *测试内容涵盖了 PDF 电子书可能遇到的各种复杂格式,为导出工具提供了全面的测试用例。*
159 |
--------------------------------------------------------------------------------
/filters/image-attr-cleanup.lua:
--------------------------------------------------------------------------------
1 | --[[
2 | Image attribute cleanup filter
3 | This filter removes image attribute lines like {width=1486 height=518}
4 | that appear immediately after images in markdown or LaTeX figure environments
5 | ]]
6 |
7 | -- Track the type of the previous element
8 | local previous_element_type = nil
9 |
10 | function Image(elem)
11 | -- Mark that we just processed an image
12 | previous_element_type = "image"
13 | return elem
14 | end
15 |
16 | function RawBlock(elem)
17 | -- Check if this is a LaTeX figure environment
18 | if elem.format == "latex" and (elem.text:match("\\begin{figure}") or elem.text:match("\\includegraphics")) then
19 | previous_element_type = "latex_figure"
20 | else
21 | previous_element_type = "other"
22 | end
23 | return elem
24 | end
25 |
26 | function Para(elem)
27 | -- Check if this paragraph contains only image attributes
28 | local content_str = pandoc.utils.stringify(elem)
29 |
30 | -- More comprehensive patterns to match image attributes
31 | local patterns = {
32 | "^%s*{%s*width%s*=%s*%d+%s+height%s*=%s*%d+%s*}%s*$", -- {width=123 height=456}
33 | "^%s*{%s*height%s*=%s*%d+%s+width%s*=%s*%d+%s*}%s*$", -- {height=456 width=123}
34 | "^%s*{%s*width%s*=%s*%d+%s*}%s*$", -- {width=123}
35 | "^%s*{%s*height%s*=%s*%d+%s*}%s*$", -- {height=456}
36 | "^%s*{width%s*=%s*%d+%s+height%s*=%s*%d+}%s*$", -- {width=123 height=456} (no spaces around =)
37 | "^%s*{height%s*=%s*%d+%s+width%s*=%s*%d+}%s*$" -- {height=456 width=123} (no spaces around =)
38 | }
39 |
40 | -- Check if content matches any of the image attribute patterns
41 | local is_image_attr = false
42 | for _, pattern in ipairs(patterns) do
43 | if content_str:match(pattern) then
44 | is_image_attr = true
45 | break
46 | end
47 | end
48 |
49 | -- If previous element was image/figure and this paragraph contains only image attributes, remove it
50 | if (previous_element_type == "image" or previous_element_type == "latex_figure") and is_image_attr then
51 | previous_element_type = nil -- Reset flag
52 | return {} -- Remove this paragraph
53 | end
54 |
55 | -- Reset flag for non-image-attribute paragraphs
56 | previous_element_type = nil
57 | return elem
58 | end
59 |
60 | -- Reset flag for other block elements
61 | function Header(elem)
62 | previous_element_type = nil
63 | return elem
64 | end
65 |
66 | function CodeBlock(elem)
67 | previous_element_type = nil
68 | return elem
69 | end
70 |
71 | function BlockQuote(elem)
72 | previous_element_type = nil
73 | return elem
74 | end
75 |
76 | function OrderedList(elem)
77 | previous_element_type = nil
78 | return elem
79 | end
80 |
81 | function BulletList(elem)
82 | previous_element_type = nil
83 | return elem
84 | end
85 |
86 | function Table(elem)
87 | previous_element_type = nil
88 | return elem
89 | end
90 |
91 | function Div(elem)
92 | previous_element_type = nil
93 | return elem
94 | end
95 |
--------------------------------------------------------------------------------
/install_pdf_dependencies.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Install dependencies for PDF export script
4 | # This script installs the necessary tools for converting Hugo books to PDF
5 |
6 | echo "Installing PDF export dependencies..."
7 |
8 | # --- OS Detection and Installation ---
9 | if [[ "$OSTYPE" == "darwin"* ]]; then
10 | # macOS
11 | echo "Detected macOS, installing with Homebrew..."
12 |
13 | if ! command -v brew &> /dev/null; then
14 | echo "Homebrew not found. Please install it first."
15 | exit 1
16 | fi
17 |
18 | echo "Installing system dependencies: pandoc, librsvg, imagemagick, basictex..."
19 | brew install pandoc librsvg imagemagick
20 | brew install --cask basictex
21 |
22 | echo "Installing LaTeX packages via tlmgr..."
23 | sudo tlmgr update --self
24 | sudo tlmgr install ctex fancyhdr titlesec fontspec geometry chngcntr booktabs caption float framed hyperref listings parskip fvextra
25 |
26 | elif [[ "$OSTYPE" == "linux-gnu"* ]]; then
27 | # Linux
28 | echo "Detected Linux..."
29 |
30 | if command -v apt-get &> /dev/null; then
31 | # Debian/Ubuntu
32 | echo "Using apt-get..."
33 | sudo apt-get update
34 | sudo apt-get install -y \
35 | pandoc \
36 | librsvg2-bin \
37 | imagemagick \
38 | texlive-latex-base \
39 | texlive-latex-recommended \
40 | texlive-latex-extra \
41 | texlive-xetex \
42 | texlive-lang-chinese \
43 | texlive-font-utils \
44 | texlive-luatex
45 |
46 | elif command -v dnf &> /dev/null || command -v yum &> /dev/null; then
47 | # Fedora/CentOS/RHEL
48 | echo "Using dnf/yum..."
49 | if command -v dnf &> /dev/null; then
50 | PKG_MANAGER=dnf
51 | else
52 | PKG_MANAGER=yum
53 | fi
54 | sudo $PKG_MANAGER install -y \
55 | pandoc \
56 | librsvg2-tools \
57 | ImageMagick \
58 | texlive-latex-base \
59 | texlive-latex \
60 | texlive-collection-latexrecommended \
61 | texlive-collection-latexextra \
62 | texlive-collection-langchinese \
63 | texlive-xetex \
64 | texlive-font-utils \
65 | texlive-luatex
66 |
67 | elif command -v pacman &> /dev/null; then
68 | # Arch Linux
69 | echo "Using pacman..."
70 | sudo pacman -S --noconfirm --needed \
71 | pandoc \
72 | librsvg \
73 | imagemagick \
74 | texlive-core \
75 | texlive-latexextra \
76 | texlive-langchinese \
77 | texlive-xetex
78 |
79 | else
80 | echo "Unsupported Linux distribution. Please install dependencies manually."
81 | exit 1
82 | fi
83 |
84 | else
85 | echo "Unsupported operating system: $OSTYPE"
86 | exit 1
87 | fi
88 |
89 | echo ""
90 | echo "✅ Dependencies installation completed!"
91 | echo ""
92 | echo "To verify installation, run:"
93 | echo " pandoc --version"
94 | echo " rsvg-convert --version"
95 | echo " convert --version"
96 | echo " xelatex --version"
97 | echo ""
98 | echo "You can now use the PDF export script."
99 |
--------------------------------------------------------------------------------
/example/chapter1-basics/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "基础格式测试"
3 | weight: 10
4 | ---
5 |
6 | ## 文本格式测试
7 |
8 | ### 中英文混排测试
9 |
10 | 这是一段包含**中文粗体**和*English italic*的混合文本。我们来测试一下`inline code`的效果,以及~~删除线~~的显示。
11 |
12 | Here's some English text with **bold formatting** and *italic formatting*. Let's also test `inline code` and ~~strikethrough~~ text.
13 |
14 | 在 Hugo 中,函数是在模板动作中使用的代码片段,它们接收一个或多个参数并返回一个值。与方法不同,函数不与特定的对象关联。Hugo 作为一个用 Go 语言编写的静态站点生成器,其函数体系充分利用了 Go 语言的特性,提供了高性能和丰富的功能。
15 |
16 | 只包含中文的段落:这部电影还有一个隐而不宣的主题:我们到底是在打自己的比赛,还是在西方的评分标准中争头名?郎朗在德国、仙台、柯蒂斯拿第一名;回国却被国内老师打成第三,说他“要摆正位置”;他用西方技法打败西方人,却始终无法获得完全的文化认同。
17 |
18 | 《Creep》是英国摇滚乐队 Radiohead 的首支单曲,发行于 1992 年,后收录于专辑《Pablo Honey》中。以下是其创作背景和表达含义:
19 |
20 | - **创作背景**:根据乐队贝斯手 Colin Greenwood 回忆,这首歌由主唱 Thom Yorke 在大学时期创作。吉他手 Jonny Greenwood 曾表示,歌曲灵感来源于 Thom Yorke 喜欢的一个女孩,她突然出现在乐队的某次演出中。另有说法称,Thom Yorke 从未与那个女孩正式交谈过,只是偶尔在酒吧里看到她,当他终于喝得烂醉鼓起勇气表白时,她却被吓跑了。
21 | - **表达含义**:这首歌主要表达了爱情中的自卑、自我怀疑以及对被接纳和理解的渴望。歌曲将喜欢的人比作“天使”,形容其如羽毛般轻盈美好,凸显出对方的完美与独特,而自己则是“a creep”“a weirdo”,觉得与周围世界格格不入,强烈的自卑感油然而生。歌词“I wish I was special, you're so fuckin' special”体现了主人公对自身平凡的不满,渴望变得特别,能与心仪之人相配,却又深知彼此差距,充满了无奈与辛酸。同时,歌曲也反映了主人公内心的矛盾与挣扎,他渴望拥有完美的身体和灵魂,希望能得到对方的注意,却又因自我认知而陷入痛苦,在爱情面前犹豫不决。整首歌通过主人公的内心独白,不仅是对个人情感的深刻剖析,也是对当代社会中人们在人际关系中常感到迷失和孤独这一现象的反思。
22 |
23 | ### Emoji 表情符号测试 😊
24 |
25 | - 基础表情:😀 😃 😄 😁 😆 😅 😂 🤣
26 | - 手势表情:👍 👎 👌 🤝 👏 🙏 💪 ✊
27 | - 心形表情:❤️ 💙 💚 💛 🧡 💜 🖤 🤍
28 | - 动物表情:🐶 🐱 🐭 🐹 🐰 🦊 🐻 🐼
29 | - 食物表情:🍎 🍌 🍇 🍓 🥝 🍒 🥑 🥕
30 | - 技术相关:💻 📱 ⌨️ 🖥️ 🖨️ 📡 🔌 💾
31 |
32 | ## 列表格式测试
33 |
34 | ### 无序列表
35 |
36 | - 第一级列表项
37 | - 第二级列表项
38 | - 第三级列表项
39 | - 第四级列表项
40 | - 另一个第一级项目
41 | - 包含**粗体**的列表项
42 | - 包含*斜体*的列表项
43 | - 包含`代码`的列表项
44 |
45 | ### 有序列表
46 |
47 | 1. 第一个有序项目
48 | 1. 嵌套的有序项目
49 | 2. 另一个嵌套项目
50 | 1. 更深层的嵌套
51 | 2. 继续嵌套测试
52 | 2. 第二个主要项目
53 | 3. 第三个主要项目
54 |
55 | ### 混合列表
56 |
57 | 1. 有序列表开始
58 | - 嵌套的无序列表
59 | - 另一个无序项目
60 | 1. 再次嵌套有序列表
61 | 2. 继续有序项目
62 | 2. 回到主要有序列表
63 |
64 | ## 引用和分隔线测试
65 |
66 | ### 普通引用
67 | >
68 | > 这是一个普通的引用块。引用块通常用于突出显示重要的文本或者引用他人的话语。引用块通常用于突出显示重要的文本或者引用他人的话语。
69 |
70 | ### 包含格式的引用
71 | >
72 | > **重要提示**:这个引用块包含了多种格式
73 | >
74 | > - 列表项目 1
75 | > - 列表项目 2
76 | >
77 | > `代码示例`和*斜体文本*也可以在引用中使用。
78 |
79 | ---
80 |
81 | ### 分隔线测试
82 |
83 | 上面是一条分隔线,下面还有一条:
84 |
85 | ***
86 |
87 | ## 链接和图片测试
88 |
89 | ### 链接测试
90 |
91 | - 普通链接:[Google](https://www.google.com)
92 | - 中文链接:[百度搜索](https://www.baidu.com)
93 | - 邮箱链接:
94 | - 自动链接:
95 |
96 | ### 图片测试(占位符)
97 |
98 | 
99 | {width=1000 height=600}
100 |
101 | *注意:实际 PDF 导出时,网络图片可能需要特殊处理*
102 |
103 | ## 脚注测试
104 |
105 | 这里有一个脚注引用[^1],还有另一个脚注[^note2]。
106 |
107 | 脚注可以帮助提供额外的信息而不打断正文的流畅性[^3]。
108 |
109 | [^1]: 这是第一个脚注的内容
110 | [^note2]: 这是第二个脚注,使用了自定义标识符
111 | [^3]: 第三个脚注,包含**格式化文本**和`代码`
112 |
113 | ## 特殊字符测试
114 |
115 | ### 数学符号
116 |
117 | α β γ δ ε ζ η θ ι κ λ μ ν ξ ο π ρ σ τ υ φ χ ψ ω
118 |
119 | ∑ ∏ ∫ ∂ ∇ ∞ ± × ÷ ≤ ≥ ≠ ≈ ∝ ∈ ∉ ⊂ ⊃ ∪ ∩
120 |
121 | ### 货币符号
122 |
123 | $ € £ ¥ ₹ ₽ ₩ ₪ ₫ ₡ ₢ ₣ ₤ ₥ ₦ ₧ ₨
124 |
125 | ### 其他特殊符号
126 |
127 | © ® ™ § ¶ † ‡ • ‰ ′ ″ ‴ ※ ‼ ⁇ ⁈ ⁉ ⁏
128 |
129 | ---
130 |
131 | *本章节完成了基础格式的测试,接下来的章节将测试更复杂的格式。*
132 |
--------------------------------------------------------------------------------
/example/chapter2-code-and-tables/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: 代码块和表格测试
3 | weight: 20
4 | lastmod: '2025-08-04'
5 | ---
6 |
7 | 本章节专门测试代码块、表格等复杂格式在 PDF 中的渲染效果。
8 |
9 | ## 代码块测试
10 |
11 | ### 内联代码
12 |
13 | 在文本中使用 `console.log()` 或者 `print()` 这样的内联代码。中文文本中的 `代码片段` 测试。
14 |
15 | ### Python 代码块
16 |
17 | ```python
18 | # Python 代码示例 - 数据处理
19 | import pandas as pd
20 | import numpy as np
21 | from datetime import datetime
22 |
23 | def process_data(filename):
24 | """
25 | 处理 CSV 数据文件
26 | Args:
27 | filename (str): 文件名
28 | Returns:
29 | pd.DataFrame: 处理后的数据
30 | """
31 | # 读取数据
32 | df = pd.read_csv(filename, encoding='utf-8')
33 |
34 | # 数据清洗
35 | df = df.dropna()
36 | df['timestamp'] = pd.to_datetime(df['timestamp'])
37 |
38 | # 数据转换
39 | df['value'] = df['value'].astype(float)
40 |
41 | return df
42 |
43 | # 使用示例
44 | if __name__ == "__main__":
45 | data = process_data('sample.csv')
46 | print(f"处理了 {len(data)} 条记录 😄")
47 | ```
48 |
49 | 这是代码块下面的文字,用来说明代码的用途,比如 Java 代码中的 interface 的实现 theory。
50 |
51 | ### JavaScript 代码块
52 |
53 | ```javascript
54 | // JavaScript 代码示例 - React 组件
55 | import React, { useState, useEffect } from 'react';
56 | import axios from 'axios';
57 |
58 | const DataFetcher = ({ apiUrl }) => {
59 | const [data, setData] = useState(null);
60 | const [loading, setLoading] = useState(true);
61 | const [error, setError] = useState(null);
62 |
63 | useEffect(() => {
64 | const fetchData = async () => {
65 | try {
66 | setLoading(true);
67 | const response = await axios.get(apiUrl);
68 | setData(response.data);
69 | } catch (err) {
70 | setError(err.message);
71 | } finally {
72 | setLoading(false);
73 | }
74 | };
75 |
76 | fetchData();
77 | }, [apiUrl]);
78 |
79 | if (loading) return '加载中...';
80 | if (error) return `错误:${error}`;
81 |
82 | return [
83 | '数据展示',
84 | JSON.stringify(data, null, 2)
85 | ];
86 |
87 | export default DataFetcher;
88 | ```
89 |
90 | 这是代码块下方的文字,不应该与代码块重叠,并且要保证上方的代码块可以正确的分页。
91 |
92 | 弹性云服务器(Elastic Cloud Server)是一种可随时自助获取、可弹性伸缩的云服务器,可帮助您打造可靠、安全、灵活、高效的应用环境,确保服务持久稳定运行,提升运维效率。根据业务发展需要,您可以随时变更规格、切换操作系统、配置安全组规则或调整配额。除此之外,您还可以实时查看监控指标及审计日志,以便及时了解弹性云服务器的健康状态。
93 |
94 | ### 超宽代码块测试
95 |
96 | ```bash
97 | # 这是一个非常长的命令行示例,用来测试PDF中超宽代码块的处理效果
98 | docker run -d --name my-container --restart=unless-stopped -p 8080:80 -v /host/path/to/data:/container/data -e ENV_VAR_1=value1 -e ENV_VAR_2=value2 -e ENV_VAR_3=value3 --network=my-network --memory=2g --cpus=1.5 my-image:latest
99 | ```
100 |
101 | ## 表格测试
102 |
103 | ### 基础表格
104 |
105 | | 姓名 | 年龄 | 职业 | 城市 |
106 | |------|------|------|------|
107 | | 张三 | 28 | 工程师 | 北京 |
108 | | 李四 | 32 | 设计师 | 上海 |
109 | | 王五 | 25 | 产品经理 | 深圳 |
110 |
111 | ### 包含格式的表格
112 |
113 | | 功能 | 状态 | 优先级 | 负责人 | 备注 |
114 | |------|------|--------|--------|------|
115 | | **用户登录** | ✅ 完成 | 🔴 高 | @张三 | 已上线 |
116 | | *数据导出* | 🔄 进行中 | 🟡 中 | @李四 | 预计下周完成 |
117 | | ~~旧功能~~ | ❌ 废弃 | 🔵 低 | - | 不再维护 |
118 | | `API接口` | ⏳ 计划中 | 🟠 中 | @王五 | 需求评审中 |
119 |
120 | ### 复杂表格(包含代码和链接)
121 |
122 | | 技术栈 | 版本 | 用途 | 示例代码 | 文档链接 |
123 | |--------|------|------|----------|----------|
124 | | React | 18.2.0 | 前端框架 | `` | [官方文档](https://react.dev) |
125 | | Node.js | 18.17.0 | 后端运行时 | `require('express')` | [Node.js](https://nodejs.org) |
126 | | Python | 3.11 | 数据处理 | `import pandas` | [Python.org](https://python.org) |
127 | | Docker | 24.0 | 容器化,开发者友好,支持多平台,可以在 Linux、Windows 和 macOS 上运行,可以用 Orbstack 替代 | `docker build .` | [Docker Hub](https://hub.docker.com) |
128 |
129 | ### 包含内联代码和中文混排的表格
130 |
131 | | 匹配方式 | 描述 | 示例 |
132 | |----------|------|------|
133 | | `prefix` | 前缀必须与 `:path` 头的开头匹配 | `/hello` 匹配 `/hello`、`/helloworld`、`/hello/v1` |
134 | | `path` | 路径必须与 `:path` 头完全匹配 | `/hello` 只匹配 `/hello`,不匹配 `/helloworld` |
135 | | `safe_regex` | 使用正则表达式匹配 `:path` 头 | `/\d{3}` 匹配三位数字路径 |
136 | | `connect_matcher` | 只匹配 CONNECT 请求 | 用于 HTTP CONNECT 方法 |
137 |
138 | ---
139 |
140 | *本章节完成了代码块和表格的测试,下一章将测试更多高级格式。*
141 |
--------------------------------------------------------------------------------
/filters/ansi-cleanup.lua:
--------------------------------------------------------------------------------
1 | --[[
2 | ANSI Cleanup Filter for LaTeX PDF Generation
3 | This filter comprehensively removes ANSI escape codes from all content
4 | to prevent LaTeX compilation errors
5 | ]]
6 |
7 | -- Comprehensive ANSI escape code removal
8 | local function strip_ansi_codes(text)
9 | if not text or type(text) ~= "string" then
10 | return text
11 | end
12 |
13 | -- Remove various forms of ANSI escape sequences
14 |
15 | -- Standard ANSI color codes (ESC[...m)
16 | text = text:gsub("\27%[[%d;]*m", "") -- Literal ESC character
17 | text = text:gsub("\\033%[[%d;]*m", "") -- Octal escape \033
18 | text = text:gsub("\\x1[bB]%[[%d;]*m", "") -- Hex escape \x1b or \x1B
19 | text = text:gsub("\\e%[[%d;]*m", "") -- Short escape \e
20 |
21 | -- ANSI escape sequences with other terminators
22 | text = text:gsub("\27%[[%d;]*[A-Za-z]", "") -- Literal ESC
23 | text = text:gsub("\\033%[[%d;]*[A-Za-z]", "") -- Octal
24 | text = text:gsub("\\x1[bB]%[[%d;]*[A-Za-z]", "") -- Hex
25 | text = text:gsub("\\e%[[%d;]*[A-Za-z]", "") -- Short
26 |
27 | -- Handle shell variable assignments containing ANSI codes
28 | -- Patterns like: RED='\033[0;31m' or RED="\033[0;31m"
29 | text = text:gsub("='\\033%[[%d;]*m'", "=''")
30 | text = text:gsub('="\\033%[[%d;]*m"', '=""')
31 | text = text:gsub("='\\e%[[%d;]*m'", "=''")
32 | text = text:gsub('="\\e%[[%d;]*m"', '=""')
33 | text = text:gsub("='\\x1[bB]%[[%d;]*m'", "=''")
34 | text = text:gsub('="\\x1[bB]%[[%d;]*m"', '=""')
35 |
36 | -- Handle assignments without quotes
37 | text = text:gsub("=\\033%[[%d;]*m", "=")
38 | text = text:gsub("=\\e%[[%d;]*m", "=")
39 | text = text:gsub("=\\x1[bB]%[[%d;]*m", "=")
40 |
41 | -- Handle more complex variable assignments with color codes
42 | text = text:gsub("([%w_]+)=(['\"])\\\\*033%[[%d;]*m%2", "%1=%2%2")
43 | text = text:gsub("([%w_]+)=(['\"])\\\\*e%[[%d;]*m%2", "%1=%2%2")
44 | text = text:gsub("([%w_]+)=(['\"])\\\\*x1[bB]%[[%d;]*m%2", "%1=%2%2")
45 |
46 | -- Remove any remaining problematic backslash sequences
47 | -- This handles cases where \033 might be interpreted as LaTeX command
48 | text = text:gsub("\\033", "\\\\textbackslash{}033")
49 | text = text:gsub("\\x1[bB]", "\\\\textbackslash{}x1b")
50 |
51 | -- Handle raw octal sequences that might cause issues
52 | text = text:gsub("\\(%d%d%d)%[", function(digits)
53 | if digits == "033" then
54 | return "[ESC_" .. digits .. "]["
55 | else
56 | return "\\\\" .. digits .. "["
57 | end
58 | end)
59 |
60 | return text
61 | end
62 |
63 | -- Apply ANSI cleanup to code blocks
64 | function CodeBlock(elem)
65 | if elem.text then
66 | elem.text = strip_ansi_codes(elem.text)
67 | end
68 | return elem
69 | end
70 |
71 | -- Apply ANSI cleanup to inline code
72 | function Code(elem)
73 | if elem.text then
74 | elem.text = strip_ansi_codes(elem.text)
75 | end
76 | return elem
77 | end
78 |
79 | -- Apply ANSI cleanup to raw blocks
80 | function RawBlock(elem)
81 | if elem.text then
82 | elem.text = strip_ansi_codes(elem.text)
83 | end
84 | return elem
85 | end
86 |
87 | -- Apply ANSI cleanup to raw inline
88 | function RawInline(elem)
89 | if elem.text then
90 | elem.text = strip_ansi_codes(elem.text)
91 | end
92 | return elem
93 | end
94 |
95 | -- Apply ANSI cleanup to string elements
96 | function Str(elem)
97 | if elem.text then
98 | elem.text = strip_ansi_codes(elem.text)
99 | end
100 | return elem
101 | end
102 |
103 | -- Apply ANSI cleanup to link URLs and titles
104 | function Link(elem)
105 | if elem.target then
106 | elem.target = strip_ansi_codes(elem.target)
107 | end
108 | if elem.title then
109 | elem.title = strip_ansi_codes(elem.title)
110 | end
111 | return elem
112 | end
113 |
114 | -- Apply ANSI cleanup to image URLs and titles
115 | function Image(elem)
116 | if elem.src then
117 | elem.src = strip_ansi_codes(elem.src)
118 | end
119 | if elem.title then
120 | elem.title = strip_ansi_codes(elem.title)
121 | end
122 | return elem
123 | end
124 |
--------------------------------------------------------------------------------
/tree.py:
--------------------------------------------------------------------------------
1 |
2 | import os
3 | import re
4 | from dataclasses import dataclass, field
5 | from typing import List, Optional
6 |
7 | @dataclass
8 | class Node:
9 | """Represents a chapter or section in the book."""
10 | title: str
11 | path: str
12 | weight: int
13 | children: List["Node"] = field(default_factory=list)
14 |
15 | def find_asset(book_dir, names):
16 | for n in names:
17 | p = os.path.join(book_dir, n)
18 | if os.path.exists(p):
19 | return p
20 | return None
21 |
22 | def adjust_heading_levels(content: str, base_level: int) -> str:
23 | lines = content.split('\n')
24 | adjusted_lines = []
25 | has_headings = False
26 | inside_code_block = False
27 | for line in lines:
28 | if line.strip().startswith('```'):
29 | inside_code_block = not inside_code_block
30 | adjusted_lines.append(line)
31 | continue
32 | if inside_code_block:
33 | adjusted_lines.append(line)
34 | continue
35 | if line.strip().startswith('#'):
36 | has_headings = True
37 | original_level = len(line) - len(line.lstrip('#'))
38 | if base_level == 1:
39 | new_level = original_level
40 | else:
41 | new_level = original_level + base_level - 1
42 | new_level = min(new_level, 6)
43 | heading_text = line.lstrip('#').strip()
44 | adjusted_lines.append('#' * new_level + ' ' + heading_text)
45 | else:
46 | adjusted_lines.append(line)
47 | if not has_headings:
48 | return content
49 | return '\n'.join(adjusted_lines)
50 |
51 |
52 | def build_tree(directory: str, include_drafts: bool = False, parse_front_matter=None, should_include=None) -> Optional[Node]:
53 | index_path = None
54 | for name in ("_index.md", "index.md"):
55 | candidate = os.path.join(directory, name)
56 | if os.path.exists(candidate):
57 | index_path = candidate
58 | break
59 | if not index_path or not parse_front_matter or not should_include:
60 | return None
61 | metadata = parse_front_matter(index_path)
62 | title, weight, draft, publish, export_pdf = metadata
63 | if not should_include(index_path, metadata, include_drafts):
64 | return None
65 | title = title or os.path.basename(directory)
66 | node = Node(title=title, path=index_path, weight=weight)
67 | for entry in sorted(os.listdir(directory)):
68 | subdir = os.path.join(directory, entry)
69 | if os.path.isdir(subdir):
70 | child = build_tree(subdir, include_drafts, parse_front_matter, should_include)
71 | if child:
72 | node.children.append(child)
73 | node.children.sort(key=lambda n: n.weight)
74 | return node
75 |
76 | def flatten_tree(node: Node, result: List[Node]) -> None:
77 | result.append(node)
78 | for child in node.children:
79 | flatten_tree(child, result)
80 |
81 | def write_hierarchical_content(tmp, node: Node, book_dir: str, temp_dir: str, temp_pngs: list, level: int = 1, cache_dir: str = None, process_images_in_content=None, adjust_heading_levels_func=None) -> None:
82 | heading_level = min(level, 4)
83 | heading = '#' * heading_level
84 | tmp.write(f'{heading} {node.title}\n\n')
85 | with open(node.path, "r", encoding="utf-8") as f:
86 | content = f.read()
87 | if process_images_in_content:
88 | content = process_images_in_content(content, book_dir, temp_dir, temp_pngs, node.path, cache_dir)
89 | content = re.sub(r'^---\n.*?\n---\n', '', content, flags=re.DOTALL)
90 | content = re.sub(r'^\s*---\s*$', '', content, flags=re.MULTILINE)
91 | content = re.sub(r'^# .*\n', '', content, count=1)
92 | if adjust_heading_levels_func:
93 | content = adjust_heading_levels_func(content, heading_level)
94 | else:
95 | content = adjust_heading_levels(content, heading_level)
96 | tmp.write(content + "\n\n")
97 | for child in node.children:
98 | write_hierarchical_content(tmp, child, book_dir, temp_dir, temp_pngs, level + 1, cache_dir, process_images_in_content, adjust_heading_levels_func)
99 | if level == 1:
100 | tmp.write('\\newpage\n\n')
101 |
--------------------------------------------------------------------------------
/filters/cleanup-filter.lua:
--------------------------------------------------------------------------------
1 | --[[
2 | Cleanup filter to fix common LaTeX line break issues
3 | This filter addresses problems with empty lines that have trailing backslashes
4 | ]]
5 |
6 | -- Clean up code blocks by removing trailing empty lines and ANSI escape codes
7 | function CodeBlock(elem)
8 | -- Remove ANSI escape codes
9 | local clean_text = elem.text:gsub("\27%[[%d;]+m", "")
10 | clean_text = clean_text:gsub("\\033%[[%d;]+m", "")
11 |
12 | -- Remove trailing empty lines from code
13 | local lines = {}
14 | for line in clean_text:gmatch("[^\r\n]*") do
15 | table.insert(lines, line)
16 | end
17 |
18 | -- Remove trailing empty lines
19 | while #lines > 0 and lines[#lines]:match("^%s*$") do
20 | table.remove(lines)
21 | end
22 |
23 | elem.text = table.concat(lines, "\n")
24 | return elem
25 | end
26 |
27 | -- Clean up paragraphs to remove problematic content
28 | function Para(elem)
29 | -- Check if paragraph contains only whitespace or breaks
30 | local content_str = pandoc.utils.stringify(elem)
31 | if content_str:match("^%s*$") then
32 | -- Return empty block instead of empty paragraph
33 | return {}
34 | end
35 | return elem
36 | end
37 |
38 | -- Clean up raw blocks
39 | function RawBlock(elem)
40 | if elem.format == "latex" then
41 | -- Remove ANSI escape codes
42 | local cleaned = elem.text:gsub("\27%[[%d;]+m", "")
43 | cleaned = cleaned:gsub("\\033%[[%d;]+m", "")
44 |
45 | -- Remove trailing backslashes from empty lines
46 | cleaned = cleaned:gsub("\n%s*\\\\%s*\n", "\n\n")
47 | -- Remove lines that are just whitespace followed by \\
48 | cleaned = cleaned:gsub("\n%s+\\\\%s*\n", "\n")
49 | -- Remove double backslashes at end of content
50 | cleaned = cleaned:gsub("\\\\%s*$", "")
51 |
52 | -- Fix specific problematic NormalTok patterns, but only if they are truly empty
53 | -- and not part of minted code blocks
54 |
55 | -- Only clean up NormalTok patterns that are clearly problematic
56 | -- Avoid cleaning patterns that might be legitimate LaTeX from minted
57 |
58 | -- Remove empty NormalTok followed by double backslash
59 | cleaned = cleaned:gsub("\\NormalTok{%s*}\\\\%s*\n", "\n")
60 |
61 | -- Remove empty tokens at line endings
62 | cleaned = cleaned:gsub("\\[%a]+Tok{%s*}\\\\%s*$", "")
63 |
64 | -- Fix double newlines after tokens
65 | cleaned = cleaned:gsub("}\\\\%s*\n%s*\n", "}\n\n")
66 |
67 | elem.text = cleaned
68 | end
69 | return elem
70 | end
71 |
72 | -- Clean up table cells to prevent empty line break issues
73 | function Table(tbl)
74 | -- Clean up all cells in the table
75 | local function clean_cell_contents(contents)
76 | if not contents then return {} end
77 |
78 | local cleaned = {}
79 | for i, element in ipairs(contents) do
80 | if element.t == "Str" then
81 | -- Keep non-empty strings
82 | if element.text and element.text ~= "" then
83 | table.insert(cleaned, element)
84 | end
85 | elseif element.t == "Space" then
86 | -- Keep spaces only if they're not at the end
87 | table.insert(cleaned, element)
88 | elseif element.t ~= "SoftBreak" and element.t ~= "LineBreak" then
89 | -- Keep other elements except breaks
90 | table.insert(cleaned, element)
91 | end
92 | end
93 | return cleaned
94 | end
95 |
96 | -- Process header
97 | if tbl.head and tbl.head.rows then
98 | for _, row in ipairs(tbl.head.rows) do
99 | for _, cell in ipairs(row.cells) do
100 | cell.contents = clean_cell_contents(cell.contents)
101 | end
102 | end
103 | end
104 |
105 | -- Process body
106 | if tbl.bodies then
107 | for _, body in ipairs(tbl.bodies) do
108 | if body.body then
109 | for _, row in ipairs(body.body) do
110 | for _, cell in ipairs(row.cells) do
111 | cell.contents = clean_cell_contents(cell.contents)
112 | end
113 | end
114 | end
115 | end
116 | end
117 |
118 | return tbl
119 | end
120 |
--------------------------------------------------------------------------------
/filters/symbol-fallback-filter.lua:
--------------------------------------------------------------------------------
1 | -- tools/pdf-book-exporter/filters/symbol-fallback-filter.lua
2 | -- Symbol fallback filter for LaTeX PDF generation
3 | --
4 | -- This filter handles special Unicode symbols that may not be available in all fonts
5 | -- by replacing them with LaTeX commands that provide proper fallbacks.
6 | --
7 | -- Key functionality:
8 | -- 1. Replaces currency symbols with LaTeX currency commands
9 | -- 2. Handles special punctuation and technical symbols
10 | -- 3. Provides font-independent symbol rendering
11 | -- 4. Avoids processing code contexts where symbols should be preserved
12 | --
13 | -- Implementation approach:
14 | -- - Simple and reliable processing of only Str elements
15 | -- - Avoids complex AST traversal that could cause issues
16 | -- - Uses direct string replacement with LaTeX commands
17 | -- - Preserves code blocks and inline code unchanged
18 |
19 | -- Define the symbol mapping table
20 | --
21 | -- Maps Unicode symbols to LaTeX commands that provide reliable fallbacks.
22 | -- Each command should be defined in the LaTeX template or through packages.
23 | --
24 | -- Categories:
25 | -- 1. Currency symbols - Various international currencies
26 | -- 2. Special punctuation - Rare punctuation marks
27 | -- 3. Technical symbols - UI/UX and interface symbols
28 | --
29 | -- Note: LaTeX commands like \rupee{} should be defined in the template
30 | -- with appropriate fallback mechanisms for missing fonts
31 | local symbol_map = {
32 | -- Currency symbols
33 | ["₹"] = "\\rupee{}", -- Indian Rupee
34 | ["₽"] = "\\ruble{}", -- Russian Ruble
35 | ["₪"] = "\\shekel{}", -- Israeli Shekel
36 | ["₡"] = "\\colon{}", -- Costa Rican Colon
37 | ["₢"] = "\\cruzeiro{}", -- Brazilian Cruzeiro
38 | ["₣"] = "\\franc{}", -- French Franc
39 | ["₤"] = "\\lira{}", -- Italian Lira
40 | ["₥"] = "\\mill{}", -- Mill symbol
41 | ["₦"] = "\\naira{}", -- Nigerian Naira
42 | ["₧"] = "\\peseta{}", -- Spanish Peseta
43 | ["₨"] = "\\rupeeold{}", -- Old Rupee symbol
44 |
45 | -- Special punctuation and symbols
46 | ["‴"] = "\\tripleprime{}", -- Triple prime
47 | ["⁏"] = "\\reversedSemicolon{}", -- Reversed semicolon
48 | ["⏳"] = "\\hourglass{}", -- Hourglass
49 | ["ℹ"] = "\\infoSymbol{}", -- Information symbol
50 | ["✊"] = "\\raisedFist{}", -- Raised fist
51 | ["⌨"] = "\\keyboardSymbol{}", -- Keyboard symbol
52 | }
53 |
54 | -- Simple and reliable approach: only process Str elements
55 | -- This avoids complex data structure issues with pandoc.walk_inline/walk_block
56 | --
57 | -- Processing strategy:
58 | -- 1. Only handle Str (string) elements to avoid AST complexity
59 | -- 2. Use simple string replacement for reliability
60 | -- 3. Return RawInline LaTeX when changes are made
61 | -- 4. Explicitly avoid processing Code and CodeBlock elements
62 | -- 5. Only process LaTeX output format
63 |
64 | -- Process Str elements (inline text)
65 | --
66 | -- Main processing function for regular text content.
67 | -- Scans through the symbol mapping table and replaces any found symbols
68 | -- with their corresponding LaTeX commands.
69 | --
70 | -- Process:
71 | -- 1. Check if generating LaTeX output
72 | -- 2. Apply string replacements for each symbol in the mapping
73 | -- 3. Return RawInline LaTeX if any changes were made
74 | -- 4. Otherwise return the original element unchanged
75 | function Str(elem)
76 | -- Only process LaTeX output
77 | if not FORMAT:match 'latex' then
78 | return elem
79 | end
80 |
81 | local text = elem.text
82 | local changed = false
83 |
84 | -- Replace each symbol with its LaTeX command
85 | for symbol, command in pairs(symbol_map) do
86 | local new_text = text:gsub(symbol, command)
87 | if new_text ~= text then
88 | text = new_text
89 | changed = true
90 | end
91 | end
92 |
93 | -- If any changes were made, return as raw LaTeX
94 | if changed then
95 | return pandoc.RawInline('latex', text)
96 | end
97 |
98 | -- Otherwise return original element
99 | return elem
100 | end
101 |
102 | -- Don't process inline code
103 | --
104 | -- Inline code should preserve symbols exactly as written.
105 | -- Users may intentionally include Unicode symbols in code examples.
106 | function Code(elem)
107 | return elem
108 | end
109 |
110 | -- Don't process code blocks
111 | --
112 | -- Code blocks should preserve all original characters.
113 | -- Symbol replacement could break code syntax or meaning.
114 | function CodeBlock(elem)
115 | return elem
116 | end
117 |
--------------------------------------------------------------------------------
/filters/table-filter.lua:
--------------------------------------------------------------------------------
1 | -- table-filter.lua
2 | -- Pandoc filter to generate tables with black outer border and gray inner lines
3 |
4 | function Table(elem)
5 | -- Only process if we're generating LaTeX
6 | if FORMAT ~= "latex" then
7 | return elem
8 | end
9 |
10 | -- Get table dimensions
11 | local num_cols = #elem.colspecs
12 |
13 | -- Build LaTeX table manually with custom column separators
14 | local latex_lines = {}
15 |
16 | -- Define custom column types with gray vertical rules
17 | table.insert(latex_lines, "\\newcolumntype{G}{!{\\color{gray!40}\\vrule}}")
18 | table.insert(latex_lines, "\\newcolumntype{B}{!{\\color{black}\\vrule}}")
19 |
20 | -- Create column specification: black outer borders, gray inner separators
21 | -- Use >{\\raggedright\\arraybackslash} for left-aligned, top-aligned text
22 | local colspec = "B"
23 | for i = 1, num_cols do
24 | colspec = colspec .. ">{\\raggedright\\arraybackslash}p{" .. string.format("%.2f", 0.9 / num_cols) .. "\\textwidth}"
25 | if i < num_cols then
26 | colspec = colspec .. "G" -- Gray separator between columns
27 | end
28 | end
29 | colspec = colspec .. "B" -- Black right border
30 |
31 | -- Start table with black outer border and improved styling
32 | table.insert(latex_lines, "\\sloppy") -- Encourage line breaks without overfull hboxes
33 | table.insert(latex_lines, "\\arrayrulecolor{black}")
34 | table.insert(latex_lines, "\\begin{longtable}{" .. colspec .. "}")
35 | table.insert(latex_lines, "\\tablefontsize") -- Shrink font if necessary
36 | table.insert(latex_lines, "\\hline")
37 |
38 | -- Process header if exists
39 | if elem.head and elem.head.rows and #elem.head.rows > 0 then
40 | for _, row in ipairs(elem.head.rows) do
41 | local row_content = {}
42 | for _, cell in ipairs(row.cells) do
43 | -- Convert cell contents to LaTeX, preserving formatting
44 | local cell_latex = pandoc.write(pandoc.Pandoc(cell.contents), 'latex')
45 | -- Clean up the LaTeX output (remove extra newlines and paragraph tags)
46 | cell_latex = cell_latex:gsub("\\par\n", "")
47 | cell_latex = cell_latex:gsub("\n", " ")
48 | cell_latex = cell_latex:gsub("^%s+", "")
49 | cell_latex = cell_latex:gsub("%s+$", "")
50 | -- Make header text bold
51 | cell_latex = "\\textbf{" .. cell_latex .. "}"
52 | table.insert(row_content, cell_latex)
53 | end
54 | -- Add light gray background for header row
55 | table.insert(latex_lines, "\\rowcolor{gray!10}")
56 | table.insert(latex_lines, table.concat(row_content, " & ") .. " \\\\")
57 | -- Gray line after header
58 | table.insert(latex_lines, "\\arrayrulecolor{gray!40}")
59 | table.insert(latex_lines, "\\hline")
60 | end
61 | end
62 |
63 | -- Process body rows
64 | if elem.bodies and #elem.bodies > 0 then
65 | for _, body in ipairs(elem.bodies) do
66 | for i, row in ipairs(body.body) do
67 | local row_content = {}
68 | for _, cell in ipairs(row.cells) do
69 | -- Convert cell contents to LaTeX, preserving formatting
70 | local cell_latex = pandoc.write(pandoc.Pandoc(cell.contents), 'latex')
71 | -- Clean up the LaTeX output (remove extra newlines and paragraph tags)
72 | cell_latex = cell_latex:gsub("\\par\n", "")
73 | cell_latex = cell_latex:gsub("\n", " ")
74 | cell_latex = cell_latex:gsub("^%s+", "")
75 | cell_latex = cell_latex:gsub("%s+$", "")
76 | table.insert(row_content, cell_latex)
77 | end
78 | table.insert(latex_lines, table.concat(row_content, " & ") .. " \\\\")
79 | -- Gray lines between rows, black line for last row
80 | if i < #body.body then
81 | table.insert(latex_lines, "\\arrayrulecolor{gray!40}")
82 | table.insert(latex_lines, "\\hline")
83 | else
84 | table.insert(latex_lines, "\\arrayrulecolor{black}")
85 | table.insert(latex_lines, "\\hline")
86 | end
87 | end
88 | end
89 | end
90 |
91 | -- End table
92 | table.insert(latex_lines, "\\end{longtable}")
93 | table.insert(latex_lines, "\\relax") -- Reset line breaking behavior
94 | table.insert(latex_lines, "\\arrayrulecolor{black}") -- Reset to black
95 |
96 | -- Return as raw LaTeX
97 | return pandoc.RawBlock("latex", table.concat(latex_lines, "\n"))
98 | end
99 |
--------------------------------------------------------------------------------
/table-filter.lua:
--------------------------------------------------------------------------------
1 | -- table-filter.lua
2 | -- Pandoc filter to generate tables with black outer border and gray inner lines
3 |
4 | function Table(elem)
5 | -- Only process if we're generating LaTeX
6 | if FORMAT ~= "latex" then
7 | return elem
8 | end
9 |
10 | -- Get table dimensions
11 | local num_cols = #elem.colspecs
12 |
13 | -- Build LaTeX table manually with custom column separators
14 | local latex_lines = {}
15 |
16 | -- Define custom column types with gray vertical rules
17 | table.insert(latex_lines, "\\newcolumntype{G}{!{\\color{gray!40}\\vrule}}")
18 | table.insert(latex_lines, "\\newcolumntype{B}{!{\\color{black}\\vrule}}")
19 |
20 | -- Create column specification: black outer borders, gray inner separators
21 | -- Use >{\\raggedright\\arraybackslash} for left-aligned, top-aligned text
22 | -- Table now occupies full text width for better content display
23 | -- Adjust column width calculation for better spacing with borders
24 | local colspec = "B"
25 | -- Reserve small amount for borders and column separators to prevent overflow
26 | local available_width = 0.98 -- 98% of textwidth to account for borders
27 | local col_width = available_width / num_cols
28 |
29 | for i = 1, num_cols do
30 | colspec = colspec .. ">{\\raggedright\\arraybackslash}p{" .. string.format("%.3f", col_width) .. "\\textwidth}"
31 | if i < num_cols then
32 | colspec = colspec .. "G" -- Gray separator between columns
33 | end
34 | end
35 | colspec = colspec .. "B" -- Black right border
36 |
37 | -- Start table with black outer border and improved styling
38 | table.insert(latex_lines, "\\sloppy") -- Encourage line breaks without overfull hboxes
39 | table.insert(latex_lines, "\\arrayrulecolor{black}")
40 | table.insert(latex_lines, "\\begin{longtable}{" .. colspec .. "}")
41 | table.insert(latex_lines, "\\tablefontsize") -- Shrink font if necessary
42 | table.insert(latex_lines, "\\hline")
43 |
44 | -- Process header if exists
45 | if elem.head and elem.head.rows and #elem.head.rows > 0 then
46 | for _, row in ipairs(elem.head.rows) do
47 | local row_content = {}
48 | for _, cell in ipairs(row.cells) do
49 | -- Convert cell contents to LaTeX, preserving formatting
50 | local cell_latex = pandoc.write(pandoc.Pandoc(cell.contents), 'latex')
51 | -- Clean up the LaTeX output (remove extra newlines and paragraph tags)
52 | cell_latex = cell_latex:gsub("\\par\n", "")
53 | cell_latex = cell_latex:gsub("\n", " ")
54 | cell_latex = cell_latex:gsub("^%s+", "")
55 | cell_latex = cell_latex:gsub("%s+$", "")
56 | -- Make header text bold
57 | cell_latex = "\\textbf{" .. cell_latex .. "}"
58 | table.insert(row_content, cell_latex)
59 | end
60 | -- Add light gray background for header row
61 | table.insert(latex_lines, "\\rowcolor{gray!10}")
62 | table.insert(latex_lines, table.concat(row_content, " & ") .. " \\\\")
63 | -- Gray line after header
64 | table.insert(latex_lines, "\\arrayrulecolor{gray!40}")
65 | table.insert(latex_lines, "\\hline")
66 | end
67 | end
68 |
69 | -- Process body rows
70 | if elem.bodies and #elem.bodies > 0 then
71 | for _, body in ipairs(elem.bodies) do
72 | for i, row in ipairs(body.body) do
73 | local row_content = {}
74 | for _, cell in ipairs(row.cells) do
75 | -- Convert cell contents to LaTeX, preserving formatting
76 | local cell_latex = pandoc.write(pandoc.Pandoc(cell.contents), 'latex')
77 | -- Clean up the LaTeX output (remove extra newlines and paragraph tags)
78 | cell_latex = cell_latex:gsub("\\par\n", "")
79 | cell_latex = cell_latex:gsub("\n", " ")
80 | cell_latex = cell_latex:gsub("^%s+", "")
81 | cell_latex = cell_latex:gsub("%s+$", "")
82 | table.insert(row_content, cell_latex)
83 | end
84 | table.insert(latex_lines, table.concat(row_content, " & ") .. " \\\\")
85 | -- Gray lines between rows, black line for last row
86 | if i < #body.body then
87 | table.insert(latex_lines, "\\arrayrulecolor{gray!40}")
88 | table.insert(latex_lines, "\\hline")
89 | else
90 | table.insert(latex_lines, "\\arrayrulecolor{black}")
91 | table.insert(latex_lines, "\\hline")
92 | end
93 | end
94 | end
95 | end
96 |
97 | -- End table
98 | table.insert(latex_lines, "\\end{longtable}")
99 | table.insert(latex_lines, "\\relax") -- Reset line breaking behavior
100 | table.insert(latex_lines, "\\arrayrulecolor{black}") -- Reset to black
101 |
102 | -- Return as raw LaTeX
103 | return pandoc.RawBlock("latex", table.concat(latex_lines, "\n"))
104 | end
105 |
--------------------------------------------------------------------------------
/emoji_support.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 |
3 | def detect_emoji_fonts():
4 | detected_fonts = {
5 | 'primary': None,
6 | 'fallbacks': [],
7 | 'available': []
8 | }
9 | emoji_font_priorities = [
10 | 'Apple Color Emoji',
11 | 'Noto Color Emoji',
12 | 'Segoe UI Emoji',
13 | 'Arial Unicode MS',
14 | 'Symbola',
15 | 'DejaVu Sans'
16 | ]
17 | try:
18 | result = subprocess.run(['fc-list', ':', 'family'],
19 | capture_output=True, text=True, check=True)
20 | available_fonts = result.stdout.split('\n')
21 | for font_name in emoji_font_priorities:
22 | if any(font_name in line for line in available_fonts):
23 | detected_fonts['available'].append(font_name)
24 | if detected_fonts['primary'] is None:
25 | detected_fonts['primary'] = font_name
26 | else:
27 | detected_fonts['fallbacks'].append(font_name)
28 | if not detected_fonts['available']:
29 | detected_fonts['primary'] = 'Source Han Sans SC'
30 | detected_fonts['available'].append('Source Han Sans SC')
31 | except (subprocess.CalledProcessError, FileNotFoundError):
32 | detected_fonts['primary'] = 'Apple Color Emoji'
33 | detected_fonts['available'] = ['Apple Color Emoji', 'Noto Color Emoji', 'Segoe UI Emoji']
34 | detected_fonts['fallbacks'] = ['Noto Color Emoji', 'Segoe UI Emoji']
35 | return detected_fonts
36 |
37 | def generate_emoji_font_config(emoji_fonts_info):
38 | if not emoji_fonts_info['available']:
39 | return "% No emoji fonts detected\n\\let\\emojifont\\rmfamily\n"
40 | latex_config = []
41 | latex_config.append("% Enhanced emoji font detection and configuration")
42 | primary_font = emoji_fonts_info['primary']
43 | fallback_fonts = emoji_fonts_info['fallbacks']
44 | latex_config.append(f"\\IfFontExistsTF{{{primary_font}}}{{")
45 | latex_config.append(f" \\newfontfamily\\emojifont{{{primary_font}}}[Renderer=HarfBuzz]")
46 | latex_config.append(f" \\typeout{{Using {primary_font} for emoji rendering}}")
47 | latex_config.append("}{")
48 | current_indent = " "
49 | for i, fallback_font in enumerate(fallback_fonts):
50 | latex_config.append(f"{current_indent}\\IfFontExistsTF{{{fallback_font}}}{{")
51 | latex_config.append(f"{current_indent} \\newfontfamily\\emojifont{{{fallback_font}}}[Renderer=HarfBuzz]")
52 | latex_config.append(f"{current_indent} \\typeout{{Using {fallback_font} for emoji rendering}}")
53 | latex_config.append(f"{current_indent}}}{{")
54 | current_indent += " "
55 | latex_config.append(f"{current_indent}\\let\\emojifont\\rmfamily")
56 | latex_config.append(f"{current_indent}\\typeout{{Warning: No suitable emoji font found, using main font}}")
57 | for _ in range(len(fallback_fonts) + 1):
58 | current_indent = current_indent[:-2]
59 | latex_config.append(f"{current_indent}}}")
60 | return "\n".join(latex_config)
61 |
62 | def configure_emoji_fonts_for_template(template_vars):
63 | emoji_fonts_info = detect_emoji_fonts()
64 | font_config = generate_emoji_font_config(emoji_fonts_info)
65 | template_vars['emoji_font_config'] = font_config
66 | template_vars['primary_emoji_font'] = emoji_fonts_info['primary']
67 | template_vars['emoji_fonts_available'] = emoji_fonts_info['available']
68 | template_vars['emoji_fallback_fonts'] = emoji_fonts_info['fallbacks']
69 | return template_vars
70 |
71 | def validate_emoji_support_requirements(emoji: bool, diagnostics_mode: bool = False) -> dict:
72 | # 简化版省略详细诊断逻辑,仅调用 detect_emoji_fonts 并返回结构
73 | validation_result = {
74 | 'valid': True,
75 | 'warnings': [],
76 | 'errors': [],
77 | 'engine': 'lualatex' if emoji else 'xelatex',
78 | 'emoji_fonts': detect_emoji_fonts(),
79 | 'diagnostics': [],
80 | 'system_info': {}
81 | }
82 | return validation_result
83 |
84 | def _analyze_pandoc_error(error, emoji, pdf_engine, emoji_validation):
85 | """分析 Pandoc 错误并提供重试建议"""
86 | error_analysis = {
87 | 'retry_recommended': False,
88 | 'retry_reason': '',
89 | 'suggested_fixes': []
90 | }
91 |
92 | error_message = str(error.stderr) if hasattr(error, 'stderr') else str(error)
93 |
94 | # 常见错误模式分析
95 | if 'xelatex' in error_message.lower() and 'not found' in error_message.lower():
96 | error_analysis['retry_recommended'] = True
97 | error_analysis['retry_reason'] = 'XeLaTeX engine issue detected, trying basic fallback'
98 | error_analysis['suggested_fixes'] = ['remove_shell_escape']
99 | elif 'timeout' in error_message.lower():
100 | error_analysis['retry_recommended'] = True
101 | error_analysis['retry_reason'] = 'Timeout detected, retrying with extended timeout'
102 | elif emoji and 'lua' in error_message.lower():
103 | error_analysis['retry_recommended'] = True
104 | error_analysis['retry_reason'] = 'Emoji/Lua filter issue, trying without emoji filters'
105 | error_analysis['suggested_fixes'] = ['remove_emoji_filters']
106 |
107 | return error_analysis
108 |
109 | def _apply_error_fixes(cmd, suggested_fixes):
110 | """根据错误分析应用修复建议"""
111 | new_cmd = cmd.copy()
112 |
113 | for fix in suggested_fixes:
114 | if fix == 'remove_shell_escape':
115 | # 移除 shell-escape 选项
116 | new_cmd = [arg for arg in new_cmd if '--pdf-engine-opt=-shell-escape' not in arg]
117 | elif fix == 'remove_emoji_filters':
118 | # 移除emoji相关的lua过滤器
119 | new_cmd = [arg for arg in new_cmd if 'emoji' not in arg.lower()]
120 |
121 | return new_cmd
122 |
123 | def _handle_final_pandoc_failure(error, emoji, pdf_engine, emoji_validation, tmp_path, template_path, emoji_filter_path):
124 | """处理最终的 Pandoc 失败"""
125 | print("❌ PDF generation failed after all retry attempts")
126 | print("\n🔍 Error Analysis:")
127 |
128 | if hasattr(error, 'stderr') and error.stderr:
129 | print(f" Pandoc stderr: {error.stderr[:500]}...")
130 |
131 | print(f" PDF Engine: {pdf_engine}")
132 | print(f" Emoji enabled: {emoji}")
133 | print(f" Template: {template_path}")
134 |
135 | print("\n💡 Troubleshooting suggestions:")
136 | print(" 1. Check if XeLaTeX/LuaLaTeX is properly installed")
137 | print(" 2. Verify template.tex syntax")
138 | print(" 3. Try without emoji support (remove --emoji flag)")
139 | print(" 4. Check intermediate markdown file for issues")
140 | print(f" 5. Debug file: {tmp_path}")
141 |
142 | print("\n📋 For detailed diagnostics, run:")
143 | print(" python cli.py --diagnostics ")
144 |
--------------------------------------------------------------------------------
/emoji-commands.tex:
--------------------------------------------------------------------------------
1 | %% Emoji-specific LaTeX commands and environments
2 | %% This file provides comprehensive emoji support with sizing, coloring, and context-aware rendering
3 |
4 | % --- Core Emoji Commands ---
5 |
6 | % Enhanced \emoji{} command with fallback support
7 | \newcommand{\customemoji}[1]{%
8 | \ifcsname emojifont\endcsname%
9 | {{\emojifont #1}}%
10 | \else%
11 | \typeout{Warning: Emoji font not available, using fallback}%
12 | #1%
13 | \fi%
14 | }
15 |
16 | % --- Emoji Sizing Commands ---
17 |
18 | % Define emoji size presets
19 | \newcommand{\emojisize}[1]{\fontsize{#1}{#1}\selectfont}
20 |
21 | % Predefined emoji sizes
22 | \newcommand{\emojitiny}[1]{\begingroup\emojisize{8pt}\emoji{#1}\endgroup}
23 | \newcommand{\emojismall}[1]{\begingroup\emojisize{10pt}\emoji{#1}\endgroup}
24 | \newcommand{\emojinormal}[1]{\begingroup\emojisize{12pt}\emoji{#1}\endgroup}
25 | \newcommand{\emojilarge}[1]{\begingroup\emojisize{14pt}\emoji{#1}\endgroup}
26 | \newcommand{\emojiLarge}[1]{\begingroup\emojisize{17pt}\emoji{#1}\endgroup}
27 | \newcommand{\emojiLARGE}[1]{\begingroup\emojisize{20pt}\emoji{#1}\endgroup}
28 | \newcommand{\emojihuge}[1]{\begingroup\emojisize{25pt}\emoji{#1}\endgroup}
29 | \newcommand{\emojiHuge}[1]{\begingroup\emojisize{30pt}\emoji{#1}\endgroup}
30 |
31 | % Custom size emoji command
32 | \newcommand{\emojiwithsize}[2]{\begingroup\emojisize{#2}\emoji{#1}\endgroup}
33 |
34 | % --- Emoji Color Commands ---
35 |
36 | % Note: Most emoji fonts are color fonts and don't respond to LaTeX color commands
37 | % These commands are provided for fallback scenarios or special cases
38 |
39 | \newcommand{\emojicolored}[2]{%
40 | \ifcsname emojifont\endcsname%
41 | {{\emojifont\textcolor{#1}{#2}}}%
42 | \else%
43 | \textcolor{#1}{#2}%
44 | \fi%
45 | }
46 |
47 | % --- Context-Aware Emoji Commands ---
48 |
49 | % Emoji in headings - automatically sized for heading level
50 | \newcommand{\emojiheading}[2][]{%
51 | \ifx\@currsize\Huge%
52 | \emojihuge{#2}% For \chapter
53 | \else\ifx\@currsize\LARGE%
54 | \emojiLARGE{#2}% For \section
55 | \else\ifx\@currsize\Large%
56 | \emojilarge{#2}% For \subsection
57 | \else\ifx\@currsize\large%
58 | \emojinormal{#2}% For \subsubsection
59 | \else%
60 | \emoji{#2}% Default size
61 | \fi\fi\fi\fi%
62 | }
63 |
64 | % Emoji in tables - optimized for table context
65 | \newcommand{\emojiintable}[1]{%
66 | \begingroup%
67 | \emojisize{10pt}% Smaller size for tables
68 | \emoji{#1}%
69 | \endgroup%
70 | }
71 |
72 | % Emoji in lists - consistent with list text
73 | \newcommand{\emojiinlist}[1]{%
74 | \begingroup%
75 | \emojisize{11pt}% Slightly smaller than normal text
76 | \emoji{#1}%
77 | \endgroup%
78 | }
79 |
80 | % Emoji in captions - matching caption font size
81 | \newcommand{\emojicaption}[1]{%
82 | \begingroup%
83 | \emojisize{10pt}% Caption size
84 | \emoji{#1}%
85 | \endgroup%
86 | }
87 |
88 | % --- Fallback Text Representations ---
89 |
90 | % Define fallback text for common emojis when emoji font is not available
91 | \newcommand{\emojifallback}[2]{%
92 | \ifcsname emojifont\endcsname%
93 | \emoji{#1}%
94 | \else%
95 | \texttt{#2}% Use monospace for fallback text
96 | \fi%
97 | }
98 |
99 | % Common emoji fallbacks (only define if not already defined)
100 | \providecommand{\emojicheck}{\emojifallback{✅}{[CHECK]}}
101 | \providecommand{\emojicross}{\emojifallback{❌}{[X]}}
102 | \providecommand{\emojiwarning}{\emojifallback{⚠️}{[WARNING]}}
103 | \providecommand{\emojinote}{\emojifallback{📝}{[NOTE]}}
104 | \providecommand{\emojitool}{\emojifallback{🔧}{[TOOL]}}
105 | \providecommand{\emojiidea}{\emojifallback{💡}{[IDEA]}}
106 | \providecommand{\emojirocket}{\emojifallback{🚀}{[ROCKET]}}
107 | \providecommand{\emojichart}{\emojifallback{📊}{[CHART]}}
108 | \providecommand{\emojitarget}{\emojifallback{🎯}{[TARGET]}}
109 | \providecommand{\emojistar}{\emojifallback{⭐}{[STAR]}}
110 | \newcommand{\emojiheart}{\emojifallback{❤️}{[HEART]}}
111 | \newcommand{\emojithumbsup}{\emojifallback{👍}{[THUMBS-UP]}}
112 | \newcommand{\emojithumbsdown}{\emojifallback{👎}{[THUMBS-DOWN]}}
113 | \newcommand{\emojifire}{\emojifallback{🔥}{[FIRE]}}
114 | \newcommand{\emojiparty}{\emojifallback{🎉}{[PARTY]}}
115 | \newcommand{\emojiclock}{\emojifallback{🕐}{[CLOCK]}}
116 | \newcommand{\emojiphone}{\emojifallback{📱}{[PHONE]}}
117 | \newcommand{\emojicomputer}{\emojifallback{💻}{[COMPUTER]}}
118 | \newcommand{\emojibook}{\emojifallback{📚}{[BOOK]}}
119 | \newcommand{\emojiemail}{\emojifallback{📧}{[EMAIL]}}
120 | \newcommand{\emojilink}{\emojifallback{🔗}{[LINK]}}
121 | \newcommand{\emojikey}{\emojifallback{🔑}{[KEY]}}
122 | \newcommand{\emojilock}{\emojifallback{🔒}{[LOCK]}}
123 | \newcommand{\emojiunlock}{\emojifallback{🔓}{[UNLOCK]}}
124 | \newcommand{\emojisearch}{\emojifallback{🔍}{[SEARCH]}}
125 | \newcommand{\emojidownload}{\emojifallback{⬇️}{[DOWNLOAD]}}
126 | \newcommand{\emojiupload}{\emojifallback{⬆️}{[UPLOAD]}}
127 |
128 | % --- Emoji Environments ---
129 |
130 | % Environment for emoji-rich content with optimized spacing
131 | \newenvironment{emojitext}{%
132 | \begingroup%
133 | \setlength{\parskip}{4pt plus 1pt minus 1pt}% Slightly more space between paragraphs
134 | \renewcommand{\emoji}[1]{%
135 | \ifcsname emojifont\endcsname%
136 | {{\emojifont ##1}}%
137 | \else%
138 | ##1%
139 | \fi%
140 | }%
141 | }{%
142 | \endgroup%
143 | }
144 |
145 | % Environment for emoji lists with proper alignment
146 | \newenvironment{emojiitemize}{%
147 | \begin{itemize}[leftmargin=2em,itemsep=2pt,parsep=0pt]%
148 | \renewcommand{\labelitemi}{\emoji{•}}% Use emoji bullet if available
149 | }{%
150 | \end{itemize}%
151 | }
152 |
153 | % Environment for emoji tables with optimized rendering
154 | \newenvironment{emojitabular}{%
155 | \begingroup%
156 | \let\originalemoji\emoji%
157 | \renewcommand{\emoji}[1]{%
158 | \begingroup%
159 | \emojisize{9pt}% Smaller size for table context
160 | \originalemoji{##1}%
161 | \endgroup%
162 | }%
163 | }{%
164 | \endgroup%
165 | }
166 |
167 | % --- Advanced Emoji Commands ---
168 |
169 | % Emoji with tooltip-like fallback (for accessibility)
170 | \newcommand{\emojiwithalt}[2]{%
171 | \emoji{#1}%
172 | \ifcsname emojifont\endcsname%
173 | \else%
174 | \footnote{#2}% Add footnote with description if no emoji font
175 | \fi%
176 | }
177 |
178 | % Inline emoji with automatic spacing adjustment
179 | \newcommand{\emojiinline}[1]{%
180 | \,\emoji{#1}\,% Add thin spaces around emoji
181 | }
182 |
183 | % Emoji sequence for complex emoji combinations
184 | \newcommand{\emojisequence}[1]{%
185 | \ifcsname emojifont\endcsname%
186 | {{\emojifont #1}}%
187 | \else%
188 | #1%
189 | \fi%
190 | }
191 |
192 | % --- Emoji Debugging and Diagnostics ---
193 |
194 | % Command to test emoji rendering
195 | \newcommand{\emojitestrender}[1]{%
196 | \typeout{Testing emoji rendering for: #1}%
197 | \ifcsname emojifont\endcsname%
198 | \typeout{Emoji font available: \meaning\emojifont}%
199 | {{\emojifont #1}}%
200 | \else%
201 | \typeout{Warning: No emoji font available}%
202 | \texttt{[NO-EMOJI-FONT]}%
203 | \fi%
204 | }
205 |
206 | % Command to show emoji font information
207 | \newcommand{\emojifontinfo}{%
208 | \ifcsname emojifont\endcsname%
209 | \typeout{Emoji font family: \meaning\emojifont}%
210 | \texttt{Emoji font available}%
211 | \else%
212 | \typeout{No emoji font configured}%
213 | \texttt{No emoji font}%
214 | \fi%
215 | }
216 |
217 | % --- Compatibility Commands ---
218 |
219 | % Ensure compatibility with existing emoji usage
220 | \providecommand{\greencheckmark}{\emojicheck}
221 | \providecommand{\redcrossmark}{\emojicross}
222 | \providecommand{\orangewarningmark}{\emojiwarning}
223 |
224 | % Note: Legacy emoji commands are defined in the main template to avoid conflicts
225 |
226 | % --- End of Emoji Commands ---
--------------------------------------------------------------------------------
/cache_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import shutil
4 | import hashlib
5 | import time
6 | from pathlib import Path
7 |
8 | def get_file_hash(file_path):
9 | hasher = hashlib.sha256()
10 | try:
11 | with open(file_path, 'rb') as f:
12 | for chunk in iter(lambda: f.read(4096), b""):
13 | hasher.update(chunk)
14 | return hasher.hexdigest()
15 | except Exception as e:
16 | print(f"Error calculating hash for {file_path}: {e}")
17 | return None
18 |
19 | def get_cache_dir(book_dir):
20 | book_dir = os.path.abspath(book_dir)
21 | current_dir = book_dir
22 | book_root_dir = book_dir
23 | while current_dir and current_dir != os.path.dirname(current_dir):
24 | parent = os.path.dirname(current_dir)
25 | parent_name = os.path.basename(parent)
26 | if parent_name == "book":
27 | book_root_dir = current_dir
28 | break
29 | current_dir = parent
30 | cache_dir = os.path.join(book_root_dir, 'image-caches')
31 | os.makedirs(cache_dir, exist_ok=True)
32 | return cache_dir
33 |
34 | def get_cache_metadata_path(cache_dir):
35 | return os.path.join(cache_dir, 'cache_metadata.json')
36 |
37 | def load_cache_metadata(cache_dir):
38 | metadata_path = get_cache_metadata_path(cache_dir)
39 | if os.path.exists(metadata_path):
40 | try:
41 | with open(metadata_path, 'r', encoding='utf-8') as f:
42 | return json.load(f)
43 | except Exception as e:
44 | print(f"Error loading cache metadata: {e}")
45 | return {}
46 |
47 | def save_cache_metadata(cache_dir, metadata):
48 | metadata_path = get_cache_metadata_path(cache_dir)
49 | try:
50 | with open(metadata_path, 'w', encoding='utf-8') as f:
51 | json.dump(metadata, f, indent=2, ensure_ascii=False)
52 | except Exception as e:
53 | print(f"Error saving cache metadata: {e}")
54 |
55 | def get_cached_image(source_path, cache_dir, target_extension='.png'):
56 | if not os.path.exists(source_path):
57 | return None
58 | source_hash = get_file_hash(source_path)
59 | if not source_hash:
60 | return None
61 | source_name = os.path.basename(source_path)
62 | base_name = os.path.splitext(source_name)[0]
63 | cache_filename = f"{base_name}_{source_hash[:12]}{target_extension}"
64 | cache_path = os.path.join(cache_dir, cache_filename)
65 | metadata = load_cache_metadata(cache_dir)
66 | if os.path.exists(cache_path):
67 | cache_info = metadata.get(cache_filename, {})
68 | cached_hash = cache_info.get('source_hash')
69 | if cached_hash == source_hash:
70 | print(f"Using cached image: {cache_path}")
71 | return cache_path
72 | else:
73 | print(f"Cache invalid for {source_path}, removing old cache")
74 | try:
75 | os.remove(cache_path)
76 | if cache_filename in metadata:
77 | del metadata[cache_filename]
78 | save_cache_metadata(cache_dir, metadata)
79 | except Exception as e:
80 | print(f"Error removing old cache: {e}")
81 | return None
82 |
83 | def save_to_cache(source_path, converted_path, cache_dir):
84 | if not os.path.exists(source_path) or not os.path.exists(converted_path):
85 | return None
86 | source_hash = get_file_hash(source_path)
87 | if not source_hash:
88 | return None
89 | source_name = os.path.basename(source_path)
90 | base_name = os.path.splitext(source_name)[0]
91 | target_extension = os.path.splitext(converted_path)[1]
92 | cache_filename = f"{base_name}_{source_hash[:12]}{target_extension}"
93 | cache_path = os.path.join(cache_dir, cache_filename)
94 | try:
95 | shutil.copy2(converted_path, cache_path)
96 | metadata = load_cache_metadata(cache_dir)
97 | metadata[cache_filename] = {
98 | 'source_path': os.path.abspath(source_path),
99 | 'source_hash': source_hash,
100 | 'cached_at': time.time(),
101 | 'cache_path': cache_path
102 | }
103 | save_cache_metadata(cache_dir, metadata)
104 | print(f"Saved to cache: {cache_path}")
105 | return cache_path
106 | except Exception as e:
107 | print(f"Error saving to cache: {e}")
108 | return None
109 |
110 | def get_cached_image_by_key(cache_key, cache_dir, extension='.png'):
111 | try:
112 | metadata = load_cache_metadata(cache_dir)
113 | for filename, info in metadata.items():
114 | if info.get('cache_key') == cache_key:
115 | cache_path = info.get('cache_path')
116 | if cache_path and os.path.exists(cache_path):
117 | return cache_path
118 | except Exception as e:
119 | print(f"Error checking cache: {e}")
120 | return None
121 |
122 | def save_to_cache_with_key(cache_key, file_path, cache_dir):
123 | try:
124 | cache_filename = f"{cache_key}.png"
125 | cache_path = os.path.join(cache_dir, cache_filename)
126 | shutil.copy2(file_path, cache_path)
127 | metadata = load_cache_metadata(cache_dir)
128 | metadata[cache_filename] = {
129 | 'cache_key': cache_key,
130 | 'cached_at': time.time(),
131 | 'cache_path': cache_path
132 | }
133 | save_cache_metadata(cache_dir, metadata)
134 | print(f"Cached enhanced image: {cache_path}")
135 | return cache_path
136 | except Exception as e:
137 | print(f"Error saving to cache: {e}")
138 | return None
139 |
140 | def clean_cache(book_dir, days_old=30):
141 | cache_dir = get_cache_dir(book_dir)
142 | if not os.path.exists(cache_dir):
143 | print("No cache directory found.")
144 | return
145 | metadata = load_cache_metadata(cache_dir)
146 | current_time = time.time()
147 | cutoff_time = current_time - (days_old * 24 * 60 * 60)
148 | cleaned_count = 0
149 | for cache_filename, cache_info in list(metadata.items()):
150 | cached_at = cache_info.get('cached_at', 0)
151 | cache_path = cache_info.get('cache_path', os.path.join(cache_dir, cache_filename))
152 | if cached_at < cutoff_time or not os.path.exists(cache_path):
153 | try:
154 | if os.path.exists(cache_path):
155 | os.remove(cache_path)
156 | del metadata[cache_filename]
157 | cleaned_count += 1
158 | print(f"Removed cache: {cache_filename}")
159 | except Exception as e:
160 | print(f"Error removing cache {cache_filename}: {e}")
161 | if cleaned_count > 0:
162 | save_cache_metadata(cache_dir, metadata)
163 | print(f"Cleaned {cleaned_count} cache files older than {days_old} days.")
164 | else:
165 | print("No cache files to clean.")
166 |
167 | def show_cache_info(book_dir):
168 | cache_dir = get_cache_dir(book_dir)
169 | if not os.path.exists(cache_dir):
170 | print("No cache directory found.")
171 | return
172 | metadata = load_cache_metadata(cache_dir)
173 | print(f"Cache directory: {cache_dir}")
174 | print(f"Cache files: {len(metadata)}")
175 | if metadata:
176 | total_size = 0
177 | for cache_filename, cache_info in metadata.items():
178 | cache_path = cache_info.get('cache_path', os.path.join(cache_dir, cache_filename))
179 | if os.path.exists(cache_path):
180 | size = os.path.getsize(cache_path)
181 | total_size += size
182 | cached_at = cache_info.get('cached_at', 0)
183 | age_days = (time.time() - cached_at) / (24 * 60 * 60)
184 | print(f" {cache_filename}: {size/1024:.1f}KB, {age_days:.1f} days old")
185 | print(f"Total cache size: {total_size/1024/1024:.2f}MB")
186 |
--------------------------------------------------------------------------------
/frontmatter.py:
--------------------------------------------------------------------------------
1 | import yaml
2 |
3 | def parse_front_matter(path):
4 | title = None
5 | weight = 9999
6 | draft = False
7 | publish = True
8 | export_pdf = True
9 | inside = False
10 | with open(path, 'r', encoding='utf-8') as f:
11 | for line in f:
12 | if line.strip() == '---':
13 | if not inside:
14 | inside = True
15 | continue
16 | else:
17 | break
18 | if inside:
19 | if line.startswith('title:'):
20 | title = line.split(':', 1)[1].strip().strip('"')
21 | elif line.startswith('weight:'):
22 | try:
23 | weight = int(line.split(':', 1)[1].strip())
24 | except ValueError:
25 | pass
26 | elif line.startswith('draft:'):
27 | draft_value = line.split(':', 1)[1].strip().lower()
28 | draft = draft_value in ['true', '1', 'yes']
29 | elif line.startswith('publish:'):
30 | publish_value = line.split(':', 1)[1].strip().lower()
31 | publish = publish_value in ['true', '1', 'yes']
32 | elif line.startswith('export_pdf:'):
33 | export_pdf_value = line.split(':', 1)[1].strip().lower()
34 | export_pdf = export_pdf_value in ['true', '1', 'yes']
35 | elif line.startswith('pdf:'):
36 | pdf_value = line.split(':', 1)[1].strip().lower()
37 | export_pdf = pdf_value in ['true', '1', 'yes']
38 | return title, weight, draft, publish, export_pdf
39 |
40 | def should_include(path: str, metadata: tuple = None, include_drafts: bool = False) -> bool:
41 | if metadata is None:
42 | import frontmatter
43 | _, _, draft, publish, export_pdf = parse_front_matter(path)
44 | else:
45 | _, _, draft, publish, export_pdf = metadata
46 | if draft and not include_drafts:
47 | return False
48 | if not publish:
49 | return False
50 | if not export_pdf:
51 | return False
52 | return True
53 |
54 | def load_book_config(book_dir):
55 | """Load book configuration from the _index.md front matter."""
56 | import os
57 | from tree import find_asset
58 |
59 | index_path = None
60 | for name in ("_index.md", "index.md"):
61 | candidate = os.path.join(book_dir, name)
62 | if os.path.exists(candidate):
63 | index_path = candidate
64 | break
65 |
66 | if not index_path:
67 | return {}
68 |
69 | config = {}
70 | with open(index_path, 'r', encoding='utf-8') as f:
71 | content = f.read()
72 |
73 | # Extract front matter
74 | if content.startswith('---'):
75 | try:
76 | # Find the end of front matter
77 | end_pos = content.find('\n---\n', 3)
78 | if end_pos != -1:
79 | front_matter = content[3:end_pos]
80 | metadata = yaml.safe_load(front_matter)
81 |
82 | # Extract book configuration if it exists
83 | if isinstance(metadata, dict) and 'book' in metadata:
84 | book_config = metadata['book']
85 | # Map book config to expected format
86 | config = {
87 | 'title': book_config.get('title', metadata.get('title', 'Book')),
88 | 'author': book_config.get('author', 'Author'),
89 | 'date': str(book_config.get('date', metadata.get('date', '2024'))).split('T')[0],
90 | 'description': book_config.get('description', metadata.get('description', '')),
91 | 'language': book_config.get('language', 'zh-hans'),
92 | 'cover': book_config.get('cover', None),
93 | 'website': book_config.get('website', ''),
94 | 'appendix': book_config.get('appendix', False),
95 | 'subject': book_config.get('subject', book_config.get('description', metadata.get('description', ''))),
96 | 'keywords': book_config.get('keywords', ''),
97 | 'creator': book_config.get('creator', 'LaTeX with hyperref'),
98 | 'producer': book_config.get('producer', 'XeLaTeX'),
99 |
100 | # Enhanced cover configuration
101 | 'cover_config': {
102 | 'image': book_config.get('cover', None),
103 | 'title_text': book_config.get('cover_title_text', book_config.get('title', metadata.get('title', 'Book'))),
104 | 'author_text': book_config.get('cover_author_text', book_config.get('author', 'Author')),
105 | 'subtitle_text': book_config.get('cover_subtitle_text', ''),
106 | 'title_color': book_config.get('cover_title_color', '#000000'),
107 | 'author_color': book_config.get('cover_author_color', '#333333'),
108 | 'subtitle_color': book_config.get('cover_subtitle_color', '#666666'),
109 | 'title_font_size': book_config.get('cover_title_font_size', 48),
110 | 'author_font_size': book_config.get('cover_author_font_size', 24),
111 | 'subtitle_font_size': book_config.get('cover_subtitle_font_size', 18),
112 | 'title_position': book_config.get('cover_title_position', 'center'),
113 | 'author_position': book_config.get('cover_author_position', 'bottom'),
114 | 'overlay_enabled': book_config.get('cover_overlay_enabled', True),
115 | 'text_shadow': book_config.get('cover_text_shadow', True),
116 | 'background_overlay': book_config.get('cover_background_overlay', False),
117 | 'overlay_opacity': book_config.get('cover_overlay_opacity', 0.7)
118 | },
119 |
120 | # Back-cover configuration (enhanced with text, QR code, and link)
121 | 'backcover_image': book_config.get('backcover_image', None),
122 | 'backcover_text': book_config.get('backcover_text', None),
123 | 'qrcode_image': book_config.get('qrcode_image', None),
124 | 'backcover_link_text': book_config.get('backcover_link_text', None),
125 | 'backcover_link_url': book_config.get('backcover_link_url', None),
126 |
127 | # Back-cover styling options
128 | 'backcover_text_color': book_config.get('backcover_text_color', '#000000'),
129 | 'backcover_link_color': book_config.get('backcover_link_color', '#0066CC'),
130 | 'backcover_text_font_size': book_config.get('backcover_text_font_size', 16),
131 | 'backcover_link_font_size': book_config.get('backcover_link_font_size', 14),
132 | 'qrcode_size': book_config.get('qrcode_size', '0.15\\paperwidth'),
133 | 'backcover_top_margin': book_config.get('backcover_top_margin', '0.2\\textheight'),
134 | 'backcover_bottom_margin': book_config.get('backcover_bottom_margin', '0.2\\textheight'),
135 | 'backcover_spacing_1': book_config.get('backcover_spacing_1', '1.5cm'),
136 | 'backcover_spacing_2': book_config.get('backcover_spacing_2', '1cm'),
137 |
138 | # Typography and styling configuration
139 | 'typography': {
140 | 'body_color': book_config.get('body_color', '#000000'),
141 | 'heading_color': book_config.get('heading_color', '#000000'),
142 | 'link_color': book_config.get('link_color', '#0066cc'),
143 | 'code_color': book_config.get('code_color', '#d14'),
144 | 'quote_color': book_config.get('quote_color', '#666666'),
145 | 'caption_color': book_config.get('caption_color', '#666666')
146 | }
147 | }
148 | else:
149 | # Fallback to using main front matter
150 | config = {
151 | 'title': metadata.get('title', 'Book'),
152 | 'author': metadata.get('author', 'Author'),
153 | 'date': str(metadata.get('date', '2024')).split('T')[0] if metadata.get('date') else '2024',
154 | 'description': metadata.get('description', ''),
155 | 'language': 'zh-hans',
156 | 'cover_config': {
157 | 'overlay_enabled': True,
158 | 'title_color': '#000000',
159 | 'author_color': '#333333'
160 | },
161 | 'typography': {
162 | 'body_color': '#000000',
163 | 'heading_color': '#000000'
164 | }
165 | }
166 | except Exception as e:
167 | print(f"Warning: Failed to parse front matter: {e}")
168 | config = {
169 | 'title': 'Book',
170 | 'author': 'Author',
171 | 'date': '2024',
172 | 'language': 'zh-hans',
173 | 'cover_config': {'overlay_enabled': True},
174 | 'typography': {'body_color': '#000000'}
175 | }
176 |
177 | # Print config for debugging
178 | print(f"Loaded book configuration with backcover: {config.get('backcover_image', 'None')}")
179 | return config
180 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/table-wrap.lua:
--------------------------------------------------------------------------------
1 | -- tools/pdf-book-exporter/filters/table-wrap.lua
2 | -- Advanced Pandoc Lua filter for converting markdown tables to LaTeX longtable format
3 | --
4 | -- This filter provides comprehensive table processing capabilities:
5 | -- 1. Converts pipe tables to longtable for automatic page breaking
6 | -- 2. Handles long text content with intelligent wrapping and hyphenation
7 | -- 3. Processes inline code within table cells using flexcode command
8 | -- 4. Supports configurable table widths via metadata
9 | -- 5. Provides safe LaTeX character escaping
10 | --
11 | -- Key enhancements:
12 | -- - Responsive column sizing based on max_table_width parameter
13 | -- - URL and long text detection with seqsplit wrapping
14 | -- - Proper handling of CJK text and technical content
15 | -- - Integration with minted/listings code highlighting
16 |
17 | -- Function to add hyphenation penalties for long words
18 | -- This function helps LaTeX break long words and technical terms appropriately
19 | -- by inserting spaces after certain characters to enable natural line breaking
20 | --
21 | -- Enhanced for technical documentation with:
22 | -- - Namespace separators (::)
23 | -- - File paths (/)
24 | -- - Variable names (_)
25 | -- - Hyphenated terms (-)
26 | function add_hyphenation_penalties(text)
27 | if not text then return "" end
28 |
29 | -- Don't modify underscores here - let latex_escape handle them properly
30 | return text
31 | end
32 |
33 | -- Function to check if text contains URLs or long sequences that need seqsplit
34 | --
35 | -- This function identifies content that requires special LaTeX wrapping:
36 | -- 1. URLs (http/https/ftp protocols)
37 | -- 2. Domain names and web addresses
38 | -- 3. Long unbroken sequences (>30 characters)
39 | --
40 | -- The seqsplit package allows LaTeX to break these sequences at any character
41 | -- when normal hyphenation fails, preventing table overflow
42 | function needs_seqsplit(text)
43 | if not text then return false end
44 |
45 | -- Check for URL patterns
46 | if string.match(text, "https?://") or
47 | string.match(text, "ftp://") or
48 | string.match(text, "www%.") or
49 | string.match(text, "%w+%.%w+%.%w+") or -- domain.subdomain.tld pattern
50 | string.match(text, "%w+%.%w+/") then -- domain.tld/path pattern
51 | return true
52 | end
53 |
54 | -- Check for long sequences without spaces
55 | for word in string.gmatch(text, "%S+") do
56 | if string.len(word) > 30 then -- Threshold for long words
57 | return true
58 | end
59 | end
60 |
61 | return false
62 | end
63 |
64 | -- Function to safely convert cell content to LaTeX while preserving commands
65 | -- and handling inline code specially for table cells
66 | --
67 | -- This is the core function for table cell processing:
68 | -- 1. Handles different Pandoc AST element types (Str, Code, RawInline, etc.)
69 | -- 2. Applies appropriate LaTeX escaping for special characters
70 | -- 3. Converts inline code to flexcode commands for better table formatting
71 | -- 4. Processes emphasis, strong text, and strikeout formatting
72 | -- 5. Ensures no unescaped line breaks that would break LaTeX compilation
73 | --
74 | -- Special handling for table context:
75 | -- - Uses flexcode instead of texttt for better code wrapping
76 | -- - Applies seqsplit for URLs and long sequences
77 | -- - Replaces line breaks with spaces to maintain table structure
78 | function cell_to_latex(cell_contents)
79 | if not cell_contents or #cell_contents == 0 then
80 | return ""
81 | end
82 |
83 | local result = {}
84 |
85 | -- Process elements
86 | for i, element in ipairs(cell_contents) do
87 | if element.t == "Str" then
88 | -- Regular text - escape LaTeX special characters and handle long text
89 | local escaped_text = latex_escape(element.text)
90 |
91 | -- Add hyphenation penalties for long words
92 | escaped_text = add_hyphenation_penalties(escaped_text)
93 |
94 | -- Wrap in seqsplit if it contains URLs or very long sequences
95 | if needs_seqsplit(element.text) then
96 | result[#result + 1] = "\\seqsplit{" .. escaped_text .. "}"
97 | else
98 | result[#result + 1] = escaped_text
99 | end
100 |
101 | elseif element.t == "Code" then
102 | -- Inline code in table - use flexcode command for better wrapping
103 | local escaped_code = latex_escape(element.text)
104 | result[#result + 1] = "\\flexcode{" .. escaped_code .. "}"
105 |
106 | elseif element.t == "RawInline" and element.format == "latex" then
107 | -- Check if this is a texttt command and replace with flexcode
108 | local latex_text = element.text
109 | if latex_text:match("^\\texttt{.*}$") then
110 | -- Extract content from texttt and use flexcode
111 | local code_content = latex_text:match("^\\texttt{(.*)}$")
112 | if code_content then
113 | result[#result + 1] = "\\flexcode{" .. code_content .. "}"
114 | else
115 | result[#result + 1] = latex_text
116 | end
117 | else
118 | -- Other LaTeX commands - keep as is
119 | result[#result + 1] = latex_text
120 | end
121 |
122 | elseif element.t == "Space" then
123 | result[#result + 1] = " "
124 |
125 | elseif element.t == "SoftBreak" or element.t == "LineBreak" then
126 | -- Replace line breaks with spaces to ensure no unescaped line breaks
127 | result[#result + 1] = " "
128 |
129 | elseif element.t == "Emph" then
130 | -- Handle emphasized text
131 | result[#result + 1] = "\\emph{" .. cell_to_latex(element.content) .. "}"
132 |
133 | elseif element.t == "Strong" then
134 | -- Handle strong text
135 | result[#result + 1] = "\\textbf{" .. cell_to_latex(element.content) .. "}"
136 |
137 | elseif element.t == "Strikeout" then
138 | -- Handle strikeout text
139 | result[#result + 1] = "\\sout{" .. cell_to_latex(element.content) .. "}"
140 |
141 | elseif element.t == "Link" then
142 | -- Handle links - render as LaTeX hyperlink
143 | local link_text = cell_to_latex(element.content)
144 | local link_url = element.target
145 |
146 | -- Escape URL for LaTeX
147 | link_url = string.gsub(link_url, "#", "\\#")
148 | link_url = string.gsub(link_url, "%$", "\\$")
149 | link_url = string.gsub(link_url, "&", "\\&")
150 | link_url = string.gsub(link_url, "%%", "\\%%")
151 | link_url = string.gsub(link_url, "_", "\\_")
152 |
153 | -- Use href command for proper link rendering
154 | result[#result + 1] = "\\href{" .. link_url .. "}{" .. link_text .. "}"
155 |
156 | elseif element.t == "Image" then
157 | -- Handle images - render as LaTeX includegraphics with size constraint for table cells
158 | local image_alt = cell_to_latex(element.content) or ""
159 | local image_src = element.src
160 |
161 | -- Escape image path for LaTeX
162 | image_src = string.gsub(image_src, "#", "\\#")
163 | image_src = string.gsub(image_src, "%$", "\\$")
164 | image_src = string.gsub(image_src, "&", "\\&")
165 | image_src = string.gsub(image_src, "%%", "\\%%")
166 | image_src = string.gsub(image_src, "_", "\\_")
167 |
168 | -- Use a reasonable size constraint for table cell images
169 | result[#result + 1] = "\\includegraphics[width=0.8\\linewidth,height=2cm,keepaspectratio]{" .. image_src .. "}"
170 |
171 | elseif element.t == "Plain" then
172 | -- Plain elements contain other inline elements - recursively process
173 | result[#result + 1] = cell_to_latex(element.content)
174 |
175 | else
176 | -- For other elements, try to stringify them safely
177 | local stringified = pandoc.utils.stringify({element})
178 | local escaped_text = latex_escape(stringified)
179 |
180 | -- Add hyphenation penalties and seqsplit for long text if needed
181 | escaped_text = add_hyphenation_penalties(escaped_text)
182 | if needs_seqsplit(stringified) then
183 | result[#result + 1] = "\\seqsplit{" .. escaped_text .. "}"
184 | else
185 | result[#result + 1] = escaped_text
186 | end
187 | end
188 | end
189 |
190 | -- Ensure the final result contains no unescaped line breaks
191 | local final_result = table.concat(result)
192 | -- Replace any remaining literal newlines with spaces
193 | final_result = string.gsub(final_result, "\n", " ")
194 | final_result = string.gsub(final_result, "\r", " ")
195 |
196 | return final_result
197 | end
198 |
199 | -- Function to escape LaTeX special characters (backup method)
200 | function latex_escape(text)
201 | if not text then return "" end
202 | text = string.gsub(text, "\\", "\\textbackslash{}")
203 | text = string.gsub(text, "{", "\\{")
204 | text = string.gsub(text, "}", "\\}")
205 | text = string.gsub(text, "%$", "\\$")
206 | text = string.gsub(text, "&", "\\&")
207 | text = string.gsub(text, "%%", "\\%%")
208 | text = string.gsub(text, "#", "\\#")
209 | text = string.gsub(text, "%^", "\\textasciicircum{}")
210 | text = string.gsub(text, "_", "\\_")
211 | text = string.gsub(text, "~", "\\textasciitilde{}")
212 | return text
213 | end
214 |
215 | -- Main table processing function - converts Pandoc tables to LaTeX longtable
216 | --
217 | -- This function implements comprehensive table processing:
218 | -- 1. Extracts table width configuration from Pandoc metadata
219 | -- 2. Calculates optimal column widths with safety margins
220 | -- 3. Generates LaTeX column specifications with proper alignment
221 | -- 4. Processes header and body content with cell_to_latex
222 | -- 5. Creates longtable environment with page-break support
223 | --
224 | -- Key features:
225 | -- - Configurable max_table_width (default 0.98 of text width)
226 | -- - Support for left, right, and center column alignment
227 | -- - Automatic header repetition on page breaks (\endfirsthead/\endhead)
228 | -- - Proper spacing calculation accounting for vertical borders
229 | -- - Safety factor to prevent LaTeX dimension errors
230 | function Table(tbl)
231 | -- Get the number of columns from the table
232 | local num_cols = #tbl.colspecs
233 |
234 | -- Extract maximum table width from Pandoc metadata (passed via -V max_table_width)
235 | -- Use more aggressive default to maximize page width utilization
236 | local max_table_width = 0.98 -- More aggressive default for better content display
237 | if PANDOC_STATE.meta and PANDOC_STATE.meta.max_table_width then
238 | local meta_value = PANDOC_STATE.meta.max_table_width
239 | if type(meta_value) == "table" and meta_value.t == "MetaInlines" then
240 | -- Extract from MetaInlines format
241 | max_table_width = tonumber(pandoc.utils.stringify(meta_value)) or 0.98
242 | elseif type(meta_value) == "number" then
243 | max_table_width = meta_value
244 | elseif type(meta_value) == "string" then
245 | max_table_width = tonumber(meta_value) or 0.98
246 | end
247 | end
248 |
249 | -- Apply conservative width reduction only for very wide tables
250 | -- Allow better utilization of page width for better content display
251 | if num_cols >= 6 then
252 | max_table_width = max_table_width * 0.96 -- 4% reduction for very wide tables
253 | elseif num_cols >= 5 then
254 | max_table_width = max_table_width * 0.98 -- 2% reduction for wide tables
255 | end
256 |
257 | -- Extract minimum column width from metadata if provided (table-level override)
258 | local min_col_width = nil
259 | if tbl.attr and tbl.attr.attributes and tbl.attr.attributes.min_col_width then
260 | min_col_width = tonumber(tbl.attr.attributes.min_col_width)
261 | end
262 |
263 | -- Compute usable width by subtracting vertical borders (arrayrulewidth)
264 | -- Use max_table_width to limit overall table width, then subtract borders
265 | -- Formula: (max_table_width * textwidth) - arrayrulewidth * (num_cols + 1)
266 | local usable_width_expr = string.format("\\dimexpr%.4f\\textwidth-\\arrayrulewidth*%d\\relax", max_table_width, num_cols + 1)
267 |
268 | -- Calculate width ratio per column based on max_table_width
269 | local base_width_ratio = max_table_width / num_cols
270 | -- Increase safety factor for better margin protection
271 | local safety_factor = 0.92 -- Increased from 0.95 to 0.92 for more conservative calculation
272 | local width_ratio = base_width_ratio * safety_factor
273 |
274 | -- Apply minimum column width override if specified (but respect max_table_width)
275 | if min_col_width then
276 | width_ratio = math.max(width_ratio, math.min(min_col_width, max_table_width / num_cols))
277 | end
278 |
279 | -- Format the column width as LaTeX dimexpr
280 | local col_width = string.format("\\dimexpr%.4f\\textwidth\\relax", width_ratio)
281 |
282 | -- Create column specification with proper alignment and line breaking support
283 | local column_spec = "|"
284 | for i = 1, num_cols do
285 | local alignment_spec = ""
286 |
287 | -- Check column alignment from colspecs
288 | if tbl.colspecs and tbl.colspecs[i] then
289 | local align = tbl.colspecs[i][1] -- Alignment is first element of colspec
290 | if align == "AlignCenter" then
291 | alignment_spec = ">{\\centering\\arraybackslash}p{" .. col_width .. "}"
292 | elseif align == "AlignRight" then
293 | alignment_spec = ">{\\raggedleft\\arraybackslash}p{" .. col_width .. "}"
294 | else -- AlignLeft or AlignDefault
295 | alignment_spec = ">{\\raggedright\\arraybackslash\\hspace{0pt}}p{" .. col_width .. "}"
296 | end
297 | else
298 | -- Default to left-aligned with line breaking support
299 | alignment_spec = ">{\\raggedright\\arraybackslash\\hspace{0pt}}p{" .. col_width .. "}"
300 | end
301 |
302 | column_spec = column_spec .. alignment_spec .. "|"
303 | end
304 |
305 | -- Create raw LaTeX block for longtable
306 | local latex_content = {}
307 |
308 | -- Start longtable environment
309 | latex_content[#latex_content + 1] = "\\begin{longtable}" .. "{" .. column_spec .. "}"
310 | latex_content[#latex_content + 1] = "\\hline"
311 |
312 | -- Process header if it exists
313 | if tbl.head and tbl.head.rows and #tbl.head.rows > 0 then
314 | for _, row in ipairs(tbl.head.rows) do
315 | local row_content = {}
316 | for j, cell in ipairs(row.cells) do
317 | -- Use safe cell conversion function
318 | local cell_latex = cell_to_latex(cell.contents)
319 | row_content[#row_content + 1] = "\\textbf{" .. cell_latex .. "}"
320 | end
321 | latex_content[#latex_content + 1] = table.concat(row_content, " & ") .. " \\\\"
322 | end
323 | latex_content[#latex_content + 1] = "\\hline"
324 | latex_content[#latex_content + 1] = "\\endfirsthead"
325 |
326 | -- Repeat header on subsequent pages
327 | latex_content[#latex_content + 1] = "\\hline"
328 | for _, row in ipairs(tbl.head.rows) do
329 | local row_content = {}
330 | for j, cell in ipairs(row.cells) do
331 | -- Use safe cell conversion function
332 | local cell_latex = cell_to_latex(cell.contents)
333 | row_content[#row_content + 1] = "\\textbf{" .. cell_latex .. "}"
334 | end
335 | latex_content[#latex_content + 1] = table.concat(row_content, " & ") .. " \\\\"
336 | end
337 | latex_content[#latex_content + 1] = "\\hline"
338 | latex_content[#latex_content + 1] = "\\endhead"
339 | end
340 |
341 | -- Process body rows
342 | if tbl.bodies and #tbl.bodies > 0 then
343 | for _, body in ipairs(tbl.bodies) do
344 | if body.body then
345 | for _, row in ipairs(body.body) do
346 | local row_content = {}
347 | for j, cell in ipairs(row.cells) do
348 | -- Use safe cell conversion function
349 | local cell_latex = cell_to_latex(cell.contents)
350 | row_content[#row_content + 1] = cell_latex
351 | end
352 | latex_content[#latex_content + 1] = table.concat(row_content, " & ") .. " \\\\"
353 | latex_content[#latex_content + 1] = "\\hline" -- Add horizontal line after each row
354 | end
355 | end
356 | end
357 | end
358 |
359 | -- End longtable environment
360 | latex_content[#latex_content + 1] = "\\hline"
361 | latex_content[#latex_content + 1] = "\\end{longtable}"
362 |
363 | -- Return as RawBlock
364 | return pandoc.RawBlock("latex", table.concat(latex_content, "\n"))
365 | end
366 |
367 |
--------------------------------------------------------------------------------
/filters/table-wrap.lua:
--------------------------------------------------------------------------------
1 | -- tools/pdf-book-exporter/filters/table-wrap.lua
2 | -- Advanced Pandoc Lua filter for converting markdown tables to LaTeX longtable format
3 | --
4 | -- This filter provides comprehensive table processing capabilities:
5 | -- 1. Converts pipe tables to longtable for automatic page breaking
6 | -- 2. Handles long text content with intelligent wrapping and hyphenation
7 | -- 3. Processes inline code within table cells using flexcode command
8 | -- 4. Supports configurable table widths via metadata
9 | -- 5. Provides safe LaTeX character escaping
10 | --
11 | -- Key enhancements:
12 | -- - Responsive column sizing based on max_table_width parameter
13 | -- - URL and long text detection with seqsplit wrapping
14 | -- - Proper handling of CJK text and technical content
15 | -- - Integration with minted/listings code highlighting
16 |
17 | -- Function to add hyphenation penalties for long words
18 | -- This function helps LaTeX break long words and technical terms appropriately
19 | -- by inserting spaces after certain characters to enable natural line breaking
20 | --
21 | -- Enhanced for technical documentation with:
22 | -- - Namespace separators (::)
23 | -- - File paths (/)
24 | -- - Variable names (_)
25 | -- - Hyphenated terms (-)
26 | function add_hyphenation_penalties(text)
27 | if not text then return "" end
28 |
29 | -- Don't modify underscores here - let latex_escape handle them properly
30 | return text
31 | end
32 |
33 | -- Function to check if text contains URLs or long sequences that need seqsplit
34 | --
35 | -- This function identifies content that requires special LaTeX wrapping:
36 | -- 1. URLs (http/https/ftp protocols)
37 | -- 2. Domain names and web addresses
38 | -- 3. Long unbroken sequences (>30 characters)
39 | --
40 | -- The seqsplit package allows LaTeX to break these sequences at any character
41 | -- when normal hyphenation fails, preventing table overflow
42 | function needs_seqsplit(text)
43 | if not text then return false end
44 |
45 | -- Check for URL patterns
46 | if string.match(text, "https?://") or
47 | string.match(text, "ftp://") or
48 | string.match(text, "www%.") or
49 | string.match(text, "%w+%.%w+%.%w+") or -- domain.subdomain.tld pattern
50 | string.match(text, "%w+%.%w+/") then -- domain.tld/path pattern
51 | return true
52 | end
53 |
54 | -- Check for long sequences without spaces
55 | for word in string.gmatch(text, "%S+") do
56 | if string.len(word) > 30 then -- Threshold for long words
57 | return true
58 | end
59 | end
60 |
61 | return false
62 | end
63 |
64 | -- Function to safely convert cell content to LaTeX while preserving commands
65 | -- and handling inline code specially for table cells
66 | --
67 | -- This is the core function for table cell processing:
68 | -- 1. Handles different Pandoc AST element types (Str, Code, RawInline, etc.)
69 | -- 2. Applies appropriate LaTeX escaping for special characters
70 | -- 3. Converts inline code to flexcode commands for better table formatting
71 | -- 4. Processes emphasis, strong text, and strikeout formatting
72 | -- 5. Ensures no unescaped line breaks that would break LaTeX compilation
73 | --
74 | -- Special handling for table context:
75 | -- - Uses flexcode instead of texttt for better code wrapping
76 | -- - Applies seqsplit for URLs and long sequences
77 | -- - Replaces line breaks with spaces to maintain table structure
78 | function cell_to_latex(cell_contents)
79 | if not cell_contents or #cell_contents == 0 then
80 | return ""
81 | end
82 |
83 | local result = {}
84 |
85 | -- Process elements
86 | for i, element in ipairs(cell_contents) do
87 | if element.t == "Str" then
88 | -- Regular text - escape LaTeX special characters and handle long text
89 | local escaped_text = latex_escape(element.text)
90 |
91 | -- Add hyphenation penalties for long words
92 | escaped_text = add_hyphenation_penalties(escaped_text)
93 |
94 | -- Wrap in seqsplit if it contains URLs or very long sequences
95 | if needs_seqsplit(element.text) then
96 | result[#result + 1] = "\\seqsplit{" .. escaped_text .. "}"
97 | else
98 | result[#result + 1] = escaped_text
99 | end
100 |
101 | elseif element.t == "Code" then
102 | -- Inline code in table - use flexcode command for better wrapping
103 | local escaped_code = latex_escape(element.text)
104 | result[#result + 1] = "\\flexcode{" .. escaped_code .. "}"
105 |
106 | elseif element.t == "RawInline" and element.format == "latex" then
107 | -- Check if this is a texttt command and replace with flexcode
108 | local latex_text = element.text
109 | if latex_text:match("^\\texttt{.*}$") then
110 | -- Extract content from texttt and use flexcode
111 | local code_content = latex_text:match("^\\texttt{(.*)}$")
112 | if code_content then
113 | result[#result + 1] = "\\flexcode{" .. code_content .. "}"
114 | else
115 | result[#result + 1] = latex_text
116 | end
117 | else
118 | -- Other LaTeX commands - keep as is
119 | result[#result + 1] = latex_text
120 | end
121 |
122 | elseif element.t == "Space" then
123 | result[#result + 1] = " "
124 |
125 | elseif element.t == "SoftBreak" or element.t == "LineBreak" then
126 | -- Replace line breaks with spaces to ensure no unescaped line breaks
127 | result[#result + 1] = " "
128 |
129 | elseif element.t == "Emph" then
130 | -- Handle emphasized text
131 | result[#result + 1] = "\\emph{" .. cell_to_latex(element.content) .. "}"
132 |
133 | elseif element.t == "Strong" then
134 | -- Handle strong text
135 | result[#result + 1] = "\\textbf{" .. cell_to_latex(element.content) .. "}"
136 |
137 | elseif element.t == "Strikeout" then
138 | -- Handle strikeout text
139 | result[#result + 1] = "\\sout{" .. cell_to_latex(element.content) .. "}"
140 |
141 | elseif element.t == "Link" then
142 | -- Handle links - render as LaTeX hyperlink
143 | local link_text = cell_to_latex(element.content)
144 | local link_url = element.target
145 |
146 | -- Escape URL for LaTeX
147 | link_url = string.gsub(link_url, "#", "\\#")
148 | link_url = string.gsub(link_url, "%$", "\\$")
149 | link_url = string.gsub(link_url, "&", "\\&")
150 | link_url = string.gsub(link_url, "%%", "\\%%")
151 | link_url = string.gsub(link_url, "_", "\\_")
152 |
153 | -- Use href command for proper link rendering
154 | result[#result + 1] = "\\href{" .. link_url .. "}{" .. link_text .. "}"
155 |
156 | elseif element.t == "Image" then
157 | -- Handle images - render as LaTeX includegraphics with size constraint for table cells
158 | local image_alt = cell_to_latex(element.content) or ""
159 | local image_src = element.src
160 |
161 | -- Escape image path for LaTeX
162 | image_src = string.gsub(image_src, "#", "\\#")
163 | image_src = string.gsub(image_src, "%$", "\\$")
164 | image_src = string.gsub(image_src, "&", "\\&")
165 | image_src = string.gsub(image_src, "%%", "\\%%")
166 | image_src = string.gsub(image_src, "_", "\\_")
167 |
168 | -- Use a reasonable size constraint for table cell images
169 | result[#result + 1] = "\\includegraphics[width=0.8\\linewidth,height=2cm,keepaspectratio]{" .. image_src .. "}"
170 |
171 | elseif element.t == "Plain" then
172 | -- Plain elements contain other inline elements - recursively process
173 | result[#result + 1] = cell_to_latex(element.content)
174 |
175 | else
176 | -- For other elements, try to stringify them safely
177 | local stringified = pandoc.utils.stringify({element})
178 | local escaped_text = latex_escape(stringified)
179 |
180 | -- Add hyphenation penalties and seqsplit for long text if needed
181 | escaped_text = add_hyphenation_penalties(escaped_text)
182 | if needs_seqsplit(stringified) then
183 | result[#result + 1] = "\\seqsplit{" .. escaped_text .. "}"
184 | else
185 | result[#result + 1] = escaped_text
186 | end
187 | end
188 | end
189 |
190 | -- Ensure the final result contains no unescaped line breaks
191 | local final_result = table.concat(result)
192 | -- Replace any remaining literal newlines with spaces
193 | final_result = string.gsub(final_result, "\n", " ")
194 | final_result = string.gsub(final_result, "\r", " ")
195 |
196 | return final_result
197 | end
198 |
199 | -- Function to escape LaTeX special characters (backup method)
200 | function latex_escape(text)
201 | if not text then return "" end
202 | text = string.gsub(text, "\\", "\\textbackslash{}")
203 | text = string.gsub(text, "{", "\\{")
204 | text = string.gsub(text, "}", "\\}")
205 | text = string.gsub(text, "%$", "\\$")
206 | text = string.gsub(text, "&", "\\&")
207 | text = string.gsub(text, "%%", "\\%%")
208 | text = string.gsub(text, "#", "\\#")
209 | text = string.gsub(text, "%^", "\\textasciicircum{}")
210 | text = string.gsub(text, "_", "\\_")
211 | text = string.gsub(text, "~", "\\textasciitilde{}")
212 | return text
213 | end
214 |
215 | -- Main table processing function - converts Pandoc tables to LaTeX longtable
216 | --
217 | -- This function implements comprehensive table processing:
218 | -- 1. Extracts table width configuration from Pandoc metadata
219 | -- 2. Calculates optimal column widths with safety margins
220 | -- 3. Generates LaTeX column specifications with proper alignment
221 | -- 4. Processes header and body content with cell_to_latex
222 | -- 5. Creates longtable environment with page-break support
223 | --
224 | -- Key features:
225 | -- - Configurable max_table_width (default 0.98 of text width)
226 | -- - Support for left, right, and center column alignment
227 | -- - Automatic header repetition on page breaks (\endfirsthead/\endhead)
228 | -- - Proper spacing calculation accounting for vertical borders
229 | -- - Safety factor to prevent LaTeX dimension errors
230 | function Table(tbl)
231 | -- Get the number of columns from the table
232 | local num_cols = #tbl.colspecs
233 |
234 | -- Extract maximum table width from Pandoc metadata (passed via -V max_table_width)
235 | -- Use more conservative default to prevent margin overflow
236 | local max_table_width = 0.85 -- Conservative default for reliable margin control
237 | if PANDOC_STATE.meta and PANDOC_STATE.meta.max_table_width then
238 | local meta_value = PANDOC_STATE.meta.max_table_width
239 | if type(meta_value) == "table" and meta_value.t == "MetaInlines" then
240 | -- Extract from MetaInlines format
241 | max_table_width = tonumber(pandoc.utils.stringify(meta_value)) or 0.92
242 | elseif type(meta_value) == "number" then
243 | max_table_width = meta_value
244 | elseif type(meta_value) == "string" then
245 | max_table_width = tonumber(meta_value) or 0.92
246 | end
247 | end
248 |
249 | -- Apply additional safety reduction for tables with many columns
250 | -- Tables with more columns need extra conservative width calculation
251 | if num_cols >= 5 then
252 | max_table_width = max_table_width * 0.95 -- 5% additional reduction for wide tables
253 | elseif num_cols >= 4 then
254 | max_table_width = max_table_width * 0.97 -- 3% additional reduction
255 | end
256 |
257 | -- Extract minimum column width from metadata if provided (table-level override)
258 | local min_col_width = nil
259 | if tbl.attr and tbl.attr.attributes and tbl.attr.attributes.min_col_width then
260 | min_col_width = tonumber(tbl.attr.attributes.min_col_width)
261 | end
262 |
263 | -- Compute usable width by subtracting vertical borders (arrayrulewidth)
264 | -- Use max_table_width to limit overall table width, then subtract borders
265 | -- Formula: (max_table_width * textwidth) - arrayrulewidth * (num_cols + 1)
266 | local usable_width_expr = string.format("\\dimexpr%.4f\\textwidth-\\arrayrulewidth*%d\\relax", max_table_width, num_cols + 1)
267 |
268 | -- Calculate width ratio per column based on max_table_width
269 | local base_width_ratio = max_table_width / num_cols
270 | -- Increase safety factor for better margin protection
271 | local safety_factor = 0.92 -- Increased from 0.95 to 0.92 for more conservative calculation
272 | local width_ratio = base_width_ratio * safety_factor
273 |
274 | -- Apply minimum column width override if specified (but respect max_table_width)
275 | if min_col_width then
276 | width_ratio = math.max(width_ratio, math.min(min_col_width, max_table_width / num_cols))
277 | end
278 |
279 | -- Format the column width as LaTeX dimexpr
280 | local col_width = string.format("\\dimexpr%.4f\\textwidth\\relax", width_ratio)
281 |
282 | -- Create column specification with proper alignment and line breaking support
283 | local column_spec = "|"
284 | for i = 1, num_cols do
285 | local alignment_spec = ""
286 |
287 | -- Check column alignment from colspecs
288 | if tbl.colspecs and tbl.colspecs[i] then
289 | local align = tbl.colspecs[i][1] -- Alignment is first element of colspec
290 | if align == "AlignCenter" then
291 | alignment_spec = ">{\\centering\\arraybackslash}p{" .. col_width .. "}"
292 | elseif align == "AlignRight" then
293 | alignment_spec = ">{\\raggedleft\\arraybackslash}p{" .. col_width .. "}"
294 | else -- AlignLeft or AlignDefault
295 | alignment_spec = ">{\\raggedright\\arraybackslash\\hspace{0pt}}p{" .. col_width .. "}"
296 | end
297 | else
298 | -- Default to left-aligned with line breaking support
299 | alignment_spec = ">{\\raggedright\\arraybackslash\\hspace{0pt}}p{" .. col_width .. "}"
300 | end
301 |
302 | column_spec = column_spec .. alignment_spec .. "|"
303 | end
304 |
305 | -- Create raw LaTeX block for longtable
306 | local latex_content = {}
307 |
308 | -- Start longtable environment
309 | latex_content[#latex_content + 1] = "\\begin{longtable}" .. "{" .. column_spec .. "}"
310 | latex_content[#latex_content + 1] = "\\hline"
311 |
312 | -- Process header if it exists
313 | if tbl.head and tbl.head.rows and #tbl.head.rows > 0 then
314 | for _, row in ipairs(tbl.head.rows) do
315 | local row_content = {}
316 | for j, cell in ipairs(row.cells) do
317 | -- Use safe cell conversion function
318 | local cell_latex = cell_to_latex(cell.contents)
319 | row_content[#row_content + 1] = "\\textbf{" .. cell_latex .. "}"
320 | end
321 | latex_content[#latex_content + 1] = table.concat(row_content, " & ") .. " \\\\"
322 | end
323 | latex_content[#latex_content + 1] = "\\hline"
324 | latex_content[#latex_content + 1] = "\\endfirsthead"
325 |
326 | -- Repeat header on subsequent pages
327 | latex_content[#latex_content + 1] = "\\hline"
328 | for _, row in ipairs(tbl.head.rows) do
329 | local row_content = {}
330 | for j, cell in ipairs(row.cells) do
331 | -- Use safe cell conversion function
332 | local cell_latex = cell_to_latex(cell.contents)
333 | row_content[#row_content + 1] = "\\textbf{" .. cell_latex .. "}"
334 | end
335 | latex_content[#latex_content + 1] = table.concat(row_content, " & ") .. " \\\\"
336 | end
337 | latex_content[#latex_content + 1] = "\\hline"
338 | latex_content[#latex_content + 1] = "\\endhead"
339 | end
340 |
341 | -- Process body rows
342 | if tbl.bodies and #tbl.bodies > 0 then
343 | for _, body in ipairs(tbl.bodies) do
344 | if body.body then
345 | for _, row in ipairs(body.body) do
346 | local row_content = {}
347 | for j, cell in ipairs(row.cells) do
348 | -- Use safe cell conversion function
349 | local cell_latex = cell_to_latex(cell.contents)
350 | row_content[#row_content + 1] = cell_latex
351 | end
352 | latex_content[#latex_content + 1] = table.concat(row_content, " & ") .. " \\\\"
353 | latex_content[#latex_content + 1] = "\\hline" -- Add horizontal line after each row
354 | end
355 | end
356 | end
357 | end
358 |
359 | -- End longtable environment
360 | latex_content[#latex_content + 1] = "\\hline"
361 | latex_content[#latex_content + 1] = "\\end{longtable}"
362 |
363 | -- Return as RawBlock
364 | return pandoc.RawBlock("latex", table.concat(latex_content, "\n"))
365 | end
366 |
367 |
--------------------------------------------------------------------------------
/validate_lua_dependencies.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | """
3 | Lua Dependencies Validation for PDF Emoji Support
4 |
5 | This module validates Lua filter dependencies and provides comprehensive
6 | error handling for Lua-related issues in the PDF export process.
7 | """
8 |
9 | import os
10 | import subprocess
11 | import tempfile
12 | import json
13 | from typing import Dict, List, Optional, Tuple
14 | from dataclasses import dataclass
15 |
16 |
17 | @dataclass
18 | class LuaValidationResult:
19 | """Result of Lua dependency validation."""
20 | valid: bool
21 | errors: List[str]
22 | warnings: List[str]
23 | details: Dict[str, any]
24 |
25 |
26 | class LuaDependencyValidator:
27 | """Validator for Lua filter dependencies and functionality."""
28 |
29 | def __init__(self, script_dir: str):
30 | self.script_dir = script_dir
31 | self.emoji_filter_path = os.path.join(script_dir, 'filters', 'emoji-passthrough.lua')
32 |
33 | def validate_lua_filter_syntax(self, filter_path: str) -> LuaValidationResult:
34 | """Validate Lua filter syntax using lua command."""
35 | result = LuaValidationResult(
36 | valid=True,
37 | errors=[],
38 | warnings=[],
39 | details={'filter_path': filter_path}
40 | )
41 |
42 | if not os.path.exists(filter_path):
43 | result.valid = False
44 | result.errors.append(f"Lua filter not found: {filter_path}")
45 | return result
46 |
47 | try:
48 | # Check if lua command is available
49 | lua_check = subprocess.run(['lua', '-v'],
50 | capture_output=True, text=True, timeout=5)
51 |
52 | if lua_check.returncode != 0:
53 | result.warnings.append("Lua interpreter not available for syntax checking")
54 | return result
55 |
56 | # Test syntax using luac (Lua compiler) which only checks syntax
57 | syntax_check = subprocess.run([
58 | 'luac', '-p', filter_path
59 | ], capture_output=True, text=True, timeout=10)
60 |
61 | if syntax_check.returncode != 0:
62 | result.valid = False
63 | result.errors.append(f"Lua syntax error in {filter_path}")
64 | result.details['syntax_error'] = syntax_check.stderr
65 | else:
66 | result.details['syntax_valid'] = True
67 |
68 | except subprocess.TimeoutExpired:
69 | result.warnings.append("Lua syntax check timed out")
70 | except FileNotFoundError:
71 | result.warnings.append("Lua interpreter not found - skipping syntax validation")
72 | except Exception as e:
73 | result.warnings.append(f"Lua syntax check failed: {str(e)}")
74 |
75 | return result
76 |
77 | def validate_emoji_filter_functions(self) -> LuaValidationResult:
78 | """Validate that emoji filter contains required functions."""
79 | result = LuaValidationResult(
80 | valid=True,
81 | errors=[],
82 | warnings=[],
83 | details={'filter_path': self.emoji_filter_path}
84 | )
85 |
86 | if not os.path.exists(self.emoji_filter_path):
87 | result.valid = False
88 | result.errors.append(f"Emoji filter not found: {self.emoji_filter_path}")
89 | return result
90 |
91 | try:
92 | with open(self.emoji_filter_path, 'r', encoding='utf-8') as f:
93 | content = f.read()
94 |
95 | # Required functions for emoji processing
96 | required_functions = [
97 | 'process_text',
98 | 'is_emoji',
99 | 'Str',
100 | 'Code',
101 | 'CodeBlock'
102 | ]
103 |
104 | # Required data structures for current emoji filter
105 | required_data = [
106 | 'emoji_map',
107 | 'fallback_map'
108 | ]
109 |
110 | missing_functions = []
111 | missing_data = []
112 |
113 | for func in required_functions:
114 | if f'function {func}' not in content:
115 | missing_functions.append(func)
116 |
117 | for data in required_data:
118 | if data not in content:
119 | missing_data.append(data)
120 |
121 | if missing_functions:
122 | result.valid = False
123 | result.errors.append(f"Missing required functions: {', '.join(missing_functions)}")
124 |
125 | if missing_data:
126 | result.warnings.append(f"Missing data structures: {', '.join(missing_data)}")
127 |
128 | # Check for return statement (filter must return filter table)
129 | if 'return {' not in content and 'return' not in content:
130 | result.warnings.append("Filter may not return proper filter table")
131 |
132 | result.details.update({
133 | 'file_size': len(content),
134 | 'missing_functions': missing_functions,
135 | 'missing_data': missing_data,
136 | 'has_return': 'return' in content
137 | })
138 |
139 | except Exception as e:
140 | result.valid = False
141 | result.errors.append(f"Error reading emoji filter: {str(e)}")
142 |
143 | return result
144 |
145 | def test_emoji_filter_with_pandoc(self) -> LuaValidationResult:
146 | """Test emoji filter functionality with Pandoc."""
147 | result = LuaValidationResult(
148 | valid=True,
149 | errors=[],
150 | warnings=[],
151 | details={'test_type': 'pandoc_integration'}
152 | )
153 |
154 | if not os.path.exists(self.emoji_filter_path):
155 | result.valid = False
156 | result.errors.append("Emoji filter not found for testing")
157 | return result
158 |
159 | # Test cases with different emoji types
160 | test_cases = [
161 | {
162 | 'name': 'basic_emoji',
163 | 'input': 'Hello 😀 World!',
164 | 'expected_pattern': r'\\emoji\{'
165 | },
166 | {
167 | 'name': 'keycap_sequence',
168 | 'input': 'Press 1️⃣ to continue',
169 | 'expected_pattern': r'\\emoji\{1'
170 | },
171 | {
172 | 'name': 'flag_sequence',
173 | 'input': 'Flag: 🇺🇸',
174 | 'expected_pattern': r'\\emoji\{'
175 | },
176 | {
177 | 'name': 'skin_tone_modifier',
178 | 'input': 'Wave 👋🏻 hello',
179 | 'expected_pattern': r'\\emoji\{'
180 | }
181 | ]
182 |
183 | try:
184 | with tempfile.TemporaryDirectory() as temp_dir:
185 | test_results = {}
186 |
187 | for test_case in test_cases:
188 | test_result = self._run_single_pandoc_test(
189 | temp_dir, test_case['name'],
190 | test_case['input'], test_case['expected_pattern']
191 | )
192 | test_results[test_case['name']] = test_result
193 |
194 | # Analyze results
195 | passed_tests = sum(1 for r in test_results.values() if r['success'])
196 | total_tests = len(test_cases)
197 |
198 | result.details.update({
199 | 'test_results': test_results,
200 | 'passed_tests': passed_tests,
201 | 'total_tests': total_tests,
202 | 'success_rate': passed_tests / total_tests if total_tests > 0 else 0
203 | })
204 |
205 | if passed_tests == 0:
206 | result.valid = False
207 | result.errors.append("All emoji filter tests failed")
208 | elif passed_tests < total_tests:
209 | result.warnings.append(f"Some emoji filter tests failed ({passed_tests}/{total_tests} passed)")
210 |
211 | except Exception as e:
212 | result.warnings.append(f"Emoji filter testing failed: {str(e)}")
213 |
214 | return result
215 |
216 | def _run_single_pandoc_test(self, temp_dir: str, test_name: str,
217 | input_text: str, expected_pattern: str) -> Dict:
218 | """Run a single Pandoc test with the emoji filter."""
219 | import re
220 |
221 | try:
222 | input_file = os.path.join(temp_dir, f'{test_name}.md')
223 | output_file = os.path.join(temp_dir, f'{test_name}.tex')
224 |
225 | with open(input_file, 'w', encoding='utf-8') as f:
226 | f.write(input_text)
227 |
228 | # Run pandoc with emoji filter
229 | cmd = [
230 | 'pandoc',
231 | input_file,
232 | '-o', output_file,
233 | '--to=latex',
234 | f'--lua-filter={self.emoji_filter_path}'
235 | ]
236 |
237 | result = subprocess.run(cmd, capture_output=True, text=True, timeout=15)
238 |
239 | if result.returncode == 0 and os.path.exists(output_file):
240 | with open(output_file, 'r', encoding='utf-8') as f:
241 | output_content = f.read()
242 |
243 | # Check if expected pattern is found
244 | pattern_found = bool(re.search(expected_pattern, output_content))
245 |
246 | return {
247 | 'success': pattern_found,
248 | 'output_content': output_content,
249 | 'pattern_found': pattern_found,
250 | 'expected_pattern': expected_pattern
251 | }
252 | else:
253 | return {
254 | 'success': False,
255 | 'error': result.stderr,
256 | 'returncode': result.returncode
257 | }
258 |
259 | except Exception as e:
260 | return {
261 | 'success': False,
262 | 'error': str(e)
263 | }
264 |
265 | def validate_pandoc_lua_support(self) -> LuaValidationResult:
266 | """Validate that Pandoc supports Lua filters."""
267 | result = LuaValidationResult(
268 | valid=True,
269 | errors=[],
270 | warnings=[],
271 | details={'test_type': 'pandoc_lua_support'}
272 | )
273 |
274 | try:
275 | # Check Pandoc version and Lua filter support
276 | version_result = subprocess.run(['pandoc', '--version'],
277 | capture_output=True, text=True, timeout=10)
278 |
279 | if version_result.returncode != 0:
280 | result.valid = False
281 | result.errors.append("Pandoc not available")
282 | return result
283 |
284 | version_output = version_result.stdout
285 | result.details['pandoc_version'] = version_output.split('\n')[0]
286 |
287 | # Test basic Lua filter support with a minimal filter
288 | with tempfile.TemporaryDirectory() as temp_dir:
289 | # Create a minimal test filter
290 | test_filter = os.path.join(temp_dir, 'test.lua')
291 | with open(test_filter, 'w') as f:
292 | f.write('''
293 | function Str(elem)
294 | return elem
295 | end
296 | ''')
297 |
298 | # Create test input
299 | test_input = os.path.join(temp_dir, 'test.md')
300 | test_output = os.path.join(temp_dir, 'test.tex')
301 |
302 | with open(test_input, 'w') as f:
303 | f.write('Test content')
304 |
305 | # Test Lua filter execution
306 | test_cmd = [
307 | 'pandoc',
308 | test_input,
309 | '-o', test_output,
310 | '--to=latex',
311 | f'--lua-filter={test_filter}'
312 | ]
313 |
314 | test_result = subprocess.run(test_cmd, capture_output=True, text=True, timeout=15)
315 |
316 | if test_result.returncode == 0:
317 | result.details['lua_filter_support'] = True
318 | else:
319 | result.valid = False
320 | result.errors.append("Pandoc Lua filter support not working")
321 | result.details['test_error'] = test_result.stderr
322 |
323 | except subprocess.TimeoutExpired:
324 | result.warnings.append("Pandoc Lua support test timed out")
325 | except FileNotFoundError:
326 | result.valid = False
327 | result.errors.append("Pandoc not found")
328 | except Exception as e:
329 | result.warnings.append(f"Pandoc Lua support test failed: {str(e)}")
330 |
331 | return result
332 |
333 | def run_comprehensive_lua_validation(self) -> LuaValidationResult:
334 | """Run comprehensive Lua dependency validation."""
335 | overall_result = LuaValidationResult(
336 | valid=True,
337 | errors=[],
338 | warnings=[],
339 | details={'validation_type': 'comprehensive'}
340 | )
341 |
342 | # Test 1: Pandoc Lua support
343 | pandoc_test = self.validate_pandoc_lua_support()
344 | overall_result.details['pandoc_lua_test'] = pandoc_test.details
345 |
346 | if not pandoc_test.valid:
347 | overall_result.valid = False
348 | overall_result.errors.extend(pandoc_test.errors)
349 | overall_result.warnings.extend(pandoc_test.warnings)
350 |
351 | # Test 2: Emoji filter syntax
352 | syntax_test = self.validate_lua_filter_syntax(self.emoji_filter_path)
353 | overall_result.details['syntax_test'] = syntax_test.details
354 |
355 | if not syntax_test.valid:
356 | overall_result.valid = False
357 | overall_result.errors.extend(syntax_test.errors)
358 | overall_result.warnings.extend(syntax_test.warnings)
359 |
360 | # Test 3: Emoji filter functions
361 | functions_test = self.validate_emoji_filter_functions()
362 | overall_result.details['functions_test'] = functions_test.details
363 |
364 | if not functions_test.valid:
365 | overall_result.valid = False
366 | overall_result.errors.extend(functions_test.errors)
367 | overall_result.warnings.extend(functions_test.warnings)
368 |
369 | # Test 4: Integration test (only if previous tests pass)
370 | if overall_result.valid:
371 | integration_test = self.test_emoji_filter_with_pandoc()
372 | overall_result.details['integration_test'] = integration_test.details
373 |
374 | if not integration_test.valid:
375 | overall_result.warnings.append("Emoji filter integration tests failed")
376 | overall_result.warnings.extend(integration_test.warnings)
377 | else:
378 | overall_result.warnings.extend(integration_test.warnings)
379 |
380 | return overall_result
381 |
382 |
383 | def main():
384 | """Command-line interface for Lua dependency validation."""
385 | import argparse
386 |
387 | parser = argparse.ArgumentParser(description='Validate Lua dependencies for emoji support')
388 | parser.add_argument('--script-dir', default='.',
389 | help='Directory containing the emoji filter')
390 | parser.add_argument('--verbose', '-v', action='store_true',
391 | help='Show detailed validation results')
392 |
393 | args = parser.parse_args()
394 |
395 | validator = LuaDependencyValidator(args.script_dir)
396 | result = validator.run_comprehensive_lua_validation()
397 |
398 | print("Lua Dependencies Validation Report")
399 | print("=" * 40)
400 |
401 | if result.valid:
402 | print("✅ All Lua dependencies are valid")
403 | else:
404 | print("❌ Lua dependency validation failed")
405 |
406 | if result.errors:
407 | print(f"\n❌ Errors ({len(result.errors)}):")
408 | for error in result.errors:
409 | print(f" • {error}")
410 |
411 | if result.warnings:
412 | print(f"\n⚠️ Warnings ({len(result.warnings)}):")
413 | for warning in result.warnings:
414 | print(f" • {warning}")
415 |
416 | if args.verbose:
417 | print(f"\n📋 Detailed Results:")
418 | print(json.dumps(result.details, indent=2, default=str))
419 |
420 |
421 | if __name__ == '__main__':
422 | main()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PDF Book Exporter
2 |
3 | A comprehensive tool for exporting Hugo book directories to professional PDF files, with enhanced support for multilingual content, emoji rendering, and customizable styling.
4 |
5 | ## ✨ Features
6 |
7 | - 📚 **Hugo Book Structure Support** - Automatically processes `_index.md` and `index.md` files with weight-based ordering
8 | - 🌍 **Multilingual Support** - CJK (Chinese, Japanese, Korean) character rendering with font auto-detection
9 | - 🎨 **Enhanced Cover System** - Dynamic text overlay on cover images with customizable fonts, colors, and positioning
10 | - 💻 **Advanced Code Highlighting** - Syntax highlighting for 20+ programming languages with customizable themes
11 | - 📊 **Smart Table Processing** - Auto-wrapping tables with enhanced formatting and break handling
12 | - 🎉 **Comprehensive Emoji Support** - Unicode emoji rendering with fallback font chains
13 | - 🎨 **Customizable Color Palettes** - Full typography and styling customization
14 | - ⚡ **Intelligent Caching** - Image processing cache with automatic invalidation
15 | - 🔧 **Flexible PDF Engines** - XeLaTeX and LuaLaTeX support with automatic engine selection
16 | - 📱 **Cross-Platform** - Works on macOS, Linux, and Windows
17 |
18 | ## 🚀 Quick Start
19 |
20 | ### Prerequisites
21 |
22 | ```bash
23 | # Install Pandoc (required)
24 | # macOS
25 | brew install pandoc
26 |
27 | # Ubuntu/Debian
28 | sudo apt-get install pandoc
29 |
30 | # Windows
31 | # Download from https://pandoc.org/installing.html
32 |
33 | # Install LaTeX distribution (TeX Live recommended)
34 | # macOS
35 | brew install --cask mactex
36 |
37 | # Ubuntu/Debian
38 | sudo apt-get install texlive-full
39 |
40 | # Windows
41 | # Download TeX Live or MiKTeX
42 | ```
43 |
44 | ### Basic Usage
45 |
46 | ```bash
47 | # Export a Hugo book to PDF
48 | python cli.py /path/to/book/directory -o output.pdf
49 |
50 | # Generate with cover and summary
51 | python cli.py /path/to/book/directory \
52 | -o professional-book.pdf \
53 | --generate-summary
54 |
55 | # Enable emoji support with automatic engine optimization
56 | python cli.py /path/to/book/directory \
57 | -o emoji-book.pdf \
58 | --emoji
59 | ```
60 |
61 | ## 📖 Examples
62 |
63 | You can export the built-in example book to PDF with the following command:
64 |
65 | ```bash
66 | python cli.py example -o example.pdf --emoji
67 | ```
68 |
69 | 
70 |
71 | ### Example 1: Basic Book Export
72 |
73 | ```bash
74 | # Directory structure
75 | content/zh/book/my-handbook/
76 | ├── _index.md # Book metadata and config
77 | ├── chapter1/
78 | │ └── index.md # Chapter 1 content
79 | ├── chapter2/
80 | │ └── index.md # Chapter 2 content
81 | └── images/
82 | └── cover.jpg # Optional cover image
83 |
84 | # Export command
85 | python cli.py content/zh/book/my-handbook \
86 | -o static/files/my-handbook.pdf \
87 | --generate-summary
88 | ```
89 |
90 | ### Example 2: Emoji-Enabled Technical Documentation
91 |
92 | ```bash
93 | # Export with emoji support and diagnostics
94 | python cli.py content/zh/book/tech-guide \
95 | -o tech-guide-with-emoji.pdf \
96 | --emoji \
97 | --diagnostics
98 |
99 | # Example output
100 | ✅ Emoji fonts detected: Apple Color Emoji, Noto Color Emoji
101 | 🎨 Using emoji font: Apple Color Emoji
102 | 🚀 PDF generated successfully at tech-guide-with-emoji.pdf
103 | ```
104 |
105 | ### Example 3: Custom Template and Appendix
106 |
107 | ```bash
108 | # Use custom template with appendix
109 | python cli.py content/zh/book/research-paper \
110 | -o research-paper.pdf \
111 | --template custom-template.tex \
112 | --appendix bibliography.md
113 | ```
114 |
115 | ### Example 4: Draft Content and Cache Management
116 |
117 | ```bash
118 | # Include draft content
119 | python cli.py content/zh/book/work-in-progress \
120 | -o draft-book.pdf \
121 | --include-drafts
122 |
123 | # Manage cache
124 | python cli.py --cache-info content/zh/book/my-book
125 | python cli.py --clean-cache 30 content/zh/book/my-book
126 | ```
127 |
128 | ## 🎨 Customizable Palettes and Styling
129 |
130 | ### Color Palette Configuration
131 |
132 | Configure document-wide color schemes in your book's `_index.md`:
133 |
134 | ```yaml
135 | ---
136 | title: "My Professional Book"
137 | book:
138 | # Typography color palette
139 | body_color: "#2C3E50" # Main text color
140 | heading_color: "#34495E" # All heading levels
141 | link_color: "#3498DB" # Hyperlinks and references
142 | code_color: "#E74C3C" # Inline code snippets
143 | quote_color: "#7F8C8D" # Blockquotes and citations
144 | caption_color: "#95A5A6" # Figure and table captions
145 |
146 | # Enhanced cover customization
147 | cover: "professional-cover.jpg"
148 | cover_title_text: "Advanced Data Science"
149 | cover_author_text: "Dr. Jane Smith"
150 | cover_subtitle_text: "A Comprehensive Guide"
151 |
152 | # Cover color scheme
153 | cover_title_color: "#FFFFFF"
154 | cover_author_color: "#ECF0F1"
155 | cover_subtitle_color: "#BDC3C7"
156 |
157 | # Font sizes (points)
158 | cover_title_font_size: 48
159 | cover_author_font_size: 24
160 | cover_subtitle_font_size: 18
161 |
162 | # Layout positioning
163 | cover_title_position: "center" # top, center, bottom
164 | cover_author_position: "bottom" # top, center, bottom
165 | ---
166 | ```
167 |
168 | ### Predefined Color Themes
169 |
170 | #### Professional Theme
171 |
172 | ```yaml
173 | book:
174 | body_color: "#2C3E50"
175 | heading_color: "#34495E"
176 | link_color: "#3498DB"
177 | code_color: "#E74C3C"
178 | quote_color: "#7F8C8D"
179 | ```
180 |
181 | #### Academic Theme
182 |
183 | ```yaml
184 | book:
185 | body_color: "#2E3440"
186 | heading_color: "#5E81AC"
187 | link_color: "#88C0D0"
188 | code_color: "#BF616A"
189 | quote_color: "#4C566A"
190 | ```
191 |
192 | #### Warm Theme
193 |
194 | ```yaml
195 | book:
196 | body_color: "#3E2723"
197 | heading_color: "#5D4037"
198 | link_color: "#FF5722"
199 | code_color: "#D84315"
200 | quote_color: "#6D4C41"
201 | ```
202 |
203 | ### Advanced Cover Customization
204 |
205 | ```yaml
206 | book:
207 | # Visual effects
208 | cover_overlay_enabled: true
209 | cover_text_shadow: true
210 | cover_background_overlay: true
211 | cover_overlay_opacity: 0.6
212 |
213 | # Advanced positioning
214 | cover_title_position: "center"
215 | cover_author_position: "bottom"
216 |
217 | # Custom export date format
218 | cover_export_date: "2024 年 1 月"
219 | ```
220 |
221 | ### Code Block Styling
222 |
223 | The tool automatically applies syntax highlighting with customizable color schemes:
224 |
225 | ```markdown
226 | ```python
227 | def hello_world():
228 | print("Hello, World! 🌍")
229 | return {"status": "success", "emoji": "✅"}
230 | ```
231 |
232 | # SQL example with automatic highlighting
233 |
234 | ```sql
235 | SELECT user_name, COUNT(*) as total_orders
236 | FROM orders
237 | WHERE created_date >= '2024-01-01'
238 | GROUP BY user_name;
239 | ```
240 |
241 | ```
242 |
243 | Supported languages: Python, JavaScript, Go, Rust, Java, C++, SQL, YAML, JSON, Bash, HTML, CSS, and more.
244 |
245 | ## 🔧 API Reference
246 |
247 | ### Core Functions
248 |
249 | #### `build_pdf(book_dir, root_node, output_pdf, metadata, **options)`
250 |
251 | Main PDF generation function.
252 |
253 | **Parameters:**
254 | - `book_dir` (str): Path to Hugo book directory
255 | - `root_node` (Node): Parsed book structure tree
256 | - `output_pdf` (str): Output PDF file path
257 | - `metadata` (dict): Book configuration and metadata
258 | - `template_path` (str, optional): Custom LaTeX template path
259 | - `appendix_path` (str, optional): Additional content to append
260 | - `emoji` (bool): Enable comprehensive emoji support
261 |
262 | **Example:**
263 | ```python
264 | from export_book_pdf import build_pdf, build_tree, load_config
265 |
266 | # Parse book structure
267 | root_node = build_tree("content/zh/book/my-book")
268 | config = load_config("content/zh/book/my-book")
269 |
270 | # Generate PDF
271 | build_pdf(
272 | book_dir="content/zh/book/my-book",
273 | root_node=root_node,
274 | output_pdf="output.pdf",
275 | metadata=config,
276 | emoji=True
277 | )
278 | ```
279 |
280 | ## 🛠️ CLI Reference
281 |
282 | ### Basic Commands
283 |
284 | ```bash
285 | # Core export command
286 | python cli.py [OPTIONS]
287 | ```
288 |
289 | ### Command Line Options
290 |
291 | | Flag | Description | Example |
292 | |------|-------------|---------|
293 | | `-o, --output` | Output PDF file path | `-o my-book.pdf` |
294 | | `--generate-summary` | Create GitBook-style summary.md | `--generate-summary` |
295 | | `--template` | Custom LaTeX template path (XeLaTeX only) | `--template custom.tex` |
296 | | `--appendix` | Append additional content | `--appendix refs.md` |
297 | | `--emoji` | Enable emoji support with automatic engine selection | `--emoji` |
298 | | `--include-drafts` | Include draft content | `--include-drafts` |
299 | | `--diagnostics` | Run system compatibility diagnostics | `--diagnostics` |
300 | | `--clean-cache` | Clean cache files (optional: days) | `--clean-cache 30` |
301 | | `--cache-info` | Display cache information | `--cache-info` |
302 | | `--generate-troubleshooting-guide` | Generate troubleshooting guide | `--generate-troubleshooting-guide` |
303 | | `--max-table-width` | Maximum table width as fraction of text width | `--max-table-width 0.95` |
304 |
305 | ### Advanced Usage Examples
306 |
307 | #### Comprehensive Diagnostics
308 |
309 | ```bash
310 | # Run full system diagnostics
311 | python cli.py --diagnostics
312 | # Output:
313 | # 🔍 System validation: ✅ PASSED
314 | # ✅ LuaLaTeX Engine
315 | # ✅ Emoji Fonts (Apple Color Emoji)
316 | # ✅ Pandoc Available
317 | # ✅ Required LaTeX Packages
318 | ```
319 |
320 | #### Cache Management
321 |
322 | ```bash
323 | # View cache statistics
324 | python cli.py --cache-info content/zh/book/handbook
325 | # Output:
326 | # Cache directory: /path/to/cache
327 | # Cache files: 15
328 | # Total cache size: 12.34MB
329 |
330 | # Clean old cache files
331 | python cli.py --clean-cache 30 content/zh/book/handbook
332 | # Output: Cleaned 8 cache files older than 30 days.
333 | ```
334 |
335 | #### Draft Content Processing
336 |
337 | ```bash
338 | # Include draft chapters (with draft: true in front matter)
339 | python cli.py content/zh/book/work-in-progress \
340 | -o draft-version.pdf \
341 | --include-drafts
342 | ```
343 |
344 | ## 📄 Workflow Integration
345 |
346 | ### Hugo Integration
347 |
348 | The tool seamlessly integrates with Hugo book themes:
349 |
350 | ```yaml
351 | # hugo.yaml or config.yaml
352 | params:
353 | book:
354 | pdf_export: true
355 | pdf_output_dir: "static/files"
356 | ```
357 |
358 | ### Automated Build Integration
359 |
360 | #### GitHub Actions
361 |
362 | ```yaml
363 | name: Generate PDF Books
364 | on:
365 | push:
366 | paths: ['content/zh/book/**']
367 |
368 | jobs:
369 | pdf-export:
370 | runs-on: ubuntu-latest
371 | steps:
372 | - uses: actions/checkout@v3
373 |
374 | - name: Install LaTeX
375 | run: sudo apt-get install texlive-full
376 |
377 | - name: Install Pandoc
378 | run: sudo apt-get install pandoc
379 |
380 | - name: Generate PDF
381 | run: |
382 | python tools/pdf-book-exporter/cli.py \
383 | content/zh/book/my-handbook \
384 | -o static/files/my-handbook.pdf \
385 | --emoji --generate-summary
386 |
387 | - name: Upload PDF
388 | uses: actions/upload-artifact@v3
389 | with:
390 | name: generated-pdfs
391 | path: static/files/*.pdf
392 | ```
393 |
394 | ## ⚠️ Known Limitations
395 |
396 | ### Current Limitations
397 |
398 | - **Template Compatibility**: Custom templates (`--template`) only work with XeLaTeX engine
399 | - **Emoji Font Dependencies**: Emoji support requires system-installed emoji fonts (Apple Color Emoji, Noto Color Emoji, etc.)
400 | - **LaTeX Engine Switching**: The tool automatically selects between XeLaTeX and LuaLaTeX based on emoji requirements
401 | - **Table Width Processing**: Very wide tables may require manual adjustment of `--max-table-width` parameter
402 | - **Image Format Support**: WebP images are automatically converted to PNG, which may increase processing time
403 | - **Cache Dependencies**: Image processing cache is tied to file modification times and may require manual clearing after system changes
404 |
405 | ### Performance Considerations
406 |
407 | - **Large Books**: Processing time increases significantly for books with 200+ pages
408 | - **Image-Heavy Content**: Books with many images may require substantial cache space
409 | - **Font Loading**: First-time emoji font detection adds ~2-3 seconds to processing time
410 | - **Memory Usage**: Large tables and complex formatting may require 2GB+ RAM
411 |
412 | ### Compatibility Notes
413 |
414 | - **Operating Systems**: Full emoji support requires macOS 10.12+, Ubuntu 18.04+, or Windows 10+
415 | - **LaTeX Distributions**: TeX Live 2020+ recommended for best compatibility
416 | - **Pandoc Version**: Requires Pandoc 2.14+ for optimal table processing
417 |
418 | ## 📋 Content Guidelines
419 |
420 | ### Book Structure
421 |
422 | ```
423 | content/zh/book/my-handbook/
424 | ├── _index.md # Required: Book metadata
425 | ├── chapter1-introduction/
426 | │ └── index.md # Chapter content
427 | ├── chapter2-basics/
428 | │ └── index.md
429 | ├── chapter3-advanced/
430 | │ ├── index.md
431 | │ └── images/ # Chapter-specific images
432 | │ └── diagram.png
433 | ├── images/ # Book-level images
434 | │ ├── cover.jpg # Optional: Book cover
435 | │ └── backcover.jpg # Optional: Back cover
436 | └── SUMMARY.md # Generated automatically
437 | ```
438 |
439 | ### Front Matter Configuration
440 |
441 | #### Book-level (`_index.md`)
442 |
443 | ```yaml
444 | ---
445 | title: "Complete Programming Guide"
446 | weight: 1
447 | book:
448 | title: "Complete Programming Guide"
449 | author: "Jane Developer"
450 | date: "2024-01-15"
451 | description: "A comprehensive guide to modern programming"
452 | language: "zh-hans"
453 |
454 | # PDF-specific settings
455 | cover: "cover.jpg"
456 | appendix: true
457 |
458 | # Back-cover configuration (optional)
459 | backcover_image: "back_qr.png" # relative path, any raster img
460 | backcover_text: |
461 | **扫码关注公众号**
462 | https://my.site/book
463 |
464 | # Color customization
465 | body_color: "#2C3E50"
466 | heading_color: "#34495E"
467 | link_color: "#3498DB"
468 |
469 | # Cover customization
470 | cover_title_text: "完整编程指南"
471 | cover_author_text: "张三"
472 | cover_title_color: "#FFFFFF"
473 | cover_author_color: "#ECF0F1"
474 | ---
475 |
476 | # Introduction
477 |
478 | This book covers comprehensive programming concepts...
479 | ```
480 |
481 | #### Chapter-level (`index.md`)
482 |
483 | ```yaml
484 | ---
485 | title: "Getting Started"
486 | weight: 10
487 | draft: false # Set to true to exclude from PDF
488 | publish: true # Set to false to exclude from PDF
489 | export_pdf: true # Set to false to exclude from PDF
490 | ---
491 |
492 | # Getting Started
493 |
494 | Welcome to the first chapter...
495 | ```
496 |
497 | ### Content Exclusion
498 |
499 | Control which content appears in PDF exports:
500 |
501 | ```yaml
502 | ---
503 | title: "Work in Progress Chapter"
504 | weight: 99
505 | draft: true # Excluded unless --include-drafts
506 | publish: false # Always excluded
507 | export_pdf: false # Excluded from PDF only
508 | pdf: false # Alternative to export_pdf
509 | ---
510 | ```
511 |
512 | ## 🎯 Advanced Features
513 |
514 | ### Multi-Language Support
515 |
516 | ```yaml
517 | book:
518 | language: "zh-hans" # Chinese Simplified
519 | # Automatic font selection:
520 | # - Source Han Sans SC (preferred)
521 | # - Noto Sans CJK SC
522 | # - PingFang SC (macOS)
523 | ```
524 |
525 | ### Emoji Rendering
526 |
527 | ```bash
528 | # Enable comprehensive emoji support
529 | python cli.py content/book/emoji-guide \
530 | -o emoji-guide.pdf \
531 | --emoji
532 |
533 | # The tool automatically:
534 | # ✅ Detects system emoji fonts
535 | # 🎨 Configures optimal rendering engine
536 | # 🚀 Provides fallback options
537 | ```
538 |
539 | ### Image Processing
540 |
541 | The tool automatically handles:
542 |
543 | - **Format conversion**: WebP → PNG, SVG → PNG
544 | - **Remote images**: Downloads and caches URLs
545 | - **Smart caching**: Avoids reprocessing unchanged images
546 | - **Size optimization**: Maintains quality while reducing file size
547 |
548 | ### Table Enhancement
549 |
550 | Advanced table processing includes:
551 |
552 | - **Auto-wrapping**: Long content automatically wraps
553 | - **Responsive sizing**: Tables adapt to page width
554 | - **Break handling**: Smart page breaks for long tables
555 | - **Styling**: Professional borders and spacing
556 |
557 | ## 🐛 Troubleshooting
558 |
559 | ### Common Issues
560 |
561 | #### 1. LaTeX Engine Not Found
562 |
563 | ```bash
564 | # Check if LaTeX is installed
565 | xelatex --version
566 | lualatex --version
567 |
568 | # Install TeX Live (recommended)
569 | # macOS: brew install --cask mactex
570 | # Ubuntu: sudo apt-get install texlive-full
571 | ```
572 |
573 | #### 2. Emoji Not Rendering
574 |
575 | ```bash
576 | # Run diagnostics to identify issues
577 | python cli.py --diagnostics
578 |
579 | # Install emoji fonts if needed
580 | # macOS: Already included (Apple Color Emoji)
581 | # Ubuntu: sudo apt-get install fonts-noto-color-emoji
582 | # Windows: Available in Windows 10+
583 | ```
584 |
585 | #### 3. Chinese Characters Not Displaying
586 |
587 | ```bash
588 | # Install CJK fonts
589 | # macOS: brew install font-source-han-sans
590 | # Ubuntu: sudo apt-get install fonts-noto-cjk
591 | ```
592 |
593 | #### 4. Memory Issues with Large Books
594 |
595 | ```bash
596 | # Process in smaller chunks or increase system memory
597 | # Use cache to avoid reprocessing images
598 | python cli.py --clean-cache 0 # Clear cache if needed
599 | ```
600 |
601 | ### Getting Help
602 |
603 | 1. **Run diagnostics**: `python cli.py --diagnostics`
604 | 2. **Generate troubleshooting guide**: `--generate-troubleshooting-guide`
605 | 3. **Check logs**: Enable verbose output in the script
606 | 4. **Community support**: Create an issue with diagnostic output
607 |
608 | ## 📊 Performance and Statistics
609 |
610 | ### Example Performance Metrics
611 |
612 | ```bash
613 | # Typical processing times
614 | Small book (5 chapters, 20 pages): ~15 seconds
615 | Medium book (15 chapters, 100 pages): ~45 seconds
616 | Large book (30 chapters, 300 pages): ~2 minutes
617 |
618 | # With caching enabled:
619 | Subsequent runs: ~5-10 seconds (cache hit rate: 85%+)
620 | ```
621 |
622 | ### Cache Management
623 |
624 | ```bash
625 | # Monitor cache usage
626 | python cli.py --cache-info content/book/handbook
627 | # Output:
628 | # Cache directory: /path/to/cache
629 | # Cache files: 25
630 | # image1_a1b2c3d4.png: 125.3KB, 2.5 days old
631 | # image2_e5f6g7h8.png: 89.7KB, 1.2 days old
632 | # Total cache size: 15.67MB
633 |
634 | # Clean old cache files
635 | python cli.py --clean-cache 7 # Remove files older than 7 days
636 | ```
637 |
638 | ## 🤝 Contributing
639 |
640 | Contributions are welcome! Please feel free to submit issues, feature requests, or pull requests.
641 |
642 | ### Development Setup
643 |
644 | ```bash
645 | # Clone the repository
646 | git clone https://github.com/rootsongjc/pdf-book-exporter.git
647 | cd pdf-book-exporter
648 |
649 | # Install dependencies
650 | ./install_pdf_dependencies.sh
651 | ```
652 |
653 | ---
654 |
655 | **Professional PDF generation for Hugo books with comprehensive multilingual and emoji support.**
656 |
--------------------------------------------------------------------------------
/image_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import shutil
4 | import hashlib
5 | import subprocess
6 | from pathlib import Path
7 | import cache_utils
8 |
9 | def latex_escape(s):
10 | return s.replace('\\', '/').replace('_', '\\_').replace('#', '\\#').replace('%', '\\%').replace('&', '\\&').replace(' ', '\\ ')
11 |
12 | def download_image(url, output_path):
13 | import urllib.request
14 | try:
15 | req = urllib.request.Request(
16 | url,
17 | headers={
18 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
19 | }
20 | )
21 | with urllib.request.urlopen(req) as response:
22 | with open(output_path, 'wb') as f:
23 | f.write(response.read())
24 | print(f"Downloaded {url} to {output_path}")
25 | return True
26 | except Exception as e:
27 | print(f"Failed to download image {url}: {e}")
28 | return False
29 |
30 | def find_image_file_recursive(book_dir, img_name, current_file_path):
31 | img_name = img_name.split('?')[0].split('#')[0]
32 | current_dir = os.path.dirname(current_file_path)
33 | candidate = os.path.abspath(os.path.join(current_dir, img_name))
34 | if os.path.exists(candidate):
35 | return candidate
36 | candidates = [
37 | os.path.join(book_dir, img_name),
38 | os.path.join(book_dir, 'images', img_name),
39 | os.path.join('static', 'images', img_name),
40 | os.path.join('static', 'files', img_name),
41 | ]
42 | for c in candidates:
43 | if os.path.exists(c):
44 | return c
45 | for root, dirs, files in os.walk(book_dir):
46 | if img_name in files:
47 | return os.path.join(root, img_name)
48 | for static_dir in ['static/images', 'static/files']:
49 | for root, dirs, files in os.walk(static_dir):
50 | if img_name in files:
51 | return os.path.join(root, img_name)
52 | return None
53 |
54 | def convert_svg_to_png(svg_path, output_dir, cache_dir=None):
55 | if cache_dir:
56 | import cache_utils
57 | cached_path = cache_utils.get_cached_image(svg_path, cache_dir, '.png')
58 | if cached_path:
59 | output_name = os.path.splitext(os.path.basename(svg_path))[0] + '.png'
60 | output_path = os.path.join(output_dir, output_name)
61 | shutil.copy2(cached_path, output_path)
62 | return output_path
63 | svg2png_script = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../scripts/svg2png.sh'))
64 | if not os.path.exists(svg2png_script):
65 | print(f"svg2png.sh not found at {svg2png_script}")
66 | return None
67 | svg_name = os.path.basename(svg_path)
68 | png_name = svg_name.replace('.svg', '.png')
69 | png_path = os.path.join(output_dir, png_name)
70 | try:
71 | subprocess.run([svg2png_script, svg_path, png_path], check=True)
72 | if os.path.exists(png_path):
73 | print(f"Converted {svg_path} to {png_path} (via svg2png.sh)")
74 | if cache_dir:
75 | cache_utils.save_to_cache(svg_path, png_path, cache_dir)
76 | return png_path
77 | else:
78 | print(f"svg2png.sh did not produce {png_path}")
79 | return None
80 | except Exception as e:
81 | print(f"Error using svg2png.sh: {e}")
82 | return None
83 |
84 | def convert_webp_to_png(webp_path, output_dir, cache_dir=None):
85 | if cache_dir:
86 | import cache_utils
87 | cached_path = cache_utils.get_cached_image(webp_path, cache_dir, '.png')
88 | if cached_path:
89 | output_name = os.path.splitext(os.path.basename(webp_path))[0] + '.png'
90 | output_path = os.path.join(output_dir, output_name)
91 | shutil.copy2(cached_path, output_path)
92 | return output_path
93 | try:
94 | webp_name = os.path.basename(webp_path)
95 | png_name = webp_name.replace('.webp', '.png')
96 | png_path = os.path.join(output_dir, png_name)
97 | cmd = ['magick', webp_path, png_path]
98 | subprocess.run(cmd, check=True, capture_output=True)
99 | print(f"Converted {webp_path} to {png_path}")
100 | if cache_dir:
101 | cache_utils.save_to_cache(webp_path, png_path, cache_dir)
102 | return png_path
103 | except (subprocess.CalledProcessError, FileNotFoundError):
104 | print(f"Warning: Could not convert {webp_path} to PNG. Install ImageMagick.")
105 | return None
106 |
107 | def process_images_in_content(content, book_dir, temp_dir, temp_pngs, current_file_path, cache_dir=None):
108 | os.makedirs(temp_dir, exist_ok=True)
109 | processed_images = {}
110 |
111 | # Only remove mermaid code blocks, not other types
112 | content = re.sub(r'```mermaid[\s\S]*?```', '', content)
113 |
114 | # Track code block boundaries to avoid processing content inside them
115 | def is_inside_code_block(text, position):
116 | before_text = text[:position]
117 | code_blocks = re.finditer(r'```[\w]*', before_text)
118 | count = len(list(code_blocks))
119 | return count % 2 == 1 # Odd count means we're inside a code block
120 |
121 | def replace_image(match):
122 | # Check if this image is inside a code block
123 | if is_inside_code_block(content, match.start()):
124 | return match.group(0) # Don't process images inside code blocks
125 |
126 | alt_text = match.group(1)
127 | img_path = match.group(2)
128 | if img_path.startswith('http://') or img_path.startswith('https://'):
129 | url_hash = hashlib.md5(img_path.encode()).hexdigest()[:12]
130 | original_filename = os.path.basename(img_path.split('?')[0])
131 | base_name = os.path.splitext(original_filename)[0]
132 | ext = os.path.splitext(original_filename)[1].lower()
133 | cached_filename = f"{base_name}_{url_hash}.png"
134 | cached_path = os.path.join(cache_dir, cached_filename) if cache_dir else None
135 | metadata = {}
136 | if cache_dir:
137 | import cache_utils
138 | metadata = cache_utils.load_cache_metadata(cache_dir)
139 | if cache_dir and cached_filename in metadata and os.path.exists(cached_path):
140 | print(f"Using cached remote image: {cached_path}")
141 | abs_path = cached_path
142 | else:
143 | temp_download_path = os.path.join(temp_dir, f"download_{url_hash}{ext}")
144 | if download_image(img_path, temp_download_path):
145 | if ext == '.webp':
146 | png_path = convert_webp_to_png(temp_download_path, temp_dir, cache_dir)
147 | if png_path:
148 | shutil.copy2(png_path, cached_path)
149 | if cache_dir:
150 | metadata[cached_filename] = {
151 | 'source_url': img_path,
152 | 'cached_at': __import__('time').time(),
153 | 'cache_path': cached_path
154 | }
155 | cache_utils.save_cache_metadata(cache_dir, metadata)
156 | abs_path = cached_path
157 | else:
158 | print(f"Warning: Failed to convert downloaded WebP to PNG: {img_path}")
159 | return match.group(0)
160 | elif ext == '.svg':
161 | png_path = convert_svg_to_png(temp_download_path, temp_dir, cache_dir)
162 | if png_path:
163 | shutil.copy2(png_path, cached_path)
164 | if cache_dir:
165 | metadata[cached_filename] = {
166 | 'source_url': img_path,
167 | 'cached_at': __import__('time').time(),
168 | 'cache_path': cached_path
169 | }
170 | cache_utils.save_cache_metadata(cache_dir, metadata)
171 | abs_path = cached_path
172 | else:
173 | print(f"Warning: Failed to convert downloaded SVG to PNG: {img_path}")
174 | return match.group(0)
175 | else:
176 | try:
177 | if ext == '.gif':
178 | cmd = ['magick', temp_download_path + '[0]', cached_path]
179 | else:
180 | cmd = ['magick', temp_download_path, cached_path]
181 | subprocess.run(cmd, check=True, capture_output=True)
182 | if cache_dir:
183 | metadata[cached_filename] = {
184 | 'source_url': img_path,
185 | 'cached_at': __import__('time').time(),
186 | 'cache_path': cached_path
187 | }
188 | cache_utils.save_cache_metadata(cache_dir, metadata)
189 | abs_path = cached_path
190 | except (subprocess.CalledProcessError, FileNotFoundError):
191 | print(f"Warning: Could not convert downloaded image {img_path} to PNG")
192 | shutil.copy2(temp_download_path, cached_path)
193 | if cache_dir:
194 | metadata[cached_filename] = {
195 | 'source_url': img_path,
196 | 'cached_at': __import__('time').time(),
197 | 'cache_path': cached_path
198 | }
199 | cache_utils.save_cache_metadata(cache_dir, metadata)
200 | abs_path = cached_path
201 | if os.path.exists(temp_download_path):
202 | os.remove(temp_download_path)
203 | else:
204 | print(f"Warning: Failed to download image: {img_path}")
205 | return f"\n\n"
206 | else:
207 | abs_path = find_image_file_recursive(book_dir, img_path, current_file_path)
208 | if not abs_path:
209 | print(f"Warning: Image not found: {img_path} in {current_file_path}")
210 | return match.group(0)
211 | if abs_path in processed_images:
212 | escaped_path = processed_images[abs_path]
213 | latex = ('\n\\begin{figure}[htbp]\n' +
214 | ' \\centering\n' +
215 | f' \\includegraphics[width=0.8\\textwidth]{{{escaped_path}}}\n' +
216 | f' \\caption{{{alt_text}}}\n' +
217 | '\\end{figure}\n')
218 | return latex
219 | ext = os.path.splitext(abs_path)[1].lower()
220 | target_path = ''
221 | if ext == '.svg':
222 | png_path = convert_svg_to_png(abs_path, temp_dir, cache_dir)
223 | if not png_path or not os.path.exists(png_path):
224 | print(f"Warning: Failed to convert SVG to PNG: {abs_path}")
225 | return match.group(0)
226 | base_name = os.path.splitext(os.path.basename(abs_path))[0]
227 | unique_name = f"{base_name}.png"
228 | target_path = os.path.join(temp_dir, unique_name)
229 | if png_path != target_path:
230 | shutil.copy(png_path, target_path)
231 | temp_pngs.append(target_path)
232 | elif ext == '.webp':
233 | png_path = convert_webp_to_png(abs_path, temp_dir, cache_dir)
234 | if not png_path or not os.path.exists(png_path):
235 | print(f"Warning: Failed to convert WEBP to PNG: {abs_path}")
236 | return match.group(0)
237 | base_name = os.path.splitext(os.path.basename(abs_path))[0]
238 | unique_name = f"{base_name}.png"
239 | target_path = os.path.join(temp_dir, unique_name)
240 | if png_path != target_path:
241 | shutil.copy(png_path, target_path)
242 | temp_pngs.append(target_path)
243 | else:
244 | unique_name = os.path.basename(abs_path)
245 | target_path = os.path.join(temp_dir, unique_name)
246 | shutil.copy(abs_path, target_path)
247 | temp_pngs.append(target_path)
248 | escaped_path = latex_escape(target_path)
249 | processed_images[abs_path] = escaped_path
250 | latex = ('\n\\begin{figure}[htbp]\n' +
251 | ' \\centering\n' +
252 | f' \\includegraphics[width=0.8\\textwidth]{{{escaped_path}}}\n' +
253 | f' \\caption{{{alt_text}}}\n' +
254 | '\\end{figure}\n')
255 | return latex
256 | content = re.sub(r'!\[(.*?)\]\((.*?)\)', replace_image, content)
257 |
258 | # Remove Hugo shortcodes and HTML comments, but avoid processing content inside code blocks
259 | lines = content.split('\n')
260 | processed_lines = []
261 | inside_code_block = False
262 |
263 | for line in lines:
264 | # Check if we're entering or leaving a code block
265 | if line.strip().startswith('```'):
266 | inside_code_block = not inside_code_block
267 | processed_lines.append(line)
268 | continue
269 |
270 | # If we're inside a code block, preserve the line as-is
271 | if inside_code_block:
272 | processed_lines.append(line)
273 | continue
274 |
275 | # Only apply cleanup outside of code blocks
276 | # Remove Hugo shortcode parameters and HTML comments
277 | if re.match(r'^\s*(\{:[^}]*\}|)\s*$', line):
278 | continue # Skip this line entirely
279 | else:
280 | processed_lines.append(line)
281 |
282 | content = '\n'.join(processed_lines)
283 |
284 | # Remove Hugo template syntax
285 | content = re.sub(r'{{[%<][\s\S]*?[%>]}}', '', content)
286 | return content
287 |
288 | def prepare_cover_for_latex(cover_path, config, temp_dir, cache_dir=None):
289 | """Prepare cover image for LaTeX processing without text overlay."""
290 | if not cover_path or not os.path.exists(cover_path):
291 | print("No cover image found")
292 | return None
293 |
294 | try:
295 | # Get cover configuration
296 | cover_config = config.get('cover_config', {})
297 |
298 | # If text overlay is disabled, just return the original image
299 | if not cover_config.get('overlay_enabled', True):
300 | print("Cover text overlay disabled, using original image")
301 | return cover_path
302 |
303 | # For WebP covers, convert to PNG for better LaTeX compatibility
304 | if cover_path.lower().endswith('.webp'):
305 | print(f"Converting WebP cover to PNG for LaTeX: {cover_path}")
306 | try:
307 | from PIL import Image
308 | import uuid
309 |
310 | # Generate cache key for WebP conversion
311 | source_hash = cache_utils.get_file_hash(cover_path)[:12] if cover_path else "default"
312 | cache_key = f"cover_png_{source_hash}"
313 |
314 | # Check cache first
315 | if cache_dir:
316 | cached_cover = _get_cached_image_by_key(cache_key, cache_dir, '.png')
317 | if cached_cover:
318 | print(f"Using cached PNG cover: {cached_cover}")
319 | return cached_cover
320 |
321 | # Convert WebP to PNG
322 | with Image.open(cover_path) as img:
323 | if img.mode != 'RGB':
324 | img = img.convert('RGB')
325 |
326 | png_cover_path = os.path.join(temp_dir, f"cover_{uuid.uuid4().hex[:8]}.png")
327 | img.save(png_cover_path, 'PNG', quality=95)
328 |
329 | # Cache the result
330 | if cache_dir:
331 | _save_to_cache_with_key(cache_key, png_cover_path, cache_dir)
332 |
333 | print(f"Converted cover to PNG: {png_cover_path}")
334 | return png_cover_path
335 |
336 | except ImportError:
337 | print("PIL (Pillow) not available, using original WebP cover")
338 | return cover_path
339 | except Exception as e:
340 | print(f"Error converting WebP cover: {e}")
341 | return cover_path
342 | else:
343 | # For non-WebP images, use directly
344 | print(f"Using original cover image: {cover_path}")
345 | return cover_path
346 |
347 | except Exception as e:
348 | print(f"Error preparing cover: {e}")
349 | return cover_path
350 |
351 |
352 | def _get_cached_image_by_key(cache_key, cache_dir, extension='.png'):
353 | """Get cached image by cache key."""
354 | try:
355 | metadata = cache_utils.load_cache_metadata(cache_dir)
356 | for filename, info in metadata.items():
357 | if info.get('cache_key') == cache_key:
358 | cache_path = info.get('cache_path')
359 | if cache_path and os.path.exists(cache_path):
360 | return cache_path
361 | except Exception as e:
362 | print(f"Error checking cache: {e}")
363 | return None
364 |
365 |
366 | def _save_to_cache_with_key(cache_key, file_path, cache_dir):
367 | """Save file to cache with a specific cache key."""
368 | import time
369 |
370 | try:
371 | cache_filename = f"{cache_key}.png"
372 | cache_path = os.path.join(cache_dir, cache_filename)
373 |
374 | # Copy file to cache
375 | shutil.copy2(file_path, cache_path)
376 |
377 | # Update metadata
378 | metadata = cache_utils.load_cache_metadata(cache_dir)
379 | metadata[cache_filename] = {
380 | 'cache_key': cache_key,
381 | 'cached_at': time.time(),
382 | 'cache_path': cache_path
383 | }
384 | cache_utils.save_cache_metadata(cache_dir, metadata)
385 |
386 | print(f"Cached enhanced image: {cache_path}")
387 | return cache_path
388 | except Exception as e:
389 | print(f"Error saving to cache: {e}")
390 | return None
391 |
392 |
393 | def get_available_fonts():
394 | """Get available fonts for Chinese text on the system."""
395 | import subprocess
396 |
397 | try:
398 | # Check for available Chinese fonts
399 | result = subprocess.run(['fc-list', ':', 'family'],
400 | capture_output=True, text=True, check=True)
401 | fonts = result.stdout.split('\n')
402 |
403 | # Priority list of preferred Chinese fonts
404 | preferred_fonts = [
405 | 'Source Han Sans SC', # Adobe/Google 思源黑体
406 | 'Noto Sans CJK SC', # Google Noto Sans CJK
407 | 'PingFang SC', # macOS 默认中文字体
408 | 'STSong', # macOS 宋体
409 | 'FangSong', # 仿宋
410 | 'Hiragino Mincho Pro', # 日文字体
411 | 'Times New Roman', # Fallback serif font
412 | 'DejaVu Serif' # Universal fallback
413 | ]
414 |
415 | # Find the first available font
416 | for font in preferred_fonts:
417 | if any(font in line for line in fonts):
418 | return font
419 | return 'Source Han Sans SC'
420 | except (subprocess.CalledProcessError, FileNotFoundError):
421 | return 'Source Han Sans SC'
422 |
--------------------------------------------------------------------------------
/filters/emoji-passthrough.lua:
--------------------------------------------------------------------------------
1 | -- tools/pdf-book-exporter/filters/emoji-passthrough.lua
2 | -- Advanced emoji processing filter for LaTeX PDF generation
3 | --
4 | -- This filter provides comprehensive emoji support for LaTeX documents:
5 | -- 1. Detects Unicode emoji characters using strict range checking
6 | -- 2. Handles emoji variation selectors (emoji vs text style)
7 | -- 3. Wraps emojis with appropriate LaTeX font commands
8 | -- 4. Provides fallback text representations for emoji in code contexts
9 | -- 5. Processes inline code and code blocks with emoji-safe replacements
10 | --
11 | -- Key features:
12 | -- - Precise emoji detection (avoids false positives with punctuation)
13 | -- - Support for composite emoji sequences with variation selectors
14 | -- - Context-aware processing (normal text vs code blocks)
15 | -- - Comprehensive emoji-to-text mapping for accessibility
16 | -- - Integration with LaTeX emoji font commands
17 |
18 | -- Function to check if a character is an emoji
19 | --
20 | -- Uses strict Unicode range checking to identify genuine emoji characters
21 | -- while avoiding false positives with:
22 | -- - ASCII punctuation and symbols
23 | -- - Mathematical operators
24 | -- - Currency symbols
25 | -- - Arrows and technical symbols
26 | --
27 | -- Only includes actual emoji Unicode blocks:
28 | -- - Miscellaneous Symbols and Pictographs (1F300-1F5FF)
29 | -- - Emoticons (1F600-1F64F)
30 | -- - Transport and Map Symbols (1F680-1F6FF)
31 | -- - Plus specific common emoji from other ranges
32 | function is_emoji(char)
33 | local code = utf8.codepoint(char)
34 | if not code then return false end
35 |
36 | -- Exclude ASCII range entirely (0x00-0x7F)
37 | if code < 0x80 then
38 | return false -- ASCII range, includes all punctuation, letters, numbers
39 | end
40 |
41 | -- Exclude Latin-1 supplement range (common punctuation)
42 | if code >= 0x80 and code <= 0xFF then
43 | return false
44 | end
45 |
46 | -- Exclude common punctuation ranges that might be misidentified
47 | if (code >= 0x2000 and code <= 0x206F) or -- General Punctuation (includes quotes, dashes)
48 | (code >= 0x20A0 and code <= 0x20CF) or -- Currency Symbols
49 | (code >= 0x2100 and code <= 0x214F and code ~= 0x2139) or -- Letterlike Symbols (except info ℹ)
50 | (code >= 0x2150 and code <= 0x218F) or -- Number Forms
51 | (code >= 0x2190 and code <= 0x21FF) or -- Arrows (might be emoji-like but usually not)
52 | (code >= 0x2200 and code <= 0x22FF) or -- Mathematical Operators
53 | (code >= 0x2300 and code <= 0x23FF and code ~= 0x2328) then -- Miscellaneous Technical (except keyboard)
54 | return false
55 | end
56 |
57 | -- Be very specific about emoji ranges - only include actual emoji blocks
58 | return (code >= 0x1F300 and code <= 0x1F5FF) or -- Symbols & Pictographs
59 | (code >= 0x1F600 and code <= 0x1F64F) or -- Emoticons
60 | (code >= 0x1F680 and code <= 0x1F6FF) or -- Transport & Map
61 | (code >= 0x1F700 and code <= 0x1F77F) or -- Alchemical
62 | (code >= 0x1F780 and code <= 0x1F7FF) or -- Geometric Shapes Extended
63 | (code >= 0x1F800 and code <= 0x1F8FF) or -- Supplemental Arrows-C
64 | (code >= 0x1F900 and code <= 0x1F9FF) or -- Supplemental Symbols
65 | (code >= 0x1FA00 and code <= 0x1FA6F) or -- Chess Symbols
66 | (code >= 0x1FA70 and code <= 0x1FAFF) or -- Symbols and Pictographs Extended-A
67 | (code >= 0x1F000 and code <= 0x1F02F) or -- Mahjong & Dominoes
68 | -- Very specific common emoji symbols from Miscellaneous Symbols
69 | (code == 0x2600) or -- Sun
70 | (code == 0x2601) or -- Cloud
71 | (code == 0x2614) or -- Umbrella
72 | (code == 0x2615) or -- Coffee
73 | (code == 0x26A0) or -- Warning Sign ⚠
74 | (code == 0x26BD) or -- Soccer Ball
75 | (code == 0x26BE) or -- Baseball
76 | (code == 0x2728) or -- Sparkles ✨
77 | (code == 0x2764) or -- Heavy Black Heart ❤
78 | (code == 0x2B50) or -- White Medium Star ⭐
79 | (code == 0x2139) or -- Information Source ℹ
80 | (code == 0x2328) or -- Keyboard ⌨
81 | -- Specific technical/UI symbols that are commonly used as emoji
82 | (code == 0x2713) or -- Check Mark ✓
83 | (code == 0x2717) or -- Cross Mark ✗
84 | (code == 0x274C) or -- Cross Mark ❌
85 | (code == 0x2705) -- White Heavy Check Mark ✅
86 | end
87 |
88 | -- Function to check if a character is a variation selector
89 | --
90 | -- Variation selectors control whether Unicode characters appear in:
91 | -- - Text style (VS15 - 0xFE0E): Black and white, text-like appearance
92 | -- - Emoji style (VS16 - 0xFE0F): Colorful, pictographic appearance
93 | --
94 | -- This function detects these selectors to properly group them with
95 | -- their base emoji characters for LaTeX processing
96 | function is_variation_selector(char)
97 | local code = utf8.codepoint(char)
98 | if not code then return false end
99 |
100 | return code == 0xFE0F or -- Variation Selector-16 (emoji style)
101 | code == 0xFE0E or -- Variation Selector-15 (text style)
102 | (code >= 0xE0100 and code <= 0xE01EF) -- Variation Selectors Supplement
103 | end
104 |
105 | -- Process text and wrap emojis, handling composite emojis with variation selectors
106 | --
107 | -- This function is the core text processor for normal document content:
108 | -- 1. Iterates through text using UTF-8 aware character processing
109 | -- 2. Identifies emoji characters using is_emoji function
110 | -- 3. Looks ahead for variation selectors to build complete emoji sequences
111 | -- 4. Wraps emoji sequences with {\emojifont ...} LaTeX commands
112 | -- 5. Preserves non-emoji text unchanged
113 | --
114 | -- Special handling:
115 | -- - Properly calculates UTF-8 character boundaries (1-4 bytes)
116 | -- - Groups emojis with their variation selectors
117 | -- - Uses direct font commands for better LaTeX compatibility
118 | function process_text(text)
119 | local result = {}
120 | local i = 1
121 |
122 | while i <= #text do
123 | local char_start = i
124 | local char_end = i
125 | local byte = text:byte(i)
126 |
127 | if byte then
128 | -- Determine UTF-8 character length
129 | if byte < 0x80 then
130 | char_end = i
131 | elseif byte < 0xE0 then
132 | char_end = i + 1
133 | elseif byte < 0xF0 then
134 | char_end = i + 2
135 | else
136 | char_end = i + 3
137 | end
138 |
139 | local char = text:sub(char_start, char_end)
140 |
141 | if is_emoji(char) then
142 | -- Check if next character is a variation selector
143 | local next_i = char_end + 1
144 | local emoji_sequence = char
145 |
146 | -- Look ahead for variation selectors
147 | while next_i <= #text do
148 | local next_byte = text:byte(next_i)
149 | local next_char_end = next_i
150 |
151 | if next_byte then
152 | -- Determine next character length
153 | if next_byte < 0x80 then
154 | next_char_end = next_i
155 | elseif next_byte < 0xE0 then
156 | next_char_end = next_i + 1
157 | elseif next_byte < 0xF0 then
158 | next_char_end = next_i + 2
159 | else
160 | next_char_end = next_i + 3
161 | end
162 |
163 | local next_char = text:sub(next_i, next_char_end)
164 |
165 | if is_variation_selector(next_char) then
166 | -- Include the variation selector in the emoji sequence
167 | emoji_sequence = emoji_sequence .. next_char
168 | next_i = next_char_end + 1
169 | else
170 | break
171 | end
172 | else
173 | break
174 | end
175 | end
176 |
177 | -- Use direct emoji font command for better compatibility
178 | table.insert(result, '{\\emojifont ' .. emoji_sequence .. '}')
179 | i = next_i
180 | else
181 | -- Skip lone variation selectors that weren't processed with an emoji
182 | if not is_variation_selector(char) then
183 | table.insert(result, char)
184 | end
185 | i = char_end + 1
186 | end
187 | else
188 | break
189 | end
190 | end
191 |
192 | return table.concat(result)
193 | end
194 |
195 | -- Convert emoji to text representation
196 | --
197 | -- Provides accessible fallback text for emojis in contexts where
198 | -- emoji fonts are not available or appropriate (e.g., code blocks).
199 | --
200 | -- Maps common emojis to:
201 | -- - Descriptive text (e.g., 😀 → ':grin:')
202 | -- - Symbolic representations (e.g., ✅ → '[check]')
203 | -- - Technical abbreviations (e.g., 🔧 → ':wrench:')
204 | --
205 | -- Covers major emoji categories:
206 | -- - Facial expressions and emotions
207 | -- - UI/UX symbols (check marks, warnings)
208 | -- - Technical and office objects
209 | -- - Nature and geography symbols
210 | function emoji_to_text(emoji)
211 | local emoji_map = {
212 | ['😄'] = ':smile:',
213 | ['😀'] = ':grin:',
214 | ['😃'] = ':happy:',
215 | ['😁'] = ':beam:',
216 | ['😆'] = ':laugh:',
217 | ['😅'] = ':sweat:',
218 | ['😂'] = ':joy:',
219 | ['🤣'] = ':rofl:',
220 | ['✅'] = '[check]',
221 | ['❌'] = '[x]',
222 | ['⚠'] = '[warning]',
223 | ['ℹ'] = '[info]',
224 | ['🎉'] = ':party:',
225 | ['💥'] = ':boom:',
226 | ['📝'] = ':note:',
227 | ['👍'] = ':+1:',
228 | ['👎'] = ':-1:',
229 | ['👌'] = ':ok:',
230 | ['🤝'] = ':handshake:',
231 | ['👏'] = ':clap:',
232 | ['🙏'] = ':pray:',
233 | ['💪'] = ':muscle:',
234 | ['✊'] = ':fist:',
235 | ['🔥'] = ':fire:',
236 | ['💡'] = ':bulb:',
237 | ['🚀'] = ':rocket:',
238 | ['⭐'] = ':star:',
239 | ['💯'] = ':100:',
240 | ['🎯'] = ':target:',
241 | ['📊'] = ':chart:',
242 | ['📈'] = ':chart_up:',
243 | ['📉'] = ':chart_down:',
244 | ['🔧'] = ':wrench:',
245 | ['⚙'] = ':gear:',
246 | ['🛠'] = ':tools:',
247 | ['🔍'] = ':search:',
248 | ['📱'] = ':phone:',
249 | ['💻'] = ':computer:',
250 | ['🖥'] = ':desktop:',
251 | ['⌨'] = ':keyboard:',
252 | ['🖱'] = ':mouse:',
253 | ['🖨'] = ':printer:',
254 | ['📷'] = ':camera:',
255 | ['🎥'] = ':video:',
256 | ['🎵'] = ':music:',
257 | ['🎶'] = ':notes:',
258 | ['📚'] = ':books:',
259 | ['📖'] = ':book:',
260 | ['📝'] = ':memo:',
261 | ['✏'] = ':pencil:',
262 | ['🖊'] = ':pen:',
263 | ['📌'] = ':pin:',
264 | ['📎'] = ':paperclip:',
265 | ['🔗'] = ':link:',
266 | ['📧'] = ':email:',
267 | ['📨'] = ':inbox:',
268 | ['📩'] = ':outbox:',
269 | ['📤'] = ':outbox_tray:',
270 | ['📥'] = ':inbox_tray:',
271 | ['📦'] = ':package:',
272 | ['🏷'] = ':label:',
273 | ['🔖'] = ':bookmark:',
274 | ['📋'] = ':clipboard:',
275 | ['📄'] = ':page:',
276 | ['📃'] = ':document:',
277 | ['📑'] = ':pages:',
278 | ['📊'] = ':chart:',
279 | ['📈'] = ':trending_up:',
280 | ['📉'] = ':trending_down:',
281 | ['🗂'] = ':folder:',
282 | ['📁'] = ':folder_open:',
283 | ['📂'] = ':folder_closed:',
284 | ['🗃'] = ':file_cabinet:',
285 | ['🗄'] = ':filing_cabinet:',
286 | ['🗑'] = ':trash:',
287 | ['🔒'] = ':lock:',
288 | ['🔓'] = ':unlock:',
289 | ['🔐'] = ':locked:',
290 | ['🔑'] = ':key:',
291 | ['🗝'] = ':old_key:',
292 | ['🔨'] = ':hammer:',
293 | ['⚒'] = ':hammer_pick:',
294 | ['🛠'] = ':tools:',
295 | ['⚙'] = ':gear:',
296 | ['🔧'] = ':wrench:',
297 | ['🔩'] = ':nut_and_bolt:',
298 | ['⚡'] = ':zap:',
299 | ['🔋'] = ':battery:',
300 | ['🔌'] = ':plug:',
301 | ['💡'] = ':bulb:',
302 | ['🔦'] = ':flashlight:',
303 | ['🕯'] = ':candle:',
304 | ['🪔'] = ':lamp:',
305 | ['🔥'] = ':fire:',
306 | ['💧'] = ':droplet:',
307 | ['🌊'] = ':ocean:',
308 | ['❄'] = ':snowflake:',
309 | ['☀'] = ':sun:',
310 | ['🌙'] = ':moon:',
311 | ['⭐'] = ':star:',
312 | ['🌟'] = ':star2:',
313 | ['✨'] = ':sparkles:',
314 | ['⚡'] = ':zap:',
315 | ['☁'] = ':cloud:',
316 | ['🌈'] = ':rainbow:',
317 | ['🌍'] = ':earth_africa:',
318 | ['🌎'] = ':earth_americas:',
319 | ['🌏'] = ':earth_asia:',
320 | ['🌐'] = ':globe:',
321 | ['🗺'] = ':world_map:',
322 | ['🧭'] = ':compass:',
323 | ['🏔'] = ':mountain:',
324 | ['⛰'] = ':mountain_peak:',
325 | ['🌋'] = ':volcano:',
326 | ['🗻'] = ':mount_fuji:',
327 | ['🏕'] = ':camping:',
328 | ['🏖'] = ':beach:',
329 | ['🏜'] = ':desert:',
330 | ['🏝'] = ':island:',
331 | ['🏞'] = ':park:',
332 | ['🏟'] = ':stadium:',
333 | ['🏛'] = ':classical_building:',
334 | ['🏗'] = ':construction:',
335 | ['🧱'] = ':brick:',
336 | ['🏘'] = ':houses:',
337 | ['🏚'] = ':house_abandoned:',
338 | ['🏠'] = ':house:',
339 | ['🏡'] = ':house_garden:',
340 | ['🏢'] = ':office:',
341 | ['🏣'] = ':post_office:',
342 | ['🏤'] = ':european_post_office:',
343 | ['🏥'] = ':hospital:',
344 | ['🏦'] = ':bank:',
345 | ['🏨'] = ':hotel:',
346 | ['🏩'] = ':love_hotel:',
347 | ['🏪'] = ':convenience_store:',
348 | ['🏫'] = ':school:',
349 | ['🏬'] = ':department_store:',
350 | ['🏭'] = ':factory:',
351 | ['🏯'] = ':japanese_castle:',
352 | ['🏰'] = ':european_castle:',
353 | ['💒'] = ':wedding:',
354 | ['🗼'] = ':tokyo_tower:',
355 | ['🗽'] = ':statue_of_liberty:',
356 | ['⛪'] = ':church:',
357 | ['🕌'] = ':mosque:',
358 | ['🛕'] = ':hindu_temple:',
359 | ['🕍'] = ':synagogue:',
360 | ['⛩'] = ':shinto_shrine:',
361 | ['🕋'] = ':kaaba:'
362 | }
363 |
364 | return emoji_map[emoji] or ':emoji:'
365 | end
366 |
367 | -- Process text for code blocks - use a special marker that can be processed by LaTeX
368 | function process_text_for_code(text)
369 | local result = {}
370 | local i = 1
371 |
372 | while i <= #text do
373 | local char_start = i
374 | local char_end = i
375 | local byte = text:byte(i)
376 |
377 | if byte then
378 | -- Determine UTF-8 character length
379 | if byte < 0x80 then
380 | char_end = i
381 | elseif byte < 0xE0 then
382 | char_end = i + 1
383 | elseif byte < 0xF0 then
384 | char_end = i + 2
385 | else
386 | char_end = i + 3
387 | end
388 |
389 | local char = text:sub(char_start, char_end)
390 |
391 | if is_emoji(char) then
392 | -- Check if next character is a variation selector
393 | local next_i = char_end + 1
394 | local emoji_sequence = char
395 |
396 | -- Look ahead for variation selectors
397 | while next_i <= #text do
398 | local next_byte = text:byte(next_i)
399 | local next_char_end = next_i
400 |
401 | if next_byte then
402 | -- Determine next character length
403 | if next_byte < 0x80 then
404 | next_char_end = next_i
405 | elseif next_byte < 0xE0 then
406 | next_char_end = next_i + 1
407 | elseif next_byte < 0xF0 then
408 | next_char_end = next_i + 2
409 | else
410 | next_char_end = next_i + 3
411 | end
412 |
413 | local next_char = text:sub(next_i, next_char_end)
414 |
415 | if is_variation_selector(next_char) then
416 | -- Include the variation selector in the emoji sequence
417 | emoji_sequence = emoji_sequence .. next_char
418 | next_i = next_char_end + 1
419 | else
420 | break
421 | end
422 | else
423 | break
424 | end
425 | end
426 |
427 | -- Use a special marker for code blocks that lstlisting can handle
428 | table.insert(result, '(*@\\emoji{' .. emoji_sequence .. '}@*)')
429 | i = next_i
430 | else
431 | -- Skip lone variation selectors that weren't processed with an emoji
432 | if not is_variation_selector(char) then
433 | table.insert(result, char)
434 | end
435 | i = char_end + 1
436 | end
437 | else
438 | break
439 | end
440 | end
441 |
442 | return table.concat(result)
443 | end
444 |
445 | -- Main filter function for processing Str (string) elements
446 | --
447 | -- This is called by Pandoc for every string element in the document.
448 | -- Only processes LaTeX output format to avoid interfering with other formats.
449 | --
450 | -- Process:
451 | -- 1. Check if we're generating LaTeX output
452 | -- 2. Process the text for emoji characters
453 | -- 3. If emojis were found and wrapped, return as RawInline LaTeX
454 | -- 4. Otherwise return original element unchanged
455 | function Str(elem)
456 | if not FORMAT:match 'latex' then
457 | return elem
458 | end
459 |
460 | local processed = process_text(elem.text)
461 | if processed ~= elem.text then
462 | return pandoc.RawInline('latex', processed)
463 | end
464 | return elem
465 | end
466 |
467 | -- Process inline code elements with emoji-to-text conversion
468 | --
469 | -- Inline code requires special handling because:
470 | -- 1. Emoji fonts may not work properly in monospace/code contexts
471 | -- 2. Code should remain readable in all output formats
472 | -- 3. Emojis in code often serve as UI indicators or comments
473 | --
474 | -- Process:
475 | -- 1. Scan code text for emoji characters
476 | -- 2. Replace emojis with descriptive text representations
477 | -- 3. Return as RawInline LaTeX with \texttt formatting if changes made
478 | -- 4. Preserve all other characters including CJK text and HTML tags
479 | function Code(elem)
480 | -- Process inline code with safe character replacements
481 | if not FORMAT:match 'latex' then
482 | return elem
483 | end
484 |
485 | local text = elem.text
486 | local result = {}
487 | local i = 1
488 | local changed = false
489 |
490 | -- Process each character (same logic as CodeBlock)
491 | while i <= #text do
492 | local char_start = i
493 | local char_end = i
494 | local byte = text:byte(i)
495 |
496 | if byte then
497 | -- Determine UTF-8 character length
498 | if byte < 0x80 then
499 | char_end = i
500 | elseif byte < 0xE0 then
501 | char_end = i + 1
502 | elseif byte < 0xF0 then
503 | char_end = i + 2
504 | else
505 | char_end = i + 3
506 | end
507 |
508 | local char = text:sub(char_start, char_end)
509 |
510 | -- Handle specific characters
511 | if is_emoji(char) then
512 | -- Convert emoji to text representation
513 | local emoji_text = emoji_to_text(char)
514 | table.insert(result, emoji_text)
515 | changed = true
516 | else
517 | -- Keep all other characters as-is (including Chinese characters and HTML tags)
518 | table.insert(result, char)
519 | end
520 |
521 | i = char_end + 1
522 | else
523 | break
524 | end
525 | end
526 |
527 | if changed then
528 | local processed_text = table.concat(result)
529 | return pandoc.RawInline('latex', '\\texttt{' .. processed_text .. '}')
530 | end
531 | return elem
532 | end
533 |
534 | -- Process code blocks with emoji-to-text conversion
535 | --
536 | -- Code blocks require emoji replacement because:
537 | -- 1. LaTeX listings/minted packages may not handle emoji fonts properly
538 | -- 2. Code blocks should maintain consistent monospace appearance
539 | -- 3. Emojis in code are usually semantic indicators
540 | --
541 | -- Process:
542 | -- 1. Scan entire code block content character by character
543 | -- 2. Replace emoji characters with text equivalents
544 | -- 3. Preserve syntax highlighting compatibility
545 | -- 4. Maintain original code block attributes (language, etc.)
546 | function CodeBlock(elem)
547 | -- For code blocks, replace problematic characters with safe alternatives
548 | if not FORMAT:match 'latex' then
549 | return elem
550 | end
551 |
552 | local text = elem.text
553 | local result = {}
554 | local i = 1
555 | local changed = false
556 |
557 | -- Process each character
558 | while i <= #text do
559 | local char_start = i
560 | local char_end = i
561 | local byte = text:byte(i)
562 |
563 | if byte then
564 | -- Determine UTF-8 character length
565 | if byte < 0x80 then
566 | char_end = i
567 | elseif byte < 0xE0 then
568 | char_end = i + 1
569 | elseif byte < 0xF0 then
570 | char_end = i + 2
571 | else
572 | char_end = i + 3
573 | end
574 |
575 | local char = text:sub(char_start, char_end)
576 |
577 | -- Handle specific characters
578 | if is_emoji(char) then
579 | -- Convert emoji to text representation
580 | local emoji_text = emoji_to_text(char)
581 | table.insert(result, emoji_text)
582 | changed = true
583 | else
584 | -- Keep all other characters as-is (including Chinese characters and HTML tags)
585 | table.insert(result, char)
586 | end
587 |
588 | i = char_end + 1
589 | else
590 | break
591 | end
592 | end
593 |
594 | if changed then
595 | return pandoc.CodeBlock(table.concat(result), elem.attr)
596 | end
597 |
598 | return elem
599 | end
600 |
--------------------------------------------------------------------------------