├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── assets
    └── images
    │   └── d99084735737c77dc3d3304cb78a411f.png
├── requirements-gpu.txt
├── requirements.txt
├── setup.py
└── x_pdf2md
    ├── .env.example
    ├── __init__.py
    ├── config.py
    ├── convert.py
    ├── image2md
        ├── __init__.py
        ├── car.png
        ├── get_image_title.py
        ├── image2text.py
        ├── prompts
        │   ├── description_prompt.md
        │   └── ocr_prompt.md
        └── vlm_function.py
    ├── image_utils
        ├── __init__.py
        ├── crop_text_areas.py
        ├── detect_and_sort.py
        ├── formula_recognize.py
        ├── image.png
        ├── layout_config.py
        ├── layout_detect.py
        ├── layout_sorter.py
        ├── layout_visualizer.py
        ├── models.py
        ├── process_page.py
        ├── region_image.py
        └── visualize_formula.py
    ├── markdown_formatter.py
    ├── ocr_utils
        ├── __init__.py
        ├── ocr_image.py
        ├── text_detection.py
        └── text_recogniize.py
    ├── pdf2md_converter.py
    ├── pdf_utils
        ├── __init__.py
        ├── pdf_to_image.py
        └── test_x_pdf2md.pdf
    ├── remote_image
        ├── __init__.py
        ├── image_names.json
        ├── image_serve.py
        ├── image_uploader.py
        ├── remote_image_config.py
        └── static
        │   ├── index.html
        │   ├── list.html
        │   └── upload.html
    ├── test_convert.py
    └── tests
        ├── __init__.py
        └── test_x_pdf2md.pdf


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Distribution / packaging
24 | .Python
25 | env/
26 | build/
27 | develop-eggs/
28 | dist/
29 | downloads/
30 | eggs/
31 | .eggs/
32 | lib/
33 | lib64/
34 | parts/
35 | sdist/
36 | var/
37 | wheels/
38 | *.egg-info/
39 | .installed.cfg
40 | *.egg
41 | 
42 | # Virtual environments
43 | venv/
44 | ENV/
45 | env/
46 | 
47 | # IDEs and editors
48 | .idea/
49 | .vscode/
50 | *.swp
51 | *.swo
52 | .DS_Store
53 | 
54 | # 环境变量文件
55 | .env
56 | .env.local
57 | .env.development.local
58 | .env.test.local
59 | .env.production.local
60 | 
61 | # others
62 | *.log
63 | test_datas
64 | output.md
65 | output


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.5.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: end-of-file-fixer
 7 |     -   id: check-yaml
 8 |     -   id: check-added-large-files
 9 | 
10 | -   repo: https://github.com/psf/black
11 |     rev: 24.2.0
12 |     hooks:
13 |     -   id: black
14 |         language_version: python3
15 |         args: [--line-length=100]
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2025, li-xiu-qi
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright
10 |    notice, this list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright
13 |    notice, this list of conditions and the following disclaimer in the
14 |    documentation and/or other materials provided with the distribution.
15 | 
16 | 3. Neither the name of the copyright holder nor the
17 |    names of its contributors may be used to endorse or promote products
18 |    derived from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # x-pdf2md
  2 | 
  3 | ![alt text](assets/images/d99084735737c77dc3d3304cb78a411f.png)
  4 | 一个将PDF文档转换为Markdown的高级工具包，支持自动提取文本、识别公式、表格和图像。
  5 | 
  6 | ## 功能特点
  7 | 
  8 | - PDF文档页面转换为图像
  9 | - 基于深度学习的版面分析
 10 | - 数学公式识别并转换为LaTeX格式
 11 | - 表格提取并转换为HTML格式
 12 | - 图像自动通过多模态模型描述并上传到自定义的服务端
 13 | - 多栏文本智能识别与重排版
 14 | 
 15 | ## 安装
 16 | 
 17 | ### 1. 安装特殊依赖
 18 | 
 19 | 本项目依赖于PaddlePaddle和PaddleX进行深度学习模型推理，这些依赖需要单独安装：
 20 | 
 21 | #### CPU版本
 22 | 
 23 | ```bash
 24 | # 首先安装PaddlePaddle CPU版本
 25 | pip install paddlepaddle==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
 26 | 
 27 | # 然后安装PaddleX
 28 | pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl
 29 | ```
 30 | 
 31 | #### GPU版本（CUDA 11.8）
 32 | 
 33 | ```bash
 34 | # 安装PaddlePaddle GPU版本
 35 | pip install paddlepaddle-gpu==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
 36 | 
 37 | # 然后安装PaddleX
 38 | pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl
 39 | ```
 40 | 
 41 | ### 安装开发依赖
 42 | 
 43 | ```bash
 44 | # 安装开发依赖
 45 | pip install -r requirements.txt
 46 | 
 47 | ```
 48 | 
 49 | #### 其他CUDA版本
 50 | 
 51 | 如果需要支持其他CUDA版本，请参考[PaddlePaddle官方安装指南](https://www.paddlepaddle.org.cn/install/quick)选择合适的安装命令。
 52 | 
 53 | ## 使用方法
 54 | 
 55 | ### 作为Python包导入
 56 | 
 57 | #### 快速转换方法
 58 | 
 59 | ```python
 60 | import os
 61 | from pathlib import Path
 62 | 
 63 | from x_pdf2md.convert import convert_pdf_to_markdown
 64 | 
 65 | 
 66 | def test_convert_pdf():
 67 |     """
 68 |     处理x_pdf2md/tests/test.pdf文件，并输出结果到output目录
 69 |     """
 70 |     # 获取当前模块所在目录
 71 |     current_module_dir = os.path.dirname(os.path.abspath(__file__))
 72 |     
 73 |     # 构建项目根目录路径
 74 |     project_root = os.path.dirname(current_module_dir)
 75 |     
 76 |     # 构建PDF文件路径（使用tests目录下的测试文件）
 77 |     pdf_path = os.path.join(project_root, "x_pdf2md", "tests", "test_x_pdf2md.pdf")
 78 |     
 79 |     # 构建输出目录路径
 80 |     output_dir = os.path.join(os.getcwd(), "output")
 81 |     
 82 |     # 确保PDF文件存在
 83 |     if not os.path.exists(pdf_path):
 84 |         print(f"错误：找不到测试PDF文件: {pdf_path}")
 85 |         print(f"请确保在 x_pdf2md/tests 目录中存在 test.pdf 文件")
 86 |         return False
 87 |     
 88 |     # 创建输出目录（如果不存在）
 89 |     os.makedirs(output_dir, exist_ok=True)
 90 |     
 91 |     print(f"开始处理PDF文件: {pdf_path}")
 92 |     
 93 |     try:
 94 |         # 调用转换函数
 95 |         output_path = convert_pdf_to_markdown(
 96 |             pdf_path=pdf_path,
 97 |             output_dir=output_dir,
 98 |             start_page=0,
 99 |             end_page=None,  # 处理所有页面
100 |             dpi=300,
101 |             upload_images=False,  # 默认不上传图片
102 |             output_md_path=os.path.join(output_dir, "test_result.md")
103 |         )
104 |         
105 |         print(f"PDF转换成功！输出文件路径: {output_path}")
106 |         return True
107 |     except Exception as e:
108 |         print(f"转换过程中出错: {str(e)}")
109 |         return False
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     # 当直接运行此文件时执行转换
114 |     test_convert_pdf()
115 | ```
116 | 
117 | #### 图片上传服务
118 | 
119 | 启动本地图片上传服务器：
120 | 
121 | ```bash
122 | # 进入项目目录
123 | cd x_pdf2md/remote_image
124 | 
125 | # 启动服务
126 | python image_serve.py
127 | ```
128 | 
129 | 服务启动后，访问 <http://localhost:8100> 可以使用Web界面上传和管理图片。
130 | 
131 | #### 调用的时候可以传入default_uploader进行上传文件
132 | 
133 | ```python
134 | from x_pdf2md.remote_image import default_uploader
135 | 
136 |     # 初始化图片上传器（如果需要）
137 |     image_uploader = None
138 |     if upload_images:
139 |         image_uploader = default_uploader
140 | 
141 |     # 格式化结果，传递输出目录
142 |     formatted_pages = format_pdf_regions(regions, image_uploader, output_dir=output_dir)
143 |     
144 |     # 创建输出目录（如果需要）
145 |     if output_md_path:
146 |         output_dir = os.path.dirname(os.path.abspath(output_md_path))
147 |         if output_dir and not os.path.exists(output_dir):
148 |             os.makedirs(output_dir, exist_ok=True)
149 |             
150 |         # 保存为Markdown文件
151 |         with open(output_md_path, "w", encoding="utf-8") as f:
152 |             f.write("\n\n---\n\n".join(formatted_pages))
153 |         
154 |         # 输出处理统计
155 |         total_pages = len(regions)
156 |         total_regions = sum(len(page_regions) for page_regions in regions)
157 |         print(f"处理完成！共处理 {total_pages} 页，生成 {total_regions} 个区域图片")
158 |         print(f"Markdown文件已保存到: {output_md_path}")
159 |         
160 |         return output_md_path
161 | ```
162 | 
163 | ## 开源协议
164 | 
165 | 本项目使用 [BSD 开源协议](./LICENSE)。
166 | 


--------------------------------------------------------------------------------
/assets/images/d99084735737c77dc3d3304cb78a411f.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/assets/images/d99084735737c77dc3d3304cb78a411f.png


--------------------------------------------------------------------------------
/requirements-gpu.txt:
--------------------------------------------------------------------------------
 1 | # 项目依赖
 2 | fastapi
 3 | # pymupdf
 4 | pdfplumber
 5 | numpy
 6 | openai
 7 | opencv_contrib_python
 8 | opencv_python
 9 | opencv_python_headless
10 | 
11 | # PaddlePaddle GPU版本
12 | # 等效于conda命令: conda install paddlepaddle-gpu==3.0.0rc1 paddlepaddle-cuda=12.3 -c paddle -c nvidia
13 | # CUDA 12.3版本
14 | paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/
15 | # 如需CUDA 11.8版本，请使用下面的命令替代上面的命令
16 | # paddlepaddle-gpu==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
17 | 
18 | https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl
19 | 
20 | Pillow
21 | python-dotenv
22 | Requests
23 | tqdm
24 | urllib3
25 | uvicorn
26 | 
27 | # 开发依赖
28 | pre-commit>=3.6.2
29 | black>=24.2.0
30 | isort>=5.13.2
31 | flake8>=7.0.0
32 | flake8-docstrings>=1.7.0
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # 项目依赖
 2 | fastapi
 3 | # pymupdf
 4 | pdfplumber
 5 | numpy
 6 | openai
 7 | opencv_contrib_python
 8 | opencv_python
 9 | opencv_python_headless
10 | # PaddlePaddle CPU版本（必须在paddlex之前安装）
11 | paddlepaddle==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
12 | https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl
13 | Pillow
14 | python-dotenv
15 | Requests
16 | tqdm
17 | urllib3
18 | uvicorn
19 | python-multipart
20 | 
21 | 
22 | # 开发依赖
23 | pre-commit>=3.6.2
24 | black>=24.2.0
25 | isort>=5.13.2
26 | flake8>=7.0.0
27 | flake8-docstrings>=1.7.0
28 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="x-pdf2md",
 5 |     version="0.1.0",
 6 |     packages=find_packages(include=['x_pdf2md', 'x_pdf2md.*']),  # 明确包含x_pdf2md包及其子包
 7 |     install_requires=[
 8 |         "tqdm>=4.45.0",
 9 |         "pdf2image>=1.14.0",
10 |         "Pillow>=8.0.0",
11 |         "numpy>=1.18.0",
12 |         "opencv-python>=4.5.0",
13 |         "pytesseract>=0.3.0",
14 |         "requests>=2.25.0",
15 |         "fastapi",
16 |         "pymupdf", 
17 |         "python-dotenv",
18 |         "uvicorn",
19 |         # PaddlePaddle和PaddleX需要特殊安装方式，不在这里列出
20 |     ],
21 |     author="li-xiu-qi",
22 |     author_email="lixiuqixiaoke@qq.com",
23 |     description="将PDF文档转换为Markdown的工具",
24 |     keywords="pdf, markdown, conversion",
25 |     url="",
26 |     license="BSD",
27 |     classifiers=[
28 |         "Development Status :: 3 - Alpha",
29 |         "Intended Audience :: Developers",
30 |         "Programming Language :: Python :: 3",
31 |         "License :: OSI Approved :: BSD License",
32 |     ],
33 |     entry_points={
34 |         'console_scripts': [
35 |             'x-pdf2md=x_pdf2md.main:main',  # 更新入口点指向新的main.py
36 |         ],
37 |     },
38 |     python_requires='>=3.7',
39 | )
40 | 


--------------------------------------------------------------------------------
/x_pdf2md/.env.example:
--------------------------------------------------------------------------------
1 | API_KEY=your_api_key
2 | BASE_URL=https://api.siliconflow.cn/v1
3 | 


--------------------------------------------------------------------------------
/x_pdf2md/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | x_pdf2md - 一个将PDF文档转换为Markdown的工具包
3 | """
4 | 
5 | __version__ = '0.1.0'
6 | 


--------------------------------------------------------------------------------
/x_pdf2md/config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 配置管理模块 - 集中管理项目配置
  5 | """
  6 | import os
  7 | from typing import Dict, Any, Optional
  8 | from dotenv import load_dotenv
  9 | 
 10 | # 加载.env文件中的环境变量
 11 | load_dotenv()
 12 | 
 13 | # 默认配置
 14 | DEFAULT_CONFIG = {
 15 |     # API配置
 16 |     "API_KEY": os.getenv("API_KEY", ""),  # 从环境变量获取API密钥
 17 |     "BASE_URL": os.getenv("BASE_URL", "https://api.siliconflow.cn/v1"),  # API基础URL
 18 | 
 19 |     # 图片服务配置
 20 |     "IMAGE_HOST": os.getenv("HOST", "0.0.0.0"),  # 图片服务器主机
 21 |     "IMAGE_PORT": int(os.getenv("PORT", "8100")),  # 图片服务器端口
 22 |     "UPLOAD_DIR": os.getenv("UPLOAD_DIR", "./uploads"),  # 图片上传目录
 23 | 
 24 |     # 模型配置
 25 |     "FORMULA_MODEL": os.getenv("FORMULA_MODEL", "PP-FormulaNet-L"),  # 公式识别模型
 26 |     "OCR_DET_MODEL": os.getenv("OCR_DET_MODEL", "PP-OCRv4_mobile_det"),  # OCR检测模型
 27 |     "OCR_REC_MODEL": os.getenv("OCR_REC_MODEL", "PP-OCRv4_mobile_rec"),  # OCR识别模型
 28 |     "LAYOUT_MODEL": os.getenv("LAYOUT_MODEL", "PP-DocLayout-L"),  # 版面分析模型 (更新为PP-DocLayout-L)
 29 | 
 30 |     # 多模态模型
 31 |     "VLM_MODEL": os.getenv("VLM_MODEL", "Qwen/Qwen2.5-VL-72B-Instruct"),  # 多模态模型
 32 | 
 33 |     # 处理配置
 34 |     "DEFAULT_DPI": int(os.getenv("DEFAULT_DPI", "300")),  # 默认DPI
 35 |     "THRESHOLD_LEFT_RIGHT": float(os.getenv("THRESHOLD_LEFT_RIGHT", "0.9")),  # 左右栏阈值
 36 |     "THRESHOLD_CROSS": float(os.getenv("THRESHOLD_CROSS", "0.3")),  # 跨栏阈值
 37 | }
 38 | 
 39 | # 运行时配置(可覆盖默认配置)
 40 | _runtime_config = {}
 41 | 
 42 | def get_config() -> Dict[str, Any]:
 43 |     """
 44 |     获取当前配置(默认配置+运行时配置)
 45 |     
 46 |     Returns:
 47 |         Dict: 合并后的配置字典
 48 |     """
 49 |     config = DEFAULT_CONFIG.copy()
 50 |     config.update(_runtime_config)
 51 |     return config
 52 | 
 53 | def set_config(key: str, value: Any) -> None:
 54 |     """
 55 |     设置运行时配置
 56 |     
 57 |     Args:
 58 |         key: 配置键名
 59 |         value: 配置值
 60 |     """
 61 |     _runtime_config[key] = value
 62 | 
 63 | def update_config(config_dict: Dict[str, Any]) -> None:
 64 |     """
 65 |     批量更新运行时配置
 66 |     
 67 |     Args:
 68 |         config_dict: 配置字典
 69 |     """
 70 |     _runtime_config.update(config_dict)
 71 | 
 72 | def get_api_key() -> str:
 73 |     """获取API密钥"""
 74 |     return get_config()["API_KEY"]
 75 | 
 76 | def get_base_url() -> str:
 77 |     """获取API基础URL"""
 78 |     return get_config()["BASE_URL"]
 79 | 
 80 | def get_model_config(model_type: str) -> str:
 81 |     """
 82 |     获取特定类型的模型配置
 83 |     
 84 |     Args:
 85 |         model_type: 模型类型，如'formula', 'ocr_det', 'ocr_rec', 'layout'
 86 |         
 87 |     Returns:
 88 |         str: 模型名称
 89 |     """
 90 |     model_map = {
 91 |         'formula': 'FORMULA_MODEL',
 92 |         'ocr_det': 'OCR_DET_MODEL',
 93 |         'ocr_rec': 'OCR_REC_MODEL',
 94 |         'layout': 'LAYOUT_MODEL',
 95 |         'vlm': 'VLM_MODEL'
 96 |     }
 97 |     
 98 |     key = model_map.get(model_type)
 99 |     if not key:
100 |         raise ValueError(f"未知的模型类型: {model_type}")
101 |     
102 |     return get_config()[key]
103 | 


--------------------------------------------------------------------------------
/x_pdf2md/convert.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 命令行入口点 - 处理命令行参数并调用相应功能
  5 | """
  6 | 
  7 | import argparse
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Optional, List, Union
 11 | # 从process_pdf.py导入必要的依赖
 12 | 
 13 | from tqdm import tqdm
 14 | 
 15 | from x_pdf2md.config import DEFAULT_CONFIG, update_config
 16 | from x_pdf2md.image_utils.process_page import process_page_layout
 17 | from x_pdf2md.image_utils.region_image import RegionImage
 18 | from x_pdf2md.markdown_formatter import format_pdf_regions
 19 | from x_pdf2md.pdf_utils.pdf_to_image import pdf_to_images
 20 | from x_pdf2md.remote_image import default_uploader
 21 | 
 22 | 
 23 | # 保留原有的导入
 24 | 
 25 | 
 26 | 
 27 | def process_pdf_document(
 28 |     pdf_path: str,
 29 |     output_dir: str,
 30 |     start_page: int = 0,
 31 |     end_page: Optional[int] = None,
 32 |     dpi: int = 300,
 33 |     threshold_left_right: float = 0.9,
 34 |     threshold_cross: float = 0.3,
 35 | ) -> List[List[RegionImage]]: 
 36 |     """
 37 |     处理PDF文档：将PDF转换为图像，并对每页进行版面分析和区域裁剪
 38 | 
 39 |     参数:
 40 |         pdf_path: PDF文件路径
 41 |         output_dir: 输出目录路径
 42 |         start_page: 起始页码（从0开始）
 43 |         end_page: 结束页码（包含），如果为None则处理所有页面
 44 |         dpi: PDF转图像的分辨率
 45 |         threshold_left_right: 判定左右栏的阈值
 46 |         threshold_cross: 判定跨栏的阈值
 47 | 
 48 |     返回:
 49 |         List[List[RegionImage]]: 每页的RegionImage对象列表
 50 |     """
 51 |     # 创建输出目录
 52 |     pdf_name = Path(pdf_path).stem
 53 |     output_dir = os.path.abspath(output_dir)
 54 |     temp_images_dir = os.path.join(output_dir, f"{pdf_name}_images")
 55 |     os.makedirs(temp_images_dir, exist_ok=True)
 56 | 
 57 |     # 将PDF转换为图像
 58 |     print("正在将PDF转换为图像...")
 59 |     image_paths = pdf_to_images(
 60 |         pdf_path=pdf_path,
 61 |         output_dir=temp_images_dir,
 62 |         start_page=start_page,
 63 |         end_page=end_page,
 64 |         dpi=dpi,
 65 |     )
 66 | 
 67 |     # 处理每个页面的布局
 68 |     print("正在分析和裁剪页面...")
 69 |     all_page_regions = []
 70 |     for i, image_path in enumerate(tqdm(image_paths, desc="处理页面")):
 71 |         page_num = i + 1
 72 |         page_dir = os.path.join(output_dir, f"{pdf_name}_page_{page_num}")
 73 |         os.makedirs(page_dir, exist_ok=True)
 74 | 
 75 |         # 处理页面布局并获取区域信息
 76 |         regions = process_page_layout(
 77 |             image_path=image_path,
 78 |             output_dir=page_dir,
 79 |             page_number=page_num,
 80 |             threshold_left_right=threshold_left_right,
 81 |             threshold_cross=threshold_cross,
 82 |         )
 83 | 
 84 |         all_page_regions.append(regions)
 85 | 
 86 |     return all_page_regions
 87 | 
 88 | 
 89 | def convert_pdf_to_markdown(
 90 |     pdf_path: str,
 91 |     output_dir: str = "output",
 92 |     start_page: int = 0,
 93 |     end_page: Optional[int] = None,
 94 |     dpi: int = DEFAULT_CONFIG["DEFAULT_DPI"],  # 使用配置中的默认值
 95 |     threshold_left_right: float = DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"],  # 使用配置中的默认值
 96 |     threshold_cross: float = DEFAULT_CONFIG["THRESHOLD_CROSS"],  # 使用配置中的默认值
 97 |     upload_images: bool = False,
 98 |     output_md_path: Optional[str] = None,
 99 |     api_key: Optional[str] = None,
100 |     base_url: Optional[str] = None,  # 从config中获取，不设默认值
101 | ) -> Union[str, List[str]]:
102 |     """
103 |     将PDF文档转换为Markdown
104 |     
105 |     Args:
106 |         pdf_path: PDF文件路径
107 |         output_dir: 输出目录路径，默认为"output"
108 |         start_page: 起始页码（从0开始），默认为0
109 |         end_page: 结束页码（包含），如果为None则处理所有页面
110 |         dpi: PDF转图像的分辨率，默认为300
111 |         threshold_left_right: 判定左右栏的阈值，默认为0.9
112 |         threshold_cross: 判定跨栏的阈值，默认为0.3
113 |         upload_images: 是否上传图片，默认为False
114 |         output_md_path: Markdown输出文件路径，如果为None则不保存文件
115 |         api_key: API密钥，可选，默认从config获取
116 |         base_url: API基础URL，可选，默认从config获取
117 |         
118 |     Returns:
119 |         如果提供了output_md_path，返回保存的文件路径；否则返回Markdown内容的列表
120 |     """
121 |     # 更新配置
122 |     config_updates = {}
123 |     if api_key:
124 |         config_updates["API_KEY"] = api_key
125 |     if base_url:
126 |         config_updates["BASE_URL"] = base_url
127 |     if dpi and dpi != DEFAULT_CONFIG["DEFAULT_DPI"]:
128 |         config_updates["DEFAULT_DPI"] = dpi
129 |     if threshold_left_right is not None and threshold_left_right != DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"]:
130 |         config_updates["THRESHOLD_LEFT_RIGHT"] = threshold_left_right
131 |     if threshold_cross is not None and threshold_cross != DEFAULT_CONFIG["THRESHOLD_CROSS"]:
132 |         config_updates["THRESHOLD_CROSS"] = threshold_cross
133 |     
134 |     if config_updates:
135 |         update_config(config_updates)
136 |     
137 |     # 处理PDF
138 |     regions = process_pdf_document(
139 |         pdf_path=pdf_path,
140 |         output_dir=output_dir,
141 |         start_page=start_page,
142 |         end_page=end_page,
143 |         dpi=dpi,
144 |         threshold_left_right=threshold_left_right,
145 |         threshold_cross=threshold_cross,
146 |     )
147 | 
148 |     # 初始化图片上传器（如果需要）
149 |     image_uploader = None
150 |     if upload_images:
151 |         image_uploader = default_uploader
152 | 
153 |     # 格式化结果，传递输出目录
154 |     formatted_pages = format_pdf_regions(regions, image_uploader, output_dir=output_dir)
155 |     
156 |     # 创建输出目录（如果需要）
157 |     if output_md_path:
158 |         output_dir = os.path.dirname(os.path.abspath(output_md_path))
159 |         if output_dir and not os.path.exists(output_dir):
160 |             os.makedirs(output_dir, exist_ok=True)
161 |             
162 |         # 保存为Markdown文件
163 |         with open(output_md_path, "w", encoding="utf-8") as f:
164 |             f.write("\n\n---\n\n".join(formatted_pages))
165 |         
166 |         # 输出处理统计
167 |         total_pages = len(regions)
168 |         total_regions = sum(len(page_regions) for page_regions in regions)
169 |         print(f"处理完成！共处理 {total_pages} 页，生成 {total_regions} 个区域图片")
170 |         print(f"Markdown文件已保存到: {output_md_path}")
171 |         
172 |         return output_md_path
173 |     
174 |     # 如果没有指定输出路径，则直接返回格式化后的内容
175 |     return formatted_pages
176 | 
177 | 
178 | def main():
179 |     """命令行主函数"""
180 |     parser = argparse.ArgumentParser(description="PDF文档处理工具")
181 |     parser.add_argument("-p", "--pdf", required=True, help="输入PDF文件路径")
182 |     parser.add_argument("-o", "--output", default="output", help="输出目录路径")
183 |     parser.add_argument(
184 |         "-s", "--start_page", type=int, default=0, help="起始页码（从0开始）"
185 |     )
186 |     parser.add_argument("-e", "--end_page", type=int, default=None, help="结束页码")
187 |     parser.add_argument("-d", "--dpi", type=int, default=DEFAULT_CONFIG["DEFAULT_DPI"], 
188 |                         help=f"图像分辨率，默认为{DEFAULT_CONFIG['DEFAULT_DPI']}")
189 |     parser.add_argument("--threshold_lr", type=float, default=DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"], 
190 |                         help=f"左右栏阈值，默认为{DEFAULT_CONFIG['THRESHOLD_LEFT_RIGHT']}")
191 |     parser.add_argument("--threshold_cross", type=float, default=DEFAULT_CONFIG["THRESHOLD_CROSS"], 
192 |                         help=f"跨栏阈值，默认为{DEFAULT_CONFIG['THRESHOLD_CROSS']}")
193 |     parser.add_argument(
194 |         "--no-filter", action="store_false", dest="filter_regions", help="不过滤区域"
195 |     )
196 |     parser.add_argument("--upload", action="store_true", help="启用图片上传")
197 |     parser.add_argument("--output-md", type=str, default="output.md", help="Markdown输出文件路径")
198 |     
199 |     # 添加API和模型配置参数
200 |     parser.add_argument("--api-key", type=str, help="API密钥")
201 |     parser.add_argument("--base-url", type=str, default=DEFAULT_CONFIG["BASE_URL"], 
202 |                         help=f"API基础URL，默认为{DEFAULT_CONFIG['BASE_URL']}")
203 |     parser.add_argument("--formula-model", type=str, default=DEFAULT_CONFIG["FORMULA_MODEL"], 
204 |                         help=f"公式识别模型名称，默认为{DEFAULT_CONFIG['FORMULA_MODEL']}")
205 |     parser.add_argument("--ocr-det-model", type=str, default=DEFAULT_CONFIG["OCR_DET_MODEL"], 
206 |                         help=f"OCR检测模型名称，默认为{DEFAULT_CONFIG['OCR_DET_MODEL']}")
207 |     parser.add_argument("--ocr-rec-model", type=str, default=DEFAULT_CONFIG["OCR_REC_MODEL"], 
208 |                         help=f"OCR识别模型名称，默认为{DEFAULT_CONFIG['OCR_REC_MODEL']}")
209 |     parser.add_argument("--layout-model", type=str, default=DEFAULT_CONFIG["LAYOUT_MODEL"], 
210 |                         help=f"版面分析模型名称，默认为{DEFAULT_CONFIG['LAYOUT_MODEL']}")
211 |     parser.add_argument("--vlm-model", type=str, default=DEFAULT_CONFIG["VLM_MODEL"],)
212 |     
213 |     args = parser.parse_args()
214 | 
215 |     # 更新模型配置
216 |     config_updates = {}
217 |     if args.formula_model != DEFAULT_CONFIG["FORMULA_MODEL"]:
218 |         config_updates["FORMULA_MODEL"] = args.formula_model
219 |     if args.ocr_det_model != DEFAULT_CONFIG["OCR_DET_MODEL"]:
220 |         config_updates["OCR_DET_MODEL"] = args.ocr_det_model
221 |     if args.ocr_rec_model != DEFAULT_CONFIG["OCR_REC_MODEL"]:
222 |         config_updates["OCR_REC_MODEL"] = args.ocr_rec_model
223 |     if args.layout_model != DEFAULT_CONFIG["LAYOUT_MODEL"]:
224 |         config_updates["LAYOUT_MODEL"] = args.layout_model
225 |     
226 |     if config_updates:
227 |         update_config(config_updates)
228 | 
229 |     # 调用转换函数
230 |     convert_pdf_to_markdown(
231 |         pdf_path=args.pdf,
232 |         output_dir=args.output,
233 |         start_page=args.start_page,
234 |         end_page=args.end_page,
235 |         dpi=args.dpi,
236 |         threshold_left_right=args.threshold_lr,
237 |         threshold_cross=args.threshold_cross,
238 |         upload_images=args.upload,
239 |         output_md_path=args.output_md,
240 |         api_key=args.api_key,
241 |         base_url=args.base_url
242 |     )
243 | 
244 | 
245 | if __name__ == "__main__":
246 |     main()
247 | 


--------------------------------------------------------------------------------
/x_pdf2md/image2md/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/image2md/__init__.py


--------------------------------------------------------------------------------
/x_pdf2md/image2md/car.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/image2md/car.png


--------------------------------------------------------------------------------
/x_pdf2md/image2md/get_image_title.py:
--------------------------------------------------------------------------------
 1 | from openai import OpenAI
 2 | from dotenv import load_dotenv
 3 | import os
 4 | 
 5 | load_dotenv()
 6 | 
 7 | SYSTEM_PROMPT = """你是一个专业图像标题生成助手。
 8 | 任务：根据提供的图像描述生成一个简短、准确且具有描述性的标题。
 9 | 
10 | 输出要求：
11 | - 标题应简洁（通常控制在5-20个字之间）
12 | - 突出图像的核心主题或最显著特征
13 | - 使用具体而非抽象的词语
14 | - 不要包含"这是"、"这张图片"等冗余词语
15 | - 学术论文或技术图像应保留专业术语的准确性
16 | - 直接输出标题文本，无需额外说明或引号
17 | 
18 | 示例：
19 | 描述：茂密森林中，阳光透过树叶洒落在地面，形成斑驳光影。远处小溪流淌，水面反射着周围绿色植被。
20 | 标题：晨光森林溪流
21 | 
22 | 描述：年轻女性在实验室使用显微镜观察样本。她穿白色实验服，戴护目镜，专注调整显微镜。旁边放着试管和实验笔记。
23 | 标题：科研人员显微观察
24 | 
25 | 描述：学术论文封面，白色背景。标题"ISAM-MTL: Cross-subject multi-task learning model with identifiable spikes and associative memory networks"位于顶部，黑色字体。下方是作者名字"Junyan Li", "Bin Hu", "Zhi-Hong Guan"。摘要部分介绍EEG信号跨主体变化性和ISAM-MTL模型。页面右下角显示DOI和版权信息。
26 | 标题：ISAM-MTL 论文封面首页
27 | """
28 | 
29 | 
30 | USER_PROMPT_TEMPLATE = """基于以下图像描述，提供一个简洁、专业的标题：
31 | ----
32 | 描述：{description}
33 | ----
34 | 直接输出标题（5-15字）："""
35 | 
36 | 
37 | def get_image_title(image_description, api_key=None):
38 |     """
39 |     使用硅基流动的deepseek v3 为多模态提取的图片描述生成图片的标题。
40 | 
41 |     参数:
42 |         image_description (str): 图像的描述文本
43 |         api_key (str): 您的OpenAI API密钥
44 | 
45 |     返回:
46 |         str: 为图像生成的标题
47 |     """
48 | 
49 |     if not api_key:
50 |         api_key = os.getenv("API_KEY")
51 |     # 使用Silicon Flow基础URL初始化客户端
52 |     client = OpenAI(api_key=api_key, base_url="https://api.siliconflow.com/v1")
53 | 
54 |     # 发送API请求
55 |     response = client.chat.completions.create(
56 |         model="deepseek-ai/DeepSeek-V3",
57 |         messages=[
58 |             {
59 |                 "role": "system",
60 |                 "content": SYSTEM_PROMPT,
61 |             },
62 |             {
63 |                 "role": "user",
64 |                 "content": USER_PROMPT_TEMPLATE.format(description=image_description),
65 |             },
66 |         ],
67 |     )
68 | 
69 |     # 提取并返回标题
70 |     title = response.choices[0].message.content.strip()
71 |     return title
72 | 
73 | 
74 | if __name__ == "__main__":
75 | 
76 |     image_description = """
77 |     这张图片显示了一篇学术论文的封面。
78 |     封面的背景是白色的，标题
79 |     "ISAM-MTL: Cross-subject multi-task learning model with identifiable spikes and associative memory networks"
80 |     位于页面的顶部，使用了黑色的字体。
81 |     标题下方是作者的名字，分别是"Junyan Li", "Bin Hu", 和"Zhi-Hong Guan"。再往下是摘要部分，使用了较小的字体。
82 |     摘要的标题是"Abstract"，内容是关于EEG（脑电图）信号的跨主体变化性，
83 |     以及一种新的模型"ISAM-MTL"（Identifiable Spikes and Associative Memory Multi-Task Learning）的介绍。
84 |     摘要的最后是"Introduction"部分的开头，介绍了脑机接 口（BCI）系统和EEG信号的相关背景。
85 |     页面的右下角显示了论文的引用信息，包括DOI（数字对象标识符）和版权信息。
86 |     整体构图简洁明了，信息层次分明。
87 |     """
88 |     title = get_image_title(image_description)
89 |     print(title)
90 | 


--------------------------------------------------------------------------------
/x_pdf2md/image2md/image2text.py:
--------------------------------------------------------------------------------
  1 | """
  2 | #### 使用说明：
  3 | 
  4 | 1. 初始化 `ImageTextExtractor` 实例时可以传入 `api_key`、`base_url`、`prompt` 或 `prompt_path`。
  5 | 2. 使用 `extract_image_text` 方法可以提取图像中的文本并转换为 Markdown 格式。
  6 | 
  7 | #### 主要功能：
  8 | - 初始化时可以从环境变量读取 API 密钥，或者手动传入。
  9 | - 提供了从文件读取自定义提示文本的功能。
 10 | - 支持提取图像 URL 或本地图像文件路径中的文本。
 11 | - 将提取的文本转换为 Markdown 格式，包括数学公式的格式化。
 12 | - 支持图像 URL 或 Base64 编码图像的解析。
 13 | - 提供多种模型和生成文本的细节级别设置。
 14 | 
 15 | #### 参数说明：
 16 | 
 17 | - **`ImageTextExtractor.__init__`**：
 18 |   - `api_key` (str): API 密钥，默认从环境变量读取。
 19 |   - `base_url` (str): API 基础 URL，默认值为 "https://api.siliconflow.cn/v1"。
 20 |   - `prompt` (str | None): 提示文本，优先使用传入的值。
 21 |   - `prompt_path` (str | None): 提示文本文件路径，读取指定文件中的内容作为提示文本。
 22 | 
 23 | - **`ImageTextExtractor._read_prompt`**：
 24 |   - `prompt_path` (str): 提示文本文件路径。
 25 |   - 返回值 (str): 读取的提示文本内容。
 26 | 
 27 | - **`ImageTextExtractor.extract_image_text`**：
 28 |   - `image_url` (str | None): 图像的 URL 地址。
 29 |   - `local_image_path` (str | None): 本地图像文件路径。
 30 |   - `model` (str): 使用的模型名称，默认 "Qwen/Qwen2-VL-72B-Instruct"。
 31 |   - `detail` (str): 细节级别，允许值为 'low', 'high', 'auto'，默认 "low"。
 32 |   - `prompt` (str | None): 提示文本，优先使用传入的值。
 33 |   - `temperature` (float): 生成文本的温度参数，默认 0.1。
 34 |   - `top_p` (float): 生成文本的 top_p 参数，默认 0.5。
 35 |   - 返回值 (str): 提取的 Markdown 格式文本。
 36 | 
 37 | - **`ImageTextExtractor._is_base64`**：
 38 |   - `s` (str): 待检查的字符串。
 39 |   - 返回值 (bool): 如果是 Base64 编码则返回 True，否则返回 False。
 40 | 
 41 | - **`ImageTextExtractor._get_image_extension`**：
 42 |   - `file_path` (str): 图像文件路径。
 43 |   - 返回值 (str): 图像文件的扩展名。
 44 | 
 45 | #### 注意事项：
 46 | - `api_key` 是必须的，可以通过环境变量或初始化时传入。
 47 | - 需要安装 `PIL` 库来获取图像的扩展名。
 48 | - 图像文件必须是有效的图像格式，如 PNG、JPG 或 TIFF。
 49 | - 如果使用 Base64 编码的图像，确保传入的字符串是有效的 Base64 编码。
 50 | 
 51 | #### 更多信息：
 52 | - 该类依赖于 OpenAI 的 API 服务以及环境变量中的 API 密钥。
 53 | - 提取的 Markdown 格式文本会保留图像中的结构和公式，适用于文档集成。
 54 | 
 55 | """
 56 | from x_pdf2md.config import get_model_config
 57 | 
 58 | _prompt = """
 59 | 你是一个可以识别图片的AI，你可以基于图片与用户进行友好的对话。
 60 | """
 61 | 
 62 | from openai import OpenAI
 63 | from dotenv import load_dotenv
 64 | import os
 65 | import base64
 66 | 
 67 | 
 68 | def extract_markdown_content(text: str) -> str:
 69 |     """
 70 |     从文本中提取Markdown内容，自动去除markdown和html代码块标记。
 71 | 
 72 |     参数:
 73 |     text (str): 输入文本。
 74 | 
 75 |     返回:
 76 |     str: 提取的内容，如果没有找到Markdown或HTML标记，则返回原始文本。
 77 |     """
 78 |     md_start_marker = "```markdown"
 79 |     html_start_marker = "```html"
 80 |     end_marker = "```"
 81 | 
 82 |     # 处理markdown代码块
 83 |     md_start_index = text.find(md_start_marker)
 84 |     if md_start_index != -1:
 85 |         start_index = md_start_index + len(md_start_marker)
 86 |         end_index = text.find(end_marker, start_index)
 87 |         
 88 |         if end_index == -1:
 89 |             return text[start_index:].strip()
 90 |         return text[start_index:end_index].strip()
 91 |     
 92 |     # 处理html代码块
 93 |     html_start_index = text.find(html_start_marker)
 94 |     if html_start_index != -1:
 95 |         start_index = html_start_index + len(html_start_marker)
 96 |         end_index = text.find(end_marker, start_index)
 97 |         
 98 |         if end_index == -1:
 99 |             return text[start_index:].strip()
100 |         return text[start_index:end_index].strip()
101 |     
102 |     # 如果没有找到特定标记，返回原始文本
103 |     return text.strip() if text else None
104 | 
105 | 
106 | def image_to_base64(image_path: str) -> str:
107 |     """
108 |     将图像文件转换为Base64编码的字符串。
109 | 
110 |     参数:
111 |     image_path (str): 图像文件路径。
112 | 
113 |     返回:
114 |     str: Base64编码的字符串。
115 |     """
116 |     with open(image_path, "rb") as image_file:
117 |         encoded_string = base64.b64encode(image_file.read()).decode("utf-8")
118 |     return encoded_string
119 | 
120 | 
121 | class ImageTextExtractor:
122 |     """
123 |     图像文本提取器类，用于将图像内容转换为 Markdown 格式的文本。
124 |     """
125 | 
126 |     def __init__(
127 |         self,
128 |         api_key: str = None,
129 |         base_url: str = "https://api.siliconflow.cn/v1",
130 |         prompt: str | None = None,
131 |         prompt_path: str | None = None,
132 |     ):
133 |         """
134 |         初始化 ImageTextExtractor 实例。
135 | 
136 |         :param api_key: API 密钥，如果未提供则从环境变量中读取
137 |         :param base_url: API 基础 URL
138 |         :param prompt: 提示文本
139 |         :param prompt_path: 提示文本文件路径
140 |         """
141 |         load_dotenv()
142 |         self.api_key: str = api_key or os.getenv("API_KEY")
143 | 
144 |         if not self.api_key:
145 |             raise ValueError("API key is required")
146 | 
147 |         self.client: OpenAI = OpenAI(
148 |             api_key=self.api_key,
149 |             base_url=base_url,
150 |         )
151 |         self._prompt: str = (
152 |             prompt or self._read_prompt(prompt_path)  or _prompt
153 |         )
154 | 
155 |     def _read_prompt(self, prompt_path: str) -> str:
156 |         """
157 |         从文件中读取提示文本。
158 | 
159 |         :param prompt_path: 提示文本文件路径
160 |         :return: 提示文本内容
161 |         """
162 |         if not prompt_path.endswith((".md", ".txt")):
163 |             raise ValueError("Prompt file must be a .md or .txt file")
164 |         with open(prompt_path, "r", encoding="utf-8") as f:
165 |             return f.read()
166 | 
167 |     def extract_image_text(
168 |         self,
169 |         image_url: str = None,
170 |         local_image_path: str = None,
171 |         model: str = None,
172 |         detail: str = "low",
173 |         prompt: str = None,
174 |         temperature: float = 0.1,
175 |     ) -> str:
176 |         """
177 |         提取图像中的文本并转换为 Markdown 格式。
178 | 
179 |         :param image_url: 图像的 URL
180 |         :param local_image_path: 本地图像文件路径
181 |         :param model: 使用的模型名称
182 |         :param detail: 细节级别，允许值为 'low', 'high', 'auto'
183 |         :param prompt: 提示文本
184 |         :param temperature: 生成文本的温度参数
185 |         :param top_p: 生成文本的 top_p 参数
186 |         :return: 提取的 Markdown 格式文本
187 |         """
188 |         if model is None:
189 |             model = get_model_config('vlm')
190 |         if not image_url and not local_image_path:
191 |             raise ValueError("Either image_url or local_image_path is required")
192 | 
193 |         if image_url and not (
194 |             image_url.startswith("http://")
195 |             or image_url.startswith("https://")
196 |             or self._is_base64(image_url)
197 |         ):
198 |             raise ValueError(
199 |                 "Image URL must be a valid HTTP/HTTPS URL or a Base64 encoded string"
200 |             )
201 | 
202 |         if local_image_path:
203 |             if not os.path.exists(local_image_path):
204 |                 raise FileNotFoundError(f"The file {local_image_path} does not exist.")
205 |             image_extension: str = self._get_image_extension(local_image_path)
206 |             with open(local_image_path, "rb") as image_file:
207 |                 base64_image: str = base64.b64encode(image_file.read()).decode("utf-8")
208 |                 image_url = f"data:image/{image_extension};base64,{base64_image}"
209 | 
210 |         if detail not in ["low", "high", "auto"]:
211 |             raise ValueError(
212 |                 "Invalid detail value. Allowed values are 'low', 'high', 'auto'"
213 |             )
214 | 
215 |         if detail == "auto":
216 |             detail = "low"
217 | 
218 |         prompt = prompt or self._prompt
219 | 
220 |         try:
221 |             response = self.client.chat.completions.create(
222 |                 model=model,
223 |                 messages=[
224 |                     {
225 |                         "role": "user",
226 |                         "content": [
227 |                             {
228 |                                 "type": "image_url",
229 |                                 "image_url": {"url": image_url, "detail": detail},
230 |                             },
231 |                             {"type": "text", "text": prompt},
232 |                         ],
233 |                     }
234 |                 ],
235 |                 stream=True,
236 |                 temperature=temperature,
237 |             )
238 | 
239 |             result: str = ""
240 |             for chunk in response:
241 |                 chunk_message: str = chunk.choices[0].delta.content
242 |                 result += chunk_message
243 |             return result
244 |         except Exception as e:
245 |             raise RuntimeError(f"Failed to extract text from image: {e}")
246 | 
247 |     def _is_base64(self, s: str) -> bool:
248 |         """
249 |         检查字符串是否为 Base64 编码。
250 | 
251 |         :param s: 待检查的字符串
252 |         :return: 如果是 Base64 编码则返回 True，否则返回 False
253 |         """
254 |         try:
255 |             if isinstance(s, str):
256 |                 if s.strip().startswith("data:image"):
257 |                     return True
258 |                 return base64.b64encode(base64.b64decode(s)).decode("utf-8") == s
259 |             return False
260 |         except Exception:
261 |             return False
262 | 
263 |     def _get_image_extension(self, file_path: str) -> str:
264 |         """
265 |         获取图像文件的扩展名。
266 | 
267 |         :param file_path: 图像文件路径
268 |         :return: 图像文件的扩展名
269 |         """
270 |         try:
271 |             from PIL import Image
272 | 
273 |             with Image.open(file_path) as img:
274 |                 return img.format.lower()
275 |         except Exception as e:
276 |             raise ValueError(f"Failed to determine image format: {e}")
277 | 
278 | 


--------------------------------------------------------------------------------
/x_pdf2md/image2md/prompts/description_prompt.md:
--------------------------------------------------------------------------------
 1 | # PDF图像内容描述提示
 2 | 
 3 | ## 任务
 4 | 
 5 | 使用视觉语言模型生成从PDF提取的图像内容的简洁描述。
 6 | 
 7 | ## 背景
 8 | 
 9 | - 图像来源于PDF文档
10 | - 需要清晰理解图像的主要内容和用途
11 | - 避免冗余描述，保持精简
12 | 
13 | ## 输入
14 | 
15 | - 从PDF提取的图像
16 | 
17 | ## 输出
18 | 
19 | 请简洁描述图像的以下关键方面：
20 | 
21 | 1. 图像类型（图表、示意图、照片等）
22 | 2. 主要内容/主题
23 | 3. 包含的关键信息点
24 | 4. 文本或标签（如有）
25 | 5. 图像的可能用途
26 | 
27 | 示例格式：
28 | "这是一张[图像类型]，展示了[主要内容]。包含[关键信息]。[其他相关细节]。"
29 | 


--------------------------------------------------------------------------------
/x_pdf2md/image2md/prompts/ocr_prompt.md:
--------------------------------------------------------------------------------
 1 | # OCR 图像到 Markdown 转换提示
 2 | 
 3 | ## 背景
 4 | 
 5 | 你有可能接受到从pdf文件内裁剪下来以图片的形式存在的内容，有可能是一个标题，或者是表格。
 6 | 
 7 | ## 任务
 8 | 
 9 | 将图像中的内容精确转换为格式化的 Markdown，保留原始文档的结构、布局和语义。
10 | 
11 | ## 输入
12 | 
13 | - 图像类型：文档扫描件、屏幕截图、手写内容照片
14 | - 支持格式：PNG、JPG、TIFF 等常见图像格式
15 | - 内容类型：文本段落、标题、列表、表格、数学公式、简单图表
16 | 
17 | ## 输出要求
18 | 
19 | - 完整的 Markdown 文本，使用适当的语法元素
20 | - 数学公式使用 LaTeX 语法，内联公式使用单个 `$` 分隔，独立公式使用 `$$` 分隔
21 | - 表格处理方式：
22 |   - 对于简单表格，使用标准 Markdown 表格语法
23 |   - 对于复杂表格（包含合并单元格等），使用 HTML 表格标记
24 |     - 跨列单元格使用 `<td colspan="n">内容</td>` 标记
25 |     - 跨行单元格使用 `<td rowspan="n">内容</td>` 标记
26 | - 列表保留原始层级和编号
27 | - 保持原始段落结构和文本流
28 | 
29 | ## 转换规则
30 | 
31 | 1. **文本内容**：保留原始格式，包括段落分隔、强调和标点符号
32 | 2. **数学公式**：使用 LaTeX 语法准确转录，保持数学符号和结构
33 | 3. **表格**：根据复杂程度选择 Markdown 或 HTML 表格格式
34 | 4. **列表**：保持原始缩进和编号系统
35 | 5. **标题**：使用适当级别的 Markdown 标题标记
36 | 
37 | ## 注意事项
38 | 
39 | - 确保转换内容的真实性
40 | - 只转换图像中实际存在的内容，不添加额外解释或内容
41 | 
42 | ## 输出示例
43 | 
44 | ```markdown
45 | # 文档标题
46 | 
47 | 正文内容，包含 $E=mc^2$ 内联公式。
48 | 
49 | ## 小节标题
50 | 
51 | 1. 列表项一
52 | 2. 列表项二
53 | 
54 | $$
55 | \int_{0}^{\infty} e^{-x^2} dx = \frac{\sqrt{\pi}}{2}
56 | $$
57 | 
58 | # 简单表格示例（Markdown语法）
59 | | 列 1 | 列 2 | 列 3 |
60 | |-----|-----|-----|
61 | | 数据 | 数据 | 数据 |
62 | 
63 | # 复杂表格示例（HTML语法）
64 | <table>
65 |         <tr>
66 |                 <th>标题 1</th>
67 |                 <th>标题 2</th>
68 |                 <th>标题 3</th>
69 |         </tr>
70 |         <tr>
71 |                 <td colspan="2">跨列单元格</td>
72 |                 <td>普通单元格</td>
73 |         </tr>
74 |         <tr>
75 |                 <td rowspan="2">跨行单元格</td>
76 |                 <td>数据</td>
77 |                 <td>数据</td>
78 |         </tr>
79 | </table>
80 | ```
81 | 


--------------------------------------------------------------------------------
/x_pdf2md/image2md/vlm_function.py:
--------------------------------------------------------------------------------
  1 | from .image2text import ImageTextExtractor, extract_markdown_content
  2 | import os
  3 | 
  4 | # 定义提示词
  5 | ocr_prompt = """
  6 | 使用OCR的模式提取图像中的文本内容，并转换为Markdown格式。
  7 | 注意：不要输出图片以外的内容。
  8 | 其中表格输出为Markdown格式，或者html格式，公式输出为带有$或者$$风格的LaTeX格式。
  9 | """
 10 | 
 11 | description_prompt = """
 12 | # PDF图像内容描述提示
 13 | 
 14 | ## 任务
 15 | 
 16 | 使用视觉语言模型生成从PDF提取的图像内容的简洁描述。
 17 | 
 18 | ## 背景
 19 | 
 20 | - 图像来源于PDF文档
 21 | - 需要清晰理解图像的主要内容和用途
 22 | - 避免冗余描述，保持精简
 23 | 
 24 | ## 输入
 25 | 
 26 | - 从PDF提取的图像
 27 | 
 28 | ## 输出
 29 | 
 30 | 请简洁描述图像的以下关键方面：
 31 | 
 32 | 1. 图像类型（图表、示意图、照片等）
 33 | 2. 主要内容/主题
 34 | 3. 包含的关键信息点
 35 | 4. 文本或标签（如有）
 36 | 5. 图像的可能用途
 37 | 
 38 | 示例格式：
 39 | "这是一张[图像类型]，展示了[主要内容]。包含[关键信息]。[其他相关细节]。"
 40 | """
 41 | 
 42 | extract_table_prompt = """
 43 | 提取图片当中的表格，并输出为支持markdown格式的html语法。
 44 | 注意：不要输出图片以外的内容。
 45 | """
 46 | 
 47 | 
 48 | def _process_image_with_model(
 49 |     image_path: str,
 50 |     model: str,
 51 |     prompt_path: str = None,
 52 |     prompt_text: str = None,
 53 |     api_key: str = None,
 54 |     detail: str = "low",
 55 |     post_process_func = None
 56 | ) -> str:
 57 |     """处理图像并返回模型输出的基础函数"""
 58 |     if api_key is None:
 59 |         api_key = os.getenv("API_KEY")
 60 |     
 61 |     extractor = ImageTextExtractor(
 62 |         api_key=api_key,
 63 |         prompt_path=prompt_path,
 64 |         prompt=prompt_text
 65 |     )
 66 | 
 67 |     try:
 68 |         result = extractor.extract_image_text(
 69 |             local_image_path=image_path, model=model, detail=detail
 70 |         )
 71 |         
 72 |         if not result.strip():
 73 |             return "No content extracted from the image"
 74 |         
 75 |         if post_process_func:
 76 |             return post_process_func(result)
 77 |         return extract_markdown_content(result)
 78 |     except Exception as e:
 79 |         return f"Error processing image: {str(e)}"
 80 | 
 81 | 
 82 | def extract_text_from_image(
 83 |     image_path: str,
 84 |     model: str = None,
 85 |     ocr_prompt_path: str = None,
 86 |     api_key: str = None,
 87 | ) -> str:
 88 |     """从图像中提取文本内容并转换为Markdown格式"""
 89 |     return _process_image_with_model(
 90 |         image_path=image_path,
 91 |         model=model,
 92 |         prompt_path=ocr_prompt_path,
 93 |         prompt_text=ocr_prompt if not ocr_prompt_path else None,
 94 |         api_key=api_key,
 95 |         detail="low"
 96 |     )
 97 | 
 98 | 
 99 | def describe_image(
100 |     image_path: str,
101 |     model: str = None,
102 |     description_prompt_path: str = None,
103 |     api_key: str = None,
104 | ) -> str:
105 |     """描述图像内容并生成文本描述"""
106 |     return _process_image_with_model(
107 |         image_path=image_path,
108 |         model=model,
109 |         prompt_path=description_prompt_path,
110 |         prompt_text=description_prompt if not description_prompt_path else None,
111 |         api_key=api_key,
112 |         detail="low"
113 |     )
114 | 
115 | 
116 | def process_table_content(result):
117 |     """处理表格内容"""
118 |     table_content = extract_markdown_content(result)
119 |     
120 |     if not (table_content.startswith('|') and '|---' in table_content):
121 |         if '<table>' in table_content.lower() and '</table>' in table_content.lower():
122 |             return table_content
123 |         else:
124 |             return f"```\n{table_content}\n```"
125 |     return table_content
126 | 
127 | 
128 | def extract_table_from_image(
129 |     image_path: str,
130 |     model: str = None,
131 |     extract_table_prompt_path: str = None,
132 |     api_key: str = None,
133 | ) -> str:
134 |     """从图像中提取表格内容并转换为Markdown或HTML格式"""
135 |     return _process_image_with_model(
136 |         image_path=image_path,
137 |         model=model,
138 |         prompt_path=extract_table_prompt_path,
139 |         prompt_text=extract_table_prompt if not extract_table_prompt_path else None,
140 |         api_key=api_key,
141 |         detail="high",
142 |         post_process_func=process_table_content
143 |     )
144 | 
145 | 
146 | # 测试代码
147 | if __name__ == "__main__":
148 |     import sys
149 |     from pathlib import Path
150 | 
151 |     current_dir = Path(__file__).parent
152 |     test_image_path = current_dir / "car.png"
153 | 
154 |     if not test_image_path.exists():
155 |         print(f"测试图像文件不存在: {test_image_path}")
156 |         sys.exit(1)
157 | 
158 |     print(f"正在处理图像: {test_image_path}")
159 | 
160 |     ocr_prompt_path = current_dir / "prompts/ocr_prompt.md"
161 |     description_prompt_path = current_dir / "prompts/description_prompt.md"
162 |     table_prompt_path = current_dir / "prompts/extract_table_prompt.md"
163 | 
164 |     # 测试文本提取
165 |     print("\n" + "=" * 50)
166 |     print("1. 提取的文本内容:")
167 |     print("=" * 50)
168 |     extracted_text = extract_text_from_image(
169 |         str(test_image_path),
170 |         ocr_prompt_path=str(ocr_prompt_path) if ocr_prompt_path.exists() else None,
171 |     )
172 |     print(extracted_text)
173 | 
174 |     # 测试图像描述
175 |     print("\n" + "=" * 50)
176 |     print("2. 图像描述:")
177 |     print("=" * 50)
178 |     image_description = describe_image(
179 |         str(test_image_path),
180 |         description_prompt_path=(
181 |             str(description_prompt_path) if description_prompt_path.exists() else None
182 |         ),
183 |     )
184 |     print(image_description)
185 | 
186 |     # 测试表格提取
187 |     print("\n" + "=" * 50)
188 |     print("3. 提取的表格内容:")
189 |     print("=" * 50)
190 |     table_content = extract_table_from_image(
191 |         str(test_image_path),
192 |         extract_table_prompt_path=(
193 |             str(table_prompt_path) if table_prompt_path.exists() else None
194 |         ),
195 |     )
196 |     print(table_content)
197 |     print("=" * 50)
198 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | 图像处理相关工具
3 | """
4 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/crop_text_areas.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # -*- coding: utf-8 -*-
  3 | # author：筱可
  4 | # 2025-03-10
  5 | """
  6 | 使用说明：
  7 | 1. 将图像文件和对应的JSON检测结果文件放在指定目录
  8 | 2. 设置相应的输入输出路径
  9 | 3. 运行脚本即可获得裁剪后的文本区域图像
 10 | 
 11 | 主要功能：
 12 | 1. 读取原始图像和文本检测结果JSON文件
 13 | 2. 支持矩形和多边形两种裁剪方式
 14 | 3. 将检测到的文本区域裁剪并保存
 15 | 
 16 | 参数说明：
 17 | TextAreaCropper类方法：
 18 | - crop_text_areas: 处理图像和JSON检测结果，裁剪文本区域
 19 | 返回值：无
 20 | 
 21 | 注意事项：
 22 | 1. 依赖库：opencv-python, numpy
 23 | 2. JSON文件需包含dt_polys和dt_scores字段
 24 | 3. 确保具有目录的写入权限
 25 | """
 26 | 
 27 | import os
 28 | import json
 29 | import cv2
 30 | import numpy as np
 31 | from typing import List, Dict, Any, Optional
 32 | from abc import ABC, abstractmethod
 33 | 
 34 | class TextCropper(ABC):
 35 |     """文本区域裁剪抽象基类"""
 36 |     
 37 |     @abstractmethod
 38 |     def crop(self, image: np.ndarray, polygon: np.ndarray) -> np.ndarray:
 39 |         """裁剪图像中的文本区域
 40 |         
 41 |         Args:
 42 |             image: 原始图像
 43 |             polygon: 文本区域多边形坐标
 44 |             
 45 |         Returns:
 46 |             裁剪后的图像区域
 47 |         """
 48 |         pass
 49 | 
 50 | class RectCropper(TextCropper):
 51 |     """矩形裁剪实现类 - 简单直接的矩形裁剪"""
 52 |     
 53 |     def crop(self, image: np.ndarray, polygon: np.ndarray) -> np.ndarray:
 54 |         # 计算外接矩形
 55 |         x, y, w, h = cv2.boundingRect(polygon)
 56 |         
 57 |         # 检查裁剪区域是否有效
 58 |         if w <= 0 or h <= 0:
 59 |             print(f"警告：无效的裁剪区域 x={x}, y={y}, w={w}, h={h}")
 60 |             # 返回一个小的空白图像，保持与原图像相同的通道数
 61 |             channels = 3 if len(image.shape) == 3 else 1
 62 |             return np.zeros((1, 1, channels), dtype=np.uint8)
 63 |         
 64 |         # 简单的矩形裁剪
 65 |         if len(image.shape) == 3:  # RGB/BGR图像
 66 |             cropped = image[y:y+h, x:x+w].copy()
 67 |         else:  # 灰度图像
 68 |             cropped = image[y:y+h, x:x+w].copy()
 69 |             
 70 |         return cropped
 71 | 
 72 | # 保留原有的PolyCropper类，但不再使用它
 73 | class PolyCropper(TextCropper):
 74 |     """多边形裁剪实现类 - 支持透明背景"""
 75 |     
 76 |     def crop(self, image: np.ndarray, polygon: np.ndarray) -> np.ndarray:
 77 |         # 计算外接矩形
 78 |         x, y, w, h = cv2.boundingRect(polygon)
 79 |         
 80 |         # 检查裁剪区域是否有效
 81 |         if w <= 0 or h <= 0:
 82 |             print(f"警告：无效的裁剪区域 x={x}, y={y}, w={w}, h={h}")
 83 |             # 返回一个小的空白图像，保持与原图像相同的通道数
 84 |             return np.zeros((1, 1, image.shape[2]), dtype=np.uint8)
 85 |         
 86 |         # 调整多边形坐标为相对于裁剪区域的坐标
 87 |         shifted_polygon = polygon - np.array([x, y])
 88 |         
 89 |         # 创建透明背景的图像(BGRA)
 90 |         cropped = np.zeros((h, w, 4), dtype=np.uint8)
 91 |         
 92 |         # 将原始图像复制到透明图像的BGR通道
 93 |         if len(image.shape) == 3:  # BGR图像
 94 |             cropped[:, :, 0:3] = image[y:y+h, x:x+w]
 95 |         else:  # 灰度图像
 96 |             for i in range(3):
 97 |                 cropped[:, :, i] = image[y:y+h, x:x+w]
 98 |         
 99 |         # 创建alpha通道掩码
100 |         mask = np.zeros((h, w), dtype=np.uint8)
101 |         cv2.fillPoly(mask, [shifted_polygon], 255)
102 |         
103 |         # 将掩码应用到alpha通道
104 |         cropped[:, :, 3] = mask
105 |         
106 |         return cropped
107 | 
108 | class TextAreaCropper:
109 |     """文本区域处理器"""
110 |     
111 |     def __init__(self, cropper: TextCropper = None):
112 |         """初始化文本区域处理器
113 |         
114 |         Args:
115 |             cropper: 裁剪策略实现，默认使用矩形裁剪
116 |         """
117 |         self.cropper = cropper if cropper is not None else RectCropper()
118 |     
119 |     def crop_text_areas(self, image_path: str, json_path: str, output_dir: str, output_format: str = 'png', bg_color: tuple = (255, 255, 255)) -> None:
120 |         """裁剪图像中检测到的文本区域
121 | 
122 |         Args:
123 |             image_path: 原始图像路径
124 |             json_path: 检测结果JSON文件路径
125 |             output_dir: 裁剪结果保存目录
126 |             output_format: 输出图像格式，支持'png'(带透明度)和'jpg'(无透明度)等，默认为'png'
127 |             bg_color: 当使用不支持透明度的格式时的背景颜色(BGR格式)，默认为白色
128 | 
129 |         Returns:
130 |             无返回值，结果保存到指定目录
131 |         """
132 |         # 创建输出目录
133 |         os.makedirs(output_dir, exist_ok=True)
134 |         
135 |         # 读取原始图像
136 |         image = cv2.imread(image_path)
137 |         if image is None:
138 |             print(f"无法读取图像: {image_path}")
139 |             return
140 |         
141 |         # 读取JSON文件中的检测结果
142 |         with open(json_path, 'r', encoding='utf-8') as f:
143 |             result = json.load(f)
144 |         
145 |         # 获取检测框列表
146 |         boxes = result.get('boxes', [])
147 |         
148 |         # 处理每个检测到的区域
149 |         for i, box in enumerate(boxes):
150 |             # 获取坐标和其他信息
151 |             coords = box.get('coordinate', [])
152 |             if not coords:
153 |                 continue
154 |             
155 |             # 将坐标转换为多边形格式
156 |             x1, y1, x2, y2 = map(float, coords)
157 |             poly = np.array([[x1, y1], [x2, y1], [x2, y2], [x1, y2]], np.int32)
158 |             
159 |             # 使用选择的裁剪策略裁剪图像
160 |             cropped = self.cropper.crop(image, poly)
161 |             
162 |             # 获取区域类型和置信度
163 |             label = box.get('label', 'unknown')
164 |             score = box.get('score', 0)
165 |             
166 |             # 生成输出文件名 - 添加序号
167 |             output_filename = f"{i}_{label}_{score:.4f}.{output_format.lower()}"
168 |             output_path = os.path.join(output_dir, output_filename)
169 |             
170 |             # 根据输出格式处理图像
171 |             if output_format.lower() != 'png' and len(cropped.shape) == 3 and cropped.shape[2] == 4:
172 |                 # 如果不是PNG格式，而且图像有Alpha通道，需要处理透明度
173 |                 # 创建纯色背景
174 |                 background = np.ones((cropped.shape[0], cropped.shape[1], 3), dtype=np.uint8)
175 |                 background[:] = bg_color
176 |                 
177 |                 # 提取Alpha通道作为掩码
178 |                 alpha = cropped[:, :, 3] / 255.0
179 |                 alpha = alpha[:, :, np.newaxis]
180 |                 
181 |                 # 将前景与背景混合
182 |                 foreground = cropped[:, :, :3]
183 |                 merged = cv2.convertScaleAbs(foreground * alpha + background * (1 - alpha))
184 |                 
185 |                 cv2.imwrite(output_path, merged)
186 |             else:
187 |                 cv2.imwrite(output_path, cropped)
188 |             
189 |             print(f"已保存{label}区域 {i+1}: {output_path}")
190 | 
191 | if __name__ == "__main__":
192 |     # 设置输入输出路径
193 |     image_path = "page_layout.png"
194 |     json_path = "./4_output/res.json"
195 |     output_dir = "./test_cropped_output"
196 |     
197 |     # 创建文本区域处理器并执行裁剪，使用矩形裁剪
198 |     processor = TextAreaCropper(RectCropper())
199 |     
200 |     # 使用默认PNG格式（带透明背景）
201 |     processor.crop_text_areas(image_path, json_path, output_dir, output_format='png')
202 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/detect_and_sort.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | import cv2
 3 | 
 4 | from image_utils.layout_detect import detect_layout
 5 | from image_utils.layout_sorter import LayoutSorter
 6 | 
 7 | 
 8 | def detect_and_sort_layout(image_path: str,
 9 |                           output_path: str = "./layout_output/layout_detection.json",
10 |                           threshold_left_right: float = 0.9,
11 |                           threshold_cross: float = 0.3) -> List[Dict]:
12 |     """
13 |     检测图片版面并对检测结果进行排序
14 |     
15 |     Args:
16 |         image_path: 输入图片路径
17 |         output_path: 布局检测结果保存路径
18 |         threshold_left_right: 判定元素属于左/右栏的阈值
19 |         threshold_cross: 判定元素跨栏的阈值
20 |         
21 |     Returns:
22 |         排序后的版面元素列表
23 |     """
24 |     # 读取图片获取宽度
25 |     image = cv2.imread(image_path)
26 |     page_width = image.shape[1]
27 |     
28 |     # 检测版面
29 |     layout_result = detect_layout(image_path, output_path)
30 |     
31 |     # 创建排序器并排序
32 |     sorter = LayoutSorter(threshold_left_right, threshold_cross)
33 |     sorted_elements = sorter.sort_layout(layout_result, page_width)
34 |     
35 |     return sorted_elements
36 | 
37 | if __name__ == "__main__":
38 |     # 使用示例
39 |     image_path = "formula_inline.png"
40 |     sorted_result = detect_and_sort_layout(image_path)
41 |     print(f"检测到 {len(sorted_result)} 个已排序的版面元素")
42 |     
43 |     # 添加可视化
44 |     from image_utils.layout_visualizer import LayoutVisualizer
45 |     visualizer = LayoutVisualizer()
46 |     visualizer.save_visualization(
47 |         image_path=image_path,
48 |         boxes=sorted_result,
49 |         output_path="output/visualization_output.png"
50 |     )
51 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/formula_recognize.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | """
 4 | 公式识别模块 - 从图像中识别数学公式并转换为LaTeX格式
 5 | """
 6 | 
 7 | import json
 8 | import os
 9 | from typing import Optional, Dict, Any
10 | 
11 | from x_pdf2md.image_utils.models import get_or_create_model
12 | 
13 | 
14 | 
15 | def recognize_formula(input_path: str, output_path: Optional[str] = None) -> str:
16 |     """
17 |     识别图像中的数学公式
18 |     
19 |     Args:
20 |         input_path: 输入图像路径
21 |         output_path: 输出结果保存路径(可选)
22 |     
23 |     Returns:
24 |         str: LaTeX格式的公式文本
25 |     """
26 |     print(f"处理公式图片: {input_path}")
27 |     
28 |     # 确保输出目录存在
29 |     output_dir = "./UniMERNet_output/" if output_path is None else os.path.dirname(output_path)
30 |     os.makedirs(output_dir, exist_ok=True)
31 |     
32 |     # 获取或创建模型
33 |     model = get_or_create_model('formula')
34 |     
35 | 
36 |     if model is None:
37 |         raise Exception("模型加载失败")
38 | 
39 |     output = model.predict(input=input_path, batch_size=1)
40 | 
41 |     for res in output:
42 |         res_path = output_path or f"{output_dir}/res.json"
43 |         res.save_to_json(save_path=res_path)
44 | 
45 |         # 读取json文件
46 |         with open(res_path, 'r') as f:
47 |             results = json.load(f)
48 | 
49 |     rec_formula = results.get("rec_formula", "")
50 | 
51 |     # 如果公式为空，返回一个默认值
52 |     if not rec_formula:
53 |         return ""
54 | 
55 |     return rec_formula
56 | 
57 | 
58 | 
59 | 
60 | if __name__ == "__main__":
61 |     # 测试公式识别
62 |     test_image = "image.png"
63 |     latex = recognize_formula(test_image)
64 |     print(f"识别结果: {latex}")
65 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/image_utils/image.png


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/layout_config.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple
 2 | 
 3 | 
 4 | class LayoutConfig:
 5 |     # 可视化颜色配置 (RGB格式)
 6 |     COLORS: Dict[str, Tuple[int, int, int]] = {
 7 |         # 标题类
 8 |         'doc_title': (255, 0, 128),  # 品红色
 9 |         'paragraph_title': (255, 0, 0),  # 红色
10 |         'figure_title': (128, 0, 255),  # 紫色
11 |         'table_title': (255, 140, 0),  # 深橙色
12 |         'chart_title': (0, 215, 255),  # 浅青色
13 | 
14 |         # 正文类
15 |         'text': (0, 255, 0),  # 绿色
16 |         'abstract': (0, 255, 191),  # 绿松石色
17 |         'aside_text': (152, 251, 152),  # 浅绿色
18 |         'footnote': (144, 238, 144),  # 淡绿色
19 | 
20 |         # 图表类
21 |         'image': (0, 0, 255),  # 蓝色
22 |         'chart': (0, 255, 255),  # 青色
23 |         'table': (255, 165, 0),  # 橙色
24 | 
25 |         # 公式和数字类
26 |         'formula': (255, 255, 0),  # 黄色
27 |         'formula_number': (255, 215, 0),  # 金色
28 |         'number': (218, 165, 32),  # 金麦色
29 | 
30 |         # 页眉页脚类
31 |         'header': (169, 169, 169),  # 深灰色
32 |         'footer': (192, 192, 192),  # 浅灰色
33 |     }
34 |     DEFAULT_COLOR: Tuple[int, int, int] = (128, 128, 128)  # 灰色，用于未定义颜色的标签
35 | 
36 |     # 需要进一步处理的标签 (白名单)
37 |     PROCESS_LABELS: List[str] = [
38 |         'paragraph_title', 'image', 'text',
39 |         'abstract', 'figure_title', 'formula',
40 |         'table_title', 'doc_title', 'table',
41 |         'chart', 'chart_title', 'formula_number'
42 |     ]
43 | 
44 |     # 需要过滤掉的标签 (黑名单)
45 |     FILTER_LABELS: List[str] = [
46 |         'footnote', 'header', 'footer',
47 |         'aside_text', 'number'
48 |     ]
49 | 
50 |     # 已知的所有标签及其ID (用于检测未知标签)
51 |     KNOWN_LABELS: Dict[int, str] = {
52 |         0: 'paragraph_title',
53 |         1: 'image',
54 |         2: 'text',
55 |         3: 'number',
56 |         4: 'abstract',
57 |         6: 'figure_title',
58 |         7: 'formula',
59 |         8: 'table',
60 |         9: 'table_title',
61 |         10: 'reference',
62 |         11: 'doc_title',
63 |         12: 'footnote',
64 |         13: 'header',
65 |         15: 'footer',
66 |         17: 'chart_title',
67 |         18: 'chart',
68 |         19: 'formula_number',
69 |         20: 'header_image',
70 |         22: 'aside_text',
71 |     }
72 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/layout_detect.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 布局处理模块
  3 | 负责文档版面分析和处理
  4 | """
  5 | 
  6 | import cv2
  7 | import json
  8 | import os
  9 | import time
 10 | from typing import Dict, List, Any
 11 | from paddlex import create_model
 12 | 
 13 | from image_utils.layout_config import LayoutConfig
 14 | from x_pdf2md.config import get_model_config
 15 | 
 16 | 
 17 | def is_box_inside(box1: List[float], box2: List[float]) -> bool:
 18 |     """
 19 |     判断box1是否在box2内部（如果box1有80%以上区域被box2包含，则视为被包含）
 20 |     
 21 |     参数:
 22 |         box1: [x1, y1, x2, y2] 格式的框坐标
 23 |         box2: [x1, y1, x2, y2] 格式的框坐标
 24 |     
 25 |     返回:
 26 |         bool: 如果box1有80%以上区域被box2包含，返回True，否则返回False
 27 |     """
 28 |     # 计算box1的面积
 29 |     area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
 30 |     
 31 |     # 计算交集的坐标
 32 |     intersect_x1 = max(box1[0], box2[0])
 33 |     intersect_y1 = max(box1[1], box2[1])
 34 |     intersect_x2 = min(box1[2], box2[2])
 35 |     intersect_y2 = min(box1[3], box2[3])
 36 |     
 37 |     # 如果没有交集，直接返回False
 38 |     if intersect_x1 >= intersect_x2 or intersect_y1 >= intersect_y2:
 39 |         return False
 40 |     
 41 |     # 计算交集的面积
 42 |     intersection_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1)
 43 |     
 44 |     # 计算交集面积占box1面积的比例
 45 |     overlap_ratio = intersection_area / area_box1
 46 |     
 47 |     # 如果交集面积占box1面积的比例大于等于0.8，则视为被包含
 48 |     return overlap_ratio >= 0.8
 49 | 
 50 | def build_box_hierarchy(boxes: List[Dict]) -> List[Dict]:
 51 |     """
 52 |     为每个框添加包含关系，并移除嵌套在其他框内部的框
 53 |     
 54 |     参数:
 55 |         boxes: 包含框信息的字典列表，每个字典包含coordinate字段
 56 |         
 57 |     返回:
 58 |         List[Dict]: 添加了包含关系的框列表
 59 |     """
 60 |     n = len(boxes)
 61 |     is_nested = [False] * n
 62 |     
 63 |     # 为每个框添加contains属性
 64 |     for i in range(n):
 65 |         boxes[i]["contains"] = []
 66 |     
 67 |     # 检查每个框是否被其他框包含
 68 |     for i in range(n):
 69 |         box1 = boxes[i]["coordinate"]
 70 |         for j in range(n):
 71 |             if i != j:
 72 |                 box2 = boxes[j]["coordinate"]
 73 |                 if is_box_inside(box1, box2):
 74 |                     is_nested[i] = True
 75 |                     # 将被包含的框添加到外部框的contains列表中
 76 |                     boxes[j]["contains"].append(boxes[i])
 77 |                     break
 78 |     
 79 |     # 只保留不是嵌套框的框
 80 |     result = []
 81 |     for i in range(n):
 82 |         if not is_nested[i]:
 83 |             result.append(boxes[i])
 84 |             
 85 |     return result
 86 | 
 87 | def calculate_boundary_distance(box1: List[float], box2: List[float]) -> float:
 88 |     """
 89 |     计算两个框之间的最小边界距离
 90 |     
 91 |     参数:
 92 |         box1: [x1, y1, x2, y2] 格式的框坐标
 93 |         box2: [x1, y1, x2, y2] 格式的框坐标
 94 |     
 95 |     返回:
 96 |         float: 两个框之间的最小距离，如果重叠则为0
 97 |     """
 98 |     # 计算水平方向上的距离
 99 |     if box1[0] > box2[2]:  # box1在box2右侧
100 |         horizontal_dist = box1[0] - box2[2]
101 |     elif box2[0] > box1[2]:  # box2在box1右侧
102 |         horizontal_dist = box2[0] - box1[2]
103 |     else:  # 水平方向上有重叠
104 |         horizontal_dist = 0
105 |     
106 |     # 计算垂直方向上的距离
107 |     if box1[1] > box2[3]:  # box1在box2下方
108 |         vertical_dist = box1[1] - box2[3]
109 |     elif box2[1] > box1[3]:  # box2在box1下方
110 |         vertical_dist = box2[1] - box1[3]
111 |     else:  # 垂直方向上有重叠
112 |         vertical_dist = 0
113 |     
114 |     # 计算欧几里得距离
115 |     return (horizontal_dist ** 2 + vertical_dist ** 2) ** 0.5
116 | 
117 | def merge_formula_numbers(boxes: List[Dict]) -> List[Dict]:
118 |     """
119 |     将公式序号框融合到最近的公式框中，优先考虑序号左侧的公式框
120 |     
121 |     参数:
122 |         boxes: 包含框信息的字典列表
123 |         
124 |     返回:
125 |         List[Dict]: 处理后的框列表，原始列表不会被修改
126 |     """
127 |     # 识别所有公式框和公式序号框
128 |     formula_boxes = [box for box in boxes if box.get("label") == "formula"]
129 |     formula_number_boxes = [box for box in boxes if box.get("label") == "formula_number"]
130 |     
131 |     # 如果没有公式序号框或公式框，直接返回原列表的副本
132 |     if not formula_number_boxes or not formula_boxes:
133 |         return boxes.copy()
134 |     
135 |     # 创建结果列表，首先加入除了公式框和公式序号框外的所有框
136 |     result_boxes = []
137 |     formula_ids = [id(box) for box in formula_boxes]
138 |     formula_number_ids = [id(box) for box in formula_number_boxes]
139 |     
140 |     # 复制非公式和非公式序号的框到结果列表
141 |     for box in boxes:
142 |         if id(box) not in formula_ids and id(box) not in formula_number_ids:
143 |             result_boxes.append(box.copy())
144 |     
145 |     # 处理公式框，为每个公式框创建副本
146 |     processed_formula_boxes = []
147 |     for formula_box in formula_boxes:
148 |         new_formula_box = formula_box.copy()
149 |         new_formula_box["formula_numbers"] = []
150 |         processed_formula_boxes.append(new_formula_box)
151 |     
152 |     # 对于每个公式序号框，找到最合适的公式框并融合
153 |     for number_box in formula_number_boxes:
154 |         number_coord = number_box["coordinate"]
155 |         
156 |         # 计算公式序号框的左边缘和中心点
157 |         number_left = number_coord[0]
158 |         number_center_y = (number_coord[1] + number_coord[3]) / 2
159 |         
160 |         # 定义垂直容忍度（公式中心点和序号中心点的垂直距离允许范围）
161 |         vertical_tolerance = (number_coord[3] - number_coord[1]) * 2  # 序号高度的2倍
162 |         
163 |         # 筛选出垂直方向上大致对齐的公式框
164 |         aligned_formulas = []
165 |         for formula_box in processed_formula_boxes:
166 |             formula_coord = formula_box["coordinate"]
167 |             formula_center_y = (formula_coord[1] + formula_coord[3]) / 2
168 |             
169 |             # 检查垂直方向上是否对齐
170 |             if abs(formula_center_y - number_center_y) <= vertical_tolerance:
171 |                 aligned_formulas.append(formula_box)
172 |         
173 |         # 先尝试找位于序号左侧的公式框（公式在左，序号在右）
174 |         left_side_formulas = []
175 |         for formula_box in aligned_formulas:
176 |             formula_coord = formula_box["coordinate"]
177 |             formula_right = formula_coord[2]  # 公式框的右边缘
178 |             
179 |             # 如果公式框的右边缘在序号框的左边缘的左侧或接近（允许少量重叠）
180 |             if formula_right <= number_left + (number_coord[2] - number_left) * 0.2:  # 允许20%的重叠
181 |                 left_side_formulas.append(formula_box)
182 |         
183 |         closest_formula = None
184 |         
185 |         # 如果找到了位于序号左侧的公式框
186 |         if left_side_formulas:
187 |             # 选择最近的一个（公式右边缘离序号左边缘最近的）
188 |             min_distance = float('inf')
189 |             for formula_box in left_side_formulas:
190 |                 formula_coord = formula_box["coordinate"]
191 |                 formula_right = formula_coord[2]
192 |                 
193 |                 distance = number_left - formula_right
194 |                 
195 |                 if distance < min_distance:
196 |                     min_distance = distance
197 |                     closest_formula = formula_box
198 |         
199 |         # 如果没有找到位于序号左侧的公式框，则在所有垂直对齐的公式框中选择距离最近的
200 |         elif aligned_formulas:
201 |             min_distance = float('inf')
202 |             for formula_box in aligned_formulas:
203 |                 formula_coord = formula_box["coordinate"]
204 |                 formula_center_x = (formula_coord[0] + formula_coord[2]) / 2
205 |                 number_center_x = (number_coord[0] + number_coord[2]) / 2
206 |                 
207 |                 distance = abs(formula_center_x - number_center_x)
208 |                 
209 |                 if distance < min_distance:
210 |                     min_distance = distance
211 |                     closest_formula = formula_box
212 |         
213 |         # 如果仍然没有找到合适的公式框，退回到使用边界距离
214 |         else:
215 |             min_distance = float('inf')
216 |             for formula_box in processed_formula_boxes:
217 |                 formula_coord = formula_box["coordinate"]
218 |                 
219 |                 # 使用边界距离替代中心点距离
220 |                 distance = calculate_boundary_distance(formula_coord, number_coord)
221 |                 
222 |                 if distance < min_distance:
223 |                     min_distance = distance
224 |                     closest_formula = formula_box
225 |         
226 |         if closest_formula:
227 |             # 融合公式框和公式序号框
228 |             # 取两个框的并集作为新的公式框
229 |             closest_formula["coordinate"] = [
230 |                 min(closest_formula["coordinate"][0], number_coord[0]),
231 |                 min(closest_formula["coordinate"][1], number_coord[1]),
232 |                 max(closest_formula["coordinate"][2], number_coord[2]),
233 |                 max(closest_formula["coordinate"][3], number_coord[3])
234 |             ]
235 |             
236 |             # 添加公式序号的详细信息
237 |             closest_formula["formula_numbers"].append({
238 |                 "coordinate": number_coord,
239 |                 "score": number_box.get("score", 0),
240 |                 "text": number_box.get("text", "")
241 |             })
242 |     
243 |     # 将处理后的公式框添加到结果列表
244 |     result_boxes.extend(processed_formula_boxes)
245 |     
246 |     return result_boxes
247 | 
248 | def detect_layout(image_path: str, output_path: str = "./layout_output/layout_detection.json", model_name= "PP-DocLayout-L") -> Dict:
249 |     """
250 |     检测文档版面布局
251 | 
252 |     参数:
253 |         image_path: 图像路径
254 |         output_dir: 输出目录
255 |         model_name: 模型名称，默认为"PP-DocLayout-L"
256 | 
257 |     返回:
258 |         版面分析结果
259 |     """
260 |     # 如果model_name为None，从配置中获取
261 |     if model_name is None:
262 |         model_name = get_model_config('layout')
263 |         
264 |     # 创建输出目录
265 |     output_dir = os.path.dirname(output_path)
266 |     os.makedirs(output_dir, exist_ok=True)
267 |     
268 |     # 设置json输出路径
269 |     json_path = output_path if output_path.endswith(".json") else os.path.join(output_dir, "layout_detection.json")
270 | 
271 |     model = create_model(model_name=model_name)
272 |     output = model.predict(image_path, batch_size=1, layout_nms=True)
273 | 
274 |     # 保存结果到JSON
275 |     for res in output:
276 |         res.save_to_json(save_path=json_path)
277 |         res.save_to_img("./output/layout_result.jpg")
278 | 
279 |     # 读取JSON文件
280 |     with open(json_path, "r", encoding="utf-8") as f:
281 |         result = json.load(f)
282 |     
283 |     # 过滤掉不需要处理的标签
284 |     result["boxes"] = [box for box in result["boxes"] if box.get("label") not in LayoutConfig.FILTER_LABELS]
285 |     
286 |     # 合并公式和公式序号
287 |     result["boxes"] = merge_formula_numbers(result["boxes"])
288 |     
289 |     # 构建框层次结构
290 |     result["boxes"] = build_box_hierarchy(result["boxes"])
291 |     # json dump到文件，使用json_path并在文件后面加入final标记
292 |     final_json_path = json_path.replace(".json", "_final.json")
293 |     print("Final JSON path:", final_json_path)
294 |     with open(final_json_path, "w", encoding="utf-8") as f:
295 |         json.dump(result, f, ensure_ascii=False, indent=2)
296 | 
297 |     return result
298 | 
299 | 
300 | if __name__ == "__main__":
301 |     image_path = "./formula_inline.png"
302 |     output_dir = "output"
303 |     os.makedirs(output_dir, exist_ok=True)
304 |     # 测试文档版面分析
305 |     result = detect_layout(image_path=image_path, output_path=os.path.join(output_dir, "layout_detection.json"))
306 | 
307 | 
308 | 
309 | # 布局检测结果分析
310 | 
311 | ## 检测信息
312 | 
313 | # - 输入图片：layout.png
314 | # - 检测项目：页面布局元素
315 | # - 总检测框数：13
316 | 
317 | # ## 检测结果详情
318 | 
319 | # 检测到的元素类型统计：
320 | 
321 | # - 表格(table): 2个
322 | # - 正文(text): 5个
323 | # - 表格标题(table_title): 2个
324 | # - 段落标题(paragraph_title): 4个
325 | 
326 | # ```json
327 | # {
328 | #     // 输入图片路径
329 | #     "input_path": "layout.png",
330 | #     "page_index": null,
331 | #     "boxes": [
332 | #         // 表格区域 1
333 | #         {
334 | #             "cls_id": 8,
335 | #             "label": "table",
336 | #             "score": 0.9866,  // 置信度 98.66%
337 | #             "coordinate": [74.31, 105.71, 321.99, 299.11]  // [x1, y1, x2, y2]
338 | #         },
339 | #         // 正文区域 1
340 | #         {
341 | #             "cls_id": 2,
342 | #             "label": "text",
343 | #             "score": 0.9860,  // 置信度 98.60%
344 | #             "coordinate": [34.66, 349.91, 358.34, 611.34]  // [x1, y1, x2, y2]
345 | #         },
346 | #        ……
347 | #     ]
348 | # }
349 | # ```
350 | 
351 | # ## 注意事项
352 | 
353 | # 1. coordinate 坐标格式为 [x1, y1, x2, y2]，表示检测框的左上角和右下角坐标
354 | # 2. score 表示检测结果的置信度，范围 0-1
355 | # 3. cls_id 对应关系：
356 | #    - 0: paragraph_title
357 | #    - 2: text
358 | #    - 8: table
359 | #    - 9: table_title
360 | #    - 7: formula
361 | #    - 19: formula_number
362 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/layout_sorter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import json
  5 | from typing import List, Dict, Optional, Union
  6 | 
  7 | class LayoutSorter:
  8 |     """版面布局元素排序处理器"""
  9 |     
 10 |     def __init__(self, 
 11 |                  threshold_left_right: float = 0.9,
 12 |                  threshold_cross: float = 0.3):
 13 |         """
 14 |         初始化排序器
 15 |         
 16 |         Args:
 17 |             threshold_left_right: 判定元素属于左/右栏的阈值(0-1)，默认0.9
 18 |             threshold_cross: 判定元素跨栏的阈值(0-1)，默认0.3
 19 |         """
 20 |         self.threshold_left_right = threshold_left_right
 21 |         self.threshold_cross = threshold_cross
 22 |     
 23 |     def sort_layout(self, layout_result: Union[str, dict], page_width: float) -> List[Dict]:
 24 |         """
 25 |         对版面检测结果进行排序
 26 |         
 27 |         Args:
 28 |             layout_result: JSON文件路径或者包含检测结果的字典
 29 |             page_width: 页面宽度,必须提供
 30 |             
 31 |         Returns:
 32 |             排序后的元素列表
 33 |         """
 34 |         # 加载检测结果
 35 |         if isinstance(layout_result, str):
 36 |             with open(layout_result, 'r', encoding='utf-8') as f:
 37 |                 result = json.load(f)
 38 |         else:
 39 |             result = layout_result
 40 |             
 41 |         # 获取元素列表
 42 |         elements = result.get('boxes', [])
 43 |         return self._sort_elements(elements, page_width)
 44 |     
 45 |     def _sort_elements(self, elements: List[Dict], page_width: float) -> List[Dict]:
 46 |         """
 47 |         对元素按照左右栏进行排序
 48 |         
 49 |         Args:
 50 |             elements: 元素列表
 51 |             page_width: 页面宽度,必须提供
 52 |             
 53 |         Returns:
 54 |             排序后的元素列表
 55 |         """
 56 |         # 筛选有效元素
 57 |         valid_elements = [
 58 |             elem for elem in elements 
 59 |             if "coordinate" in elem and len(elem["coordinate"]) == 4
 60 |         ]
 61 |         
 62 |         if not valid_elements:
 63 |             return []
 64 |             
 65 |         page_center_x = page_width / 2
 66 |         left_column = []
 67 |         right_column = []
 68 |         
 69 |         # 分配元素到左右栏
 70 |         for elem in valid_elements:
 71 |             x1, _, x2, _ = elem["coordinate"]
 72 |             elem_width = x2 - x1
 73 |             
 74 |             # 计算左右覆盖比例
 75 |             left_part = max(0, min(x2, page_center_x) - x1)
 76 |             right_part = max(0, x2 - max(x1, page_center_x))
 77 |             
 78 |             left_ratio = left_part / elem_width if elem_width > 0 else 0
 79 |             right_ratio = right_part / elem_width if elem_width > 0 else 0
 80 |             
 81 |             # 根据覆盖比例分配
 82 |             if left_ratio >= self.threshold_left_right:
 83 |                 left_column.append(elem)
 84 |             elif right_ratio >= self.threshold_left_right:
 85 |                 right_column.append(elem)
 86 |             elif left_ratio > self.threshold_cross and right_ratio > self.threshold_cross:
 87 |                 left_column.append(elem)
 88 |             else:
 89 |                 elem_center_x = (x1 + x2) / 2
 90 |                 if elem_center_x <= page_center_x:
 91 |                     left_column.append(elem)
 92 |                 else:
 93 |                     right_column.append(elem)
 94 |                     
 95 |         # 按垂直位置排序
 96 |         left_column.sort(key=lambda e: e["coordinate"][1])
 97 |         right_column.sort(key=lambda e: e["coordinate"][1])
 98 |         
 99 |         return left_column + right_column
100 |     
101 | 
102 | if __name__ == "__main__":
103 |     # 使用示例
104 |     sorter = LayoutSorter()
105 |     
106 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/layout_visualizer.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | from PIL import Image, ImageDraw, ImageFont
  4 | from typing import Dict, List
  5 | from layout_config import LayoutConfig
  6 | 
  7 | class LayoutVisualizer:
  8 |     def __init__(self, font_path: str = None, font_size: int = 24):  # 修改默认字体大小为24
  9 |         """初始化可视化器
 10 |         
 11 |         Args:
 12 |             font_path: 字体文件路径，默认使用PIL默认字体
 13 |             font_size: 字体大小，默认24
 14 |         """
 15 |         self.font = ImageFont.load_default()
 16 |         if font_path:
 17 |             try:
 18 |                 self.font = ImageFont.truetype(font_path, font_size)
 19 |             except:
 20 |                 print("无法加载指定字体，使用默认字体")
 21 |         
 22 |         # 使用配置文件中的颜色设置
 23 |         self.colors = LayoutConfig.COLORS
 24 |         self.default_color = LayoutConfig.DEFAULT_COLOR
 25 |         
 26 |     def draw_boxes(self, image: np.ndarray, boxes: List[Dict], 
 27 |                   show_order: bool = True, show_label: bool = True) -> np.ndarray:
 28 |         """绘制排序后的检测框
 29 |         
 30 |         Args:
 31 |             image: 原始图像(RGB格式)
 32 |             boxes: 排序后的检测框列表
 33 |             show_order: 是否显示排序顺序
 34 |             show_label: 是否显示标签类型
 35 |             
 36 |         Returns:
 37 |             绘制了检测框的图像
 38 |         """
 39 |         # 转换为PIL图像
 40 |         image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 41 |         pil_image = Image.fromarray(image_rgb)
 42 |         draw = ImageDraw.Draw(pil_image)
 43 |         
 44 |         # 绘制每个元素
 45 |         for i, box in enumerate(boxes):
 46 |             x1, y1, x2, y2 = map(int, box['coordinate'])
 47 |             label = box['label']
 48 |             score = box.get('score', 0)
 49 |             color = self.colors.get(label, self.default_color)
 50 |             
 51 |             # 绘制矩形框
 52 |             draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
 53 |             
 54 |             # 准备标注文本
 55 |             text_elements = []
 56 |             if show_order:
 57 |                 text_elements.append(f"#{i+1}")
 58 |             if show_label:
 59 |                 text_elements.append(f"{label}")
 60 |                 if score > 0:
 61 |                     text_elements.append(f"{score:.2f}")
 62 |             text = " ".join(text_elements)
 63 |             
 64 |             if text:
 65 |                 # 获取文本尺寸
 66 |                 text_bbox = draw.textbbox((0, 0), text, font=self.font)
 67 |                 text_width = text_bbox[2] - text_bbox[0]
 68 |                 text_height = text_bbox[3] - text_bbox[1]
 69 |                 
 70 |                 # 绘制文本背景
 71 |                 draw.rectangle(
 72 |                     [x1, y1 - text_height - 8, x1 + text_width + 8, y1],  # 增加内边距
 73 |                     fill=color
 74 |                 )
 75 |                 
 76 |                 # 绘制文本
 77 |                 draw.text(
 78 |                     (x1 + 4, y1 - text_height - 4),  # 调整文本位置
 79 |                     text,
 80 |                     fill=(255, 255, 255),
 81 |                     font=self.font
 82 |                 )
 83 |         
 84 |         # 转换回OpenCV格式
 85 |         result = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
 86 |         return result
 87 | 
 88 |     def save_visualization(self, image_path: str, boxes: List[Dict], 
 89 |                          output_path: str,
 90 |                          show_order: bool = True,
 91 |                          show_label: bool = True) -> None:
 92 |         """保存可视化结果
 93 |         
 94 |         Args:
 95 |             image_path: 原始图像路径
 96 |             boxes: 排序后的检测框列表
 97 |             output_path: 输出图像路径
 98 |             show_order: 是否显示排序顺序
 99 |             show_label: 是否显示标签类型
100 |         """
101 |         # 读取原始图像
102 |         image = cv2.imread(image_path)
103 |         if image is None:
104 |             raise ValueError(f"无法读取图像: {image_path}")
105 |         
106 |         # 绘制检测框
107 |         result = self.draw_boxes(image, boxes, show_order, show_label)
108 |         
109 |         # 保存结果
110 |         cv2.imwrite(output_path, result)
111 |         print(f"可视化结果已保存至: {output_path}")
112 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/models.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | from paddlex import create_model
 4 | 
 5 | from x_pdf2md.config import get_model_config
 6 | 
 7 | # 全局模型字典，用于存储已加载的模型
 8 | _GLOBAL_MODELS: Dict[str, Any] = {}
 9 | 
10 | def get_or_create_model(model_type: str) -> Any:
11 |     """
12 |     获取或创建模型，实现模型的全局注册
13 | 
14 |     Args:
15 |         model_type: 模型类型
16 | 
17 |     Returns:
18 |         已加载的模型实例
19 |     """
20 |     global _GLOBAL_MODELS
21 | 
22 |     # 如果模型已经加载，直接返回
23 |     if model_type in _GLOBAL_MODELS and _GLOBAL_MODELS[model_type] is not None:
24 |         return _GLOBAL_MODELS[model_type]
25 | 
26 |     # 否则，从配置中获取模型名称并加载
27 |     model_name = get_model_config(model_type)
28 |     try:
29 |         model = create_model(model_name=model_name)
30 |         _GLOBAL_MODELS[model_type] = model
31 |         print(f"模型 {model_type} 加载成功")
32 |         return model
33 |     except Exception as e:
34 |         print(f"模型 {model_type} 加载失败: {e}")
35 |         return None
36 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/process_page.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | import os
 3 | import json
 4 | 
 5 | from x_pdf2md.image_utils.crop_text_areas import PolyCropper, TextAreaCropper
 6 | from x_pdf2md.image_utils.detect_and_sort import detect_and_sort_layout
 7 | from x_pdf2md.image_utils.region_image import RegionImage
 8 | 
 9 | 
10 | def process_page_layout(
11 |         image_path: str,
12 |         output_dir: str,
13 |         page_number: int = 1,
14 |         layout_json_path: str = None,
15 |         threshold_left_right: float = 0.9,
16 |         threshold_cross: float = 0.3
17 | ) -> List[RegionImage]:
18 |     """
19 |     处理页面布局：检测并排序版面，然后按顺序裁剪保存各区域
20 | 
21 |     Args:
22 |         image_path: 输入图片路径
23 |         output_dir: 输出目录路径
24 |         layout_json_path: 布局检测结果保存路径（可选）
25 |         threshold_left_right: 判定左右栏的阈值
26 |         threshold_cross: 判定跨栏的阈值
27 | 
28 |     Returns:
29 |         List[RegionImage]: 包含区域信息的RegionImage对象列表
30 |     """
31 |     # 确保输出目录存在
32 |     os.makedirs(output_dir, exist_ok=True)
33 | 
34 |     # 如果未指定layout_json_path，在output_dir中创建临时文件
35 |     if layout_json_path is None:
36 |         layout_json_path = os.path.join(output_dir, "temp_layout.json")
37 | 
38 |     # 检测并排序版面
39 |     sorted_elements = detect_and_sort_layout(
40 |         image_path,
41 |         layout_json_path,
42 |         threshold_left_right,
43 |         threshold_cross
44 |     )
45 | 
46 |     # 将排序后的元素写入JSON文件
47 |     with open(layout_json_path, 'w', encoding='utf-8') as f:
48 |         json.dump({"boxes": sorted_elements}, f, ensure_ascii=False, indent=2)
49 | 
50 |     # 创建裁剪处理器
51 |     cropper = TextAreaCropper(PolyCropper())
52 | 
53 |     # 裁剪并保存区域
54 |     region_images = []
55 |     cropper.crop_text_areas(
56 |         image_path,
57 |         layout_json_path,
58 |         output_dir,
59 |         output_format='png'
60 |     )
61 | 
62 |     # 获取裁剪后的图片信息（按排序顺序）
63 |     for i, element in enumerate(sorted_elements):
64 |         label = element.get('label', 'unknown')
65 |         score = element.get('score', 0)
66 |         box = element.get('box', [])
67 |         filename = f"{i}_{label}_{score:.4f}.png"
68 |         cropped_path = os.path.join(output_dir, filename)
69 |         contains = element.get('contains', [])
70 |         if os.path.exists(cropped_path):
71 |             region = RegionImage(
72 |                 image_path=cropped_path,
73 |                 label=label,
74 |                 score=score,
75 |                 page_number=page_number,
76 |                 region_index=i,
77 |                 original_box=box,
78 |                 contains=contains
79 |             )
80 |             region_images.append(region)
81 | 
82 |     return region_images
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     # 使用示例
87 |     image_path = "car.png"
88 |     output_dir = "./processed_output"
89 | 
90 |     cropped_images = process_page_layout(image_path, output_dir)
91 |     print(f"处理完成，共生成 {len(cropped_images)} 个区域图片")
92 |     for i, path in enumerate(cropped_images, 1):
93 |         print(f"区域 {i}: {path}")
94 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/region_image.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class RegionImage:
 6 |     """表示文档中的一个区域图片"""
 7 |     image_path: str  # 图片文件路径
 8 |     label: str  # 区域标签 (如 'text', 'title' 等)
 9 |     score: float  # 检测置信度分数
10 |     page_number: int  # 页码
11 |     region_index: int  # 区域在页面中的序号
12 |     original_box: list  # 原始边界框坐标 [x1,y1,x2,y2]
13 |     content: str = None  # 识别出的内容
14 |     contains: list = None  # 包含的区域
15 | 
16 |     def __str__(self) -> str:
17 |         return f"RegionImage(label={self.label}, page={self.page_number}, index={self.region_index}, path={self.image_path})"
18 | 


--------------------------------------------------------------------------------
/x_pdf2md/image_utils/visualize_formula.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | with open("S_output/res.json", "r", encoding="utf-8") as f:
 5 |     s_data = json.load(f)
 6 |     
 7 | with open("L_output/res.json", "r", encoding="utf-8") as f:
 8 |     l_data = json.load(f)
 9 | 
10 | with open("UniMERNet_output/res.json", "r", encoding="utf-8") as f:
11 |     u_data = json.load(f)
12 | 
13 | # 写入到md文件
14 | with open("output/formula_recognition.md", "w", encoding="utf-8") as f:
15 |     f.write("# 公式识别结果对比\n\n")
16 |     # 小模型识别结果
17 |     f.write("## 小模型识别结果\n\n")
18 |     f.write(f"输入图像: {s_data['input_path']}\n\n")
19 |     f.write(f"识别结果: $${s_data['rec_formula']}$$\n\n")
20 |     # 大模型识别结果
21 |     f.write("## 大模型识别结果\n\n")
22 |     f.write(f"输入图像: {l_data['input_path']}\n\n")
23 |     f.write(f"识别结果:$$ {l_data['rec_formula']}$$\n\n")
24 |     # 加入UniMERNet_output的识别结果
25 |     f.write("## UniMERNet模型识别结果\n\n")
26 |     f.write(f"输入图像: {u_data['input_path']}\n\n")
27 |     f.write(f"识别结果:$$ {u_data['rec_formula']}$$\n\n")


--------------------------------------------------------------------------------
/x_pdf2md/markdown_formatter.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | import os
  3 | 
  4 | from x_pdf2md.image2md.get_image_title import get_image_title
  5 | from x_pdf2md.image2md.vlm_function import extract_table_from_image, extract_text_from_image, describe_image
  6 | from x_pdf2md.image_utils.formula_recognize import recognize_formula
  7 | from x_pdf2md.ocr_utils.ocr_image import OCRProcessor
  8 | from x_pdf2md.remote_image.image_uploader import ImageUploader
  9 | from x_pdf2md.image_utils.region_image import RegionImage
 10 | 
 11 | ocr_processor = OCRProcessor()
 12 | 
 13 | def format_region_content(
 14 |     region: RegionImage, 
 15 |     image_upload_obj: Optional[ImageUploader] = None,
 16 |     output_dir: Optional[str] = None
 17 | ) -> None:
 18 |     """
 19 |     根据区域标签类型生成或增强内容
 20 | 
 21 |     参数:
 22 |         region: RegionImage对象
 23 |         image_upload_obj: 可选的图片上传器对象
 24 |         output_dir: 可选的输出目录，用于保存处理结果
 25 |     """
 26 | 
 27 |     # 获取标签
 28 |     label = region.label
 29 |     # 清空区域内容
 30 |     region.content = ""
 31 |     # 默认内容为空
 32 |     content = ""
 33 |     
 34 |     # 使用图片路径
 35 |     image_path = region.image_path
 36 |     
 37 |     # 排除图片相关部分，这些已在format_region中单独处理
 38 |     if label in ["image", "figure", "chart"]:
 39 |         print("处理图片：", image_path)
 40 |         # 获取图片描述
 41 |         image_describe = describe_image(image_path)
 42 |         print("图片描述：", image_describe)
 43 |         
 44 |         # 如果有图片路径且有上传器，尝试上传
 45 |         image_title = get_image_title(image_describe)
 46 |         if not image_title:
 47 |             image_title = f"{label}_{region.region_index+1}"
 48 |         print(f"处理图片: {image_title}")
 49 |         
 50 |         # 如果指定了输出目录，可以在这里处理输出相关的逻辑
 51 |         result_path = image_path
 52 |         if output_dir:
 53 |             # 这里可以添加将处理结果保存到输出目录的逻辑
 54 |             # 例如: 复制图片到输出目录或者生成新的输出文件
 55 |             result_filename = os.path.basename(image_path)
 56 |             result_path = os.path.join(output_dir, result_filename)
 57 |             
 58 |         # 如果有图片路径且有上传器，尝试上传
 59 |         if image_path and image_upload_obj:
 60 |             print(f"上传图片: {image_path}")
 61 |             try:
 62 |                 # 上传图片
 63 |                 image_url = image_upload_obj.upload(image_path)
 64 |                 # 如果上传成功，使用图片URL
 65 |                 if image_url:
 66 |                     content = f"![{image_title}]({image_url})\n\n" + (
 67 |                         f"**{image_title}描述:** {region.content}"
 68 |                         if region.content
 69 |                         else ""
 70 |                     )
 71 |             except Exception as e:
 72 |                 print(f"图片上传失败: {e}")
 73 |         else:
 74 |             # 如果有输出目录，将图片复制到images子文件夹并使用相对路径
 75 |             if output_dir:
 76 |                 # 创建images子文件夹
 77 |                 images_dir = os.path.join(output_dir, "images")
 78 |                 os.makedirs(images_dir, exist_ok=True)
 79 |                 
 80 |                 # 获取原图片的文件名
 81 |                 image_filename = os.path.basename(image_path)
 82 |                 # 构建目标路径
 83 |                 target_image_path = os.path.join(images_dir, image_filename)
 84 |                 
 85 |                 # 复制图片到目标路径
 86 |                 import shutil
 87 |                 try:
 88 |                     shutil.copy2(image_path, target_image_path)
 89 |                     print(f"图片已复制到: {target_image_path}")
 90 |                     # 使用相对路径引用图片
 91 |                     image_rel_path = f"./images/{image_filename}"
 92 |                     content = f"![{image_title}]({image_rel_path})\n\n" + (
 93 |                         f"**{image_title}描述:** {region.content}" if region.content else ""
 94 |                     )
 95 |                 except Exception as e:
 96 |                     print(f"复制图片失败: {e}")
 97 |                     # 失败时回退到使用原始路径
 98 |                     content = f"![{image_title}]({image_path})\n\n" + (
 99 |                         f"**{image_title}描述:** {region.content}" if region.content else ""
100 |                     )
101 |             else:
102 |                 # 没有输出目录时使用原始路径
103 |                 content = f"![{image_title}]({image_path})\n\n" + (
104 |                     f"**{image_title}描述:** {region.content}" if region.content else ""
105 |                 )
106 |     
107 |     # 根据标签类型处理内容
108 |     if label == "text":
109 |         # 文本内容处理
110 |         content = extract_text_from_image(image_path=image_path)
111 |          
112 |     elif label == "formula":
113 |         content = recognize_formula(input_path=image_path)
114 |         # 公式内容处理
115 |         if not content.startswith("$$") and not content.endswith("$$"):
116 |             content = f"$$\n{content}\n$$"
117 |             
118 |     elif label == "table":
119 |         # 表格内容处理
120 |         content = extract_table_from_image(image_path=image_path)
121 |     elif label in ["doc_title", "paragraph_title",
122 |                    "chart_title", "table_title", "figure_title",
123 |                    "abstract"]:
124 |         # 其他类型标签的默认处理
125 |         content = ocr_processor.extract_text(image_path)
126 |     
127 |     region.content = content
128 | 
129 | 
130 | def format_pdf_regions(
131 |     page_regions: List[List[RegionImage]],
132 |     image_uploader: Optional[ImageUploader] = None,
133 |     output_dir: Optional[str] = None,
134 | ) -> List[str]:
135 |     """
136 |     格式化所有页面的区域为Markdown文本
137 | 
138 |     参数:
139 |         page_regions: 每页的RegionImage对象列表
140 |         image_uploader: 可选的图片上传器对象
141 |         output_dir: 可选的输出目录，用于保存处理结果
142 | 
143 |     返回:
144 |         List[str]: 每页的Markdown文本列表
145 |     """
146 |     # 内部函数：将format_region移到这里
147 |     def format_region(
148 |         region: RegionImage, 
149 |         image_upload_obj: Optional[ImageUploader] = None,
150 |         output_dir: Optional[str] = None
151 |     ) -> str:
152 |         """
153 |         将区块处理结果格式化为Markdown
154 | 
155 |         参数:
156 |             region: RegionImage对象，表示区块处理结果
157 |             image_upload_obj: 图片上传器对象，用于处理图片上传
158 |             output_dir: 可选的输出目录，用于保存处理结果
159 | 
160 |         返回:
161 |             Markdown格式的文本
162 |         """
163 |         # print(f"处理区域 #{region.region_index+1}，标签: {region.label}")
164 | 
165 |         # 生成或增强区域内容
166 |         format_region_content(region, image_upload_obj, output_dir)
167 | 
168 |         if not region.content:
169 |             return ""
170 |         return region.content
171 | 
172 |     formatted_pages = []
173 |     for page_num, regions in enumerate(page_regions, 1):
174 |         print(f"\n处理第 {page_num} 页的格式化...")
175 |         page_content = []
176 |         for region in regions:
177 |             formatted = format_region(region, image_uploader, output_dir)
178 |             if formatted:
179 |                 page_content.append(formatted)
180 |         formatted_pages.append("\n\n".join(page_content))
181 |     return formatted_pages
182 | 


--------------------------------------------------------------------------------
/x_pdf2md/ocr_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/ocr_utils/__init__.py


--------------------------------------------------------------------------------
/x_pdf2md/ocr_utils/ocr_image.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | from typing import Dict, List, Union
  4 | import os
  5 | import json
  6 | from PIL import Image
  7 | 
  8 | from x_pdf2md.ocr_utils.text_detection import text_detection
  9 | from x_pdf2md.ocr_utils.text_recogniize import recognize_text
 10 | from x_pdf2md.config import get_model_config
 11 | 
 12 | 
 13 | class OCRProcessor:
 14 |     def __init__(self, det_model=None, rec_model=None):
 15 |         """
 16 |         初始化OCR处理器
 17 |         
 18 |         Args:
 19 |             det_model: 文本检测模型名称，None则使用配置
 20 |             rec_model: 文本识别模型名称，None则使用配置
 21 |         """
 22 |         # 使用传入的模型名称或从配置中获取
 23 |         self.det_model = det_model or get_model_config('ocr_det')
 24 |         self.rec_model = rec_model or get_model_config('ocr_rec')
 25 |         
 26 |     def crop_image(self, image_path: str, box_coordinates: List) -> Image.Image:
 27 |         """根据坐标裁剪图像区域"""
 28 |         image = Image.open(image_path)
 29 |         # 将坐标转换为矩形边界框
 30 |         x_coordinates = [int(point[0]) for point in box_coordinates]
 31 |         y_coordinates = [int(point[1]) for point in box_coordinates]
 32 |         left, top = min(x_coordinates), min(y_coordinates)
 33 |         right, bottom = max(x_coordinates), max(y_coordinates)
 34 |         # 裁剪图像
 35 |         cropped = image.crop((left, top, right, bottom))
 36 |         return cropped
 37 | 
 38 |     def process_image(self, image_path: str, save_crops: bool = True, output_dir: str = "./output/crops") -> List[Dict]:
 39 |         """
 40 |         处理图像的完整OCR流程
 41 |         Args:
 42 |             image_path: 输入图像路径
 43 |             save_crops: 是否保存裁剪后的图像
 44 |             output_dir: 裁剪图像的保存目录
 45 |         Returns:
 46 |             包含文本位置和识别结果的列表
 47 |         """
 48 |         # 创建输出目录
 49 |         if save_crops:
 50 |             os.makedirs(output_dir, exist_ok=True)
 51 |         
 52 |         # 1. 首先进行文本检测
 53 |         det_results = text_detection(image_path)
 54 |         
 55 |         all_results = []
 56 |         # 2. 对每个检测到的区域进行处理
 57 |         for idx, (poly, score) in enumerate(zip(det_results['dt_polys'], det_results['dt_scores'])):
 58 |             # 裁剪检测到的文本区域
 59 |             cropped = self.crop_image(image_path, poly)
 60 |             
 61 |             # 保存裁剪的图像（如果需要）
 62 |             if save_crops:
 63 |                 crop_filename = f"text_area_{idx}_score_{score:.4f}.png"
 64 |                 crop_path = os.path.join(output_dir, crop_filename)
 65 |                 cropped.save(crop_path)
 66 |                 temp_path = crop_path
 67 |             else:
 68 |                 # 如果不保存，使用临时目录
 69 |                 temp_dir = "./output/temp"
 70 |                 os.makedirs(temp_dir, exist_ok=True)
 71 |                 temp_path = os.path.join(temp_dir, f"temp_{idx}.png")
 72 |                 cropped.save(temp_path)
 73 |             
 74 |             # 3. 对裁剪区域进行文本识别
 75 |             rec_result = recognize_text(temp_path)
 76 |             
 77 |             # 4. 整合结果
 78 |             result = {
 79 |                 'position': poly,
 80 |                 'detection_score': score,
 81 |                 'text': rec_result['rec_text'],
 82 |                 'recognition_score': rec_result['rec_score']
 83 |             }
 84 |             if save_crops:
 85 |                 result['crop_path'] = crop_path
 86 |             all_results.append(result)
 87 |             
 88 |             # 清理临时文件（如果不需要保存）
 89 |             if not save_crops:
 90 |                 os.remove(temp_path)
 91 |             
 92 |         return all_results
 93 | 
 94 |     def extract_text(self, image_path: str, as_list: bool = False, save_crops: bool = False, output_dir: str = "./output/crops") -> Union[str, List[str]]:
 95 |         """
 96 |         直接从图像中提取文本内容
 97 |         Args:
 98 |             image_path: 输入图像路径
 99 |             as_list: 是否以列表形式返回每个检测区域的文本
100 |             save_crops: 是否保存裁剪后的图像
101 |             output_dir: 裁剪图像的保存目录
102 |         Returns:
103 |             提取的文本内容，可以是字符串或字符串列表
104 |         """
105 |         # 调用OCR处理流程
106 |         results = self.process_image(image_path, save_crops, output_dir)
107 |         
108 |         # 提取所有文本
109 |         texts = [result['text'] for result in results]
110 |         
111 |         # 根据参数决定返回列表还是合并后的字符串
112 |         if as_list:
113 |             return texts
114 |         else:
115 |             return ''.join(texts)
116 |     
117 |     def save_results_to_json(self, results: List[Dict], output_path: str):
118 |         """
119 |         将OCR结果保存到JSON文件
120 |         Args:
121 |             results: OCR处理结果列表
122 |             output_path: JSON文件保存路径
123 |         """
124 |         # 确保输出目录存在
125 |         os.makedirs(os.path.dirname(output_path), exist_ok=True)
126 |         
127 |         # 将numpy数组转换为列表以便JSON序列化
128 |         serializable_results = []
129 |         for result in results:
130 |             result_copy = result.copy()
131 |             # 检查position是否为numpy数组，如果是则转换为列表
132 |             if hasattr(result_copy['position'], 'tolist'):
133 |                 result_copy['position'] = result_copy['position'].tolist()
134 |             # 如果已经是列表则不需要转换
135 |             serializable_results.append(result_copy)
136 |             
137 |         # 保存到JSON文件
138 |         with open(output_path, 'w', encoding='utf-8') as f:
139 |             json.dump(serializable_results, f, ensure_ascii=False, indent=2)
140 | 
141 | if __name__ == "__main__":
142 |     # 测试用例
143 |     image_path = "test_text.png"  # 替换为实际的测试图像路径
144 |     ocr = OCRProcessor()
145 |     results = ocr.process_image(
146 |         image_path,
147 |         save_crops=True,
148 |         output_dir="output/test_crops"
149 |     )
150 |     
151 |     # 保存结果到JSON文件
152 |     json_output_path = "output/ocr_results.json"
153 |     ocr.save_results_to_json(results, json_output_path)
154 |     
155 |     # 打印识别结果
156 |     print("\nOCR Results:")
157 |     print("-" * 50)
158 |     for idx, result in enumerate(results):
159 |         print(f"Region {idx + 1}:")
160 |         print(f"Text: {result['text']}")
161 |         print(f"Detection Score: {result['detection_score']:.4f}")
162 |         print(f"Recognition Score: {result['recognition_score']:.4f}")
163 |         if 'crop_path' in result:
164 |             print(f"Crop saved at: {result['crop_path']}")
165 |         print("-" * 50)
166 |     
167 |     # 直接提取文本的示例
168 |     text = ocr.extract_text(image_path)
169 |     print("\nExtracted Text:")
170 |     print("-" * 50)
171 |     print(text)
172 |     
173 |     # 以列表形式获取文本
174 |     text_list = ocr.extract_text(image_path, as_list=True)
175 |     print("\nExtracted Text as List:")
176 |     print("-" * 50)
177 |     for i, t in enumerate(text_list):
178 |         print(f"{i+1}. {t}")
179 | 


--------------------------------------------------------------------------------
/x_pdf2md/ocr_utils/text_detection.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # -*- coding: utf-8 -*-
  3 | # author：筱可
  4 | # 2025-03-16
  5 | 
  6 | # 导入必要的库
  7 | import json
  8 | import os
  9 | from typing import List
 10 | from paddlex import create_model  # PaddleX模型创建工具
 11 | import numpy as np
 12 | import cv2
 13 | 
 14 | def is_same_line(box1, box2, height_threshold=0.5):
 15 |     """
 16 |     判断两个文本框是否在同一行
 17 |     Args:
 18 |         box1: 第一个文本框坐标 shape:(4,2)
 19 |         box2: 第二个文本框坐标 shape:(4,2)
 20 |         height_threshold: 判定阈值，默认为文本框高度的0.5倍
 21 |     Returns:
 22 |         bool: True表示在同一行，False表示不在同一行
 23 |     """
 24 |     box1_center = np.mean(box1, axis=0)[1]  # y坐标的中心点
 25 |     box2_center = np.mean(box2, axis=0)[1]
 26 |     box1_height = abs(max(box1[:,1]) - min(box1[:,1]))
 27 |     box2_height = abs(max(box2[:,1]) - min(box2[:,1]))
 28 |     avg_height = (box1_height + box2_height) / 2
 29 |     
 30 |     return abs(box1_center - box2_center) < avg_height * height_threshold
 31 | 
 32 | def merge_overlapping_boxes(boxes, scores):
 33 |     """
 34 |     合并同一行的重叠文本框
 35 |     Args:
 36 |         boxes: 所有文本框坐标列表 shape:(N,4,2)
 37 |         scores: 对应的置信度得分列表 shape:(N,)
 38 |     Returns:
 39 |         tuple: (合并后的文本框列表, 合并后的置信度列表)
 40 |     """
 41 |     # 如果只有一个或没有文本框，直接返回
 42 |     if len(boxes) <= 1:
 43 |         return boxes, scores
 44 |     
 45 |     # 将输入的文本框列表转换为numpy数组，便于后续处理    
 46 |     boxes = np.array(boxes)
 47 |     # 初始化合并后的文本框列表
 48 |     merged_boxes = []
 49 |     # 初始化合并后的得分列表
 50 |     merged_scores = []
 51 |     # 初始化标记数组，用于记录每个文本框是否已被处理
 52 |     used = [False] * len(boxes)
 53 |     
 54 |     # 遍历所有文本框
 55 |     for i in range(len(boxes)):
 56 |         # 如果当前文本框已被处理，则跳过
 57 |         if used[i]:
 58 |             continue
 59 |         
 60 |         # 获取当前文本框和其得分    
 61 |         current_box = boxes[i]
 62 |         current_score = scores[i]
 63 |         # 初始化待合并文本框的索引列表
 64 |         merged_indices = [i]
 65 |         
 66 |         # 寻找与当前文本框在同一行的其他文本框
 67 |         for j in range(i + 1, len(boxes)):
 68 |             # 如果目标文本框已被处理，则跳过
 69 |             if used[j]:
 70 |                 continue
 71 |             
 72 |             # 判断两个文本框是否在同一行    
 73 |             if is_same_line(boxes[i], boxes[j]):
 74 |                 merged_indices.append(j)
 75 |         
 76 |         # 如果找到了需要合并的文本框
 77 |         if len(merged_indices) > 1:
 78 |             # 将所有待合并文本框的坐标点重新整理
 79 |             merged_points = boxes[merged_indices].reshape(-1, 2)
 80 |             # 计算合并后文本框的最小x和y坐标
 81 |             x_min, y_min = np.min(merged_points, axis=0)
 82 |             # 计算合并后文本框的最大x和y坐标
 83 |             x_max, y_max = np.max(merged_points, axis=0)
 84 |             # 构建合并后的矩形文本框坐标
 85 |             merged_box = np.array([[x_min, y_min], [x_max, y_min],
 86 |                                  [x_max, y_max], [x_min, y_max]])
 87 |             # 计算合并后文本框的平均置信度得分
 88 |             merged_score = np.mean([scores[idx] for idx in merged_indices])
 89 |         else:
 90 |             # 如果没有需要合并的文本框，保持原状
 91 |             merged_box = current_box
 92 |             merged_score = current_score
 93 |         
 94 |         # 标记所有已处理的文本框    
 95 |         for idx in merged_indices:
 96 |             used[idx] = True
 97 |         
 98 |         # 将处理结果添加到输出列表    
 99 |         merged_boxes.append(merged_box)
100 |         merged_scores.append(merged_score)
101 |     
102 |     # 返回合并后的文本框和对应的置信度得分
103 |     return merged_boxes, merged_scores
104 | 
105 | def visualize_boxes(image_path, boxes, output_path="./output/merged_result.jpg"):
106 |     """
107 |     将检测到的文本框可视化到图像上
108 |     Args:
109 |         image_path: 原始图像路径
110 |         boxes: 文本框坐标列表
111 |         output_path: 可视化结果保存路径
112 |     """
113 |     image = cv2.imread(image_path)
114 |     for box in boxes:
115 |         box = box.astype(np.int32)
116 |         cv2.polylines(image, [box], True, (0, 255, 0), 2)
117 |     cv2.imwrite(output_path, image)
118 | 
119 | def text_detection(image_path, output_path="./output/res.json", model="PP-OCRv4_mobile_det", 
120 |                   visualize=False) -> None:
121 |     """
122 |     执行文本检测的主函数
123 |     Args:
124 |         image_path: 输入图像路径
125 |         output_path: 检测结果JSON保存路径
126 |         model: 使用的PaddleOCR模型名称
127 |         visualize: 是否生成可视化结果
128 |     Returns:
129 |         dict: 包含文本检测结果的字典，格式如下：
130 |             {
131 |                 'input_path': str,  # 输入图像路径
132 |                 'page_index': int,  # 页面索引（如果有）
133 |                 'dt_polys': List[List[List[float]]],  # 文本框坐标
134 |                 'dt_scores': List[float]  # 置信度得分
135 |             }
136 |     """
137 |     # 创建输出目录
138 |     os.makedirs("output", exist_ok=True)
139 |     
140 |     # 初始化模型
141 |     model = create_model(model_name=model)
142 | 
143 |     # 执行预测
144 |     output = model.predict(image_path, batch_size=1)
145 | 
146 |     # 处理每个检测结果
147 |     for res in output:
148 |         # 将原始结果保存为JSON
149 |         res.save_to_json(output_path)
150 |         
151 |         # 读取JSON结果进行后处理
152 |         with open(output_path, 'r', encoding='utf-8') as f:
153 |             detection_result = json.load(f)
154 |         
155 |         # 提取文本框和置信度
156 |         boxes = np.array(detection_result['dt_polys'])  # 转换为numpy数组便于处理
157 |         scores = np.array(detection_result['dt_scores'])
158 |         
159 |         # 执行文本框合并
160 |         merged_boxes, merged_scores = merge_overlapping_boxes(boxes, scores)
161 |         
162 |         # 更新检测结果，将numpy数组转换回列表
163 |         detection_result['dt_polys'] = [box.tolist() if isinstance(box, np.ndarray) else box 
164 |                                       for box in merged_boxes]
165 |         detection_result['dt_scores'] = [float(score) if isinstance(score, np.ndarray) else score 
166 |                                        for score in merged_scores]
167 |         
168 |         # 保存处理后的结果
169 |         with open(output_path, 'w', encoding='utf-8') as f:
170 |             json.dump(detection_result, f, indent=4, ensure_ascii=False)
171 |         
172 |         # 生成可视化结果（如果需要）
173 |         if visualize:
174 |             visualize_boxes(image_path, boxes, "./output/original_result.jpg")  # 原始检测框
175 |             visualize_boxes(image_path, merged_boxes, "./output/merged_result.jpg")  # 合并后的检测框
176 |         
177 |     return detection_result
178 | 
179 | # 主程序入口
180 | if __name__ == "__main__":
181 |     # 对测试图像执行文本检测
182 |     res = text_detection(image_path = "test_fomula_text_block.png")
183 |     
184 |     
185 | # 文本检测结果说明
186 | 
187 | # 以下是OCR文本检测的JSON结果，包含了检测到的文本区域及其相关信息：
188 | 
189 | # ```json
190 | # {
191 | #     "input_path": "general_ocr_001.png",  // 输入图像的文件路径
192 | #     "page_index": null,  // 页面索引，null表示不适用或单页面文档
193 | #     "dt_polys": [  // 检测到的文本多边形区域，每个区域由四个坐标点[x,y]组成
194 | #         [[73, 552], [453, 542], [454, 575], [74, 585]],  // 第1个文本区域的四个顶点坐标
195 | #         [[17, 506], [515, 486], [517, 535], [19, 555]],  // 第2个文本区域的四个顶点坐标
196 | #         [[189, 457], [398, 449], [399, 482], [190, 490]],  // 第3个文本区域的四个顶点坐标
197 | #         [[41, 412], [484, 387], [486, 433], [43, 457]],  // 第4个文本区域的四个顶点坐标
198 | #         [[510, 32], [525, 32], [525, 49], [510, 49]]  // 第5个文本区域的四个顶点坐标
199 | #     ],
200 | #     "dt_scores": [  // 每个检测区域的置信度得分，值范围0-1，越高表示越可信
201 | #         0.7650322239059382,  // 第1个区域的置信度
202 | #         0.7197010251844577,  // 第2个区域的置信度
203 | #         0.8289373546662983,  // 第3个区域的置信度 (最高置信度)
204 | #         0.7989932734846841,  // 第4个区域的置信度
205 | #         0.7363050443898626   // 第5个区域的置信度
206 | #     ]
207 | # }
208 | # ```
209 | 
210 | # ## 字段说明
211 | 
212 | # - **input_path**: 输入的图像文件路径
213 | # - **page_index**: 多页文档的页码索引，null表示单页或不适用
214 | # - **dt_polys**: 检测到的文本区域多边形，每个区域由4个点的坐标表示，按顺时针或逆时针排列
215 | # - **dt_scores**: 对应每个文本区域的检测置信度，值越大表示检测结果越可靠
216 | 
217 | 


--------------------------------------------------------------------------------
/x_pdf2md/ocr_utils/text_recogniize.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from paddlex import create_model
 4 | 
 5 | 
 6 | def recognize_text(
 7 |     input_image: str,
 8 |     output_path: str = "./output/res.json",
 9 |     model="PP-OCRv4_mobile_rec",
10 | ) -> list:
11 |     """
12 |     识别图片中的文本
13 |     Args:
14 |         input_image: 输入图片路径
15 |         output_dir: 输出目录路径
16 |     Returns:
17 |         识别结果列表
18 |     """
19 |     output_dir = os.path.dirname(output_path)
20 |     # 确保输出目录存在
21 |     os.makedirs(output_dir, exist_ok=True)
22 | 
23 |     # 创建模型
24 |     model = create_model(model_name=model)
25 | 
26 |     # 预测
27 |     output = model.predict(input=input_image, batch_size=1)
28 | 
29 |     for res in output:
30 |         res.save_to_json(save_path=output_path)
31 | 
32 |     with open(output_path, "r", encoding="utf-8") as f:
33 |         result = json.load(f)
34 | 
35 |     return result
36 | 
37 | 
38 | # 使用示例:
39 | # results = recognize_text("text_area_4_score_0.9858.png")
40 | 
41 | # OCR 文本识别结果
42 | 
43 | # 以下是 OCR 文本识别的 JSON 结果数据：
44 | 
45 | # ```json
46 | # {
47 | #     "input_path": "general_ocr_rec_001.png",  // 输入图像文件路径
48 | #     "page_index": null,                        // 页码索引（多页文档时使用）
49 | #     "rec_text": "绿洲仕格维花园公寓",           // 识别出的文本内容
50 | #     "rec_score": 0.9875162839889526           // 识别结果的置信度分数（0-1之间）
51 | # }
52 | # ```
53 | 
54 | # ## 字段说明
55 | 
56 | # - **input_path**: 输入的源图像文件名
57 | # - **page_index**: 在多页文档中的页码索引，null 表示单页文档或默认页
58 | # - **rec_text**: OCR 识别出的文本内容
59 | # - **rec_score**: 识别结果的置信度，越接近 1 表示识别结果越可信
60 | 


--------------------------------------------------------------------------------
/x_pdf2md/pdf2md_converter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | """
  4 | 命令行入口点 - 处理命令行参数并调用相应功能
  5 | """
  6 | 
  7 | import argparse
  8 | import os
  9 | from pathlib import Path
 10 | from typing import Optional, List, Union
 11 | # 从process_pdf.py导入必要的依赖
 12 | from x_pdf2md.image_utils.process_page import process_page_layout
 13 | from x_pdf2md.image_utils.layout_config import LayoutConfig
 14 | from x_pdf2md.image_utils.region_image import RegionImage
 15 | from tqdm import tqdm
 16 | # 更新导入路径
 17 | from x_pdf2md.markdown_formatter import format_pdf_regions
 18 | from x_pdf2md.pdf_utils.pdf_to_image import pdf_to_images
 19 | from x_pdf2md.remote_image import default_uploader
 20 | from x_pdf2md.config import update_config, get_config, DEFAULT_CONFIG
 21 | 
 22 | 
 23 | def process_pdf_document(
 24 |     pdf_path: str,
 25 |     output_dir: str,
 26 |     start_page: int = 0,
 27 |     end_page: Optional[int] = None,
 28 |     dpi: int = 300,
 29 |     threshold_left_right: float = 0.9,
 30 |     threshold_cross: float = 0.3,
 31 | ) -> List[List[RegionImage]]: 
 32 |     """
 33 |     处理PDF文档：将PDF转换为图像，并对每页进行版面分析和区域裁剪
 34 | 
 35 |     参数:
 36 |         pdf_path: PDF文件路径
 37 |         output_dir: 输出目录路径
 38 |         start_page: 起始页码（从0开始）
 39 |         end_page: 结束页码（包含），如果为None则处理所有页面
 40 |         dpi: PDF转图像的分辨率
 41 |         threshold_left_right: 判定左右栏的阈值
 42 |         threshold_cross: 判定跨栏的阈值
 43 | 
 44 |     返回:
 45 |         List[List[RegionImage]]: 每页的RegionImage对象列表
 46 |     """
 47 |     # 创建输出目录
 48 |     pdf_name = Path(pdf_path).stem
 49 |     output_dir = os.path.abspath(output_dir)
 50 |     temp_images_dir = os.path.join(output_dir, f"{pdf_name}_images")
 51 |     os.makedirs(temp_images_dir, exist_ok=True)
 52 | 
 53 |     # 将PDF转换为图像
 54 |     print("正在将PDF转换为图像...")
 55 |     image_paths = pdf_to_images(
 56 |         pdf_path=pdf_path,
 57 |         output_dir=temp_images_dir,
 58 |         start_page=start_page,
 59 |         end_page=end_page,
 60 |         dpi=dpi,
 61 |     )
 62 | 
 63 |     # 处理每个页面的布局
 64 |     print("正在分析和裁剪页面...")
 65 |     all_page_regions = []
 66 |     for i, image_path in enumerate(tqdm(image_paths, desc="处理页面")):
 67 |         page_num = i + 1
 68 |         page_dir = os.path.join(output_dir, f"{pdf_name}_page_{page_num}")
 69 |         os.makedirs(page_dir, exist_ok=True)
 70 | 
 71 |         # 处理页面布局并获取区域信息
 72 |         regions = process_page_layout(
 73 |             image_path=image_path,
 74 |             output_dir=page_dir,
 75 |             page_number=page_num,
 76 |             threshold_left_right=threshold_left_right,
 77 |             threshold_cross=threshold_cross,
 78 |         )
 79 | 
 80 |         all_page_regions.append(regions)
 81 | 
 82 |     return all_page_regions
 83 | 
 84 | 
 85 | def convert_pdf_to_markdown(
 86 |     pdf_path: str,
 87 |     output_dir: str = "output",
 88 |     start_page: int = 0,
 89 |     end_page: Optional[int] = None,
 90 |     dpi: int = DEFAULT_CONFIG["DEFAULT_DPI"],  # 使用配置中的默认值
 91 |     threshold_left_right: float = DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"],  # 使用配置中的默认值
 92 |     threshold_cross: float = DEFAULT_CONFIG["THRESHOLD_CROSS"],  # 使用配置中的默认值
 93 |     upload_images: bool = False,
 94 |     output_md_path: Optional[str] = None,
 95 |     api_key: Optional[str] = None,
 96 |     base_url: Optional[str] = None,  # 从config中获取，不设默认值
 97 | ) -> Union[str, List[str]]:
 98 |     """
 99 |     将PDF文档转换为Markdown
100 |     
101 |     Args:
102 |         pdf_path: PDF文件路径
103 |         output_dir: 输出目录路径，默认为"output"
104 |         start_page: 起始页码（从0开始），默认为0
105 |         end_page: 结束页码（包含），如果为None则处理所有页面
106 |         dpi: PDF转图像的分辨率，默认为300
107 |         threshold_left_right: 判定左右栏的阈值，默认为0.9
108 |         threshold_cross: 判定跨栏的阈值，默认为0.3
109 |         upload_images: 是否上传图片，默认为False
110 |         output_md_path: Markdown输出文件路径，如果为None则不保存文件
111 |         api_key: API密钥，可选，默认从config获取
112 |         base_url: API基础URL，可选，默认从config获取
113 |         
114 |     Returns:
115 |         如果提供了output_md_path，返回保存的文件路径；否则返回Markdown内容的列表
116 |     """
117 |     # 更新配置
118 |     config_updates = {}
119 |     if api_key:
120 |         config_updates["API_KEY"] = api_key
121 |     if base_url:
122 |         config_updates["BASE_URL"] = base_url
123 |     if dpi and dpi != DEFAULT_CONFIG["DEFAULT_DPI"]:
124 |         config_updates["DEFAULT_DPI"] = dpi
125 |     if threshold_left_right is not None and threshold_left_right != DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"]:
126 |         config_updates["THRESHOLD_LEFT_RIGHT"] = threshold_left_right
127 |     if threshold_cross is not None and threshold_cross != DEFAULT_CONFIG["THRESHOLD_CROSS"]:
128 |         config_updates["THRESHOLD_CROSS"] = threshold_cross
129 |     
130 |     if config_updates:
131 |         update_config(config_updates)
132 |     
133 |     # 处理PDF
134 |     regions = process_pdf_document(
135 |         pdf_path=pdf_path,
136 |         output_dir=output_dir,
137 |         start_page=start_page,
138 |         end_page=end_page,
139 |         dpi=dpi,
140 |         threshold_left_right=threshold_left_right,
141 |         threshold_cross=threshold_cross,
142 |     )
143 | 
144 |     # 初始化图片上传器（如果需要）
145 |     image_uploader = None
146 |     if upload_images:
147 |         image_uploader = default_uploader
148 | 
149 |     # 格式化结果，传递输出目录
150 |     formatted_pages = format_pdf_regions(regions, image_uploader, output_dir=output_dir)
151 |     
152 |     # 创建输出目录（如果需要）
153 |     if output_md_path:
154 |         output_dir = os.path.dirname(os.path.abspath(output_md_path))
155 |         if output_dir and not os.path.exists(output_dir):
156 |             os.makedirs(output_dir, exist_ok=True)
157 |             
158 |         # 保存为Markdown文件
159 |         with open(output_md_path, "w", encoding="utf-8") as f:
160 |             f.write("\n\n---\n\n".join(formatted_pages))
161 |         
162 |         # 输出处理统计
163 |         total_pages = len(regions)
164 |         total_regions = sum(len(page_regions) for page_regions in regions)
165 |         print(f"处理完成！共处理 {total_pages} 页，生成 {total_regions} 个区域图片")
166 |         print(f"Markdown文件已保存到: {output_md_path}")
167 |         
168 |         return output_md_path
169 |     
170 |     # 如果没有指定输出路径，则直接返回格式化后的内容
171 |     return formatted_pages
172 | 
173 | 
174 | def main():
175 |     """命令行主函数"""
176 |     parser = argparse.ArgumentParser(description="PDF文档处理工具")
177 |     parser.add_argument("-p", "--pdf", required=True, help="输入PDF文件路径")
178 |     parser.add_argument("-o", "--output", default="output", help="输出目录路径")
179 |     parser.add_argument(
180 |         "-s", "--start_page", type=int, default=0, help="起始页码（从0开始）"
181 |     )
182 |     parser.add_argument("-e", "--end_page", type=int, default=None, help="结束页码")
183 |     parser.add_argument("-d", "--dpi", type=int, default=DEFAULT_CONFIG["DEFAULT_DPI"], 
184 |                         help=f"图像分辨率，默认为{DEFAULT_CONFIG['DEFAULT_DPI']}")
185 |     parser.add_argument("--threshold_lr", type=float, default=DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"], 
186 |                         help=f"左右栏阈值，默认为{DEFAULT_CONFIG['THRESHOLD_LEFT_RIGHT']}")
187 |     parser.add_argument("--threshold_cross", type=float, default=DEFAULT_CONFIG["THRESHOLD_CROSS"], 
188 |                         help=f"跨栏阈值，默认为{DEFAULT_CONFIG['THRESHOLD_CROSS']}")
189 |     parser.add_argument(
190 |         "--no-filter", action="store_false", dest="filter_regions", help="不过滤区域"
191 |     )
192 |     parser.add_argument("--upload", action="store_true", help="启用图片上传")
193 |     parser.add_argument("--output-md", type=str, default="output.md", help="Markdown输出文件路径")
194 |     
195 |     # 添加API和模型配置参数
196 |     parser.add_argument("--api-key", type=str, help="API密钥")
197 |     parser.add_argument("--base-url", type=str, default=DEFAULT_CONFIG["BASE_URL"], 
198 |                         help=f"API基础URL，默认为{DEFAULT_CONFIG['BASE_URL']}")
199 |     parser.add_argument("--formula-model", type=str, default=DEFAULT_CONFIG["FORMULA_MODEL"], 
200 |                         help=f"公式识别模型名称，默认为{DEFAULT_CONFIG['FORMULA_MODEL']}")
201 |     parser.add_argument("--ocr-det-model", type=str, default=DEFAULT_CONFIG["OCR_DET_MODEL"], 
202 |                         help=f"OCR检测模型名称，默认为{DEFAULT_CONFIG['OCR_DET_MODEL']}")
203 |     parser.add_argument("--ocr-rec-model", type=str, default=DEFAULT_CONFIG["OCR_REC_MODEL"], 
204 |                         help=f"OCR识别模型名称，默认为{DEFAULT_CONFIG['OCR_REC_MODEL']}")
205 |     parser.add_argument("--layout-model", type=str, default=DEFAULT_CONFIG["LAYOUT_MODEL"], 
206 |                         help=f"版面分析模型名称，默认为{DEFAULT_CONFIG['LAYOUT_MODEL']}")
207 |     parser.add_argument("--vlm-model", type=str, default=DEFAULT_CONFIG["VLM_MODEL"],)
208 |     
209 |     args = parser.parse_args()
210 | 
211 |     # 更新模型配置
212 |     config_updates = {}
213 |     if args.formula_model != DEFAULT_CONFIG["FORMULA_MODEL"]:
214 |         config_updates["FORMULA_MODEL"] = args.formula_model
215 |     if args.ocr_det_model != DEFAULT_CONFIG["OCR_DET_MODEL"]:
216 |         config_updates["OCR_DET_MODEL"] = args.ocr_det_model
217 |     if args.ocr_rec_model != DEFAULT_CONFIG["OCR_REC_MODEL"]:
218 |         config_updates["OCR_REC_MODEL"] = args.ocr_rec_model
219 |     if args.layout_model != DEFAULT_CONFIG["LAYOUT_MODEL"]:
220 |         config_updates["LAYOUT_MODEL"] = args.layout_model
221 |     
222 |     if config_updates:
223 |         update_config(config_updates)
224 | 
225 |     # 调用转换函数
226 |     convert_pdf_to_markdown(
227 |         pdf_path=args.pdf,
228 |         output_dir=args.output,
229 |         start_page=args.start_page,
230 |         end_page=args.end_page,
231 |         dpi=args.dpi,
232 |         threshold_left_right=args.threshold_lr,
233 |         threshold_cross=args.threshold_cross,
234 |         upload_images=args.upload,
235 |         output_md_path=args.output_md,
236 |         api_key=args.api_key,
237 |         base_url=args.base_url
238 |     )
239 | 
240 | 
241 | if __name__ == "__main__":
242 |     main()


--------------------------------------------------------------------------------
/x_pdf2md/pdf_utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | PDF处理相关工具
3 | """
4 | 


--------------------------------------------------------------------------------
/x_pdf2md/pdf_utils/pdf_to_image.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import pdfplumber
  4 | from PIL import Image
  5 | from pathlib import Path
  6 | from tqdm import tqdm
  7 | 
  8 | def pdf_page_to_image(pdf_path, page_number, output_path, dpi=300):
  9 |     """
 10 |     将PDF中的指定页面提取为高分辨率图片。
 11 |     
 12 |     参数:
 13 |         pdf_path (str): PDF文件路径
 14 |         page_number (int): 要提取的页码（从0开始索引）
 15 |         output_path (str): 输出图片的保存路径
 16 |         dpi (int): 分辨率（每英寸点数），数值越高质量越好
 17 |     
 18 |     返回:
 19 |         str: 已保存图片的路径
 20 |     """
 21 |     try:
 22 |         # 创建输出目录（如果不存在）
 23 |         os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
 24 |         
 25 |         # 使用pdfplumber打开PDF
 26 |         with pdfplumber.open(pdf_path) as pdf:
 27 |             # 检查页码是否有效
 28 |             if page_number < 0 or page_number >= len(pdf.pages):
 29 |                 raise ValueError(f"页码 {page_number} 超出范围。PDF共有 {len(pdf.pages)} 页。")
 30 |             
 31 |             # 获取指定页面
 32 |             page = pdf.pages[page_number]
 33 |             
 34 |             # 将页面转换为图像
 35 |             img = page.to_image(resolution=dpi)
 36 |             
 37 |             # 保存图像
 38 |             img.save(output_path, format="PNG")
 39 |         
 40 |         return output_path
 41 |     
 42 |     except Exception as e:
 43 |         print(f"提取PDF页面时出错: {e}")
 44 |         return None
 45 | 
 46 | def pdf_to_images(pdf_path, output_dir, start_page=0, end_page=None, dpi=300):
 47 |     """
 48 |     将PDF文件转换为一系列图像
 49 |     
 50 |     参数:
 51 |         pdf_path (str): PDF文件路径
 52 |         output_dir (str): 输出图像的目录
 53 |         start_page (int): 起始页码（从0开始索引）
 54 |         end_page (int): 结束页码（包含），如果为None则处理所有页面
 55 |         dpi (int): 分辨率
 56 |     
 57 |     返回:
 58 |         list: 已生成图像的路径列表
 59 |     """
 60 |     # 创建输出目录
 61 |     os.makedirs(output_dir, exist_ok=True)
 62 |     
 63 |     # 获取PDF文件名（不含扩展名）
 64 |     pdf_name = Path(pdf_path).stem
 65 |     
 66 |     try:
 67 |         # 使用pdfplumber获取PDF总页数
 68 |         with pdfplumber.open(pdf_path) as pdf:
 69 |             total_pages = len(pdf.pages)
 70 |         
 71 |         # 如果未指定结束页码，则处理所有页面
 72 |         if end_page is None:
 73 |             end_page = total_pages - 1
 74 |         
 75 |         # 验证页码范围
 76 |         if start_page < 0 or start_page >= total_pages:
 77 |             raise ValueError(f"起始页码 {start_page} 无效。PDF共有 {total_pages} 页。")
 78 |         
 79 |         if end_page < start_page or end_page >= total_pages:
 80 |             raise ValueError(f"结束页码 {end_page} 无效。PDF共有 {total_pages} 页。")
 81 |         
 82 |         # 存储图像路径
 83 |         image_paths = []
 84 |         
 85 |         # 处理每个页面
 86 |         page_range = range(start_page, end_page + 1)
 87 |         for page_num in tqdm(page_range, desc="转换PDF页面为图像"):
 88 |             # 设置输出图像路径
 89 |             output_image = os.path.join(output_dir, f"{pdf_name}_page_{page_num+1}.png")
 90 |             
 91 |             # 转换页面为图像
 92 |             result = pdf_page_to_image(pdf_path, page_num, output_image, dpi)
 93 |             
 94 |             if result:
 95 |                 image_paths.append(result)
 96 |         
 97 |         return image_paths
 98 |         
 99 |     except Exception as e:
100 |         print(f"处理PDF时出错: {e}")
101 |         return []
102 | 
103 | 
104 | # 如果需要命令行使用，保留此部分；否则可以删除
105 | if __name__ == "__main__":
106 |     
107 |     pdf_path = "./test_x_pdf2md.pdf"
108 |     output_dir = "./output"
109 |           # 转换PDF到图像
110 |     image_paths = pdf_to_images(
111 |         pdf_path=pdf_path,
112 |         output_dir=output_dir,
113 |         dpi=300
114 |     )


--------------------------------------------------------------------------------
/x_pdf2md/pdf_utils/test_x_pdf2md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/pdf_utils/test_x_pdf2md.pdf


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/__init__.py:
--------------------------------------------------------------------------------
 1 | from x_pdf2md.remote_image.image_uploader import ImageUploader
 2 | from x_pdf2md.remote_image.remote_image_config import BASE_URL
 3 | 
 4 | 
 5 | # 创建默认的上传器实例
 6 | default_uploader = ImageUploader(BASE_URL)
 7 | 
 8 | # 导出常用的接口
 9 | __all__ = ['default_uploader']
10 | 


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/image_names.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "d8cde17d3272483aadffee7942fa5a12.png": "文档飞舞.png",
 3 |   "d8b83c8b5db34b9abe81358248e01feb.png": "car.png",
 4 |   "26aec496c5b04c08b5b234657d332201.png": "car.png",
 5 |   "6440738c700d4a638af2e3bdb46d2b18.png": "3_image_0.9310.png",
 6 |   "ce61d84ea087494aba596334712a9d16.png": "1_image_0.8737.png",
 7 |   "d5263f6aab80473f9acb7b44da3ad13d.png": "3_image_0.9332.png",
 8 |   "d11d01ef03c44c0897fcaba132d76b15.png": "1_chart_0.6732.png",
 9 |   "14c685439a234a7992584ffa45baa00c.png": "6_image_0.9474.png",
10 |   "0bc2b863d25f462a9be99c8b59a94278.png": "1_image_0.8843.png",
11 |   "fc5b2e8a370343349efb19567176eba4.png": "5_image_0.9802.png",
12 |   "77a78431422645afa2438a6185f538ad.png": "1_image_0.9368.png",
13 |   "e2c8f97af1964083be85e9accad22d74.png": "3_chart_0.9602.png",
14 |   "e2f1c3479ea2451686b014935da352dc.png": "3_image_0.9310.png",
15 |   "a6a4b1ba811e461fbf9db0f5568544ae.png": "1_image_0.8737.png",
16 |   "14d9aac105404d909196c67b3db7d39f.png": "3_image_0.9332.png",
17 |   "0486c1d0971144e182563f5e468d7659.png": "1_chart_0.6732.png",
18 |   "796c9840eef843deaa8b0b2a10bdee00.png": "6_image_0.9474.png",
19 |   "acc78019636545ce914c28e66c270901.png": "1_image_0.8843.png",
20 |   "f9653e02835845f089df2bf58afe7665.png": "5_image_0.9802.png",
21 |   "b34f44dd026447f59e4ef6551a825d90.png": "1_image_0.9368.png",
22 |   "eac2252e2a2e4614bf9f4ddfe4d258a9.png": "3_chart_0.9602.png",
23 |   "00985878a4ea4bfcbfc57f37e294a29a.png": "2_image_0.9310.png",
24 |   "d7099bd9820c4785b2934d690f863168.png": "0_image_0.8737.png",
25 |   "afe096a66c224bcdbe153d4cf1289db1.png": "2_image_0.9332.png",
26 |   "44ce898afe704f59a90fe69a876f9450.png": "0_chart_0.6732.png",
27 |   "2a6ab8e378cf4a8ebdc028b41af25781.png": "5_image_0.9474.png",
28 |   "7c9f474ff43b48118601e031de081313.png": "0_image_0.8843.png",
29 |   "baf9911a9c244a6d938484aa46e22807.png": "4_image_0.9802.png",
30 |   "f7e6d0ebcd4e4deabb6d62cc8877e216.png": "0_image_0.9368.png",
31 |   "075771b951fe45e992da22b6281405a0.png": "2_chart_0.9602.png",
32 |   "d335ddab7ee1446e93c9fdbce532586c.png": "2_image_0.9310.png",
33 |   "e328186431ef4257925e29ae93bd85e7.png": "0_image_0.8737.png",
34 |   "668fb1eef5e64611980a016f2d060b43.png": "2_image_0.9332.png",
35 |   "e96fc48547b743dda00842d5ecd2f166.png": "0_chart_0.6732.png",
36 |   "569fd6ad0fec40318d1aa92c6b329827.png": "5_image_0.9474.png",
37 |   "3b65c3ce1bf140e48994782b1b15b52f.png": "0_image_0.8843.png",
38 |   "4b62cdb34c2a478d97d7f5417091edba.png": "4_image_0.9802.png",
39 |   "1a93ac8922714d7194b65d9cffddeb34.png": "0_image_0.9368.png",
40 |   "cdc04e2d086f450a87f882e40fdef62f.png": "2_chart_0.9602.png",
41 |   "09430e275628438c8b7bc1db393b5ceb.png": "2_image_0.9310.png",
42 |   "e6f5a4dbe62645fbb719c37c33aaadf0.png": "0_image_0.8737.png",
43 |   "76ec5a3704a2448a9d5242de4389d5f6.png": "2_image_0.9332.png",
44 |   "f76f60e9e5d448bdb5481c184275614e.png": "0_chart_0.6732.png",
45 |   "bc0614bfd5a34cd8b0fb626ab2c797b2.png": "5_image_0.9474.png",
46 |   "44878ac3ab6d41068fbe1bde4d6f89d3.png": "0_image_0.8843.png",
47 |   "7d0d7929c95544bdb87e8fc1f367664d.png": "4_image_0.9802.png",
48 |   "f2e6caad04194952a9f096fd0e73659e.png": "0_image_0.9368.png",
49 |   "8e7465cee34c4c458b04d885190f9b27.png": "2_chart_0.9602.png",
50 |   "55c8f8af6074461eb4e19c849e144aaa.png": "2_image_0.9310.png",
51 |   "7521553b2530417a807dd3bb6b1d4c68.png": "0_image_0.8737.png",
52 |   "159b393f626c4177b5a28a6bce955c50.png": "2_image_0.9332.png",
53 |   "a418628c21c6469698cda3035e879233.png": "0_chart_0.6732.png"
54 | }


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/image_serve.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from fastapi import FastAPI, UploadFile, Query
 4 | from fastapi.staticfiles import StaticFiles
 5 | from fastapi.responses import FileResponse, JSONResponse
 6 | import uuid
 7 | from typing import List
 8 | from remote_image_config import UPLOAD_DIR, BASE_URL, HOST, PORT, IMAGE_NAMES_FILE
 9 | 
10 | if not os.path.exists(UPLOAD_DIR):
11 |     os.makedirs(UPLOAD_DIR)
12 | 
13 | app = FastAPI()
14 | 
15 | # 挂载静态文件目录
16 | app.mount("/images", StaticFiles(directory=UPLOAD_DIR), name="images")
17 | app.mount("/static", StaticFiles(directory="static"), name="static")
18 | 
19 | @app.get("/")
20 | async def root():
21 |     return FileResponse("static/index.html")
22 | 
23 | @app.get("/health")
24 | async def health_check():
25 |     """健康检查接口"""
26 |     return {"status": "ok"}
27 | 
28 | # 加载图片名称映射
29 | image_names = {}
30 | if os.path.exists(IMAGE_NAMES_FILE):
31 |     with open(IMAGE_NAMES_FILE, 'r', encoding='utf-8') as f:
32 |         image_names = json.load(f)
33 | 
34 | @app.post("/image_upload")
35 | async def upload_image(file: UploadFile):
36 |     """
37 |     图片上传接口
38 |     """
39 |     try:
40 |         original_filename = file.filename
41 |         file_extension = os.path.splitext(original_filename)[1]
42 |         unique_filename = f"{uuid.uuid4().hex}{file_extension}"
43 |         
44 |         # 确保上传目录存在
45 |         os.makedirs(UPLOAD_DIR, exist_ok=True)
46 |         
47 |         # 保存文件
48 |         file_path = os.path.join(UPLOAD_DIR, unique_filename)
49 |         content = await file.read()
50 |         with open(file_path, "wb") as f:
51 |             f.write(content)
52 |         
53 |         # 保存原始文件名映射
54 |         image_names[unique_filename] = original_filename
55 |         with open(IMAGE_NAMES_FILE, 'w', encoding='utf-8') as f:
56 |             json.dump(image_names, f, ensure_ascii=False, indent=2)
57 |         
58 |         # 返回相对路径，不包含BASE_URL
59 |         return {"url": f"images/{unique_filename}", "originalName": original_filename}
60 |     except Exception as e:
61 |         return JSONResponse(
62 |             status_code=500,
63 |             content={"error": f"Upload failed: {str(e)}"}
64 |         )
65 | 
66 | @app.get("/api/images")
67 | async def list_images(page: int = Query(default=1, ge=1), page_size: int = Query(default=20, ge=1, le=100)):
68 |     all_images = [f for f in os.listdir(UPLOAD_DIR) if os.path.isfile(os.path.join(UPLOAD_DIR, f))]
69 |     all_images.sort(key=lambda x: os.path.getctime(os.path.join(UPLOAD_DIR, x)), reverse=True)
70 |     
71 |     # 计算分页
72 |     total = len(all_images)
73 |     total_pages = (total + page_size - 1) // page_size
74 |     start = (page - 1) * page_size
75 |     end = min(start + page_size, total)
76 |     
77 |     # 添加原始文件名
78 |     image_list = []
79 |     for img in all_images[start:end]:
80 |         image_list.append({
81 |             "filename": img,
82 |             "originalName": image_names.get(img, img)
83 |         })
84 |     
85 |     return {
86 |         "images": image_list,
87 |         "totalPages": total_pages,
88 |         "currentPage": page,
89 |         "total": total
90 |     }
91 | 
92 | if __name__ == "__main__":
93 |     import uvicorn
94 |     uvicorn.run(app, host=HOST, port=PORT)
95 | 


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/image_uploader.py:
--------------------------------------------------------------------------------
  1 | from .remote_image_config import BASE_URL, IMAGE_SERVER
  2 | import requests
  3 | from typing import Optional, Tuple
  4 | import logging
  5 | from requests.adapters import HTTPAdapter
  6 | from urllib3.util.retry import Retry
  7 | import os
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | class ImageUploader:
 12 |     """图片上传处理类"""
 13 | 
 14 |     def __init__(self, server_url: str = None, max_retries: int = 3, timeout: int = 10):
 15 |         """
 16 |         初始化图片上传器
 17 |         
 18 |         参数:
 19 |             server_url: 图片服务器的URL，如果不提供则使用配置文件中的设置
 20 |             max_retries: 最大重试次数
 21 |             timeout: 请求超时时间(秒)
 22 |         """
 23 |         self.server_url = server_url or IMAGE_SERVER['base_url']
 24 |         self.server_url = self.server_url.rstrip('/')
 25 |         self.timeout = timeout
 26 |         
 27 |         # 配置重试策略
 28 |         self.session = requests.Session()
 29 |         retries = Retry(
 30 |             total=max_retries,
 31 |             backoff_factor=0.5,
 32 |             status_forcelist=[502, 503, 504]
 33 |         )
 34 |         self.session.mount('http://', HTTPAdapter(max_retries=retries))
 35 |         self.session.mount('https://', HTTPAdapter(max_retries=retries))
 36 |         
 37 |     def get_absolute_url(self, relative_path: str) -> Optional[str]:
 38 |         """
 39 |         将相对路径转换为完整的URL地址
 40 |         
 41 |         参数:
 42 |             relative_path: 图片的相对路径
 43 |             
 44 |         返回:
 45 |             str: 完整的URL地址
 46 |             None: 如果输入路径为None
 47 |         """
 48 |         if relative_path is None:
 49 |             return None
 50 |         if relative_path.startswith(('http://', 'https://')):
 51 |             return relative_path
 52 |         return f"{self.server_url}/{relative_path.lstrip('/')}"
 53 |     
 54 |     def upload(self, image_path: str) -> Optional[str]:
 55 |         """上传图片到服务器"""
 56 |         try:
 57 |             # 检查文件是否存在
 58 |             if not os.path.exists(image_path):
 59 |                 logger.error(f"文件不存在: {image_path}")
 60 |                 return None
 61 | 
 62 |             # 获取文件名
 63 |             filename = os.path.basename(image_path)
 64 |             
 65 |             with open(image_path, 'rb') as f:
 66 |                 # 使用元组格式指定文件名
 67 |                 files = {
 68 |                     'file': (filename, f, 'image/jpeg')  
 69 |                 }
 70 |                 logger.info(f"正在上传文件 {image_path} 到 {self.server_url}/image_upload")
 71 |                 response = self.session.post(
 72 |                     f"{self.server_url}/image_upload",
 73 |                     files=files,
 74 |                     timeout=self.timeout
 75 |                 )
 76 |                 
 77 |             if response.status_code == 200:
 78 |                 result = response.json()
 79 |                 url = result.get('url')
 80 |                 if url:
 81 |                     # 服务器返回的是相对路径 'images/xxx.jpg'，需要拼接完整URL
 82 |                     absolute_url = f"{self.server_url}/{url}"
 83 |                     logger.info(f"上传成功，URL: {absolute_url}")
 84 |                     return absolute_url
 85 |                 else:
 86 |                     logger.error("服务器返回的URL为空")
 87 |                     return None
 88 |             else:
 89 |                 logger.error(
 90 |                     f"上传失败: HTTP {response.status_code}\n"
 91 |                     f"响应内容: {response.text}\n"
 92 |                     f"请求URL: {self.server_url}/image_upload"
 93 |                 )
 94 |                 return None
 95 |                 
 96 |         except requests.exceptions.ConnectionError as e:
 97 |             logger.error(f"服务器连接失败: {str(e)}")
 98 |             return None
 99 |         except requests.exceptions.Timeout as e:
100 |             logger.error(f"请求超时: {str(e)}")
101 |             return None
102 |         except Exception as e:
103 |             logger.error(f"上传图片时发生错误: {str(e)}")
104 |             return None
105 | 
106 |     def check_server(self) -> Tuple[bool, str]:
107 |         """检查服务器是否可用"""
108 |         try:
109 |             response = self.session.get(f"{self.server_url}/health", timeout=self.timeout)
110 |             if response.status_code == 200:
111 |                 return True, "服务器运行正常"
112 |             return False, f"服务器返回异常状态码: {response.status_code}"
113 |         except requests.exceptions.ConnectionError:
114 |             return False, f"无法连接到服务器 {self.server_url}"
115 |         except Exception as e:
116 |             return False, f"检查服务器时发生错误: {str(e)}"
117 | 
118 | 
119 | # 使用当前目录下的car.png作为测试图片
120 | if __name__ == "__main__":
121 |     # 配置日志输出
122 |     logging.basicConfig(
123 |         level=logging.INFO,
124 |         format='%(asctime)s - %(levelname)s - %(message)s'
125 |     )
126 |     
127 |     uploader = ImageUploader(server_url="http://localhost:8100")  # 明确指定服务器地址
128 |     print(f"使用服务器地址: {uploader.server_url}")
129 |     
130 |     # 首先检查服务器状态
131 |     status, message = uploader.check_server()
132 |     if not status:
133 |         print(f"服务器检查失败: {message}")
134 |         exit(1)
135 |         
136 |     print("服务器连接正常，开始上传图片...")
137 |     image_url = uploader.upload("car.png")
138 |     if image_url:
139 |         print(f"图片上传成功，URL为: {image_url}")
140 |     else:
141 |         print("图片上传失败，请检查日志获取详细信息")


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/remote_image_config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # 服务器配置
 4 | HOST = "0.0.0.0"
 5 | PORT = 8100
 6 | BASE_URL = f"http://{HOST}:{PORT}"
 7 | 
 8 | # 上传配置
 9 | UPLOAD_DIR = os.path.join(os.path.dirname(__file__), "upload_images")
10 | if not os.path.exists(UPLOAD_DIR):
11 |     os.makedirs(UPLOAD_DIR)
12 | 
13 | # 图片名称映射文件路径
14 | IMAGE_NAMES_FILE = os.path.join(os.path.dirname(__file__), "image_names.json")
15 | 
16 | # 图片服务器配置
17 | IMAGE_SERVER = {
18 |     "base_url": BASE_URL,
19 |     'timeout': 10,
20 |     'max_retries': 3
21 | }
22 | 


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/static/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <title>图片服务导航</title>
 5 |     <style>
 6 |         body {
 7 |             font-family: Arial, sans-serif;
 8 |             max-width: 800px;
 9 |             margin: 40px auto;
10 |             padding: 0 20px;
11 |         }
12 |         .nav-card {
13 |             background: #f5f5f5;
14 |             border-radius: 8px;
15 |             padding: 20px;
16 |             margin: 20px 0;
17 |             transition: transform 0.2s;
18 |         }
19 |         .nav-card:hover {
20 |             transform: translateY(-2px);
21 |             box-shadow: 0 4px 8px rgba(0,0,0,0.1);
22 |         }
23 |         h1 { color: #333; text-align: center; }
24 |         .nav-grid {
25 |             display: grid;
26 |             grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
27 |             gap: 20px;
28 |             margin-top: 30px;
29 |         }
30 |         a {
31 |             text-decoration: none;
32 |             color: inherit;
33 |         }
34 |         .nav-card h2 {
35 |             margin-top: 0;
36 |             color: #2196f3;
37 |         }
38 |         .nav-card p {
39 |             color: #666;
40 |             margin-bottom: 0;
41 |         }
42 |     </style>
43 | </head>
44 | <body>
45 |     <h1>图片服务中心</h1>
46 |     <div class="nav-grid">
47 |         <a href="/static/upload.html">
48 |             <div class="nav-card">
49 |                 <h2>上传图片</h2>
50 |                 <p>上传新的图片到服务器</p>
51 |             </div>
52 |         </a>
53 |         <a href="/static/list.html">
54 |             <div class="nav-card">
55 |                 <h2>图片列表</h2>
56 |                 <p>浏览所有已上传的图片</p>
57 |             </div>
58 |         </a>
59 |     </div>
60 | </body>
61 | </html>
62 | 


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/static/list.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head>
 4 |     <title>图片列表</title>
 5 |     <style>
 6 |         body {
 7 |             font-family: Arial, sans-serif;
 8 |             max-width: 1200px;
 9 |             margin: 40px auto;
10 |             padding: 0 20px;
11 |         }
12 |         .image-grid {
13 |             display: grid;
14 |             grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
15 |             gap: 20px;
16 |             padding: 20px 0;
17 |         }
18 |         .image-item {
19 |             text-align: center;
20 |         }
21 |         .pagination {
22 |             text-align: center;
23 |             margin: 20px 0;
24 |         }
25 |         .back-link {
26 |             margin-bottom: 20px;
27 |         }
28 |     </style>
29 | </head>
30 | <body>
31 |     <div class="back-link">
32 |         <a href="/x_pdf2md/remote_image/static">返回首页</a>
33 |     </div>
34 |     <h1>图片列表</h1>
35 |     <div id="imageGrid" class="image-grid"></div>
36 |     <div id="pagination" class="pagination"></div>
37 | 
38 |     <script>
39 |         async function loadImages(page = 1) {
40 |             try {
41 |                 const response = await fetch(`/api/images?page=${page}`);
42 |                 const data = await response.json();
43 |                 
44 |                 const imageGrid = document.getElementById('imageGrid');
45 |                 imageGrid.innerHTML = data.images.map(img => `
46 |                     <div class="image-item">
47 |                         <img src="/images/${img.filename}" alt="${img.originalName}" style="max-width: 200px; margin: 10px;">
48 |                         <br>
49 |                         <a href="/images/${img.filename}" title="${img.filename}">${img.originalName}</a>
50 |                     </div>
51 |                 `).join('');
52 | 
53 |                 const pagination = document.getElementById('pagination');
54 |                 pagination.innerHTML = createPaginationHTML(data.currentPage, data.totalPages);
55 |             } catch (error) {
56 |                 console.error('加载图片失败:', error);
57 |             }
58 |         }
59 | 
60 |         function createPaginationHTML(currentPage, totalPages) {
61 |             let html = '';
62 |             if (totalPages > 1) {
63 |                 if (currentPage > 1) {
64 |                     html += `<a href="#" onclick="loadImages(${currentPage-1})">上一页</a> `;
65 |                 }
66 |                 html += `第 ${currentPage}/${totalPages} 页 `;
67 |                 if (currentPage < totalPages) {
68 |                     html += `<a href="#" onclick="loadImages(${currentPage+1})">下一页</a>`;
69 |                 }
70 |             }
71 |             return html;
72 |         }
73 | 
74 |         // 初始加载
75 |         loadImages();
76 |     </script>
77 | </body>
78 | </html>
79 | 


--------------------------------------------------------------------------------
/x_pdf2md/remote_image/static/upload.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <title>上传图片</title>
  5 |     <style>
  6 |         body {
  7 |             font-family: Arial, sans-serif;
  8 |             max-width: 800px;
  9 |             margin: 40px auto;
 10 |             padding: 0 20px;
 11 |         }
 12 |         .upload-container {
 13 |             background: #f5f5f5;
 14 |             padding: 20px;
 15 |             border-radius: 8px;
 16 |             margin: 20px 0;
 17 |         }
 18 |         .upload-preview {
 19 |             max-width: 300px;
 20 |             margin: 20px 0;
 21 |             display: none;
 22 |         }
 23 |         .back-link {
 24 |             margin-bottom: 20px;
 25 |         }
 26 |         .result {
 27 |             margin-top: 20px;
 28 |             padding: 10px;
 29 |             display: none;
 30 |         }
 31 |         .success {
 32 |             background: #e8f5e9;
 33 |             border: 1px solid #a5d6a7;
 34 |         }
 35 |         .error {
 36 |             background: #ffebee;
 37 |             border: 1px solid #ffcdd2;
 38 |         }
 39 |     </style>
 40 | </head>
 41 | <body>
 42 |     <div class="back-link">
 43 |         <a href="/x_pdf2md/remote_image/static">返回首页</a>
 44 |     </div>
 45 |     <h1>上传图片</h1>
 46 |     <div class="upload-container">
 47 |         <form id="uploadForm">
 48 |             <input type="file" id="imageInput" accept="image/*" onchange="previewImage(event)">
 49 |             <button type="submit">上传</button>
 50 |         </form>
 51 |         <img id="preview" class="upload-preview">
 52 |         <div id="result" class="result"></div>
 53 |     </div>
 54 | 
 55 |     <script>
 56 |         function previewImage(event) {
 57 |             const preview = document.getElementById('preview');
 58 |             const file = event.target.files[0];
 59 |             const reader = new FileReader();
 60 | 
 61 |             reader.onload = function(e) {
 62 |                 preview.src = e.target.result;
 63 |                 preview.style.display = 'block';
 64 |             }
 65 | 
 66 |             if (file) {
 67 |                 reader.readAsDataURL(file);
 68 |             }
 69 |         }
 70 | 
 71 |         document.getElementById('uploadForm').onsubmit = async function(e) {
 72 |             e.preventDefault();
 73 |             const formData = new FormData();
 74 |             const fileInput = document.getElementById('imageInput');
 75 |             const resultDiv = document.getElementById('result');
 76 | 
 77 |             if (fileInput.files.length === 0) {
 78 |                 resultDiv.textContent = '请选择要上传的图片';
 79 |                 resultDiv.className = 'result error';
 80 |                 resultDiv.style.display = 'block';
 81 |                 return;
 82 |             }
 83 | 
 84 |             formData.append('file', fileInput.files[0]);
 85 | 
 86 |             try {
 87 |                 const response = await fetch('/image_upload', {
 88 |                     method: 'POST',
 89 |                     body: formData
 90 |                 });
 91 |                 const data = await response.json();
 92 |                 resultDiv.innerHTML = `上传成功！<br>
 93 |                     原始文件名: ${data.originalName}<br>
 94 |                     图片地址：<a href="${data.url}" target="_blank">${data.url}</a>`;
 95 |                 resultDiv.className = 'result success';
 96 |                 resultDiv.style.display = 'block';
 97 |             } catch (error) {
 98 |                 resultDiv.textContent = '上传失败：' + error.message;
 99 |                 resultDiv.className = 'result error';
100 |                 resultDiv.style.display = 'block';
101 |             }
102 |         };
103 |     </script>
104 | </body>
105 | </html>
106 | 


--------------------------------------------------------------------------------
/x_pdf2md/test_convert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 测试文件，用于直接处理x_pdf2md/tests目录下的test.pdf文件
 3 | 使用方法：
 4 | python -m x_pdf2md.test_convert
 5 | """
 6 | 
 7 | import os
 8 | from pathlib import Path
 9 | 
10 | from x_pdf2md.convert import convert_pdf_to_markdown
11 | 
12 | 
13 | def test_convert_pdf():
14 |     """
15 |     处理x_pdf2md/tests/test.pdf文件，并输出结果到output目录
16 |     """
17 |     # 获取当前模块所在目录
18 |     current_module_dir = os.path.dirname(os.path.abspath(__file__))
19 |     
20 |     # 构建项目根目录路径
21 |     project_root = os.path.dirname(current_module_dir)
22 |     
23 |     # 构建PDF文件路径（使用tests目录下的测试文件）
24 |     pdf_path = os.path.join(project_root, "x_pdf2md", "tests", "test_x_pdf2md.pdf")
25 |     
26 |     # 构建输出目录路径
27 |     output_dir = os.path.join(os.getcwd(), "output")
28 |     
29 |     # 确保PDF文件存在
30 |     if not os.path.exists(pdf_path):
31 |         print(f"错误：找不到测试PDF文件: {pdf_path}")
32 |         print(f"请确保在 x_pdf2md/tests 目录中存在 test.pdf 文件")
33 |         return False
34 |     
35 |     # 创建输出目录（如果不存在）
36 |     os.makedirs(output_dir, exist_ok=True)
37 |     
38 |     print(f"开始处理PDF文件: {pdf_path}")
39 |     
40 | 
41 |     output_path = convert_pdf_to_markdown(
42 |         pdf_path=pdf_path,
43 |         output_dir=output_dir,
44 |         start_page=0,
45 |         end_page=None,  # 处理所有页面
46 |         dpi=300,
47 |         upload_images=False,  # 默认不上传图片
48 |         output_md_path=os.path.join(output_dir, "test_result.md")
49 |     )
50 | 
51 |     print(f"PDF转换成功！输出文件路径: {output_path}")
52 |     return output_path
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     # 当直接运行此文件时执行转换
57 | 
58 |     test_convert_pdf()
59 | 
60 | 


--------------------------------------------------------------------------------
/x_pdf2md/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/tests/__init__.py


--------------------------------------------------------------------------------
/x_pdf2md/tests/test_x_pdf2md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/tests/test_x_pdf2md.pdf


--------------------------------------------------------------------------------