├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── assets └── images │ └── d99084735737c77dc3d3304cb78a411f.png ├── requirements-gpu.txt ├── requirements.txt ├── setup.py └── x_pdf2md ├── .env.example ├── __init__.py ├── config.py ├── convert.py ├── image2md ├── __init__.py ├── car.png ├── get_image_title.py ├── image2text.py ├── prompts │ ├── description_prompt.md │ └── ocr_prompt.md └── vlm_function.py ├── image_utils ├── __init__.py ├── crop_text_areas.py ├── detect_and_sort.py ├── formula_recognize.py ├── image.png ├── layout_config.py ├── layout_detect.py ├── layout_sorter.py ├── layout_visualizer.py ├── models.py ├── process_page.py ├── region_image.py └── visualize_formula.py ├── markdown_formatter.py ├── ocr_utils ├── __init__.py ├── ocr_image.py ├── text_detection.py └── text_recogniize.py ├── pdf2md_converter.py ├── pdf_utils ├── __init__.py ├── pdf_to_image.py └── test_x_pdf2md.pdf ├── remote_image ├── __init__.py ├── image_names.json ├── image_serve.py ├── image_uploader.py ├── remote_image_config.py └── static │ ├── index.html │ ├── list.html │ └── upload.html ├── test_convert.py └── tests ├── __init__.py └── test_x_pdf2md.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Distribution / packaging 24 | .Python 25 | env/ 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | *.egg-info/ 39 | .installed.cfg 40 | *.egg 41 | 42 | # Virtual environments 43 | venv/ 44 | ENV/ 45 | env/ 46 | 47 | # IDEs and editors 48 | .idea/ 49 | .vscode/ 50 | *.swp 51 | *.swo 52 | .DS_Store 53 | 54 | # 环境变量文件 55 | .env 56 | .env.local 57 | .env.development.local 58 | .env.test.local 59 | .env.production.local 60 | 61 | # others 62 | *.log 63 | test_datas 64 | output.md 65 | output -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | 10 | - repo: https://github.com/psf/black 11 | rev: 24.2.0 12 | hooks: 13 | - id: black 14 | language_version: python3 15 | args: [--line-length=100] 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2025, li-xiu-qi 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | 3. Neither the name of the copyright holder nor the 17 | names of its contributors may be used to endorse or promote products 18 | derived from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # x-pdf2md 2 | 3 | ![alt text](assets/images/d99084735737c77dc3d3304cb78a411f.png) 4 | 一个将PDF文档转换为Markdown的高级工具包,支持自动提取文本、识别公式、表格和图像。 5 | 6 | ## 功能特点 7 | 8 | - PDF文档页面转换为图像 9 | - 基于深度学习的版面分析 10 | - 数学公式识别并转换为LaTeX格式 11 | - 表格提取并转换为HTML格式 12 | - 图像自动通过多模态模型描述并上传到自定义的服务端 13 | - 多栏文本智能识别与重排版 14 | 15 | ## 安装 16 | 17 | ### 1. 安装特殊依赖 18 | 19 | 本项目依赖于PaddlePaddle和PaddleX进行深度学习模型推理,这些依赖需要单独安装: 20 | 21 | #### CPU版本 22 | 23 | ```bash 24 | # 首先安装PaddlePaddle CPU版本 25 | pip install paddlepaddle==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ 26 | 27 | # 然后安装PaddleX 28 | pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl 29 | ``` 30 | 31 | #### GPU版本(CUDA 11.8) 32 | 33 | ```bash 34 | # 安装PaddlePaddle GPU版本 35 | pip install paddlepaddle-gpu==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ 36 | 37 | # 然后安装PaddleX 38 | pip install https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl 39 | ``` 40 | 41 | ### 安装开发依赖 42 | 43 | ```bash 44 | # 安装开发依赖 45 | pip install -r requirements.txt 46 | 47 | ``` 48 | 49 | #### 其他CUDA版本 50 | 51 | 如果需要支持其他CUDA版本,请参考[PaddlePaddle官方安装指南](https://www.paddlepaddle.org.cn/install/quick)选择合适的安装命令。 52 | 53 | ## 使用方法 54 | 55 | ### 作为Python包导入 56 | 57 | #### 快速转换方法 58 | 59 | ```python 60 | import os 61 | from pathlib import Path 62 | 63 | from x_pdf2md.convert import convert_pdf_to_markdown 64 | 65 | 66 | def test_convert_pdf(): 67 | """ 68 | 处理x_pdf2md/tests/test.pdf文件,并输出结果到output目录 69 | """ 70 | # 获取当前模块所在目录 71 | current_module_dir = os.path.dirname(os.path.abspath(__file__)) 72 | 73 | # 构建项目根目录路径 74 | project_root = os.path.dirname(current_module_dir) 75 | 76 | # 构建PDF文件路径(使用tests目录下的测试文件) 77 | pdf_path = os.path.join(project_root, "x_pdf2md", "tests", "test_x_pdf2md.pdf") 78 | 79 | # 构建输出目录路径 80 | output_dir = os.path.join(os.getcwd(), "output") 81 | 82 | # 确保PDF文件存在 83 | if not os.path.exists(pdf_path): 84 | print(f"错误:找不到测试PDF文件: {pdf_path}") 85 | print(f"请确保在 x_pdf2md/tests 目录中存在 test.pdf 文件") 86 | return False 87 | 88 | # 创建输出目录(如果不存在) 89 | os.makedirs(output_dir, exist_ok=True) 90 | 91 | print(f"开始处理PDF文件: {pdf_path}") 92 | 93 | try: 94 | # 调用转换函数 95 | output_path = convert_pdf_to_markdown( 96 | pdf_path=pdf_path, 97 | output_dir=output_dir, 98 | start_page=0, 99 | end_page=None, # 处理所有页面 100 | dpi=300, 101 | upload_images=False, # 默认不上传图片 102 | output_md_path=os.path.join(output_dir, "test_result.md") 103 | ) 104 | 105 | print(f"PDF转换成功!输出文件路径: {output_path}") 106 | return True 107 | except Exception as e: 108 | print(f"转换过程中出错: {str(e)}") 109 | return False 110 | 111 | 112 | if __name__ == "__main__": 113 | # 当直接运行此文件时执行转换 114 | test_convert_pdf() 115 | ``` 116 | 117 | #### 图片上传服务 118 | 119 | 启动本地图片上传服务器: 120 | 121 | ```bash 122 | # 进入项目目录 123 | cd x_pdf2md/remote_image 124 | 125 | # 启动服务 126 | python image_serve.py 127 | ``` 128 | 129 | 服务启动后,访问 可以使用Web界面上传和管理图片。 130 | 131 | #### 调用的时候可以传入default_uploader进行上传文件 132 | 133 | ```python 134 | from x_pdf2md.remote_image import default_uploader 135 | 136 | # 初始化图片上传器(如果需要) 137 | image_uploader = None 138 | if upload_images: 139 | image_uploader = default_uploader 140 | 141 | # 格式化结果,传递输出目录 142 | formatted_pages = format_pdf_regions(regions, image_uploader, output_dir=output_dir) 143 | 144 | # 创建输出目录(如果需要) 145 | if output_md_path: 146 | output_dir = os.path.dirname(os.path.abspath(output_md_path)) 147 | if output_dir and not os.path.exists(output_dir): 148 | os.makedirs(output_dir, exist_ok=True) 149 | 150 | # 保存为Markdown文件 151 | with open(output_md_path, "w", encoding="utf-8") as f: 152 | f.write("\n\n---\n\n".join(formatted_pages)) 153 | 154 | # 输出处理统计 155 | total_pages = len(regions) 156 | total_regions = sum(len(page_regions) for page_regions in regions) 157 | print(f"处理完成!共处理 {total_pages} 页,生成 {total_regions} 个区域图片") 158 | print(f"Markdown文件已保存到: {output_md_path}") 159 | 160 | return output_md_path 161 | ``` 162 | 163 | ## 开源协议 164 | 165 | 本项目使用 [BSD 开源协议](./LICENSE)。 166 | -------------------------------------------------------------------------------- /assets/images/d99084735737c77dc3d3304cb78a411f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/assets/images/d99084735737c77dc3d3304cb78a411f.png -------------------------------------------------------------------------------- /requirements-gpu.txt: -------------------------------------------------------------------------------- 1 | # 项目依赖 2 | fastapi 3 | # pymupdf 4 | pdfplumber 5 | numpy 6 | openai 7 | opencv_contrib_python 8 | opencv_python 9 | opencv_python_headless 10 | 11 | # PaddlePaddle GPU版本 12 | # 等效于conda命令: conda install paddlepaddle-gpu==3.0.0rc1 paddlepaddle-cuda=12.3 -c paddle -c nvidia 13 | # CUDA 12.3版本 14 | paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu123/ 15 | # 如需CUDA 11.8版本,请使用下面的命令替代上面的命令 16 | # paddlepaddle-gpu==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/ 17 | 18 | https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl 19 | 20 | Pillow 21 | python-dotenv 22 | Requests 23 | tqdm 24 | urllib3 25 | uvicorn 26 | 27 | # 开发依赖 28 | pre-commit>=3.6.2 29 | black>=24.2.0 30 | isort>=5.13.2 31 | flake8>=7.0.0 32 | flake8-docstrings>=1.7.0 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # 项目依赖 2 | fastapi 3 | # pymupdf 4 | pdfplumber 5 | numpy 6 | openai 7 | opencv_contrib_python 8 | opencv_python 9 | opencv_python_headless 10 | # PaddlePaddle CPU版本(必须在paddlex之前安装) 11 | paddlepaddle==3.0.0rc0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/ 12 | https://paddle-model-ecology.bj.bcebos.com/paddlex/whl/paddlex-3.0.0rc0-py3-none-any.whl 13 | Pillow 14 | python-dotenv 15 | Requests 16 | tqdm 17 | urllib3 18 | uvicorn 19 | python-multipart 20 | 21 | 22 | # 开发依赖 23 | pre-commit>=3.6.2 24 | black>=24.2.0 25 | isort>=5.13.2 26 | flake8>=7.0.0 27 | flake8-docstrings>=1.7.0 28 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="x-pdf2md", 5 | version="0.1.0", 6 | packages=find_packages(include=['x_pdf2md', 'x_pdf2md.*']), # 明确包含x_pdf2md包及其子包 7 | install_requires=[ 8 | "tqdm>=4.45.0", 9 | "pdf2image>=1.14.0", 10 | "Pillow>=8.0.0", 11 | "numpy>=1.18.0", 12 | "opencv-python>=4.5.0", 13 | "pytesseract>=0.3.0", 14 | "requests>=2.25.0", 15 | "fastapi", 16 | "pymupdf", 17 | "python-dotenv", 18 | "uvicorn", 19 | # PaddlePaddle和PaddleX需要特殊安装方式,不在这里列出 20 | ], 21 | author="li-xiu-qi", 22 | author_email="lixiuqixiaoke@qq.com", 23 | description="将PDF文档转换为Markdown的工具", 24 | keywords="pdf, markdown, conversion", 25 | url="", 26 | license="BSD", 27 | classifiers=[ 28 | "Development Status :: 3 - Alpha", 29 | "Intended Audience :: Developers", 30 | "Programming Language :: Python :: 3", 31 | "License :: OSI Approved :: BSD License", 32 | ], 33 | entry_points={ 34 | 'console_scripts': [ 35 | 'x-pdf2md=x_pdf2md.main:main', # 更新入口点指向新的main.py 36 | ], 37 | }, 38 | python_requires='>=3.7', 39 | ) 40 | -------------------------------------------------------------------------------- /x_pdf2md/.env.example: -------------------------------------------------------------------------------- 1 | API_KEY=your_api_key 2 | BASE_URL=https://api.siliconflow.cn/v1 3 | -------------------------------------------------------------------------------- /x_pdf2md/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | x_pdf2md - 一个将PDF文档转换为Markdown的工具包 3 | """ 4 | 5 | __version__ = '0.1.0' 6 | -------------------------------------------------------------------------------- /x_pdf2md/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 配置管理模块 - 集中管理项目配置 5 | """ 6 | import os 7 | from typing import Dict, Any, Optional 8 | from dotenv import load_dotenv 9 | 10 | # 加载.env文件中的环境变量 11 | load_dotenv() 12 | 13 | # 默认配置 14 | DEFAULT_CONFIG = { 15 | # API配置 16 | "API_KEY": os.getenv("API_KEY", ""), # 从环境变量获取API密钥 17 | "BASE_URL": os.getenv("BASE_URL", "https://api.siliconflow.cn/v1"), # API基础URL 18 | 19 | # 图片服务配置 20 | "IMAGE_HOST": os.getenv("HOST", "0.0.0.0"), # 图片服务器主机 21 | "IMAGE_PORT": int(os.getenv("PORT", "8100")), # 图片服务器端口 22 | "UPLOAD_DIR": os.getenv("UPLOAD_DIR", "./uploads"), # 图片上传目录 23 | 24 | # 模型配置 25 | "FORMULA_MODEL": os.getenv("FORMULA_MODEL", "PP-FormulaNet-L"), # 公式识别模型 26 | "OCR_DET_MODEL": os.getenv("OCR_DET_MODEL", "PP-OCRv4_mobile_det"), # OCR检测模型 27 | "OCR_REC_MODEL": os.getenv("OCR_REC_MODEL", "PP-OCRv4_mobile_rec"), # OCR识别模型 28 | "LAYOUT_MODEL": os.getenv("LAYOUT_MODEL", "PP-DocLayout-L"), # 版面分析模型 (更新为PP-DocLayout-L) 29 | 30 | # 多模态模型 31 | "VLM_MODEL": os.getenv("VLM_MODEL", "Qwen/Qwen2.5-VL-72B-Instruct"), # 多模态模型 32 | 33 | # 处理配置 34 | "DEFAULT_DPI": int(os.getenv("DEFAULT_DPI", "300")), # 默认DPI 35 | "THRESHOLD_LEFT_RIGHT": float(os.getenv("THRESHOLD_LEFT_RIGHT", "0.9")), # 左右栏阈值 36 | "THRESHOLD_CROSS": float(os.getenv("THRESHOLD_CROSS", "0.3")), # 跨栏阈值 37 | } 38 | 39 | # 运行时配置(可覆盖默认配置) 40 | _runtime_config = {} 41 | 42 | def get_config() -> Dict[str, Any]: 43 | """ 44 | 获取当前配置(默认配置+运行时配置) 45 | 46 | Returns: 47 | Dict: 合并后的配置字典 48 | """ 49 | config = DEFAULT_CONFIG.copy() 50 | config.update(_runtime_config) 51 | return config 52 | 53 | def set_config(key: str, value: Any) -> None: 54 | """ 55 | 设置运行时配置 56 | 57 | Args: 58 | key: 配置键名 59 | value: 配置值 60 | """ 61 | _runtime_config[key] = value 62 | 63 | def update_config(config_dict: Dict[str, Any]) -> None: 64 | """ 65 | 批量更新运行时配置 66 | 67 | Args: 68 | config_dict: 配置字典 69 | """ 70 | _runtime_config.update(config_dict) 71 | 72 | def get_api_key() -> str: 73 | """获取API密钥""" 74 | return get_config()["API_KEY"] 75 | 76 | def get_base_url() -> str: 77 | """获取API基础URL""" 78 | return get_config()["BASE_URL"] 79 | 80 | def get_model_config(model_type: str) -> str: 81 | """ 82 | 获取特定类型的模型配置 83 | 84 | Args: 85 | model_type: 模型类型,如'formula', 'ocr_det', 'ocr_rec', 'layout' 86 | 87 | Returns: 88 | str: 模型名称 89 | """ 90 | model_map = { 91 | 'formula': 'FORMULA_MODEL', 92 | 'ocr_det': 'OCR_DET_MODEL', 93 | 'ocr_rec': 'OCR_REC_MODEL', 94 | 'layout': 'LAYOUT_MODEL', 95 | 'vlm': 'VLM_MODEL' 96 | } 97 | 98 | key = model_map.get(model_type) 99 | if not key: 100 | raise ValueError(f"未知的模型类型: {model_type}") 101 | 102 | return get_config()[key] 103 | -------------------------------------------------------------------------------- /x_pdf2md/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 命令行入口点 - 处理命令行参数并调用相应功能 5 | """ 6 | 7 | import argparse 8 | import os 9 | from pathlib import Path 10 | from typing import Optional, List, Union 11 | # 从process_pdf.py导入必要的依赖 12 | 13 | from tqdm import tqdm 14 | 15 | from x_pdf2md.config import DEFAULT_CONFIG, update_config 16 | from x_pdf2md.image_utils.process_page import process_page_layout 17 | from x_pdf2md.image_utils.region_image import RegionImage 18 | from x_pdf2md.markdown_formatter import format_pdf_regions 19 | from x_pdf2md.pdf_utils.pdf_to_image import pdf_to_images 20 | from x_pdf2md.remote_image import default_uploader 21 | 22 | 23 | # 保留原有的导入 24 | 25 | 26 | 27 | def process_pdf_document( 28 | pdf_path: str, 29 | output_dir: str, 30 | start_page: int = 0, 31 | end_page: Optional[int] = None, 32 | dpi: int = 300, 33 | threshold_left_right: float = 0.9, 34 | threshold_cross: float = 0.3, 35 | ) -> List[List[RegionImage]]: 36 | """ 37 | 处理PDF文档:将PDF转换为图像,并对每页进行版面分析和区域裁剪 38 | 39 | 参数: 40 | pdf_path: PDF文件路径 41 | output_dir: 输出目录路径 42 | start_page: 起始页码(从0开始) 43 | end_page: 结束页码(包含),如果为None则处理所有页面 44 | dpi: PDF转图像的分辨率 45 | threshold_left_right: 判定左右栏的阈值 46 | threshold_cross: 判定跨栏的阈值 47 | 48 | 返回: 49 | List[List[RegionImage]]: 每页的RegionImage对象列表 50 | """ 51 | # 创建输出目录 52 | pdf_name = Path(pdf_path).stem 53 | output_dir = os.path.abspath(output_dir) 54 | temp_images_dir = os.path.join(output_dir, f"{pdf_name}_images") 55 | os.makedirs(temp_images_dir, exist_ok=True) 56 | 57 | # 将PDF转换为图像 58 | print("正在将PDF转换为图像...") 59 | image_paths = pdf_to_images( 60 | pdf_path=pdf_path, 61 | output_dir=temp_images_dir, 62 | start_page=start_page, 63 | end_page=end_page, 64 | dpi=dpi, 65 | ) 66 | 67 | # 处理每个页面的布局 68 | print("正在分析和裁剪页面...") 69 | all_page_regions = [] 70 | for i, image_path in enumerate(tqdm(image_paths, desc="处理页面")): 71 | page_num = i + 1 72 | page_dir = os.path.join(output_dir, f"{pdf_name}_page_{page_num}") 73 | os.makedirs(page_dir, exist_ok=True) 74 | 75 | # 处理页面布局并获取区域信息 76 | regions = process_page_layout( 77 | image_path=image_path, 78 | output_dir=page_dir, 79 | page_number=page_num, 80 | threshold_left_right=threshold_left_right, 81 | threshold_cross=threshold_cross, 82 | ) 83 | 84 | all_page_regions.append(regions) 85 | 86 | return all_page_regions 87 | 88 | 89 | def convert_pdf_to_markdown( 90 | pdf_path: str, 91 | output_dir: str = "output", 92 | start_page: int = 0, 93 | end_page: Optional[int] = None, 94 | dpi: int = DEFAULT_CONFIG["DEFAULT_DPI"], # 使用配置中的默认值 95 | threshold_left_right: float = DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"], # 使用配置中的默认值 96 | threshold_cross: float = DEFAULT_CONFIG["THRESHOLD_CROSS"], # 使用配置中的默认值 97 | upload_images: bool = False, 98 | output_md_path: Optional[str] = None, 99 | api_key: Optional[str] = None, 100 | base_url: Optional[str] = None, # 从config中获取,不设默认值 101 | ) -> Union[str, List[str]]: 102 | """ 103 | 将PDF文档转换为Markdown 104 | 105 | Args: 106 | pdf_path: PDF文件路径 107 | output_dir: 输出目录路径,默认为"output" 108 | start_page: 起始页码(从0开始),默认为0 109 | end_page: 结束页码(包含),如果为None则处理所有页面 110 | dpi: PDF转图像的分辨率,默认为300 111 | threshold_left_right: 判定左右栏的阈值,默认为0.9 112 | threshold_cross: 判定跨栏的阈值,默认为0.3 113 | upload_images: 是否上传图片,默认为False 114 | output_md_path: Markdown输出文件路径,如果为None则不保存文件 115 | api_key: API密钥,可选,默认从config获取 116 | base_url: API基础URL,可选,默认从config获取 117 | 118 | Returns: 119 | 如果提供了output_md_path,返回保存的文件路径;否则返回Markdown内容的列表 120 | """ 121 | # 更新配置 122 | config_updates = {} 123 | if api_key: 124 | config_updates["API_KEY"] = api_key 125 | if base_url: 126 | config_updates["BASE_URL"] = base_url 127 | if dpi and dpi != DEFAULT_CONFIG["DEFAULT_DPI"]: 128 | config_updates["DEFAULT_DPI"] = dpi 129 | if threshold_left_right is not None and threshold_left_right != DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"]: 130 | config_updates["THRESHOLD_LEFT_RIGHT"] = threshold_left_right 131 | if threshold_cross is not None and threshold_cross != DEFAULT_CONFIG["THRESHOLD_CROSS"]: 132 | config_updates["THRESHOLD_CROSS"] = threshold_cross 133 | 134 | if config_updates: 135 | update_config(config_updates) 136 | 137 | # 处理PDF 138 | regions = process_pdf_document( 139 | pdf_path=pdf_path, 140 | output_dir=output_dir, 141 | start_page=start_page, 142 | end_page=end_page, 143 | dpi=dpi, 144 | threshold_left_right=threshold_left_right, 145 | threshold_cross=threshold_cross, 146 | ) 147 | 148 | # 初始化图片上传器(如果需要) 149 | image_uploader = None 150 | if upload_images: 151 | image_uploader = default_uploader 152 | 153 | # 格式化结果,传递输出目录 154 | formatted_pages = format_pdf_regions(regions, image_uploader, output_dir=output_dir) 155 | 156 | # 创建输出目录(如果需要) 157 | if output_md_path: 158 | output_dir = os.path.dirname(os.path.abspath(output_md_path)) 159 | if output_dir and not os.path.exists(output_dir): 160 | os.makedirs(output_dir, exist_ok=True) 161 | 162 | # 保存为Markdown文件 163 | with open(output_md_path, "w", encoding="utf-8") as f: 164 | f.write("\n\n---\n\n".join(formatted_pages)) 165 | 166 | # 输出处理统计 167 | total_pages = len(regions) 168 | total_regions = sum(len(page_regions) for page_regions in regions) 169 | print(f"处理完成!共处理 {total_pages} 页,生成 {total_regions} 个区域图片") 170 | print(f"Markdown文件已保存到: {output_md_path}") 171 | 172 | return output_md_path 173 | 174 | # 如果没有指定输出路径,则直接返回格式化后的内容 175 | return formatted_pages 176 | 177 | 178 | def main(): 179 | """命令行主函数""" 180 | parser = argparse.ArgumentParser(description="PDF文档处理工具") 181 | parser.add_argument("-p", "--pdf", required=True, help="输入PDF文件路径") 182 | parser.add_argument("-o", "--output", default="output", help="输出目录路径") 183 | parser.add_argument( 184 | "-s", "--start_page", type=int, default=0, help="起始页码(从0开始)" 185 | ) 186 | parser.add_argument("-e", "--end_page", type=int, default=None, help="结束页码") 187 | parser.add_argument("-d", "--dpi", type=int, default=DEFAULT_CONFIG["DEFAULT_DPI"], 188 | help=f"图像分辨率,默认为{DEFAULT_CONFIG['DEFAULT_DPI']}") 189 | parser.add_argument("--threshold_lr", type=float, default=DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"], 190 | help=f"左右栏阈值,默认为{DEFAULT_CONFIG['THRESHOLD_LEFT_RIGHT']}") 191 | parser.add_argument("--threshold_cross", type=float, default=DEFAULT_CONFIG["THRESHOLD_CROSS"], 192 | help=f"跨栏阈值,默认为{DEFAULT_CONFIG['THRESHOLD_CROSS']}") 193 | parser.add_argument( 194 | "--no-filter", action="store_false", dest="filter_regions", help="不过滤区域" 195 | ) 196 | parser.add_argument("--upload", action="store_true", help="启用图片上传") 197 | parser.add_argument("--output-md", type=str, default="output.md", help="Markdown输出文件路径") 198 | 199 | # 添加API和模型配置参数 200 | parser.add_argument("--api-key", type=str, help="API密钥") 201 | parser.add_argument("--base-url", type=str, default=DEFAULT_CONFIG["BASE_URL"], 202 | help=f"API基础URL,默认为{DEFAULT_CONFIG['BASE_URL']}") 203 | parser.add_argument("--formula-model", type=str, default=DEFAULT_CONFIG["FORMULA_MODEL"], 204 | help=f"公式识别模型名称,默认为{DEFAULT_CONFIG['FORMULA_MODEL']}") 205 | parser.add_argument("--ocr-det-model", type=str, default=DEFAULT_CONFIG["OCR_DET_MODEL"], 206 | help=f"OCR检测模型名称,默认为{DEFAULT_CONFIG['OCR_DET_MODEL']}") 207 | parser.add_argument("--ocr-rec-model", type=str, default=DEFAULT_CONFIG["OCR_REC_MODEL"], 208 | help=f"OCR识别模型名称,默认为{DEFAULT_CONFIG['OCR_REC_MODEL']}") 209 | parser.add_argument("--layout-model", type=str, default=DEFAULT_CONFIG["LAYOUT_MODEL"], 210 | help=f"版面分析模型名称,默认为{DEFAULT_CONFIG['LAYOUT_MODEL']}") 211 | parser.add_argument("--vlm-model", type=str, default=DEFAULT_CONFIG["VLM_MODEL"],) 212 | 213 | args = parser.parse_args() 214 | 215 | # 更新模型配置 216 | config_updates = {} 217 | if args.formula_model != DEFAULT_CONFIG["FORMULA_MODEL"]: 218 | config_updates["FORMULA_MODEL"] = args.formula_model 219 | if args.ocr_det_model != DEFAULT_CONFIG["OCR_DET_MODEL"]: 220 | config_updates["OCR_DET_MODEL"] = args.ocr_det_model 221 | if args.ocr_rec_model != DEFAULT_CONFIG["OCR_REC_MODEL"]: 222 | config_updates["OCR_REC_MODEL"] = args.ocr_rec_model 223 | if args.layout_model != DEFAULT_CONFIG["LAYOUT_MODEL"]: 224 | config_updates["LAYOUT_MODEL"] = args.layout_model 225 | 226 | if config_updates: 227 | update_config(config_updates) 228 | 229 | # 调用转换函数 230 | convert_pdf_to_markdown( 231 | pdf_path=args.pdf, 232 | output_dir=args.output, 233 | start_page=args.start_page, 234 | end_page=args.end_page, 235 | dpi=args.dpi, 236 | threshold_left_right=args.threshold_lr, 237 | threshold_cross=args.threshold_cross, 238 | upload_images=args.upload, 239 | output_md_path=args.output_md, 240 | api_key=args.api_key, 241 | base_url=args.base_url 242 | ) 243 | 244 | 245 | if __name__ == "__main__": 246 | main() 247 | -------------------------------------------------------------------------------- /x_pdf2md/image2md/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/image2md/__init__.py -------------------------------------------------------------------------------- /x_pdf2md/image2md/car.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/image2md/car.png -------------------------------------------------------------------------------- /x_pdf2md/image2md/get_image_title.py: -------------------------------------------------------------------------------- 1 | from openai import OpenAI 2 | from dotenv import load_dotenv 3 | import os 4 | 5 | load_dotenv() 6 | 7 | SYSTEM_PROMPT = """你是一个专业图像标题生成助手。 8 | 任务:根据提供的图像描述生成一个简短、准确且具有描述性的标题。 9 | 10 | 输出要求: 11 | - 标题应简洁(通常控制在5-20个字之间) 12 | - 突出图像的核心主题或最显著特征 13 | - 使用具体而非抽象的词语 14 | - 不要包含"这是"、"这张图片"等冗余词语 15 | - 学术论文或技术图像应保留专业术语的准确性 16 | - 直接输出标题文本,无需额外说明或引号 17 | 18 | 示例: 19 | 描述:茂密森林中,阳光透过树叶洒落在地面,形成斑驳光影。远处小溪流淌,水面反射着周围绿色植被。 20 | 标题:晨光森林溪流 21 | 22 | 描述:年轻女性在实验室使用显微镜观察样本。她穿白色实验服,戴护目镜,专注调整显微镜。旁边放着试管和实验笔记。 23 | 标题:科研人员显微观察 24 | 25 | 描述:学术论文封面,白色背景。标题"ISAM-MTL: Cross-subject multi-task learning model with identifiable spikes and associative memory networks"位于顶部,黑色字体。下方是作者名字"Junyan Li", "Bin Hu", "Zhi-Hong Guan"。摘要部分介绍EEG信号跨主体变化性和ISAM-MTL模型。页面右下角显示DOI和版权信息。 26 | 标题:ISAM-MTL 论文封面首页 27 | """ 28 | 29 | 30 | USER_PROMPT_TEMPLATE = """基于以下图像描述,提供一个简洁、专业的标题: 31 | ---- 32 | 描述:{description} 33 | ---- 34 | 直接输出标题(5-15字):""" 35 | 36 | 37 | def get_image_title(image_description, api_key=None): 38 | """ 39 | 使用硅基流动的deepseek v3 为多模态提取的图片描述生成图片的标题。 40 | 41 | 参数: 42 | image_description (str): 图像的描述文本 43 | api_key (str): 您的OpenAI API密钥 44 | 45 | 返回: 46 | str: 为图像生成的标题 47 | """ 48 | 49 | if not api_key: 50 | api_key = os.getenv("API_KEY") 51 | # 使用Silicon Flow基础URL初始化客户端 52 | client = OpenAI(api_key=api_key, base_url="https://api.siliconflow.com/v1") 53 | 54 | # 发送API请求 55 | response = client.chat.completions.create( 56 | model="deepseek-ai/DeepSeek-V3", 57 | messages=[ 58 | { 59 | "role": "system", 60 | "content": SYSTEM_PROMPT, 61 | }, 62 | { 63 | "role": "user", 64 | "content": USER_PROMPT_TEMPLATE.format(description=image_description), 65 | }, 66 | ], 67 | ) 68 | 69 | # 提取并返回标题 70 | title = response.choices[0].message.content.strip() 71 | return title 72 | 73 | 74 | if __name__ == "__main__": 75 | 76 | image_description = """ 77 | 这张图片显示了一篇学术论文的封面。 78 | 封面的背景是白色的,标题 79 | "ISAM-MTL: Cross-subject multi-task learning model with identifiable spikes and associative memory networks" 80 | 位于页面的顶部,使用了黑色的字体。 81 | 标题下方是作者的名字,分别是"Junyan Li", "Bin Hu", 和"Zhi-Hong Guan"。再往下是摘要部分,使用了较小的字体。 82 | 摘要的标题是"Abstract",内容是关于EEG(脑电图)信号的跨主体变化性, 83 | 以及一种新的模型"ISAM-MTL"(Identifiable Spikes and Associative Memory Multi-Task Learning)的介绍。 84 | 摘要的最后是"Introduction"部分的开头,介绍了脑机接 口(BCI)系统和EEG信号的相关背景。 85 | 页面的右下角显示了论文的引用信息,包括DOI(数字对象标识符)和版权信息。 86 | 整体构图简洁明了,信息层次分明。 87 | """ 88 | title = get_image_title(image_description) 89 | print(title) 90 | -------------------------------------------------------------------------------- /x_pdf2md/image2md/image2text.py: -------------------------------------------------------------------------------- 1 | """ 2 | #### 使用说明: 3 | 4 | 1. 初始化 `ImageTextExtractor` 实例时可以传入 `api_key`、`base_url`、`prompt` 或 `prompt_path`。 5 | 2. 使用 `extract_image_text` 方法可以提取图像中的文本并转换为 Markdown 格式。 6 | 7 | #### 主要功能: 8 | - 初始化时可以从环境变量读取 API 密钥,或者手动传入。 9 | - 提供了从文件读取自定义提示文本的功能。 10 | - 支持提取图像 URL 或本地图像文件路径中的文本。 11 | - 将提取的文本转换为 Markdown 格式,包括数学公式的格式化。 12 | - 支持图像 URL 或 Base64 编码图像的解析。 13 | - 提供多种模型和生成文本的细节级别设置。 14 | 15 | #### 参数说明: 16 | 17 | - **`ImageTextExtractor.__init__`**: 18 | - `api_key` (str): API 密钥,默认从环境变量读取。 19 | - `base_url` (str): API 基础 URL,默认值为 "https://api.siliconflow.cn/v1"。 20 | - `prompt` (str | None): 提示文本,优先使用传入的值。 21 | - `prompt_path` (str | None): 提示文本文件路径,读取指定文件中的内容作为提示文本。 22 | 23 | - **`ImageTextExtractor._read_prompt`**: 24 | - `prompt_path` (str): 提示文本文件路径。 25 | - 返回值 (str): 读取的提示文本内容。 26 | 27 | - **`ImageTextExtractor.extract_image_text`**: 28 | - `image_url` (str | None): 图像的 URL 地址。 29 | - `local_image_path` (str | None): 本地图像文件路径。 30 | - `model` (str): 使用的模型名称,默认 "Qwen/Qwen2-VL-72B-Instruct"。 31 | - `detail` (str): 细节级别,允许值为 'low', 'high', 'auto',默认 "low"。 32 | - `prompt` (str | None): 提示文本,优先使用传入的值。 33 | - `temperature` (float): 生成文本的温度参数,默认 0.1。 34 | - `top_p` (float): 生成文本的 top_p 参数,默认 0.5。 35 | - 返回值 (str): 提取的 Markdown 格式文本。 36 | 37 | - **`ImageTextExtractor._is_base64`**: 38 | - `s` (str): 待检查的字符串。 39 | - 返回值 (bool): 如果是 Base64 编码则返回 True,否则返回 False。 40 | 41 | - **`ImageTextExtractor._get_image_extension`**: 42 | - `file_path` (str): 图像文件路径。 43 | - 返回值 (str): 图像文件的扩展名。 44 | 45 | #### 注意事项: 46 | - `api_key` 是必须的,可以通过环境变量或初始化时传入。 47 | - 需要安装 `PIL` 库来获取图像的扩展名。 48 | - 图像文件必须是有效的图像格式,如 PNG、JPG 或 TIFF。 49 | - 如果使用 Base64 编码的图像,确保传入的字符串是有效的 Base64 编码。 50 | 51 | #### 更多信息: 52 | - 该类依赖于 OpenAI 的 API 服务以及环境变量中的 API 密钥。 53 | - 提取的 Markdown 格式文本会保留图像中的结构和公式,适用于文档集成。 54 | 55 | """ 56 | from x_pdf2md.config import get_model_config 57 | 58 | _prompt = """ 59 | 你是一个可以识别图片的AI,你可以基于图片与用户进行友好的对话。 60 | """ 61 | 62 | from openai import OpenAI 63 | from dotenv import load_dotenv 64 | import os 65 | import base64 66 | 67 | 68 | def extract_markdown_content(text: str) -> str: 69 | """ 70 | 从文本中提取Markdown内容,自动去除markdown和html代码块标记。 71 | 72 | 参数: 73 | text (str): 输入文本。 74 | 75 | 返回: 76 | str: 提取的内容,如果没有找到Markdown或HTML标记,则返回原始文本。 77 | """ 78 | md_start_marker = "```markdown" 79 | html_start_marker = "```html" 80 | end_marker = "```" 81 | 82 | # 处理markdown代码块 83 | md_start_index = text.find(md_start_marker) 84 | if md_start_index != -1: 85 | start_index = md_start_index + len(md_start_marker) 86 | end_index = text.find(end_marker, start_index) 87 | 88 | if end_index == -1: 89 | return text[start_index:].strip() 90 | return text[start_index:end_index].strip() 91 | 92 | # 处理html代码块 93 | html_start_index = text.find(html_start_marker) 94 | if html_start_index != -1: 95 | start_index = html_start_index + len(html_start_marker) 96 | end_index = text.find(end_marker, start_index) 97 | 98 | if end_index == -1: 99 | return text[start_index:].strip() 100 | return text[start_index:end_index].strip() 101 | 102 | # 如果没有找到特定标记,返回原始文本 103 | return text.strip() if text else None 104 | 105 | 106 | def image_to_base64(image_path: str) -> str: 107 | """ 108 | 将图像文件转换为Base64编码的字符串。 109 | 110 | 参数: 111 | image_path (str): 图像文件路径。 112 | 113 | 返回: 114 | str: Base64编码的字符串。 115 | """ 116 | with open(image_path, "rb") as image_file: 117 | encoded_string = base64.b64encode(image_file.read()).decode("utf-8") 118 | return encoded_string 119 | 120 | 121 | class ImageTextExtractor: 122 | """ 123 | 图像文本提取器类,用于将图像内容转换为 Markdown 格式的文本。 124 | """ 125 | 126 | def __init__( 127 | self, 128 | api_key: str = None, 129 | base_url: str = "https://api.siliconflow.cn/v1", 130 | prompt: str | None = None, 131 | prompt_path: str | None = None, 132 | ): 133 | """ 134 | 初始化 ImageTextExtractor 实例。 135 | 136 | :param api_key: API 密钥,如果未提供则从环境变量中读取 137 | :param base_url: API 基础 URL 138 | :param prompt: 提示文本 139 | :param prompt_path: 提示文本文件路径 140 | """ 141 | load_dotenv() 142 | self.api_key: str = api_key or os.getenv("API_KEY") 143 | 144 | if not self.api_key: 145 | raise ValueError("API key is required") 146 | 147 | self.client: OpenAI = OpenAI( 148 | api_key=self.api_key, 149 | base_url=base_url, 150 | ) 151 | self._prompt: str = ( 152 | prompt or self._read_prompt(prompt_path) or _prompt 153 | ) 154 | 155 | def _read_prompt(self, prompt_path: str) -> str: 156 | """ 157 | 从文件中读取提示文本。 158 | 159 | :param prompt_path: 提示文本文件路径 160 | :return: 提示文本内容 161 | """ 162 | if not prompt_path.endswith((".md", ".txt")): 163 | raise ValueError("Prompt file must be a .md or .txt file") 164 | with open(prompt_path, "r", encoding="utf-8") as f: 165 | return f.read() 166 | 167 | def extract_image_text( 168 | self, 169 | image_url: str = None, 170 | local_image_path: str = None, 171 | model: str = None, 172 | detail: str = "low", 173 | prompt: str = None, 174 | temperature: float = 0.1, 175 | ) -> str: 176 | """ 177 | 提取图像中的文本并转换为 Markdown 格式。 178 | 179 | :param image_url: 图像的 URL 180 | :param local_image_path: 本地图像文件路径 181 | :param model: 使用的模型名称 182 | :param detail: 细节级别,允许值为 'low', 'high', 'auto' 183 | :param prompt: 提示文本 184 | :param temperature: 生成文本的温度参数 185 | :param top_p: 生成文本的 top_p 参数 186 | :return: 提取的 Markdown 格式文本 187 | """ 188 | if model is None: 189 | model = get_model_config('vlm') 190 | if not image_url and not local_image_path: 191 | raise ValueError("Either image_url or local_image_path is required") 192 | 193 | if image_url and not ( 194 | image_url.startswith("http://") 195 | or image_url.startswith("https://") 196 | or self._is_base64(image_url) 197 | ): 198 | raise ValueError( 199 | "Image URL must be a valid HTTP/HTTPS URL or a Base64 encoded string" 200 | ) 201 | 202 | if local_image_path: 203 | if not os.path.exists(local_image_path): 204 | raise FileNotFoundError(f"The file {local_image_path} does not exist.") 205 | image_extension: str = self._get_image_extension(local_image_path) 206 | with open(local_image_path, "rb") as image_file: 207 | base64_image: str = base64.b64encode(image_file.read()).decode("utf-8") 208 | image_url = f"data:image/{image_extension};base64,{base64_image}" 209 | 210 | if detail not in ["low", "high", "auto"]: 211 | raise ValueError( 212 | "Invalid detail value. Allowed values are 'low', 'high', 'auto'" 213 | ) 214 | 215 | if detail == "auto": 216 | detail = "low" 217 | 218 | prompt = prompt or self._prompt 219 | 220 | try: 221 | response = self.client.chat.completions.create( 222 | model=model, 223 | messages=[ 224 | { 225 | "role": "user", 226 | "content": [ 227 | { 228 | "type": "image_url", 229 | "image_url": {"url": image_url, "detail": detail}, 230 | }, 231 | {"type": "text", "text": prompt}, 232 | ], 233 | } 234 | ], 235 | stream=True, 236 | temperature=temperature, 237 | ) 238 | 239 | result: str = "" 240 | for chunk in response: 241 | chunk_message: str = chunk.choices[0].delta.content 242 | result += chunk_message 243 | return result 244 | except Exception as e: 245 | raise RuntimeError(f"Failed to extract text from image: {e}") 246 | 247 | def _is_base64(self, s: str) -> bool: 248 | """ 249 | 检查字符串是否为 Base64 编码。 250 | 251 | :param s: 待检查的字符串 252 | :return: 如果是 Base64 编码则返回 True,否则返回 False 253 | """ 254 | try: 255 | if isinstance(s, str): 256 | if s.strip().startswith("data:image"): 257 | return True 258 | return base64.b64encode(base64.b64decode(s)).decode("utf-8") == s 259 | return False 260 | except Exception: 261 | return False 262 | 263 | def _get_image_extension(self, file_path: str) -> str: 264 | """ 265 | 获取图像文件的扩展名。 266 | 267 | :param file_path: 图像文件路径 268 | :return: 图像文件的扩展名 269 | """ 270 | try: 271 | from PIL import Image 272 | 273 | with Image.open(file_path) as img: 274 | return img.format.lower() 275 | except Exception as e: 276 | raise ValueError(f"Failed to determine image format: {e}") 277 | 278 | -------------------------------------------------------------------------------- /x_pdf2md/image2md/prompts/description_prompt.md: -------------------------------------------------------------------------------- 1 | # PDF图像内容描述提示 2 | 3 | ## 任务 4 | 5 | 使用视觉语言模型生成从PDF提取的图像内容的简洁描述。 6 | 7 | ## 背景 8 | 9 | - 图像来源于PDF文档 10 | - 需要清晰理解图像的主要内容和用途 11 | - 避免冗余描述,保持精简 12 | 13 | ## 输入 14 | 15 | - 从PDF提取的图像 16 | 17 | ## 输出 18 | 19 | 请简洁描述图像的以下关键方面: 20 | 21 | 1. 图像类型(图表、示意图、照片等) 22 | 2. 主要内容/主题 23 | 3. 包含的关键信息点 24 | 4. 文本或标签(如有) 25 | 5. 图像的可能用途 26 | 27 | 示例格式: 28 | "这是一张[图像类型],展示了[主要内容]。包含[关键信息]。[其他相关细节]。" 29 | -------------------------------------------------------------------------------- /x_pdf2md/image2md/prompts/ocr_prompt.md: -------------------------------------------------------------------------------- 1 | # OCR 图像到 Markdown 转换提示 2 | 3 | ## 背景 4 | 5 | 你有可能接受到从pdf文件内裁剪下来以图片的形式存在的内容,有可能是一个标题,或者是表格。 6 | 7 | ## 任务 8 | 9 | 将图像中的内容精确转换为格式化的 Markdown,保留原始文档的结构、布局和语义。 10 | 11 | ## 输入 12 | 13 | - 图像类型:文档扫描件、屏幕截图、手写内容照片 14 | - 支持格式:PNG、JPG、TIFF 等常见图像格式 15 | - 内容类型:文本段落、标题、列表、表格、数学公式、简单图表 16 | 17 | ## 输出要求 18 | 19 | - 完整的 Markdown 文本,使用适当的语法元素 20 | - 数学公式使用 LaTeX 语法,内联公式使用单个 `$` 分隔,独立公式使用 `$$` 分隔 21 | - 表格处理方式: 22 | - 对于简单表格,使用标准 Markdown 表格语法 23 | - 对于复杂表格(包含合并单元格等),使用 HTML 表格标记 24 | - 跨列单元格使用 `内容` 标记 25 | - 跨行单元格使用 `内容` 标记 26 | - 列表保留原始层级和编号 27 | - 保持原始段落结构和文本流 28 | 29 | ## 转换规则 30 | 31 | 1. **文本内容**:保留原始格式,包括段落分隔、强调和标点符号 32 | 2. **数学公式**:使用 LaTeX 语法准确转录,保持数学符号和结构 33 | 3. **表格**:根据复杂程度选择 Markdown 或 HTML 表格格式 34 | 4. **列表**:保持原始缩进和编号系统 35 | 5. **标题**:使用适当级别的 Markdown 标题标记 36 | 37 | ## 注意事项 38 | 39 | - 确保转换内容的真实性 40 | - 只转换图像中实际存在的内容,不添加额外解释或内容 41 | 42 | ## 输出示例 43 | 44 | ```markdown 45 | # 文档标题 46 | 47 | 正文内容,包含 $E=mc^2$ 内联公式。 48 | 49 | ## 小节标题 50 | 51 | 1. 列表项一 52 | 2. 列表项二 53 | 54 | $$ 55 | \int_{0}^{\infty} e^{-x^2} dx = \frac{\sqrt{\pi}}{2} 56 | $$ 57 | 58 | # 简单表格示例(Markdown语法) 59 | | 列 1 | 列 2 | 列 3 | 60 | |-----|-----|-----| 61 | | 数据 | 数据 | 数据 | 62 | 63 | # 复杂表格示例(HTML语法) 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 |
标题 1标题 2标题 3
跨列单元格普通单元格
跨行单元格数据数据
80 | ``` 81 | -------------------------------------------------------------------------------- /x_pdf2md/image2md/vlm_function.py: -------------------------------------------------------------------------------- 1 | from .image2text import ImageTextExtractor, extract_markdown_content 2 | import os 3 | 4 | # 定义提示词 5 | ocr_prompt = """ 6 | 使用OCR的模式提取图像中的文本内容,并转换为Markdown格式。 7 | 注意:不要输出图片以外的内容。 8 | 其中表格输出为Markdown格式,或者html格式,公式输出为带有$或者$$风格的LaTeX格式。 9 | """ 10 | 11 | description_prompt = """ 12 | # PDF图像内容描述提示 13 | 14 | ## 任务 15 | 16 | 使用视觉语言模型生成从PDF提取的图像内容的简洁描述。 17 | 18 | ## 背景 19 | 20 | - 图像来源于PDF文档 21 | - 需要清晰理解图像的主要内容和用途 22 | - 避免冗余描述,保持精简 23 | 24 | ## 输入 25 | 26 | - 从PDF提取的图像 27 | 28 | ## 输出 29 | 30 | 请简洁描述图像的以下关键方面: 31 | 32 | 1. 图像类型(图表、示意图、照片等) 33 | 2. 主要内容/主题 34 | 3. 包含的关键信息点 35 | 4. 文本或标签(如有) 36 | 5. 图像的可能用途 37 | 38 | 示例格式: 39 | "这是一张[图像类型],展示了[主要内容]。包含[关键信息]。[其他相关细节]。" 40 | """ 41 | 42 | extract_table_prompt = """ 43 | 提取图片当中的表格,并输出为支持markdown格式的html语法。 44 | 注意:不要输出图片以外的内容。 45 | """ 46 | 47 | 48 | def _process_image_with_model( 49 | image_path: str, 50 | model: str, 51 | prompt_path: str = None, 52 | prompt_text: str = None, 53 | api_key: str = None, 54 | detail: str = "low", 55 | post_process_func = None 56 | ) -> str: 57 | """处理图像并返回模型输出的基础函数""" 58 | if api_key is None: 59 | api_key = os.getenv("API_KEY") 60 | 61 | extractor = ImageTextExtractor( 62 | api_key=api_key, 63 | prompt_path=prompt_path, 64 | prompt=prompt_text 65 | ) 66 | 67 | try: 68 | result = extractor.extract_image_text( 69 | local_image_path=image_path, model=model, detail=detail 70 | ) 71 | 72 | if not result.strip(): 73 | return "No content extracted from the image" 74 | 75 | if post_process_func: 76 | return post_process_func(result) 77 | return extract_markdown_content(result) 78 | except Exception as e: 79 | return f"Error processing image: {str(e)}" 80 | 81 | 82 | def extract_text_from_image( 83 | image_path: str, 84 | model: str = None, 85 | ocr_prompt_path: str = None, 86 | api_key: str = None, 87 | ) -> str: 88 | """从图像中提取文本内容并转换为Markdown格式""" 89 | return _process_image_with_model( 90 | image_path=image_path, 91 | model=model, 92 | prompt_path=ocr_prompt_path, 93 | prompt_text=ocr_prompt if not ocr_prompt_path else None, 94 | api_key=api_key, 95 | detail="low" 96 | ) 97 | 98 | 99 | def describe_image( 100 | image_path: str, 101 | model: str = None, 102 | description_prompt_path: str = None, 103 | api_key: str = None, 104 | ) -> str: 105 | """描述图像内容并生成文本描述""" 106 | return _process_image_with_model( 107 | image_path=image_path, 108 | model=model, 109 | prompt_path=description_prompt_path, 110 | prompt_text=description_prompt if not description_prompt_path else None, 111 | api_key=api_key, 112 | detail="low" 113 | ) 114 | 115 | 116 | def process_table_content(result): 117 | """处理表格内容""" 118 | table_content = extract_markdown_content(result) 119 | 120 | if not (table_content.startswith('|') and '|---' in table_content): 121 | if '' in table_content.lower() and '
' in table_content.lower(): 122 | return table_content 123 | else: 124 | return f"```\n{table_content}\n```" 125 | return table_content 126 | 127 | 128 | def extract_table_from_image( 129 | image_path: str, 130 | model: str = None, 131 | extract_table_prompt_path: str = None, 132 | api_key: str = None, 133 | ) -> str: 134 | """从图像中提取表格内容并转换为Markdown或HTML格式""" 135 | return _process_image_with_model( 136 | image_path=image_path, 137 | model=model, 138 | prompt_path=extract_table_prompt_path, 139 | prompt_text=extract_table_prompt if not extract_table_prompt_path else None, 140 | api_key=api_key, 141 | detail="high", 142 | post_process_func=process_table_content 143 | ) 144 | 145 | 146 | # 测试代码 147 | if __name__ == "__main__": 148 | import sys 149 | from pathlib import Path 150 | 151 | current_dir = Path(__file__).parent 152 | test_image_path = current_dir / "car.png" 153 | 154 | if not test_image_path.exists(): 155 | print(f"测试图像文件不存在: {test_image_path}") 156 | sys.exit(1) 157 | 158 | print(f"正在处理图像: {test_image_path}") 159 | 160 | ocr_prompt_path = current_dir / "prompts/ocr_prompt.md" 161 | description_prompt_path = current_dir / "prompts/description_prompt.md" 162 | table_prompt_path = current_dir / "prompts/extract_table_prompt.md" 163 | 164 | # 测试文本提取 165 | print("\n" + "=" * 50) 166 | print("1. 提取的文本内容:") 167 | print("=" * 50) 168 | extracted_text = extract_text_from_image( 169 | str(test_image_path), 170 | ocr_prompt_path=str(ocr_prompt_path) if ocr_prompt_path.exists() else None, 171 | ) 172 | print(extracted_text) 173 | 174 | # 测试图像描述 175 | print("\n" + "=" * 50) 176 | print("2. 图像描述:") 177 | print("=" * 50) 178 | image_description = describe_image( 179 | str(test_image_path), 180 | description_prompt_path=( 181 | str(description_prompt_path) if description_prompt_path.exists() else None 182 | ), 183 | ) 184 | print(image_description) 185 | 186 | # 测试表格提取 187 | print("\n" + "=" * 50) 188 | print("3. 提取的表格内容:") 189 | print("=" * 50) 190 | table_content = extract_table_from_image( 191 | str(test_image_path), 192 | extract_table_prompt_path=( 193 | str(table_prompt_path) if table_prompt_path.exists() else None 194 | ), 195 | ) 196 | print(table_content) 197 | print("=" * 50) 198 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | 图像处理相关工具 3 | """ 4 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/crop_text_areas.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author:筱可 4 | # 2025-03-10 5 | """ 6 | 使用说明: 7 | 1. 将图像文件和对应的JSON检测结果文件放在指定目录 8 | 2. 设置相应的输入输出路径 9 | 3. 运行脚本即可获得裁剪后的文本区域图像 10 | 11 | 主要功能: 12 | 1. 读取原始图像和文本检测结果JSON文件 13 | 2. 支持矩形和多边形两种裁剪方式 14 | 3. 将检测到的文本区域裁剪并保存 15 | 16 | 参数说明: 17 | TextAreaCropper类方法: 18 | - crop_text_areas: 处理图像和JSON检测结果,裁剪文本区域 19 | 返回值:无 20 | 21 | 注意事项: 22 | 1. 依赖库:opencv-python, numpy 23 | 2. JSON文件需包含dt_polys和dt_scores字段 24 | 3. 确保具有目录的写入权限 25 | """ 26 | 27 | import os 28 | import json 29 | import cv2 30 | import numpy as np 31 | from typing import List, Dict, Any, Optional 32 | from abc import ABC, abstractmethod 33 | 34 | class TextCropper(ABC): 35 | """文本区域裁剪抽象基类""" 36 | 37 | @abstractmethod 38 | def crop(self, image: np.ndarray, polygon: np.ndarray) -> np.ndarray: 39 | """裁剪图像中的文本区域 40 | 41 | Args: 42 | image: 原始图像 43 | polygon: 文本区域多边形坐标 44 | 45 | Returns: 46 | 裁剪后的图像区域 47 | """ 48 | pass 49 | 50 | class RectCropper(TextCropper): 51 | """矩形裁剪实现类 - 简单直接的矩形裁剪""" 52 | 53 | def crop(self, image: np.ndarray, polygon: np.ndarray) -> np.ndarray: 54 | # 计算外接矩形 55 | x, y, w, h = cv2.boundingRect(polygon) 56 | 57 | # 检查裁剪区域是否有效 58 | if w <= 0 or h <= 0: 59 | print(f"警告:无效的裁剪区域 x={x}, y={y}, w={w}, h={h}") 60 | # 返回一个小的空白图像,保持与原图像相同的通道数 61 | channels = 3 if len(image.shape) == 3 else 1 62 | return np.zeros((1, 1, channels), dtype=np.uint8) 63 | 64 | # 简单的矩形裁剪 65 | if len(image.shape) == 3: # RGB/BGR图像 66 | cropped = image[y:y+h, x:x+w].copy() 67 | else: # 灰度图像 68 | cropped = image[y:y+h, x:x+w].copy() 69 | 70 | return cropped 71 | 72 | # 保留原有的PolyCropper类,但不再使用它 73 | class PolyCropper(TextCropper): 74 | """多边形裁剪实现类 - 支持透明背景""" 75 | 76 | def crop(self, image: np.ndarray, polygon: np.ndarray) -> np.ndarray: 77 | # 计算外接矩形 78 | x, y, w, h = cv2.boundingRect(polygon) 79 | 80 | # 检查裁剪区域是否有效 81 | if w <= 0 or h <= 0: 82 | print(f"警告:无效的裁剪区域 x={x}, y={y}, w={w}, h={h}") 83 | # 返回一个小的空白图像,保持与原图像相同的通道数 84 | return np.zeros((1, 1, image.shape[2]), dtype=np.uint8) 85 | 86 | # 调整多边形坐标为相对于裁剪区域的坐标 87 | shifted_polygon = polygon - np.array([x, y]) 88 | 89 | # 创建透明背景的图像(BGRA) 90 | cropped = np.zeros((h, w, 4), dtype=np.uint8) 91 | 92 | # 将原始图像复制到透明图像的BGR通道 93 | if len(image.shape) == 3: # BGR图像 94 | cropped[:, :, 0:3] = image[y:y+h, x:x+w] 95 | else: # 灰度图像 96 | for i in range(3): 97 | cropped[:, :, i] = image[y:y+h, x:x+w] 98 | 99 | # 创建alpha通道掩码 100 | mask = np.zeros((h, w), dtype=np.uint8) 101 | cv2.fillPoly(mask, [shifted_polygon], 255) 102 | 103 | # 将掩码应用到alpha通道 104 | cropped[:, :, 3] = mask 105 | 106 | return cropped 107 | 108 | class TextAreaCropper: 109 | """文本区域处理器""" 110 | 111 | def __init__(self, cropper: TextCropper = None): 112 | """初始化文本区域处理器 113 | 114 | Args: 115 | cropper: 裁剪策略实现,默认使用矩形裁剪 116 | """ 117 | self.cropper = cropper if cropper is not None else RectCropper() 118 | 119 | def crop_text_areas(self, image_path: str, json_path: str, output_dir: str, output_format: str = 'png', bg_color: tuple = (255, 255, 255)) -> None: 120 | """裁剪图像中检测到的文本区域 121 | 122 | Args: 123 | image_path: 原始图像路径 124 | json_path: 检测结果JSON文件路径 125 | output_dir: 裁剪结果保存目录 126 | output_format: 输出图像格式,支持'png'(带透明度)和'jpg'(无透明度)等,默认为'png' 127 | bg_color: 当使用不支持透明度的格式时的背景颜色(BGR格式),默认为白色 128 | 129 | Returns: 130 | 无返回值,结果保存到指定目录 131 | """ 132 | # 创建输出目录 133 | os.makedirs(output_dir, exist_ok=True) 134 | 135 | # 读取原始图像 136 | image = cv2.imread(image_path) 137 | if image is None: 138 | print(f"无法读取图像: {image_path}") 139 | return 140 | 141 | # 读取JSON文件中的检测结果 142 | with open(json_path, 'r', encoding='utf-8') as f: 143 | result = json.load(f) 144 | 145 | # 获取检测框列表 146 | boxes = result.get('boxes', []) 147 | 148 | # 处理每个检测到的区域 149 | for i, box in enumerate(boxes): 150 | # 获取坐标和其他信息 151 | coords = box.get('coordinate', []) 152 | if not coords: 153 | continue 154 | 155 | # 将坐标转换为多边形格式 156 | x1, y1, x2, y2 = map(float, coords) 157 | poly = np.array([[x1, y1], [x2, y1], [x2, y2], [x1, y2]], np.int32) 158 | 159 | # 使用选择的裁剪策略裁剪图像 160 | cropped = self.cropper.crop(image, poly) 161 | 162 | # 获取区域类型和置信度 163 | label = box.get('label', 'unknown') 164 | score = box.get('score', 0) 165 | 166 | # 生成输出文件名 - 添加序号 167 | output_filename = f"{i}_{label}_{score:.4f}.{output_format.lower()}" 168 | output_path = os.path.join(output_dir, output_filename) 169 | 170 | # 根据输出格式处理图像 171 | if output_format.lower() != 'png' and len(cropped.shape) == 3 and cropped.shape[2] == 4: 172 | # 如果不是PNG格式,而且图像有Alpha通道,需要处理透明度 173 | # 创建纯色背景 174 | background = np.ones((cropped.shape[0], cropped.shape[1], 3), dtype=np.uint8) 175 | background[:] = bg_color 176 | 177 | # 提取Alpha通道作为掩码 178 | alpha = cropped[:, :, 3] / 255.0 179 | alpha = alpha[:, :, np.newaxis] 180 | 181 | # 将前景与背景混合 182 | foreground = cropped[:, :, :3] 183 | merged = cv2.convertScaleAbs(foreground * alpha + background * (1 - alpha)) 184 | 185 | cv2.imwrite(output_path, merged) 186 | else: 187 | cv2.imwrite(output_path, cropped) 188 | 189 | print(f"已保存{label}区域 {i+1}: {output_path}") 190 | 191 | if __name__ == "__main__": 192 | # 设置输入输出路径 193 | image_path = "page_layout.png" 194 | json_path = "./4_output/res.json" 195 | output_dir = "./test_cropped_output" 196 | 197 | # 创建文本区域处理器并执行裁剪,使用矩形裁剪 198 | processor = TextAreaCropper(RectCropper()) 199 | 200 | # 使用默认PNG格式(带透明背景) 201 | processor.crop_text_areas(image_path, json_path, output_dir, output_format='png') 202 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/detect_and_sort.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | import cv2 3 | 4 | from image_utils.layout_detect import detect_layout 5 | from image_utils.layout_sorter import LayoutSorter 6 | 7 | 8 | def detect_and_sort_layout(image_path: str, 9 | output_path: str = "./layout_output/layout_detection.json", 10 | threshold_left_right: float = 0.9, 11 | threshold_cross: float = 0.3) -> List[Dict]: 12 | """ 13 | 检测图片版面并对检测结果进行排序 14 | 15 | Args: 16 | image_path: 输入图片路径 17 | output_path: 布局检测结果保存路径 18 | threshold_left_right: 判定元素属于左/右栏的阈值 19 | threshold_cross: 判定元素跨栏的阈值 20 | 21 | Returns: 22 | 排序后的版面元素列表 23 | """ 24 | # 读取图片获取宽度 25 | image = cv2.imread(image_path) 26 | page_width = image.shape[1] 27 | 28 | # 检测版面 29 | layout_result = detect_layout(image_path, output_path) 30 | 31 | # 创建排序器并排序 32 | sorter = LayoutSorter(threshold_left_right, threshold_cross) 33 | sorted_elements = sorter.sort_layout(layout_result, page_width) 34 | 35 | return sorted_elements 36 | 37 | if __name__ == "__main__": 38 | # 使用示例 39 | image_path = "formula_inline.png" 40 | sorted_result = detect_and_sort_layout(image_path) 41 | print(f"检测到 {len(sorted_result)} 个已排序的版面元素") 42 | 43 | # 添加可视化 44 | from image_utils.layout_visualizer import LayoutVisualizer 45 | visualizer = LayoutVisualizer() 46 | visualizer.save_visualization( 47 | image_path=image_path, 48 | boxes=sorted_result, 49 | output_path="output/visualization_output.png" 50 | ) 51 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/formula_recognize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 公式识别模块 - 从图像中识别数学公式并转换为LaTeX格式 5 | """ 6 | 7 | import json 8 | import os 9 | from typing import Optional, Dict, Any 10 | 11 | from x_pdf2md.image_utils.models import get_or_create_model 12 | 13 | 14 | 15 | def recognize_formula(input_path: str, output_path: Optional[str] = None) -> str: 16 | """ 17 | 识别图像中的数学公式 18 | 19 | Args: 20 | input_path: 输入图像路径 21 | output_path: 输出结果保存路径(可选) 22 | 23 | Returns: 24 | str: LaTeX格式的公式文本 25 | """ 26 | print(f"处理公式图片: {input_path}") 27 | 28 | # 确保输出目录存在 29 | output_dir = "./UniMERNet_output/" if output_path is None else os.path.dirname(output_path) 30 | os.makedirs(output_dir, exist_ok=True) 31 | 32 | # 获取或创建模型 33 | model = get_or_create_model('formula') 34 | 35 | 36 | if model is None: 37 | raise Exception("模型加载失败") 38 | 39 | output = model.predict(input=input_path, batch_size=1) 40 | 41 | for res in output: 42 | res_path = output_path or f"{output_dir}/res.json" 43 | res.save_to_json(save_path=res_path) 44 | 45 | # 读取json文件 46 | with open(res_path, 'r') as f: 47 | results = json.load(f) 48 | 49 | rec_formula = results.get("rec_formula", "") 50 | 51 | # 如果公式为空,返回一个默认值 52 | if not rec_formula: 53 | return "" 54 | 55 | return rec_formula 56 | 57 | 58 | 59 | 60 | if __name__ == "__main__": 61 | # 测试公式识别 62 | test_image = "image.png" 63 | latex = recognize_formula(test_image) 64 | print(f"识别结果: {latex}") 65 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/image_utils/image.png -------------------------------------------------------------------------------- /x_pdf2md/image_utils/layout_config.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | 4 | class LayoutConfig: 5 | # 可视化颜色配置 (RGB格式) 6 | COLORS: Dict[str, Tuple[int, int, int]] = { 7 | # 标题类 8 | 'doc_title': (255, 0, 128), # 品红色 9 | 'paragraph_title': (255, 0, 0), # 红色 10 | 'figure_title': (128, 0, 255), # 紫色 11 | 'table_title': (255, 140, 0), # 深橙色 12 | 'chart_title': (0, 215, 255), # 浅青色 13 | 14 | # 正文类 15 | 'text': (0, 255, 0), # 绿色 16 | 'abstract': (0, 255, 191), # 绿松石色 17 | 'aside_text': (152, 251, 152), # 浅绿色 18 | 'footnote': (144, 238, 144), # 淡绿色 19 | 20 | # 图表类 21 | 'image': (0, 0, 255), # 蓝色 22 | 'chart': (0, 255, 255), # 青色 23 | 'table': (255, 165, 0), # 橙色 24 | 25 | # 公式和数字类 26 | 'formula': (255, 255, 0), # 黄色 27 | 'formula_number': (255, 215, 0), # 金色 28 | 'number': (218, 165, 32), # 金麦色 29 | 30 | # 页眉页脚类 31 | 'header': (169, 169, 169), # 深灰色 32 | 'footer': (192, 192, 192), # 浅灰色 33 | } 34 | DEFAULT_COLOR: Tuple[int, int, int] = (128, 128, 128) # 灰色,用于未定义颜色的标签 35 | 36 | # 需要进一步处理的标签 (白名单) 37 | PROCESS_LABELS: List[str] = [ 38 | 'paragraph_title', 'image', 'text', 39 | 'abstract', 'figure_title', 'formula', 40 | 'table_title', 'doc_title', 'table', 41 | 'chart', 'chart_title', 'formula_number' 42 | ] 43 | 44 | # 需要过滤掉的标签 (黑名单) 45 | FILTER_LABELS: List[str] = [ 46 | 'footnote', 'header', 'footer', 47 | 'aside_text', 'number' 48 | ] 49 | 50 | # 已知的所有标签及其ID (用于检测未知标签) 51 | KNOWN_LABELS: Dict[int, str] = { 52 | 0: 'paragraph_title', 53 | 1: 'image', 54 | 2: 'text', 55 | 3: 'number', 56 | 4: 'abstract', 57 | 6: 'figure_title', 58 | 7: 'formula', 59 | 8: 'table', 60 | 9: 'table_title', 61 | 10: 'reference', 62 | 11: 'doc_title', 63 | 12: 'footnote', 64 | 13: 'header', 65 | 15: 'footer', 66 | 17: 'chart_title', 67 | 18: 'chart', 68 | 19: 'formula_number', 69 | 20: 'header_image', 70 | 22: 'aside_text', 71 | } 72 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/layout_detect.py: -------------------------------------------------------------------------------- 1 | """ 2 | 布局处理模块 3 | 负责文档版面分析和处理 4 | """ 5 | 6 | import cv2 7 | import json 8 | import os 9 | import time 10 | from typing import Dict, List, Any 11 | from paddlex import create_model 12 | 13 | from image_utils.layout_config import LayoutConfig 14 | from x_pdf2md.config import get_model_config 15 | 16 | 17 | def is_box_inside(box1: List[float], box2: List[float]) -> bool: 18 | """ 19 | 判断box1是否在box2内部(如果box1有80%以上区域被box2包含,则视为被包含) 20 | 21 | 参数: 22 | box1: [x1, y1, x2, y2] 格式的框坐标 23 | box2: [x1, y1, x2, y2] 格式的框坐标 24 | 25 | 返回: 26 | bool: 如果box1有80%以上区域被box2包含,返回True,否则返回False 27 | """ 28 | # 计算box1的面积 29 | area_box1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) 30 | 31 | # 计算交集的坐标 32 | intersect_x1 = max(box1[0], box2[0]) 33 | intersect_y1 = max(box1[1], box2[1]) 34 | intersect_x2 = min(box1[2], box2[2]) 35 | intersect_y2 = min(box1[3], box2[3]) 36 | 37 | # 如果没有交集,直接返回False 38 | if intersect_x1 >= intersect_x2 or intersect_y1 >= intersect_y2: 39 | return False 40 | 41 | # 计算交集的面积 42 | intersection_area = (intersect_x2 - intersect_x1) * (intersect_y2 - intersect_y1) 43 | 44 | # 计算交集面积占box1面积的比例 45 | overlap_ratio = intersection_area / area_box1 46 | 47 | # 如果交集面积占box1面积的比例大于等于0.8,则视为被包含 48 | return overlap_ratio >= 0.8 49 | 50 | def build_box_hierarchy(boxes: List[Dict]) -> List[Dict]: 51 | """ 52 | 为每个框添加包含关系,并移除嵌套在其他框内部的框 53 | 54 | 参数: 55 | boxes: 包含框信息的字典列表,每个字典包含coordinate字段 56 | 57 | 返回: 58 | List[Dict]: 添加了包含关系的框列表 59 | """ 60 | n = len(boxes) 61 | is_nested = [False] * n 62 | 63 | # 为每个框添加contains属性 64 | for i in range(n): 65 | boxes[i]["contains"] = [] 66 | 67 | # 检查每个框是否被其他框包含 68 | for i in range(n): 69 | box1 = boxes[i]["coordinate"] 70 | for j in range(n): 71 | if i != j: 72 | box2 = boxes[j]["coordinate"] 73 | if is_box_inside(box1, box2): 74 | is_nested[i] = True 75 | # 将被包含的框添加到外部框的contains列表中 76 | boxes[j]["contains"].append(boxes[i]) 77 | break 78 | 79 | # 只保留不是嵌套框的框 80 | result = [] 81 | for i in range(n): 82 | if not is_nested[i]: 83 | result.append(boxes[i]) 84 | 85 | return result 86 | 87 | def calculate_boundary_distance(box1: List[float], box2: List[float]) -> float: 88 | """ 89 | 计算两个框之间的最小边界距离 90 | 91 | 参数: 92 | box1: [x1, y1, x2, y2] 格式的框坐标 93 | box2: [x1, y1, x2, y2] 格式的框坐标 94 | 95 | 返回: 96 | float: 两个框之间的最小距离,如果重叠则为0 97 | """ 98 | # 计算水平方向上的距离 99 | if box1[0] > box2[2]: # box1在box2右侧 100 | horizontal_dist = box1[0] - box2[2] 101 | elif box2[0] > box1[2]: # box2在box1右侧 102 | horizontal_dist = box2[0] - box1[2] 103 | else: # 水平方向上有重叠 104 | horizontal_dist = 0 105 | 106 | # 计算垂直方向上的距离 107 | if box1[1] > box2[3]: # box1在box2下方 108 | vertical_dist = box1[1] - box2[3] 109 | elif box2[1] > box1[3]: # box2在box1下方 110 | vertical_dist = box2[1] - box1[3] 111 | else: # 垂直方向上有重叠 112 | vertical_dist = 0 113 | 114 | # 计算欧几里得距离 115 | return (horizontal_dist ** 2 + vertical_dist ** 2) ** 0.5 116 | 117 | def merge_formula_numbers(boxes: List[Dict]) -> List[Dict]: 118 | """ 119 | 将公式序号框融合到最近的公式框中,优先考虑序号左侧的公式框 120 | 121 | 参数: 122 | boxes: 包含框信息的字典列表 123 | 124 | 返回: 125 | List[Dict]: 处理后的框列表,原始列表不会被修改 126 | """ 127 | # 识别所有公式框和公式序号框 128 | formula_boxes = [box for box in boxes if box.get("label") == "formula"] 129 | formula_number_boxes = [box for box in boxes if box.get("label") == "formula_number"] 130 | 131 | # 如果没有公式序号框或公式框,直接返回原列表的副本 132 | if not formula_number_boxes or not formula_boxes: 133 | return boxes.copy() 134 | 135 | # 创建结果列表,首先加入除了公式框和公式序号框外的所有框 136 | result_boxes = [] 137 | formula_ids = [id(box) for box in formula_boxes] 138 | formula_number_ids = [id(box) for box in formula_number_boxes] 139 | 140 | # 复制非公式和非公式序号的框到结果列表 141 | for box in boxes: 142 | if id(box) not in formula_ids and id(box) not in formula_number_ids: 143 | result_boxes.append(box.copy()) 144 | 145 | # 处理公式框,为每个公式框创建副本 146 | processed_formula_boxes = [] 147 | for formula_box in formula_boxes: 148 | new_formula_box = formula_box.copy() 149 | new_formula_box["formula_numbers"] = [] 150 | processed_formula_boxes.append(new_formula_box) 151 | 152 | # 对于每个公式序号框,找到最合适的公式框并融合 153 | for number_box in formula_number_boxes: 154 | number_coord = number_box["coordinate"] 155 | 156 | # 计算公式序号框的左边缘和中心点 157 | number_left = number_coord[0] 158 | number_center_y = (number_coord[1] + number_coord[3]) / 2 159 | 160 | # 定义垂直容忍度(公式中心点和序号中心点的垂直距离允许范围) 161 | vertical_tolerance = (number_coord[3] - number_coord[1]) * 2 # 序号高度的2倍 162 | 163 | # 筛选出垂直方向上大致对齐的公式框 164 | aligned_formulas = [] 165 | for formula_box in processed_formula_boxes: 166 | formula_coord = formula_box["coordinate"] 167 | formula_center_y = (formula_coord[1] + formula_coord[3]) / 2 168 | 169 | # 检查垂直方向上是否对齐 170 | if abs(formula_center_y - number_center_y) <= vertical_tolerance: 171 | aligned_formulas.append(formula_box) 172 | 173 | # 先尝试找位于序号左侧的公式框(公式在左,序号在右) 174 | left_side_formulas = [] 175 | for formula_box in aligned_formulas: 176 | formula_coord = formula_box["coordinate"] 177 | formula_right = formula_coord[2] # 公式框的右边缘 178 | 179 | # 如果公式框的右边缘在序号框的左边缘的左侧或接近(允许少量重叠) 180 | if formula_right <= number_left + (number_coord[2] - number_left) * 0.2: # 允许20%的重叠 181 | left_side_formulas.append(formula_box) 182 | 183 | closest_formula = None 184 | 185 | # 如果找到了位于序号左侧的公式框 186 | if left_side_formulas: 187 | # 选择最近的一个(公式右边缘离序号左边缘最近的) 188 | min_distance = float('inf') 189 | for formula_box in left_side_formulas: 190 | formula_coord = formula_box["coordinate"] 191 | formula_right = formula_coord[2] 192 | 193 | distance = number_left - formula_right 194 | 195 | if distance < min_distance: 196 | min_distance = distance 197 | closest_formula = formula_box 198 | 199 | # 如果没有找到位于序号左侧的公式框,则在所有垂直对齐的公式框中选择距离最近的 200 | elif aligned_formulas: 201 | min_distance = float('inf') 202 | for formula_box in aligned_formulas: 203 | formula_coord = formula_box["coordinate"] 204 | formula_center_x = (formula_coord[0] + formula_coord[2]) / 2 205 | number_center_x = (number_coord[0] + number_coord[2]) / 2 206 | 207 | distance = abs(formula_center_x - number_center_x) 208 | 209 | if distance < min_distance: 210 | min_distance = distance 211 | closest_formula = formula_box 212 | 213 | # 如果仍然没有找到合适的公式框,退回到使用边界距离 214 | else: 215 | min_distance = float('inf') 216 | for formula_box in processed_formula_boxes: 217 | formula_coord = formula_box["coordinate"] 218 | 219 | # 使用边界距离替代中心点距离 220 | distance = calculate_boundary_distance(formula_coord, number_coord) 221 | 222 | if distance < min_distance: 223 | min_distance = distance 224 | closest_formula = formula_box 225 | 226 | if closest_formula: 227 | # 融合公式框和公式序号框 228 | # 取两个框的并集作为新的公式框 229 | closest_formula["coordinate"] = [ 230 | min(closest_formula["coordinate"][0], number_coord[0]), 231 | min(closest_formula["coordinate"][1], number_coord[1]), 232 | max(closest_formula["coordinate"][2], number_coord[2]), 233 | max(closest_formula["coordinate"][3], number_coord[3]) 234 | ] 235 | 236 | # 添加公式序号的详细信息 237 | closest_formula["formula_numbers"].append({ 238 | "coordinate": number_coord, 239 | "score": number_box.get("score", 0), 240 | "text": number_box.get("text", "") 241 | }) 242 | 243 | # 将处理后的公式框添加到结果列表 244 | result_boxes.extend(processed_formula_boxes) 245 | 246 | return result_boxes 247 | 248 | def detect_layout(image_path: str, output_path: str = "./layout_output/layout_detection.json", model_name= "PP-DocLayout-L") -> Dict: 249 | """ 250 | 检测文档版面布局 251 | 252 | 参数: 253 | image_path: 图像路径 254 | output_dir: 输出目录 255 | model_name: 模型名称,默认为"PP-DocLayout-L" 256 | 257 | 返回: 258 | 版面分析结果 259 | """ 260 | # 如果model_name为None,从配置中获取 261 | if model_name is None: 262 | model_name = get_model_config('layout') 263 | 264 | # 创建输出目录 265 | output_dir = os.path.dirname(output_path) 266 | os.makedirs(output_dir, exist_ok=True) 267 | 268 | # 设置json输出路径 269 | json_path = output_path if output_path.endswith(".json") else os.path.join(output_dir, "layout_detection.json") 270 | 271 | model = create_model(model_name=model_name) 272 | output = model.predict(image_path, batch_size=1, layout_nms=True) 273 | 274 | # 保存结果到JSON 275 | for res in output: 276 | res.save_to_json(save_path=json_path) 277 | res.save_to_img("./output/layout_result.jpg") 278 | 279 | # 读取JSON文件 280 | with open(json_path, "r", encoding="utf-8") as f: 281 | result = json.load(f) 282 | 283 | # 过滤掉不需要处理的标签 284 | result["boxes"] = [box for box in result["boxes"] if box.get("label") not in LayoutConfig.FILTER_LABELS] 285 | 286 | # 合并公式和公式序号 287 | result["boxes"] = merge_formula_numbers(result["boxes"]) 288 | 289 | # 构建框层次结构 290 | result["boxes"] = build_box_hierarchy(result["boxes"]) 291 | # json dump到文件,使用json_path并在文件后面加入final标记 292 | final_json_path = json_path.replace(".json", "_final.json") 293 | print("Final JSON path:", final_json_path) 294 | with open(final_json_path, "w", encoding="utf-8") as f: 295 | json.dump(result, f, ensure_ascii=False, indent=2) 296 | 297 | return result 298 | 299 | 300 | if __name__ == "__main__": 301 | image_path = "./formula_inline.png" 302 | output_dir = "output" 303 | os.makedirs(output_dir, exist_ok=True) 304 | # 测试文档版面分析 305 | result = detect_layout(image_path=image_path, output_path=os.path.join(output_dir, "layout_detection.json")) 306 | 307 | 308 | 309 | # 布局检测结果分析 310 | 311 | ## 检测信息 312 | 313 | # - 输入图片:layout.png 314 | # - 检测项目:页面布局元素 315 | # - 总检测框数:13 316 | 317 | # ## 检测结果详情 318 | 319 | # 检测到的元素类型统计: 320 | 321 | # - 表格(table): 2个 322 | # - 正文(text): 5个 323 | # - 表格标题(table_title): 2个 324 | # - 段落标题(paragraph_title): 4个 325 | 326 | # ```json 327 | # { 328 | # // 输入图片路径 329 | # "input_path": "layout.png", 330 | # "page_index": null, 331 | # "boxes": [ 332 | # // 表格区域 1 333 | # { 334 | # "cls_id": 8, 335 | # "label": "table", 336 | # "score": 0.9866, // 置信度 98.66% 337 | # "coordinate": [74.31, 105.71, 321.99, 299.11] // [x1, y1, x2, y2] 338 | # }, 339 | # // 正文区域 1 340 | # { 341 | # "cls_id": 2, 342 | # "label": "text", 343 | # "score": 0.9860, // 置信度 98.60% 344 | # "coordinate": [34.66, 349.91, 358.34, 611.34] // [x1, y1, x2, y2] 345 | # }, 346 | # …… 347 | # ] 348 | # } 349 | # ``` 350 | 351 | # ## 注意事项 352 | 353 | # 1. coordinate 坐标格式为 [x1, y1, x2, y2],表示检测框的左上角和右下角坐标 354 | # 2. score 表示检测结果的置信度,范围 0-1 355 | # 3. cls_id 对应关系: 356 | # - 0: paragraph_title 357 | # - 2: text 358 | # - 8: table 359 | # - 9: table_title 360 | # - 7: formula 361 | # - 19: formula_number 362 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/layout_sorter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import json 5 | from typing import List, Dict, Optional, Union 6 | 7 | class LayoutSorter: 8 | """版面布局元素排序处理器""" 9 | 10 | def __init__(self, 11 | threshold_left_right: float = 0.9, 12 | threshold_cross: float = 0.3): 13 | """ 14 | 初始化排序器 15 | 16 | Args: 17 | threshold_left_right: 判定元素属于左/右栏的阈值(0-1),默认0.9 18 | threshold_cross: 判定元素跨栏的阈值(0-1),默认0.3 19 | """ 20 | self.threshold_left_right = threshold_left_right 21 | self.threshold_cross = threshold_cross 22 | 23 | def sort_layout(self, layout_result: Union[str, dict], page_width: float) -> List[Dict]: 24 | """ 25 | 对版面检测结果进行排序 26 | 27 | Args: 28 | layout_result: JSON文件路径或者包含检测结果的字典 29 | page_width: 页面宽度,必须提供 30 | 31 | Returns: 32 | 排序后的元素列表 33 | """ 34 | # 加载检测结果 35 | if isinstance(layout_result, str): 36 | with open(layout_result, 'r', encoding='utf-8') as f: 37 | result = json.load(f) 38 | else: 39 | result = layout_result 40 | 41 | # 获取元素列表 42 | elements = result.get('boxes', []) 43 | return self._sort_elements(elements, page_width) 44 | 45 | def _sort_elements(self, elements: List[Dict], page_width: float) -> List[Dict]: 46 | """ 47 | 对元素按照左右栏进行排序 48 | 49 | Args: 50 | elements: 元素列表 51 | page_width: 页面宽度,必须提供 52 | 53 | Returns: 54 | 排序后的元素列表 55 | """ 56 | # 筛选有效元素 57 | valid_elements = [ 58 | elem for elem in elements 59 | if "coordinate" in elem and len(elem["coordinate"]) == 4 60 | ] 61 | 62 | if not valid_elements: 63 | return [] 64 | 65 | page_center_x = page_width / 2 66 | left_column = [] 67 | right_column = [] 68 | 69 | # 分配元素到左右栏 70 | for elem in valid_elements: 71 | x1, _, x2, _ = elem["coordinate"] 72 | elem_width = x2 - x1 73 | 74 | # 计算左右覆盖比例 75 | left_part = max(0, min(x2, page_center_x) - x1) 76 | right_part = max(0, x2 - max(x1, page_center_x)) 77 | 78 | left_ratio = left_part / elem_width if elem_width > 0 else 0 79 | right_ratio = right_part / elem_width if elem_width > 0 else 0 80 | 81 | # 根据覆盖比例分配 82 | if left_ratio >= self.threshold_left_right: 83 | left_column.append(elem) 84 | elif right_ratio >= self.threshold_left_right: 85 | right_column.append(elem) 86 | elif left_ratio > self.threshold_cross and right_ratio > self.threshold_cross: 87 | left_column.append(elem) 88 | else: 89 | elem_center_x = (x1 + x2) / 2 90 | if elem_center_x <= page_center_x: 91 | left_column.append(elem) 92 | else: 93 | right_column.append(elem) 94 | 95 | # 按垂直位置排序 96 | left_column.sort(key=lambda e: e["coordinate"][1]) 97 | right_column.sort(key=lambda e: e["coordinate"][1]) 98 | 99 | return left_column + right_column 100 | 101 | 102 | if __name__ == "__main__": 103 | # 使用示例 104 | sorter = LayoutSorter() 105 | 106 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/layout_visualizer.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | from PIL import Image, ImageDraw, ImageFont 4 | from typing import Dict, List 5 | from layout_config import LayoutConfig 6 | 7 | class LayoutVisualizer: 8 | def __init__(self, font_path: str = None, font_size: int = 24): # 修改默认字体大小为24 9 | """初始化可视化器 10 | 11 | Args: 12 | font_path: 字体文件路径,默认使用PIL默认字体 13 | font_size: 字体大小,默认24 14 | """ 15 | self.font = ImageFont.load_default() 16 | if font_path: 17 | try: 18 | self.font = ImageFont.truetype(font_path, font_size) 19 | except: 20 | print("无法加载指定字体,使用默认字体") 21 | 22 | # 使用配置文件中的颜色设置 23 | self.colors = LayoutConfig.COLORS 24 | self.default_color = LayoutConfig.DEFAULT_COLOR 25 | 26 | def draw_boxes(self, image: np.ndarray, boxes: List[Dict], 27 | show_order: bool = True, show_label: bool = True) -> np.ndarray: 28 | """绘制排序后的检测框 29 | 30 | Args: 31 | image: 原始图像(RGB格式) 32 | boxes: 排序后的检测框列表 33 | show_order: 是否显示排序顺序 34 | show_label: 是否显示标签类型 35 | 36 | Returns: 37 | 绘制了检测框的图像 38 | """ 39 | # 转换为PIL图像 40 | image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) 41 | pil_image = Image.fromarray(image_rgb) 42 | draw = ImageDraw.Draw(pil_image) 43 | 44 | # 绘制每个元素 45 | for i, box in enumerate(boxes): 46 | x1, y1, x2, y2 = map(int, box['coordinate']) 47 | label = box['label'] 48 | score = box.get('score', 0) 49 | color = self.colors.get(label, self.default_color) 50 | 51 | # 绘制矩形框 52 | draw.rectangle([x1, y1, x2, y2], outline=color, width=2) 53 | 54 | # 准备标注文本 55 | text_elements = [] 56 | if show_order: 57 | text_elements.append(f"#{i+1}") 58 | if show_label: 59 | text_elements.append(f"{label}") 60 | if score > 0: 61 | text_elements.append(f"{score:.2f}") 62 | text = " ".join(text_elements) 63 | 64 | if text: 65 | # 获取文本尺寸 66 | text_bbox = draw.textbbox((0, 0), text, font=self.font) 67 | text_width = text_bbox[2] - text_bbox[0] 68 | text_height = text_bbox[3] - text_bbox[1] 69 | 70 | # 绘制文本背景 71 | draw.rectangle( 72 | [x1, y1 - text_height - 8, x1 + text_width + 8, y1], # 增加内边距 73 | fill=color 74 | ) 75 | 76 | # 绘制文本 77 | draw.text( 78 | (x1 + 4, y1 - text_height - 4), # 调整文本位置 79 | text, 80 | fill=(255, 255, 255), 81 | font=self.font 82 | ) 83 | 84 | # 转换回OpenCV格式 85 | result = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR) 86 | return result 87 | 88 | def save_visualization(self, image_path: str, boxes: List[Dict], 89 | output_path: str, 90 | show_order: bool = True, 91 | show_label: bool = True) -> None: 92 | """保存可视化结果 93 | 94 | Args: 95 | image_path: 原始图像路径 96 | boxes: 排序后的检测框列表 97 | output_path: 输出图像路径 98 | show_order: 是否显示排序顺序 99 | show_label: 是否显示标签类型 100 | """ 101 | # 读取原始图像 102 | image = cv2.imread(image_path) 103 | if image is None: 104 | raise ValueError(f"无法读取图像: {image_path}") 105 | 106 | # 绘制检测框 107 | result = self.draw_boxes(image, boxes, show_order, show_label) 108 | 109 | # 保存结果 110 | cv2.imwrite(output_path, result) 111 | print(f"可视化结果已保存至: {output_path}") 112 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/models.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | from paddlex import create_model 4 | 5 | from x_pdf2md.config import get_model_config 6 | 7 | # 全局模型字典,用于存储已加载的模型 8 | _GLOBAL_MODELS: Dict[str, Any] = {} 9 | 10 | def get_or_create_model(model_type: str) -> Any: 11 | """ 12 | 获取或创建模型,实现模型的全局注册 13 | 14 | Args: 15 | model_type: 模型类型 16 | 17 | Returns: 18 | 已加载的模型实例 19 | """ 20 | global _GLOBAL_MODELS 21 | 22 | # 如果模型已经加载,直接返回 23 | if model_type in _GLOBAL_MODELS and _GLOBAL_MODELS[model_type] is not None: 24 | return _GLOBAL_MODELS[model_type] 25 | 26 | # 否则,从配置中获取模型名称并加载 27 | model_name = get_model_config(model_type) 28 | try: 29 | model = create_model(model_name=model_name) 30 | _GLOBAL_MODELS[model_type] = model 31 | print(f"模型 {model_type} 加载成功") 32 | return model 33 | except Exception as e: 34 | print(f"模型 {model_type} 加载失败: {e}") 35 | return None 36 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/process_page.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | import os 3 | import json 4 | 5 | from x_pdf2md.image_utils.crop_text_areas import PolyCropper, TextAreaCropper 6 | from x_pdf2md.image_utils.detect_and_sort import detect_and_sort_layout 7 | from x_pdf2md.image_utils.region_image import RegionImage 8 | 9 | 10 | def process_page_layout( 11 | image_path: str, 12 | output_dir: str, 13 | page_number: int = 1, 14 | layout_json_path: str = None, 15 | threshold_left_right: float = 0.9, 16 | threshold_cross: float = 0.3 17 | ) -> List[RegionImage]: 18 | """ 19 | 处理页面布局:检测并排序版面,然后按顺序裁剪保存各区域 20 | 21 | Args: 22 | image_path: 输入图片路径 23 | output_dir: 输出目录路径 24 | layout_json_path: 布局检测结果保存路径(可选) 25 | threshold_left_right: 判定左右栏的阈值 26 | threshold_cross: 判定跨栏的阈值 27 | 28 | Returns: 29 | List[RegionImage]: 包含区域信息的RegionImage对象列表 30 | """ 31 | # 确保输出目录存在 32 | os.makedirs(output_dir, exist_ok=True) 33 | 34 | # 如果未指定layout_json_path,在output_dir中创建临时文件 35 | if layout_json_path is None: 36 | layout_json_path = os.path.join(output_dir, "temp_layout.json") 37 | 38 | # 检测并排序版面 39 | sorted_elements = detect_and_sort_layout( 40 | image_path, 41 | layout_json_path, 42 | threshold_left_right, 43 | threshold_cross 44 | ) 45 | 46 | # 将排序后的元素写入JSON文件 47 | with open(layout_json_path, 'w', encoding='utf-8') as f: 48 | json.dump({"boxes": sorted_elements}, f, ensure_ascii=False, indent=2) 49 | 50 | # 创建裁剪处理器 51 | cropper = TextAreaCropper(PolyCropper()) 52 | 53 | # 裁剪并保存区域 54 | region_images = [] 55 | cropper.crop_text_areas( 56 | image_path, 57 | layout_json_path, 58 | output_dir, 59 | output_format='png' 60 | ) 61 | 62 | # 获取裁剪后的图片信息(按排序顺序) 63 | for i, element in enumerate(sorted_elements): 64 | label = element.get('label', 'unknown') 65 | score = element.get('score', 0) 66 | box = element.get('box', []) 67 | filename = f"{i}_{label}_{score:.4f}.png" 68 | cropped_path = os.path.join(output_dir, filename) 69 | contains = element.get('contains', []) 70 | if os.path.exists(cropped_path): 71 | region = RegionImage( 72 | image_path=cropped_path, 73 | label=label, 74 | score=score, 75 | page_number=page_number, 76 | region_index=i, 77 | original_box=box, 78 | contains=contains 79 | ) 80 | region_images.append(region) 81 | 82 | return region_images 83 | 84 | 85 | if __name__ == "__main__": 86 | # 使用示例 87 | image_path = "car.png" 88 | output_dir = "./processed_output" 89 | 90 | cropped_images = process_page_layout(image_path, output_dir) 91 | print(f"处理完成,共生成 {len(cropped_images)} 个区域图片") 92 | for i, path in enumerate(cropped_images, 1): 93 | print(f"区域 {i}: {path}") 94 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/region_image.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class RegionImage: 6 | """表示文档中的一个区域图片""" 7 | image_path: str # 图片文件路径 8 | label: str # 区域标签 (如 'text', 'title' 等) 9 | score: float # 检测置信度分数 10 | page_number: int # 页码 11 | region_index: int # 区域在页面中的序号 12 | original_box: list # 原始边界框坐标 [x1,y1,x2,y2] 13 | content: str = None # 识别出的内容 14 | contains: list = None # 包含的区域 15 | 16 | def __str__(self) -> str: 17 | return f"RegionImage(label={self.label}, page={self.page_number}, index={self.region_index}, path={self.image_path})" 18 | -------------------------------------------------------------------------------- /x_pdf2md/image_utils/visualize_formula.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | with open("S_output/res.json", "r", encoding="utf-8") as f: 5 | s_data = json.load(f) 6 | 7 | with open("L_output/res.json", "r", encoding="utf-8") as f: 8 | l_data = json.load(f) 9 | 10 | with open("UniMERNet_output/res.json", "r", encoding="utf-8") as f: 11 | u_data = json.load(f) 12 | 13 | # 写入到md文件 14 | with open("output/formula_recognition.md", "w", encoding="utf-8") as f: 15 | f.write("# 公式识别结果对比\n\n") 16 | # 小模型识别结果 17 | f.write("## 小模型识别结果\n\n") 18 | f.write(f"输入图像: {s_data['input_path']}\n\n") 19 | f.write(f"识别结果: $${s_data['rec_formula']}$$\n\n") 20 | # 大模型识别结果 21 | f.write("## 大模型识别结果\n\n") 22 | f.write(f"输入图像: {l_data['input_path']}\n\n") 23 | f.write(f"识别结果:$$ {l_data['rec_formula']}$$\n\n") 24 | # 加入UniMERNet_output的识别结果 25 | f.write("## UniMERNet模型识别结果\n\n") 26 | f.write(f"输入图像: {u_data['input_path']}\n\n") 27 | f.write(f"识别结果:$$ {u_data['rec_formula']}$$\n\n") -------------------------------------------------------------------------------- /x_pdf2md/markdown_formatter.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | import os 3 | 4 | from x_pdf2md.image2md.get_image_title import get_image_title 5 | from x_pdf2md.image2md.vlm_function import extract_table_from_image, extract_text_from_image, describe_image 6 | from x_pdf2md.image_utils.formula_recognize import recognize_formula 7 | from x_pdf2md.ocr_utils.ocr_image import OCRProcessor 8 | from x_pdf2md.remote_image.image_uploader import ImageUploader 9 | from x_pdf2md.image_utils.region_image import RegionImage 10 | 11 | ocr_processor = OCRProcessor() 12 | 13 | def format_region_content( 14 | region: RegionImage, 15 | image_upload_obj: Optional[ImageUploader] = None, 16 | output_dir: Optional[str] = None 17 | ) -> None: 18 | """ 19 | 根据区域标签类型生成或增强内容 20 | 21 | 参数: 22 | region: RegionImage对象 23 | image_upload_obj: 可选的图片上传器对象 24 | output_dir: 可选的输出目录,用于保存处理结果 25 | """ 26 | 27 | # 获取标签 28 | label = region.label 29 | # 清空区域内容 30 | region.content = "" 31 | # 默认内容为空 32 | content = "" 33 | 34 | # 使用图片路径 35 | image_path = region.image_path 36 | 37 | # 排除图片相关部分,这些已在format_region中单独处理 38 | if label in ["image", "figure", "chart"]: 39 | print("处理图片:", image_path) 40 | # 获取图片描述 41 | image_describe = describe_image(image_path) 42 | print("图片描述:", image_describe) 43 | 44 | # 如果有图片路径且有上传器,尝试上传 45 | image_title = get_image_title(image_describe) 46 | if not image_title: 47 | image_title = f"{label}_{region.region_index+1}" 48 | print(f"处理图片: {image_title}") 49 | 50 | # 如果指定了输出目录,可以在这里处理输出相关的逻辑 51 | result_path = image_path 52 | if output_dir: 53 | # 这里可以添加将处理结果保存到输出目录的逻辑 54 | # 例如: 复制图片到输出目录或者生成新的输出文件 55 | result_filename = os.path.basename(image_path) 56 | result_path = os.path.join(output_dir, result_filename) 57 | 58 | # 如果有图片路径且有上传器,尝试上传 59 | if image_path and image_upload_obj: 60 | print(f"上传图片: {image_path}") 61 | try: 62 | # 上传图片 63 | image_url = image_upload_obj.upload(image_path) 64 | # 如果上传成功,使用图片URL 65 | if image_url: 66 | content = f"![{image_title}]({image_url})\n\n" + ( 67 | f"**{image_title}描述:** {region.content}" 68 | if region.content 69 | else "" 70 | ) 71 | except Exception as e: 72 | print(f"图片上传失败: {e}") 73 | else: 74 | # 如果有输出目录,将图片复制到images子文件夹并使用相对路径 75 | if output_dir: 76 | # 创建images子文件夹 77 | images_dir = os.path.join(output_dir, "images") 78 | os.makedirs(images_dir, exist_ok=True) 79 | 80 | # 获取原图片的文件名 81 | image_filename = os.path.basename(image_path) 82 | # 构建目标路径 83 | target_image_path = os.path.join(images_dir, image_filename) 84 | 85 | # 复制图片到目标路径 86 | import shutil 87 | try: 88 | shutil.copy2(image_path, target_image_path) 89 | print(f"图片已复制到: {target_image_path}") 90 | # 使用相对路径引用图片 91 | image_rel_path = f"./images/{image_filename}" 92 | content = f"![{image_title}]({image_rel_path})\n\n" + ( 93 | f"**{image_title}描述:** {region.content}" if region.content else "" 94 | ) 95 | except Exception as e: 96 | print(f"复制图片失败: {e}") 97 | # 失败时回退到使用原始路径 98 | content = f"![{image_title}]({image_path})\n\n" + ( 99 | f"**{image_title}描述:** {region.content}" if region.content else "" 100 | ) 101 | else: 102 | # 没有输出目录时使用原始路径 103 | content = f"![{image_title}]({image_path})\n\n" + ( 104 | f"**{image_title}描述:** {region.content}" if region.content else "" 105 | ) 106 | 107 | # 根据标签类型处理内容 108 | if label == "text": 109 | # 文本内容处理 110 | content = extract_text_from_image(image_path=image_path) 111 | 112 | elif label == "formula": 113 | content = recognize_formula(input_path=image_path) 114 | # 公式内容处理 115 | if not content.startswith("$$") and not content.endswith("$$"): 116 | content = f"$$\n{content}\n$$" 117 | 118 | elif label == "table": 119 | # 表格内容处理 120 | content = extract_table_from_image(image_path=image_path) 121 | elif label in ["doc_title", "paragraph_title", 122 | "chart_title", "table_title", "figure_title", 123 | "abstract"]: 124 | # 其他类型标签的默认处理 125 | content = ocr_processor.extract_text(image_path) 126 | 127 | region.content = content 128 | 129 | 130 | def format_pdf_regions( 131 | page_regions: List[List[RegionImage]], 132 | image_uploader: Optional[ImageUploader] = None, 133 | output_dir: Optional[str] = None, 134 | ) -> List[str]: 135 | """ 136 | 格式化所有页面的区域为Markdown文本 137 | 138 | 参数: 139 | page_regions: 每页的RegionImage对象列表 140 | image_uploader: 可选的图片上传器对象 141 | output_dir: 可选的输出目录,用于保存处理结果 142 | 143 | 返回: 144 | List[str]: 每页的Markdown文本列表 145 | """ 146 | # 内部函数:将format_region移到这里 147 | def format_region( 148 | region: RegionImage, 149 | image_upload_obj: Optional[ImageUploader] = None, 150 | output_dir: Optional[str] = None 151 | ) -> str: 152 | """ 153 | 将区块处理结果格式化为Markdown 154 | 155 | 参数: 156 | region: RegionImage对象,表示区块处理结果 157 | image_upload_obj: 图片上传器对象,用于处理图片上传 158 | output_dir: 可选的输出目录,用于保存处理结果 159 | 160 | 返回: 161 | Markdown格式的文本 162 | """ 163 | # print(f"处理区域 #{region.region_index+1},标签: {region.label}") 164 | 165 | # 生成或增强区域内容 166 | format_region_content(region, image_upload_obj, output_dir) 167 | 168 | if not region.content: 169 | return "" 170 | return region.content 171 | 172 | formatted_pages = [] 173 | for page_num, regions in enumerate(page_regions, 1): 174 | print(f"\n处理第 {page_num} 页的格式化...") 175 | page_content = [] 176 | for region in regions: 177 | formatted = format_region(region, image_uploader, output_dir) 178 | if formatted: 179 | page_content.append(formatted) 180 | formatted_pages.append("\n\n".join(page_content)) 181 | return formatted_pages 182 | -------------------------------------------------------------------------------- /x_pdf2md/ocr_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/ocr_utils/__init__.py -------------------------------------------------------------------------------- /x_pdf2md/ocr_utils/ocr_image.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from typing import Dict, List, Union 4 | import os 5 | import json 6 | from PIL import Image 7 | 8 | from x_pdf2md.ocr_utils.text_detection import text_detection 9 | from x_pdf2md.ocr_utils.text_recogniize import recognize_text 10 | from x_pdf2md.config import get_model_config 11 | 12 | 13 | class OCRProcessor: 14 | def __init__(self, det_model=None, rec_model=None): 15 | """ 16 | 初始化OCR处理器 17 | 18 | Args: 19 | det_model: 文本检测模型名称,None则使用配置 20 | rec_model: 文本识别模型名称,None则使用配置 21 | """ 22 | # 使用传入的模型名称或从配置中获取 23 | self.det_model = det_model or get_model_config('ocr_det') 24 | self.rec_model = rec_model or get_model_config('ocr_rec') 25 | 26 | def crop_image(self, image_path: str, box_coordinates: List) -> Image.Image: 27 | """根据坐标裁剪图像区域""" 28 | image = Image.open(image_path) 29 | # 将坐标转换为矩形边界框 30 | x_coordinates = [int(point[0]) for point in box_coordinates] 31 | y_coordinates = [int(point[1]) for point in box_coordinates] 32 | left, top = min(x_coordinates), min(y_coordinates) 33 | right, bottom = max(x_coordinates), max(y_coordinates) 34 | # 裁剪图像 35 | cropped = image.crop((left, top, right, bottom)) 36 | return cropped 37 | 38 | def process_image(self, image_path: str, save_crops: bool = True, output_dir: str = "./output/crops") -> List[Dict]: 39 | """ 40 | 处理图像的完整OCR流程 41 | Args: 42 | image_path: 输入图像路径 43 | save_crops: 是否保存裁剪后的图像 44 | output_dir: 裁剪图像的保存目录 45 | Returns: 46 | 包含文本位置和识别结果的列表 47 | """ 48 | # 创建输出目录 49 | if save_crops: 50 | os.makedirs(output_dir, exist_ok=True) 51 | 52 | # 1. 首先进行文本检测 53 | det_results = text_detection(image_path) 54 | 55 | all_results = [] 56 | # 2. 对每个检测到的区域进行处理 57 | for idx, (poly, score) in enumerate(zip(det_results['dt_polys'], det_results['dt_scores'])): 58 | # 裁剪检测到的文本区域 59 | cropped = self.crop_image(image_path, poly) 60 | 61 | # 保存裁剪的图像(如果需要) 62 | if save_crops: 63 | crop_filename = f"text_area_{idx}_score_{score:.4f}.png" 64 | crop_path = os.path.join(output_dir, crop_filename) 65 | cropped.save(crop_path) 66 | temp_path = crop_path 67 | else: 68 | # 如果不保存,使用临时目录 69 | temp_dir = "./output/temp" 70 | os.makedirs(temp_dir, exist_ok=True) 71 | temp_path = os.path.join(temp_dir, f"temp_{idx}.png") 72 | cropped.save(temp_path) 73 | 74 | # 3. 对裁剪区域进行文本识别 75 | rec_result = recognize_text(temp_path) 76 | 77 | # 4. 整合结果 78 | result = { 79 | 'position': poly, 80 | 'detection_score': score, 81 | 'text': rec_result['rec_text'], 82 | 'recognition_score': rec_result['rec_score'] 83 | } 84 | if save_crops: 85 | result['crop_path'] = crop_path 86 | all_results.append(result) 87 | 88 | # 清理临时文件(如果不需要保存) 89 | if not save_crops: 90 | os.remove(temp_path) 91 | 92 | return all_results 93 | 94 | def extract_text(self, image_path: str, as_list: bool = False, save_crops: bool = False, output_dir: str = "./output/crops") -> Union[str, List[str]]: 95 | """ 96 | 直接从图像中提取文本内容 97 | Args: 98 | image_path: 输入图像路径 99 | as_list: 是否以列表形式返回每个检测区域的文本 100 | save_crops: 是否保存裁剪后的图像 101 | output_dir: 裁剪图像的保存目录 102 | Returns: 103 | 提取的文本内容,可以是字符串或字符串列表 104 | """ 105 | # 调用OCR处理流程 106 | results = self.process_image(image_path, save_crops, output_dir) 107 | 108 | # 提取所有文本 109 | texts = [result['text'] for result in results] 110 | 111 | # 根据参数决定返回列表还是合并后的字符串 112 | if as_list: 113 | return texts 114 | else: 115 | return ''.join(texts) 116 | 117 | def save_results_to_json(self, results: List[Dict], output_path: str): 118 | """ 119 | 将OCR结果保存到JSON文件 120 | Args: 121 | results: OCR处理结果列表 122 | output_path: JSON文件保存路径 123 | """ 124 | # 确保输出目录存在 125 | os.makedirs(os.path.dirname(output_path), exist_ok=True) 126 | 127 | # 将numpy数组转换为列表以便JSON序列化 128 | serializable_results = [] 129 | for result in results: 130 | result_copy = result.copy() 131 | # 检查position是否为numpy数组,如果是则转换为列表 132 | if hasattr(result_copy['position'], 'tolist'): 133 | result_copy['position'] = result_copy['position'].tolist() 134 | # 如果已经是列表则不需要转换 135 | serializable_results.append(result_copy) 136 | 137 | # 保存到JSON文件 138 | with open(output_path, 'w', encoding='utf-8') as f: 139 | json.dump(serializable_results, f, ensure_ascii=False, indent=2) 140 | 141 | if __name__ == "__main__": 142 | # 测试用例 143 | image_path = "test_text.png" # 替换为实际的测试图像路径 144 | ocr = OCRProcessor() 145 | results = ocr.process_image( 146 | image_path, 147 | save_crops=True, 148 | output_dir="output/test_crops" 149 | ) 150 | 151 | # 保存结果到JSON文件 152 | json_output_path = "output/ocr_results.json" 153 | ocr.save_results_to_json(results, json_output_path) 154 | 155 | # 打印识别结果 156 | print("\nOCR Results:") 157 | print("-" * 50) 158 | for idx, result in enumerate(results): 159 | print(f"Region {idx + 1}:") 160 | print(f"Text: {result['text']}") 161 | print(f"Detection Score: {result['detection_score']:.4f}") 162 | print(f"Recognition Score: {result['recognition_score']:.4f}") 163 | if 'crop_path' in result: 164 | print(f"Crop saved at: {result['crop_path']}") 165 | print("-" * 50) 166 | 167 | # 直接提取文本的示例 168 | text = ocr.extract_text(image_path) 169 | print("\nExtracted Text:") 170 | print("-" * 50) 171 | print(text) 172 | 173 | # 以列表形式获取文本 174 | text_list = ocr.extract_text(image_path, as_list=True) 175 | print("\nExtracted Text as List:") 176 | print("-" * 50) 177 | for i, t in enumerate(text_list): 178 | print(f"{i+1}. {t}") 179 | -------------------------------------------------------------------------------- /x_pdf2md/ocr_utils/text_detection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # author:筱可 4 | # 2025-03-16 5 | 6 | # 导入必要的库 7 | import json 8 | import os 9 | from typing import List 10 | from paddlex import create_model # PaddleX模型创建工具 11 | import numpy as np 12 | import cv2 13 | 14 | def is_same_line(box1, box2, height_threshold=0.5): 15 | """ 16 | 判断两个文本框是否在同一行 17 | Args: 18 | box1: 第一个文本框坐标 shape:(4,2) 19 | box2: 第二个文本框坐标 shape:(4,2) 20 | height_threshold: 判定阈值,默认为文本框高度的0.5倍 21 | Returns: 22 | bool: True表示在同一行,False表示不在同一行 23 | """ 24 | box1_center = np.mean(box1, axis=0)[1] # y坐标的中心点 25 | box2_center = np.mean(box2, axis=0)[1] 26 | box1_height = abs(max(box1[:,1]) - min(box1[:,1])) 27 | box2_height = abs(max(box2[:,1]) - min(box2[:,1])) 28 | avg_height = (box1_height + box2_height) / 2 29 | 30 | return abs(box1_center - box2_center) < avg_height * height_threshold 31 | 32 | def merge_overlapping_boxes(boxes, scores): 33 | """ 34 | 合并同一行的重叠文本框 35 | Args: 36 | boxes: 所有文本框坐标列表 shape:(N,4,2) 37 | scores: 对应的置信度得分列表 shape:(N,) 38 | Returns: 39 | tuple: (合并后的文本框列表, 合并后的置信度列表) 40 | """ 41 | # 如果只有一个或没有文本框,直接返回 42 | if len(boxes) <= 1: 43 | return boxes, scores 44 | 45 | # 将输入的文本框列表转换为numpy数组,便于后续处理 46 | boxes = np.array(boxes) 47 | # 初始化合并后的文本框列表 48 | merged_boxes = [] 49 | # 初始化合并后的得分列表 50 | merged_scores = [] 51 | # 初始化标记数组,用于记录每个文本框是否已被处理 52 | used = [False] * len(boxes) 53 | 54 | # 遍历所有文本框 55 | for i in range(len(boxes)): 56 | # 如果当前文本框已被处理,则跳过 57 | if used[i]: 58 | continue 59 | 60 | # 获取当前文本框和其得分 61 | current_box = boxes[i] 62 | current_score = scores[i] 63 | # 初始化待合并文本框的索引列表 64 | merged_indices = [i] 65 | 66 | # 寻找与当前文本框在同一行的其他文本框 67 | for j in range(i + 1, len(boxes)): 68 | # 如果目标文本框已被处理,则跳过 69 | if used[j]: 70 | continue 71 | 72 | # 判断两个文本框是否在同一行 73 | if is_same_line(boxes[i], boxes[j]): 74 | merged_indices.append(j) 75 | 76 | # 如果找到了需要合并的文本框 77 | if len(merged_indices) > 1: 78 | # 将所有待合并文本框的坐标点重新整理 79 | merged_points = boxes[merged_indices].reshape(-1, 2) 80 | # 计算合并后文本框的最小x和y坐标 81 | x_min, y_min = np.min(merged_points, axis=0) 82 | # 计算合并后文本框的最大x和y坐标 83 | x_max, y_max = np.max(merged_points, axis=0) 84 | # 构建合并后的矩形文本框坐标 85 | merged_box = np.array([[x_min, y_min], [x_max, y_min], 86 | [x_max, y_max], [x_min, y_max]]) 87 | # 计算合并后文本框的平均置信度得分 88 | merged_score = np.mean([scores[idx] for idx in merged_indices]) 89 | else: 90 | # 如果没有需要合并的文本框,保持原状 91 | merged_box = current_box 92 | merged_score = current_score 93 | 94 | # 标记所有已处理的文本框 95 | for idx in merged_indices: 96 | used[idx] = True 97 | 98 | # 将处理结果添加到输出列表 99 | merged_boxes.append(merged_box) 100 | merged_scores.append(merged_score) 101 | 102 | # 返回合并后的文本框和对应的置信度得分 103 | return merged_boxes, merged_scores 104 | 105 | def visualize_boxes(image_path, boxes, output_path="./output/merged_result.jpg"): 106 | """ 107 | 将检测到的文本框可视化到图像上 108 | Args: 109 | image_path: 原始图像路径 110 | boxes: 文本框坐标列表 111 | output_path: 可视化结果保存路径 112 | """ 113 | image = cv2.imread(image_path) 114 | for box in boxes: 115 | box = box.astype(np.int32) 116 | cv2.polylines(image, [box], True, (0, 255, 0), 2) 117 | cv2.imwrite(output_path, image) 118 | 119 | def text_detection(image_path, output_path="./output/res.json", model="PP-OCRv4_mobile_det", 120 | visualize=False) -> None: 121 | """ 122 | 执行文本检测的主函数 123 | Args: 124 | image_path: 输入图像路径 125 | output_path: 检测结果JSON保存路径 126 | model: 使用的PaddleOCR模型名称 127 | visualize: 是否生成可视化结果 128 | Returns: 129 | dict: 包含文本检测结果的字典,格式如下: 130 | { 131 | 'input_path': str, # 输入图像路径 132 | 'page_index': int, # 页面索引(如果有) 133 | 'dt_polys': List[List[List[float]]], # 文本框坐标 134 | 'dt_scores': List[float] # 置信度得分 135 | } 136 | """ 137 | # 创建输出目录 138 | os.makedirs("output", exist_ok=True) 139 | 140 | # 初始化模型 141 | model = create_model(model_name=model) 142 | 143 | # 执行预测 144 | output = model.predict(image_path, batch_size=1) 145 | 146 | # 处理每个检测结果 147 | for res in output: 148 | # 将原始结果保存为JSON 149 | res.save_to_json(output_path) 150 | 151 | # 读取JSON结果进行后处理 152 | with open(output_path, 'r', encoding='utf-8') as f: 153 | detection_result = json.load(f) 154 | 155 | # 提取文本框和置信度 156 | boxes = np.array(detection_result['dt_polys']) # 转换为numpy数组便于处理 157 | scores = np.array(detection_result['dt_scores']) 158 | 159 | # 执行文本框合并 160 | merged_boxes, merged_scores = merge_overlapping_boxes(boxes, scores) 161 | 162 | # 更新检测结果,将numpy数组转换回列表 163 | detection_result['dt_polys'] = [box.tolist() if isinstance(box, np.ndarray) else box 164 | for box in merged_boxes] 165 | detection_result['dt_scores'] = [float(score) if isinstance(score, np.ndarray) else score 166 | for score in merged_scores] 167 | 168 | # 保存处理后的结果 169 | with open(output_path, 'w', encoding='utf-8') as f: 170 | json.dump(detection_result, f, indent=4, ensure_ascii=False) 171 | 172 | # 生成可视化结果(如果需要) 173 | if visualize: 174 | visualize_boxes(image_path, boxes, "./output/original_result.jpg") # 原始检测框 175 | visualize_boxes(image_path, merged_boxes, "./output/merged_result.jpg") # 合并后的检测框 176 | 177 | return detection_result 178 | 179 | # 主程序入口 180 | if __name__ == "__main__": 181 | # 对测试图像执行文本检测 182 | res = text_detection(image_path = "test_fomula_text_block.png") 183 | 184 | 185 | # 文本检测结果说明 186 | 187 | # 以下是OCR文本检测的JSON结果,包含了检测到的文本区域及其相关信息: 188 | 189 | # ```json 190 | # { 191 | # "input_path": "general_ocr_001.png", // 输入图像的文件路径 192 | # "page_index": null, // 页面索引,null表示不适用或单页面文档 193 | # "dt_polys": [ // 检测到的文本多边形区域,每个区域由四个坐标点[x,y]组成 194 | # [[73, 552], [453, 542], [454, 575], [74, 585]], // 第1个文本区域的四个顶点坐标 195 | # [[17, 506], [515, 486], [517, 535], [19, 555]], // 第2个文本区域的四个顶点坐标 196 | # [[189, 457], [398, 449], [399, 482], [190, 490]], // 第3个文本区域的四个顶点坐标 197 | # [[41, 412], [484, 387], [486, 433], [43, 457]], // 第4个文本区域的四个顶点坐标 198 | # [[510, 32], [525, 32], [525, 49], [510, 49]] // 第5个文本区域的四个顶点坐标 199 | # ], 200 | # "dt_scores": [ // 每个检测区域的置信度得分,值范围0-1,越高表示越可信 201 | # 0.7650322239059382, // 第1个区域的置信度 202 | # 0.7197010251844577, // 第2个区域的置信度 203 | # 0.8289373546662983, // 第3个区域的置信度 (最高置信度) 204 | # 0.7989932734846841, // 第4个区域的置信度 205 | # 0.7363050443898626 // 第5个区域的置信度 206 | # ] 207 | # } 208 | # ``` 209 | 210 | # ## 字段说明 211 | 212 | # - **input_path**: 输入的图像文件路径 213 | # - **page_index**: 多页文档的页码索引,null表示单页或不适用 214 | # - **dt_polys**: 检测到的文本区域多边形,每个区域由4个点的坐标表示,按顺时针或逆时针排列 215 | # - **dt_scores**: 对应每个文本区域的检测置信度,值越大表示检测结果越可靠 216 | 217 | -------------------------------------------------------------------------------- /x_pdf2md/ocr_utils/text_recogniize.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from paddlex import create_model 4 | 5 | 6 | def recognize_text( 7 | input_image: str, 8 | output_path: str = "./output/res.json", 9 | model="PP-OCRv4_mobile_rec", 10 | ) -> list: 11 | """ 12 | 识别图片中的文本 13 | Args: 14 | input_image: 输入图片路径 15 | output_dir: 输出目录路径 16 | Returns: 17 | 识别结果列表 18 | """ 19 | output_dir = os.path.dirname(output_path) 20 | # 确保输出目录存在 21 | os.makedirs(output_dir, exist_ok=True) 22 | 23 | # 创建模型 24 | model = create_model(model_name=model) 25 | 26 | # 预测 27 | output = model.predict(input=input_image, batch_size=1) 28 | 29 | for res in output: 30 | res.save_to_json(save_path=output_path) 31 | 32 | with open(output_path, "r", encoding="utf-8") as f: 33 | result = json.load(f) 34 | 35 | return result 36 | 37 | 38 | # 使用示例: 39 | # results = recognize_text("text_area_4_score_0.9858.png") 40 | 41 | # OCR 文本识别结果 42 | 43 | # 以下是 OCR 文本识别的 JSON 结果数据: 44 | 45 | # ```json 46 | # { 47 | # "input_path": "general_ocr_rec_001.png", // 输入图像文件路径 48 | # "page_index": null, // 页码索引(多页文档时使用) 49 | # "rec_text": "绿洲仕格维花园公寓", // 识别出的文本内容 50 | # "rec_score": 0.9875162839889526 // 识别结果的置信度分数(0-1之间) 51 | # } 52 | # ``` 53 | 54 | # ## 字段说明 55 | 56 | # - **input_path**: 输入的源图像文件名 57 | # - **page_index**: 在多页文档中的页码索引,null 表示单页文档或默认页 58 | # - **rec_text**: OCR 识别出的文本内容 59 | # - **rec_score**: 识别结果的置信度,越接近 1 表示识别结果越可信 60 | -------------------------------------------------------------------------------- /x_pdf2md/pdf2md_converter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | """ 4 | 命令行入口点 - 处理命令行参数并调用相应功能 5 | """ 6 | 7 | import argparse 8 | import os 9 | from pathlib import Path 10 | from typing import Optional, List, Union 11 | # 从process_pdf.py导入必要的依赖 12 | from x_pdf2md.image_utils.process_page import process_page_layout 13 | from x_pdf2md.image_utils.layout_config import LayoutConfig 14 | from x_pdf2md.image_utils.region_image import RegionImage 15 | from tqdm import tqdm 16 | # 更新导入路径 17 | from x_pdf2md.markdown_formatter import format_pdf_regions 18 | from x_pdf2md.pdf_utils.pdf_to_image import pdf_to_images 19 | from x_pdf2md.remote_image import default_uploader 20 | from x_pdf2md.config import update_config, get_config, DEFAULT_CONFIG 21 | 22 | 23 | def process_pdf_document( 24 | pdf_path: str, 25 | output_dir: str, 26 | start_page: int = 0, 27 | end_page: Optional[int] = None, 28 | dpi: int = 300, 29 | threshold_left_right: float = 0.9, 30 | threshold_cross: float = 0.3, 31 | ) -> List[List[RegionImage]]: 32 | """ 33 | 处理PDF文档:将PDF转换为图像,并对每页进行版面分析和区域裁剪 34 | 35 | 参数: 36 | pdf_path: PDF文件路径 37 | output_dir: 输出目录路径 38 | start_page: 起始页码(从0开始) 39 | end_page: 结束页码(包含),如果为None则处理所有页面 40 | dpi: PDF转图像的分辨率 41 | threshold_left_right: 判定左右栏的阈值 42 | threshold_cross: 判定跨栏的阈值 43 | 44 | 返回: 45 | List[List[RegionImage]]: 每页的RegionImage对象列表 46 | """ 47 | # 创建输出目录 48 | pdf_name = Path(pdf_path).stem 49 | output_dir = os.path.abspath(output_dir) 50 | temp_images_dir = os.path.join(output_dir, f"{pdf_name}_images") 51 | os.makedirs(temp_images_dir, exist_ok=True) 52 | 53 | # 将PDF转换为图像 54 | print("正在将PDF转换为图像...") 55 | image_paths = pdf_to_images( 56 | pdf_path=pdf_path, 57 | output_dir=temp_images_dir, 58 | start_page=start_page, 59 | end_page=end_page, 60 | dpi=dpi, 61 | ) 62 | 63 | # 处理每个页面的布局 64 | print("正在分析和裁剪页面...") 65 | all_page_regions = [] 66 | for i, image_path in enumerate(tqdm(image_paths, desc="处理页面")): 67 | page_num = i + 1 68 | page_dir = os.path.join(output_dir, f"{pdf_name}_page_{page_num}") 69 | os.makedirs(page_dir, exist_ok=True) 70 | 71 | # 处理页面布局并获取区域信息 72 | regions = process_page_layout( 73 | image_path=image_path, 74 | output_dir=page_dir, 75 | page_number=page_num, 76 | threshold_left_right=threshold_left_right, 77 | threshold_cross=threshold_cross, 78 | ) 79 | 80 | all_page_regions.append(regions) 81 | 82 | return all_page_regions 83 | 84 | 85 | def convert_pdf_to_markdown( 86 | pdf_path: str, 87 | output_dir: str = "output", 88 | start_page: int = 0, 89 | end_page: Optional[int] = None, 90 | dpi: int = DEFAULT_CONFIG["DEFAULT_DPI"], # 使用配置中的默认值 91 | threshold_left_right: float = DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"], # 使用配置中的默认值 92 | threshold_cross: float = DEFAULT_CONFIG["THRESHOLD_CROSS"], # 使用配置中的默认值 93 | upload_images: bool = False, 94 | output_md_path: Optional[str] = None, 95 | api_key: Optional[str] = None, 96 | base_url: Optional[str] = None, # 从config中获取,不设默认值 97 | ) -> Union[str, List[str]]: 98 | """ 99 | 将PDF文档转换为Markdown 100 | 101 | Args: 102 | pdf_path: PDF文件路径 103 | output_dir: 输出目录路径,默认为"output" 104 | start_page: 起始页码(从0开始),默认为0 105 | end_page: 结束页码(包含),如果为None则处理所有页面 106 | dpi: PDF转图像的分辨率,默认为300 107 | threshold_left_right: 判定左右栏的阈值,默认为0.9 108 | threshold_cross: 判定跨栏的阈值,默认为0.3 109 | upload_images: 是否上传图片,默认为False 110 | output_md_path: Markdown输出文件路径,如果为None则不保存文件 111 | api_key: API密钥,可选,默认从config获取 112 | base_url: API基础URL,可选,默认从config获取 113 | 114 | Returns: 115 | 如果提供了output_md_path,返回保存的文件路径;否则返回Markdown内容的列表 116 | """ 117 | # 更新配置 118 | config_updates = {} 119 | if api_key: 120 | config_updates["API_KEY"] = api_key 121 | if base_url: 122 | config_updates["BASE_URL"] = base_url 123 | if dpi and dpi != DEFAULT_CONFIG["DEFAULT_DPI"]: 124 | config_updates["DEFAULT_DPI"] = dpi 125 | if threshold_left_right is not None and threshold_left_right != DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"]: 126 | config_updates["THRESHOLD_LEFT_RIGHT"] = threshold_left_right 127 | if threshold_cross is not None and threshold_cross != DEFAULT_CONFIG["THRESHOLD_CROSS"]: 128 | config_updates["THRESHOLD_CROSS"] = threshold_cross 129 | 130 | if config_updates: 131 | update_config(config_updates) 132 | 133 | # 处理PDF 134 | regions = process_pdf_document( 135 | pdf_path=pdf_path, 136 | output_dir=output_dir, 137 | start_page=start_page, 138 | end_page=end_page, 139 | dpi=dpi, 140 | threshold_left_right=threshold_left_right, 141 | threshold_cross=threshold_cross, 142 | ) 143 | 144 | # 初始化图片上传器(如果需要) 145 | image_uploader = None 146 | if upload_images: 147 | image_uploader = default_uploader 148 | 149 | # 格式化结果,传递输出目录 150 | formatted_pages = format_pdf_regions(regions, image_uploader, output_dir=output_dir) 151 | 152 | # 创建输出目录(如果需要) 153 | if output_md_path: 154 | output_dir = os.path.dirname(os.path.abspath(output_md_path)) 155 | if output_dir and not os.path.exists(output_dir): 156 | os.makedirs(output_dir, exist_ok=True) 157 | 158 | # 保存为Markdown文件 159 | with open(output_md_path, "w", encoding="utf-8") as f: 160 | f.write("\n\n---\n\n".join(formatted_pages)) 161 | 162 | # 输出处理统计 163 | total_pages = len(regions) 164 | total_regions = sum(len(page_regions) for page_regions in regions) 165 | print(f"处理完成!共处理 {total_pages} 页,生成 {total_regions} 个区域图片") 166 | print(f"Markdown文件已保存到: {output_md_path}") 167 | 168 | return output_md_path 169 | 170 | # 如果没有指定输出路径,则直接返回格式化后的内容 171 | return formatted_pages 172 | 173 | 174 | def main(): 175 | """命令行主函数""" 176 | parser = argparse.ArgumentParser(description="PDF文档处理工具") 177 | parser.add_argument("-p", "--pdf", required=True, help="输入PDF文件路径") 178 | parser.add_argument("-o", "--output", default="output", help="输出目录路径") 179 | parser.add_argument( 180 | "-s", "--start_page", type=int, default=0, help="起始页码(从0开始)" 181 | ) 182 | parser.add_argument("-e", "--end_page", type=int, default=None, help="结束页码") 183 | parser.add_argument("-d", "--dpi", type=int, default=DEFAULT_CONFIG["DEFAULT_DPI"], 184 | help=f"图像分辨率,默认为{DEFAULT_CONFIG['DEFAULT_DPI']}") 185 | parser.add_argument("--threshold_lr", type=float, default=DEFAULT_CONFIG["THRESHOLD_LEFT_RIGHT"], 186 | help=f"左右栏阈值,默认为{DEFAULT_CONFIG['THRESHOLD_LEFT_RIGHT']}") 187 | parser.add_argument("--threshold_cross", type=float, default=DEFAULT_CONFIG["THRESHOLD_CROSS"], 188 | help=f"跨栏阈值,默认为{DEFAULT_CONFIG['THRESHOLD_CROSS']}") 189 | parser.add_argument( 190 | "--no-filter", action="store_false", dest="filter_regions", help="不过滤区域" 191 | ) 192 | parser.add_argument("--upload", action="store_true", help="启用图片上传") 193 | parser.add_argument("--output-md", type=str, default="output.md", help="Markdown输出文件路径") 194 | 195 | # 添加API和模型配置参数 196 | parser.add_argument("--api-key", type=str, help="API密钥") 197 | parser.add_argument("--base-url", type=str, default=DEFAULT_CONFIG["BASE_URL"], 198 | help=f"API基础URL,默认为{DEFAULT_CONFIG['BASE_URL']}") 199 | parser.add_argument("--formula-model", type=str, default=DEFAULT_CONFIG["FORMULA_MODEL"], 200 | help=f"公式识别模型名称,默认为{DEFAULT_CONFIG['FORMULA_MODEL']}") 201 | parser.add_argument("--ocr-det-model", type=str, default=DEFAULT_CONFIG["OCR_DET_MODEL"], 202 | help=f"OCR检测模型名称,默认为{DEFAULT_CONFIG['OCR_DET_MODEL']}") 203 | parser.add_argument("--ocr-rec-model", type=str, default=DEFAULT_CONFIG["OCR_REC_MODEL"], 204 | help=f"OCR识别模型名称,默认为{DEFAULT_CONFIG['OCR_REC_MODEL']}") 205 | parser.add_argument("--layout-model", type=str, default=DEFAULT_CONFIG["LAYOUT_MODEL"], 206 | help=f"版面分析模型名称,默认为{DEFAULT_CONFIG['LAYOUT_MODEL']}") 207 | parser.add_argument("--vlm-model", type=str, default=DEFAULT_CONFIG["VLM_MODEL"],) 208 | 209 | args = parser.parse_args() 210 | 211 | # 更新模型配置 212 | config_updates = {} 213 | if args.formula_model != DEFAULT_CONFIG["FORMULA_MODEL"]: 214 | config_updates["FORMULA_MODEL"] = args.formula_model 215 | if args.ocr_det_model != DEFAULT_CONFIG["OCR_DET_MODEL"]: 216 | config_updates["OCR_DET_MODEL"] = args.ocr_det_model 217 | if args.ocr_rec_model != DEFAULT_CONFIG["OCR_REC_MODEL"]: 218 | config_updates["OCR_REC_MODEL"] = args.ocr_rec_model 219 | if args.layout_model != DEFAULT_CONFIG["LAYOUT_MODEL"]: 220 | config_updates["LAYOUT_MODEL"] = args.layout_model 221 | 222 | if config_updates: 223 | update_config(config_updates) 224 | 225 | # 调用转换函数 226 | convert_pdf_to_markdown( 227 | pdf_path=args.pdf, 228 | output_dir=args.output, 229 | start_page=args.start_page, 230 | end_page=args.end_page, 231 | dpi=args.dpi, 232 | threshold_left_right=args.threshold_lr, 233 | threshold_cross=args.threshold_cross, 234 | upload_images=args.upload, 235 | output_md_path=args.output_md, 236 | api_key=args.api_key, 237 | base_url=args.base_url 238 | ) 239 | 240 | 241 | if __name__ == "__main__": 242 | main() -------------------------------------------------------------------------------- /x_pdf2md/pdf_utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | PDF处理相关工具 3 | """ 4 | -------------------------------------------------------------------------------- /x_pdf2md/pdf_utils/pdf_to_image.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pdfplumber 4 | from PIL import Image 5 | from pathlib import Path 6 | from tqdm import tqdm 7 | 8 | def pdf_page_to_image(pdf_path, page_number, output_path, dpi=300): 9 | """ 10 | 将PDF中的指定页面提取为高分辨率图片。 11 | 12 | 参数: 13 | pdf_path (str): PDF文件路径 14 | page_number (int): 要提取的页码(从0开始索引) 15 | output_path (str): 输出图片的保存路径 16 | dpi (int): 分辨率(每英寸点数),数值越高质量越好 17 | 18 | 返回: 19 | str: 已保存图片的路径 20 | """ 21 | try: 22 | # 创建输出目录(如果不存在) 23 | os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True) 24 | 25 | # 使用pdfplumber打开PDF 26 | with pdfplumber.open(pdf_path) as pdf: 27 | # 检查页码是否有效 28 | if page_number < 0 or page_number >= len(pdf.pages): 29 | raise ValueError(f"页码 {page_number} 超出范围。PDF共有 {len(pdf.pages)} 页。") 30 | 31 | # 获取指定页面 32 | page = pdf.pages[page_number] 33 | 34 | # 将页面转换为图像 35 | img = page.to_image(resolution=dpi) 36 | 37 | # 保存图像 38 | img.save(output_path, format="PNG") 39 | 40 | return output_path 41 | 42 | except Exception as e: 43 | print(f"提取PDF页面时出错: {e}") 44 | return None 45 | 46 | def pdf_to_images(pdf_path, output_dir, start_page=0, end_page=None, dpi=300): 47 | """ 48 | 将PDF文件转换为一系列图像 49 | 50 | 参数: 51 | pdf_path (str): PDF文件路径 52 | output_dir (str): 输出图像的目录 53 | start_page (int): 起始页码(从0开始索引) 54 | end_page (int): 结束页码(包含),如果为None则处理所有页面 55 | dpi (int): 分辨率 56 | 57 | 返回: 58 | list: 已生成图像的路径列表 59 | """ 60 | # 创建输出目录 61 | os.makedirs(output_dir, exist_ok=True) 62 | 63 | # 获取PDF文件名(不含扩展名) 64 | pdf_name = Path(pdf_path).stem 65 | 66 | try: 67 | # 使用pdfplumber获取PDF总页数 68 | with pdfplumber.open(pdf_path) as pdf: 69 | total_pages = len(pdf.pages) 70 | 71 | # 如果未指定结束页码,则处理所有页面 72 | if end_page is None: 73 | end_page = total_pages - 1 74 | 75 | # 验证页码范围 76 | if start_page < 0 or start_page >= total_pages: 77 | raise ValueError(f"起始页码 {start_page} 无效。PDF共有 {total_pages} 页。") 78 | 79 | if end_page < start_page or end_page >= total_pages: 80 | raise ValueError(f"结束页码 {end_page} 无效。PDF共有 {total_pages} 页。") 81 | 82 | # 存储图像路径 83 | image_paths = [] 84 | 85 | # 处理每个页面 86 | page_range = range(start_page, end_page + 1) 87 | for page_num in tqdm(page_range, desc="转换PDF页面为图像"): 88 | # 设置输出图像路径 89 | output_image = os.path.join(output_dir, f"{pdf_name}_page_{page_num+1}.png") 90 | 91 | # 转换页面为图像 92 | result = pdf_page_to_image(pdf_path, page_num, output_image, dpi) 93 | 94 | if result: 95 | image_paths.append(result) 96 | 97 | return image_paths 98 | 99 | except Exception as e: 100 | print(f"处理PDF时出错: {e}") 101 | return [] 102 | 103 | 104 | # 如果需要命令行使用,保留此部分;否则可以删除 105 | if __name__ == "__main__": 106 | 107 | pdf_path = "./test_x_pdf2md.pdf" 108 | output_dir = "./output" 109 | # 转换PDF到图像 110 | image_paths = pdf_to_images( 111 | pdf_path=pdf_path, 112 | output_dir=output_dir, 113 | dpi=300 114 | ) -------------------------------------------------------------------------------- /x_pdf2md/pdf_utils/test_x_pdf2md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/pdf_utils/test_x_pdf2md.pdf -------------------------------------------------------------------------------- /x_pdf2md/remote_image/__init__.py: -------------------------------------------------------------------------------- 1 | from x_pdf2md.remote_image.image_uploader import ImageUploader 2 | from x_pdf2md.remote_image.remote_image_config import BASE_URL 3 | 4 | 5 | # 创建默认的上传器实例 6 | default_uploader = ImageUploader(BASE_URL) 7 | 8 | # 导出常用的接口 9 | __all__ = ['default_uploader'] 10 | -------------------------------------------------------------------------------- /x_pdf2md/remote_image/image_names.json: -------------------------------------------------------------------------------- 1 | { 2 | "d8cde17d3272483aadffee7942fa5a12.png": "文档飞舞.png", 3 | "d8b83c8b5db34b9abe81358248e01feb.png": "car.png", 4 | "26aec496c5b04c08b5b234657d332201.png": "car.png", 5 | "6440738c700d4a638af2e3bdb46d2b18.png": "3_image_0.9310.png", 6 | "ce61d84ea087494aba596334712a9d16.png": "1_image_0.8737.png", 7 | "d5263f6aab80473f9acb7b44da3ad13d.png": "3_image_0.9332.png", 8 | "d11d01ef03c44c0897fcaba132d76b15.png": "1_chart_0.6732.png", 9 | "14c685439a234a7992584ffa45baa00c.png": "6_image_0.9474.png", 10 | "0bc2b863d25f462a9be99c8b59a94278.png": "1_image_0.8843.png", 11 | "fc5b2e8a370343349efb19567176eba4.png": "5_image_0.9802.png", 12 | "77a78431422645afa2438a6185f538ad.png": "1_image_0.9368.png", 13 | "e2c8f97af1964083be85e9accad22d74.png": "3_chart_0.9602.png", 14 | "e2f1c3479ea2451686b014935da352dc.png": "3_image_0.9310.png", 15 | "a6a4b1ba811e461fbf9db0f5568544ae.png": "1_image_0.8737.png", 16 | "14d9aac105404d909196c67b3db7d39f.png": "3_image_0.9332.png", 17 | "0486c1d0971144e182563f5e468d7659.png": "1_chart_0.6732.png", 18 | "796c9840eef843deaa8b0b2a10bdee00.png": "6_image_0.9474.png", 19 | "acc78019636545ce914c28e66c270901.png": "1_image_0.8843.png", 20 | "f9653e02835845f089df2bf58afe7665.png": "5_image_0.9802.png", 21 | "b34f44dd026447f59e4ef6551a825d90.png": "1_image_0.9368.png", 22 | "eac2252e2a2e4614bf9f4ddfe4d258a9.png": "3_chart_0.9602.png", 23 | "00985878a4ea4bfcbfc57f37e294a29a.png": "2_image_0.9310.png", 24 | "d7099bd9820c4785b2934d690f863168.png": "0_image_0.8737.png", 25 | "afe096a66c224bcdbe153d4cf1289db1.png": "2_image_0.9332.png", 26 | "44ce898afe704f59a90fe69a876f9450.png": "0_chart_0.6732.png", 27 | "2a6ab8e378cf4a8ebdc028b41af25781.png": "5_image_0.9474.png", 28 | "7c9f474ff43b48118601e031de081313.png": "0_image_0.8843.png", 29 | "baf9911a9c244a6d938484aa46e22807.png": "4_image_0.9802.png", 30 | "f7e6d0ebcd4e4deabb6d62cc8877e216.png": "0_image_0.9368.png", 31 | "075771b951fe45e992da22b6281405a0.png": "2_chart_0.9602.png", 32 | "d335ddab7ee1446e93c9fdbce532586c.png": "2_image_0.9310.png", 33 | "e328186431ef4257925e29ae93bd85e7.png": "0_image_0.8737.png", 34 | "668fb1eef5e64611980a016f2d060b43.png": "2_image_0.9332.png", 35 | "e96fc48547b743dda00842d5ecd2f166.png": "0_chart_0.6732.png", 36 | "569fd6ad0fec40318d1aa92c6b329827.png": "5_image_0.9474.png", 37 | "3b65c3ce1bf140e48994782b1b15b52f.png": "0_image_0.8843.png", 38 | "4b62cdb34c2a478d97d7f5417091edba.png": "4_image_0.9802.png", 39 | "1a93ac8922714d7194b65d9cffddeb34.png": "0_image_0.9368.png", 40 | "cdc04e2d086f450a87f882e40fdef62f.png": "2_chart_0.9602.png", 41 | "09430e275628438c8b7bc1db393b5ceb.png": "2_image_0.9310.png", 42 | "e6f5a4dbe62645fbb719c37c33aaadf0.png": "0_image_0.8737.png", 43 | "76ec5a3704a2448a9d5242de4389d5f6.png": "2_image_0.9332.png", 44 | "f76f60e9e5d448bdb5481c184275614e.png": "0_chart_0.6732.png", 45 | "bc0614bfd5a34cd8b0fb626ab2c797b2.png": "5_image_0.9474.png", 46 | "44878ac3ab6d41068fbe1bde4d6f89d3.png": "0_image_0.8843.png", 47 | "7d0d7929c95544bdb87e8fc1f367664d.png": "4_image_0.9802.png", 48 | "f2e6caad04194952a9f096fd0e73659e.png": "0_image_0.9368.png", 49 | "8e7465cee34c4c458b04d885190f9b27.png": "2_chart_0.9602.png", 50 | "55c8f8af6074461eb4e19c849e144aaa.png": "2_image_0.9310.png", 51 | "7521553b2530417a807dd3bb6b1d4c68.png": "0_image_0.8737.png", 52 | "159b393f626c4177b5a28a6bce955c50.png": "2_image_0.9332.png", 53 | "a418628c21c6469698cda3035e879233.png": "0_chart_0.6732.png" 54 | } -------------------------------------------------------------------------------- /x_pdf2md/remote_image/image_serve.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from fastapi import FastAPI, UploadFile, Query 4 | from fastapi.staticfiles import StaticFiles 5 | from fastapi.responses import FileResponse, JSONResponse 6 | import uuid 7 | from typing import List 8 | from remote_image_config import UPLOAD_DIR, BASE_URL, HOST, PORT, IMAGE_NAMES_FILE 9 | 10 | if not os.path.exists(UPLOAD_DIR): 11 | os.makedirs(UPLOAD_DIR) 12 | 13 | app = FastAPI() 14 | 15 | # 挂载静态文件目录 16 | app.mount("/images", StaticFiles(directory=UPLOAD_DIR), name="images") 17 | app.mount("/static", StaticFiles(directory="static"), name="static") 18 | 19 | @app.get("/") 20 | async def root(): 21 | return FileResponse("static/index.html") 22 | 23 | @app.get("/health") 24 | async def health_check(): 25 | """健康检查接口""" 26 | return {"status": "ok"} 27 | 28 | # 加载图片名称映射 29 | image_names = {} 30 | if os.path.exists(IMAGE_NAMES_FILE): 31 | with open(IMAGE_NAMES_FILE, 'r', encoding='utf-8') as f: 32 | image_names = json.load(f) 33 | 34 | @app.post("/image_upload") 35 | async def upload_image(file: UploadFile): 36 | """ 37 | 图片上传接口 38 | """ 39 | try: 40 | original_filename = file.filename 41 | file_extension = os.path.splitext(original_filename)[1] 42 | unique_filename = f"{uuid.uuid4().hex}{file_extension}" 43 | 44 | # 确保上传目录存在 45 | os.makedirs(UPLOAD_DIR, exist_ok=True) 46 | 47 | # 保存文件 48 | file_path = os.path.join(UPLOAD_DIR, unique_filename) 49 | content = await file.read() 50 | with open(file_path, "wb") as f: 51 | f.write(content) 52 | 53 | # 保存原始文件名映射 54 | image_names[unique_filename] = original_filename 55 | with open(IMAGE_NAMES_FILE, 'w', encoding='utf-8') as f: 56 | json.dump(image_names, f, ensure_ascii=False, indent=2) 57 | 58 | # 返回相对路径,不包含BASE_URL 59 | return {"url": f"images/{unique_filename}", "originalName": original_filename} 60 | except Exception as e: 61 | return JSONResponse( 62 | status_code=500, 63 | content={"error": f"Upload failed: {str(e)}"} 64 | ) 65 | 66 | @app.get("/api/images") 67 | async def list_images(page: int = Query(default=1, ge=1), page_size: int = Query(default=20, ge=1, le=100)): 68 | all_images = [f for f in os.listdir(UPLOAD_DIR) if os.path.isfile(os.path.join(UPLOAD_DIR, f))] 69 | all_images.sort(key=lambda x: os.path.getctime(os.path.join(UPLOAD_DIR, x)), reverse=True) 70 | 71 | # 计算分页 72 | total = len(all_images) 73 | total_pages = (total + page_size - 1) // page_size 74 | start = (page - 1) * page_size 75 | end = min(start + page_size, total) 76 | 77 | # 添加原始文件名 78 | image_list = [] 79 | for img in all_images[start:end]: 80 | image_list.append({ 81 | "filename": img, 82 | "originalName": image_names.get(img, img) 83 | }) 84 | 85 | return { 86 | "images": image_list, 87 | "totalPages": total_pages, 88 | "currentPage": page, 89 | "total": total 90 | } 91 | 92 | if __name__ == "__main__": 93 | import uvicorn 94 | uvicorn.run(app, host=HOST, port=PORT) 95 | -------------------------------------------------------------------------------- /x_pdf2md/remote_image/image_uploader.py: -------------------------------------------------------------------------------- 1 | from .remote_image_config import BASE_URL, IMAGE_SERVER 2 | import requests 3 | from typing import Optional, Tuple 4 | import logging 5 | from requests.adapters import HTTPAdapter 6 | from urllib3.util.retry import Retry 7 | import os 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class ImageUploader: 12 | """图片上传处理类""" 13 | 14 | def __init__(self, server_url: str = None, max_retries: int = 3, timeout: int = 10): 15 | """ 16 | 初始化图片上传器 17 | 18 | 参数: 19 | server_url: 图片服务器的URL,如果不提供则使用配置文件中的设置 20 | max_retries: 最大重试次数 21 | timeout: 请求超时时间(秒) 22 | """ 23 | self.server_url = server_url or IMAGE_SERVER['base_url'] 24 | self.server_url = self.server_url.rstrip('/') 25 | self.timeout = timeout 26 | 27 | # 配置重试策略 28 | self.session = requests.Session() 29 | retries = Retry( 30 | total=max_retries, 31 | backoff_factor=0.5, 32 | status_forcelist=[502, 503, 504] 33 | ) 34 | self.session.mount('http://', HTTPAdapter(max_retries=retries)) 35 | self.session.mount('https://', HTTPAdapter(max_retries=retries)) 36 | 37 | def get_absolute_url(self, relative_path: str) -> Optional[str]: 38 | """ 39 | 将相对路径转换为完整的URL地址 40 | 41 | 参数: 42 | relative_path: 图片的相对路径 43 | 44 | 返回: 45 | str: 完整的URL地址 46 | None: 如果输入路径为None 47 | """ 48 | if relative_path is None: 49 | return None 50 | if relative_path.startswith(('http://', 'https://')): 51 | return relative_path 52 | return f"{self.server_url}/{relative_path.lstrip('/')}" 53 | 54 | def upload(self, image_path: str) -> Optional[str]: 55 | """上传图片到服务器""" 56 | try: 57 | # 检查文件是否存在 58 | if not os.path.exists(image_path): 59 | logger.error(f"文件不存在: {image_path}") 60 | return None 61 | 62 | # 获取文件名 63 | filename = os.path.basename(image_path) 64 | 65 | with open(image_path, 'rb') as f: 66 | # 使用元组格式指定文件名 67 | files = { 68 | 'file': (filename, f, 'image/jpeg') 69 | } 70 | logger.info(f"正在上传文件 {image_path} 到 {self.server_url}/image_upload") 71 | response = self.session.post( 72 | f"{self.server_url}/image_upload", 73 | files=files, 74 | timeout=self.timeout 75 | ) 76 | 77 | if response.status_code == 200: 78 | result = response.json() 79 | url = result.get('url') 80 | if url: 81 | # 服务器返回的是相对路径 'images/xxx.jpg',需要拼接完整URL 82 | absolute_url = f"{self.server_url}/{url}" 83 | logger.info(f"上传成功,URL: {absolute_url}") 84 | return absolute_url 85 | else: 86 | logger.error("服务器返回的URL为空") 87 | return None 88 | else: 89 | logger.error( 90 | f"上传失败: HTTP {response.status_code}\n" 91 | f"响应内容: {response.text}\n" 92 | f"请求URL: {self.server_url}/image_upload" 93 | ) 94 | return None 95 | 96 | except requests.exceptions.ConnectionError as e: 97 | logger.error(f"服务器连接失败: {str(e)}") 98 | return None 99 | except requests.exceptions.Timeout as e: 100 | logger.error(f"请求超时: {str(e)}") 101 | return None 102 | except Exception as e: 103 | logger.error(f"上传图片时发生错误: {str(e)}") 104 | return None 105 | 106 | def check_server(self) -> Tuple[bool, str]: 107 | """检查服务器是否可用""" 108 | try: 109 | response = self.session.get(f"{self.server_url}/health", timeout=self.timeout) 110 | if response.status_code == 200: 111 | return True, "服务器运行正常" 112 | return False, f"服务器返回异常状态码: {response.status_code}" 113 | except requests.exceptions.ConnectionError: 114 | return False, f"无法连接到服务器 {self.server_url}" 115 | except Exception as e: 116 | return False, f"检查服务器时发生错误: {str(e)}" 117 | 118 | 119 | # 使用当前目录下的car.png作为测试图片 120 | if __name__ == "__main__": 121 | # 配置日志输出 122 | logging.basicConfig( 123 | level=logging.INFO, 124 | format='%(asctime)s - %(levelname)s - %(message)s' 125 | ) 126 | 127 | uploader = ImageUploader(server_url="http://localhost:8100") # 明确指定服务器地址 128 | print(f"使用服务器地址: {uploader.server_url}") 129 | 130 | # 首先检查服务器状态 131 | status, message = uploader.check_server() 132 | if not status: 133 | print(f"服务器检查失败: {message}") 134 | exit(1) 135 | 136 | print("服务器连接正常,开始上传图片...") 137 | image_url = uploader.upload("car.png") 138 | if image_url: 139 | print(f"图片上传成功,URL为: {image_url}") 140 | else: 141 | print("图片上传失败,请检查日志获取详细信息") -------------------------------------------------------------------------------- /x_pdf2md/remote_image/remote_image_config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # 服务器配置 4 | HOST = "0.0.0.0" 5 | PORT = 8100 6 | BASE_URL = f"http://{HOST}:{PORT}" 7 | 8 | # 上传配置 9 | UPLOAD_DIR = os.path.join(os.path.dirname(__file__), "upload_images") 10 | if not os.path.exists(UPLOAD_DIR): 11 | os.makedirs(UPLOAD_DIR) 12 | 13 | # 图片名称映射文件路径 14 | IMAGE_NAMES_FILE = os.path.join(os.path.dirname(__file__), "image_names.json") 15 | 16 | # 图片服务器配置 17 | IMAGE_SERVER = { 18 | "base_url": BASE_URL, 19 | 'timeout': 10, 20 | 'max_retries': 3 21 | } 22 | -------------------------------------------------------------------------------- /x_pdf2md/remote_image/static/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 图片服务导航 5 | 43 | 44 | 45 |

图片服务中心

46 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /x_pdf2md/remote_image/static/list.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 图片列表 5 | 29 | 30 | 31 | 34 |

图片列表

35 |
36 | 37 | 38 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /x_pdf2md/remote_image/static/upload.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 上传图片 5 | 40 | 41 | 42 | 45 |

上传图片

46 |
47 |
48 | 49 | 50 |
51 | 52 |
53 |
54 | 55 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /x_pdf2md/test_convert.py: -------------------------------------------------------------------------------- 1 | """ 2 | 测试文件,用于直接处理x_pdf2md/tests目录下的test.pdf文件 3 | 使用方法: 4 | python -m x_pdf2md.test_convert 5 | """ 6 | 7 | import os 8 | from pathlib import Path 9 | 10 | from x_pdf2md.convert import convert_pdf_to_markdown 11 | 12 | 13 | def test_convert_pdf(): 14 | """ 15 | 处理x_pdf2md/tests/test.pdf文件,并输出结果到output目录 16 | """ 17 | # 获取当前模块所在目录 18 | current_module_dir = os.path.dirname(os.path.abspath(__file__)) 19 | 20 | # 构建项目根目录路径 21 | project_root = os.path.dirname(current_module_dir) 22 | 23 | # 构建PDF文件路径(使用tests目录下的测试文件) 24 | pdf_path = os.path.join(project_root, "x_pdf2md", "tests", "test_x_pdf2md.pdf") 25 | 26 | # 构建输出目录路径 27 | output_dir = os.path.join(os.getcwd(), "output") 28 | 29 | # 确保PDF文件存在 30 | if not os.path.exists(pdf_path): 31 | print(f"错误:找不到测试PDF文件: {pdf_path}") 32 | print(f"请确保在 x_pdf2md/tests 目录中存在 test.pdf 文件") 33 | return False 34 | 35 | # 创建输出目录(如果不存在) 36 | os.makedirs(output_dir, exist_ok=True) 37 | 38 | print(f"开始处理PDF文件: {pdf_path}") 39 | 40 | 41 | output_path = convert_pdf_to_markdown( 42 | pdf_path=pdf_path, 43 | output_dir=output_dir, 44 | start_page=0, 45 | end_page=None, # 处理所有页面 46 | dpi=300, 47 | upload_images=False, # 默认不上传图片 48 | output_md_path=os.path.join(output_dir, "test_result.md") 49 | ) 50 | 51 | print(f"PDF转换成功!输出文件路径: {output_path}") 52 | return output_path 53 | 54 | 55 | if __name__ == "__main__": 56 | # 当直接运行此文件时执行转换 57 | 58 | test_convert_pdf() 59 | 60 | -------------------------------------------------------------------------------- /x_pdf2md/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/tests/__init__.py -------------------------------------------------------------------------------- /x_pdf2md/tests/test_x_pdf2md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/li-xiu-qi/x-pdf2md/ef69a42c2b8ca9da128762fe2bda03bd909cbee0/x_pdf2md/tests/test_x_pdf2md.pdf --------------------------------------------------------------------------------