├── magic_doc ├── __init__.py ├── common │ ├── __init__.py │ └── default_config.py ├── conv │ ├── __init__.py │ ├── base.py │ ├── pptx_python_pptx.py │ ├── docx_xml_parse.py │ ├── doc_antiword.py │ ├── conv_html.py │ ├── pdf.py │ ├── doc_libreoffice.py │ ├── ppt_libreoffice.py │ ├── pdf_pp_structurev2.py │ └── pdf_magicpdf.py ├── libs │ ├── __init__.py │ └── version.py ├── model │ ├── __init__.py │ ├── sub_modules │ │ ├── __init__.py │ │ ├── UniMERNet │ │ │ └── __init__.py │ │ ├── layoutlmv3 │ │ │ ├── __init__.py │ │ │ ├── layoutlmft │ │ │ │ ├── data │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── funsd.py │ │ │ │ │ └── data_collator.py │ │ │ │ ├── __init__.py │ │ │ │ └── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── layoutlmv3 │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── tokenization_layoutlmv3.py │ │ │ │ │ ├── tokenization_layoutlmv3_fast.py │ │ │ │ │ └── configuration_layoutlmv3.py │ │ │ └── model_init.py │ │ └── post_process.py │ ├── seq_layout.py │ ├── seq_paddle.py │ ├── seq_ocr.py │ ├── doc_analysis_by_pp.py │ ├── parallel_paddle.py │ ├── parallel_layout.py │ └── parallel_ocr.py ├── contrib │ ├── __init__.py │ ├── pdf │ │ ├── __init__.py │ │ └── pdf_extractor.py │ ├── office │ │ ├── formula │ │ │ ├── __init__.py │ │ │ ├── mml │ │ │ │ ├── __init__.py │ │ │ │ └── xsl │ │ │ │ │ ├── mmltex.xsl │ │ │ │ │ ├── README │ │ │ │ │ └── tables.xsl │ │ │ └── omml │ │ │ │ └── __init__.py │ │ ├── ppt_extract.py │ │ ├── __init__.py │ │ ├── doc.py │ │ ├── pptx_extract.py │ │ └── docx_extract.py │ ├── magic_html │ │ ├── extractors │ │ │ ├── __init__.py │ │ │ ├── article_extractor.py │ │ │ ├── title_extractor.py │ │ │ ├── custom_extractor.py │ │ │ ├── weixin_extractor.py │ │ │ └── forum_extractor.py │ │ ├── mmltex │ │ │ ├── mmltex.xsl │ │ │ ├── README │ │ │ ├── tables.xsl │ │ │ └── glayout.xsl │ │ └── __init__.py │ ├── wrapper_exceptions.py │ ├── test_data │ │ ├── doc │ │ │ └── test.doc │ │ └── url_service │ │ │ └── run.py │ └── model │ │ └── __init__.py ├── progress │ ├── __init__.py │ ├── pupdator.py │ └── filepupdator.py ├── restful_api │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── import_models.py │ │ ├── oss │ │ │ ├── __init__.py │ │ │ └── oss.py │ │ ├── web_hook.py │ │ ├── ext.py │ │ └── custom_response.py │ ├── config │ │ ├── __init__.py │ │ └── config.yaml │ ├── api │ │ ├── analysis │ │ │ ├── __init__.py │ │ │ ├── ext.py │ │ │ ├── serialization.py │ │ │ ├── magic_html_view.py │ │ │ └── magic_pdf_view.py │ │ ├── extentions.py │ │ └── __init__.py │ └── app.py ├── bin │ └── linux │ │ ├── antiword │ │ └── share │ │ └── antiword │ │ ├── UTF-8.txt │ │ ├── fontnames.russian │ │ ├── Example │ │ ├── Default │ │ └── fontnames ├── utils │ ├── null_writer.py │ ├── yaml_load.py │ ├── __init__.py │ ├── path_utils.py │ └── config.py └── resources │ └── model │ ├── model_configs.yaml │ └── UniMERNet │ ├── demo.yaml │ └── demo_old.yaml ├── assets ├── contributor.png └── license.svg ├── magic-doc-template.json ├── test ├── test_cli │ ├── conf │ │ └── conf.py │ └── test_cli.py └── test_docconv.py ├── .gitignore ├── requirements.txt ├── update_version.py ├── .github └── workflows │ ├── benchmark.yml │ ├── ci.yml │ └── python-package.yml ├── tools ├── scoring.py ├── benchmark.py ├── clean_photo.py └── markdown_calculate.py ├── setup.py ├── README_zh-CN.md └── README.md /magic_doc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/conv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/libs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/contrib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/contrib/pdf/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/progress/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/restful_api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/restful_api/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/restful_api/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/contrib/office/formula/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/restful_api/common/import_models.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/restful_api/common/oss/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/libs/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.38" 2 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/UniMERNet/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | -------------------------------------------------------------------------------- /assets/contributor.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magicpdf/Magic-Doc/HEAD/assets/contributor.png -------------------------------------------------------------------------------- /magic_doc/bin/linux/antiword: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magicpdf/Magic-Doc/HEAD/magic_doc/bin/linux/antiword -------------------------------------------------------------------------------- /magic-doc-template.json: -------------------------------------------------------------------------------- 1 | { 2 | "s3_config": { 3 | "ak": "", 4 | "sk": "", 5 | "endpoint": "" 6 | } 7 | } -------------------------------------------------------------------------------- /magic_doc/contrib/wrapper_exceptions.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | class NotSupportOcrPDFException(BaseException): 5 | pass 6 | 7 | -------------------------------------------------------------------------------- /magic_doc/contrib/test_data/doc/test.doc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magicpdf/Magic-Doc/HEAD/magic_doc/contrib/test_data/doc/test.doc -------------------------------------------------------------------------------- /magic_doc/utils/null_writer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class NullWriter: 4 | def write(self, *args, **kwargs): 5 | return None 6 | 7 | -------------------------------------------------------------------------------- /magic_doc/restful_api/common/web_hook.py: -------------------------------------------------------------------------------- 1 | from flask import request 2 | 3 | 4 | # @jwt_required() 5 | def before_request(): 6 | return None 7 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .data_collator import DataCollatorForKeyValueExtraction 3 | -------------------------------------------------------------------------------- /magic_doc/bin/linux/share/antiword/UTF-8.txt: -------------------------------------------------------------------------------- 1 | # UTF-8 to Unicode 2 | # This file is a dummy. 3 | # The conversion is done algorithmicly, not by a table look-up. 4 | -------------------------------------------------------------------------------- /test/test_cli/conf/conf.py: -------------------------------------------------------------------------------- 1 | import os 2 | conf = { 3 | "code_path": os.environ.get('GITHUB_WORKSPACE'), 4 | "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/test/test_cli/pdf_dev", 5 | "pdf_res_path": "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci" 6 | 7 | } -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/__init__.py: -------------------------------------------------------------------------------- 1 | from .models import ( 2 | LayoutLMv3Config, 3 | LayoutLMv3ForTokenClassification, 4 | LayoutLMv3ForQuestionAnswering, 5 | LayoutLMv3ForSequenceClassification, 6 | LayoutLMv3Tokenizer, 7 | ) 8 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .layoutlmv3 import ( 2 | LayoutLMv3Config, 3 | LayoutLMv3ForTokenClassification, 4 | LayoutLMv3ForQuestionAnswering, 5 | LayoutLMv3ForSequenceClassification, 6 | LayoutLMv3Tokenizer, 7 | ) 8 | -------------------------------------------------------------------------------- /magic_doc/restful_api/common/ext.py: -------------------------------------------------------------------------------- 1 | def singleton_func(cls): 2 | instance = {} 3 | 4 | def _singleton(*args, **kwargs): 5 | if cls not in instance: 6 | instance[cls] = cls(*args, **kwargs) 7 | return instance[cls] 8 | 9 | return _singleton 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb* 2 | *.ipynb 3 | *.png 4 | *.jpg 5 | *.pdf 6 | 7 | # python 8 | .ipynb_checkpoints 9 | *.ipynb 10 | **/__pycache__/ 11 | 12 | # vscode 13 | .vscode 14 | 15 | # logs 16 | *.log 17 | *.out 18 | 19 | # debug directory 20 | debug/ 21 | source.dev.env 22 | 23 | # pycharm 24 | .idea/ 25 | 26 | -------------------------------------------------------------------------------- /magic_doc/restful_api/api/analysis/__init__.py: -------------------------------------------------------------------------------- 1 | from flask import Blueprint 2 | from .magic_pdf_view import * 3 | from .magic_html_view import * 4 | from ..extentions import Api 5 | 6 | analysis_blue = Blueprint('analysis', __name__, url_prefix='/analysis') 7 | 8 | api = Api(analysis_blue) 9 | api.add_resource(MagicPdfView, '/pdf') 10 | api.add_resource(MagicHtmlView, '/html') -------------------------------------------------------------------------------- /magic_doc/progress/pupdator.py: -------------------------------------------------------------------------------- 1 | 2 | from abc import ABC, abstractmethod 3 | 4 | 5 | class ConvProgressUpdator(ABC): 6 | def __init__(self): 7 | pass 8 | 9 | def update(self, progress: int) -> bool: 10 | # TODO ratelimie 11 | return self.do_update(progress) 12 | 13 | @abstractmethod 14 | def do_update(self, progress: int): 15 | pass 16 | 17 | 18 | -------------------------------------------------------------------------------- /magic_doc/resources/model/model_configs.yaml: -------------------------------------------------------------------------------- 1 | models: 2 | layout: True 3 | formula: False 4 | ocr: False 5 | 6 | weights: 7 | layout: /mnt/hwfile/opendatalab/wufan/weights/ocr_pipeline/Layoutlmv3/model_final.pth 8 | mfd: /mnt/hwfile/opendatalab/wufan/weights/ocr_pipeline/yolov8/withouscihubtrain_addr4_epoch91.pt 9 | mfr: /mnt/hwfile/opendatalab/wufan/weights/ocr_pipeline/UniMERNet/models_old 10 | 11 | 12 | -------------------------------------------------------------------------------- /magic_doc/contrib/office/formula/mml/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from lxml import etree as ET 4 | 5 | transform = None 6 | 7 | _xslt_filename = os.path.join( 8 | os.path.dirname(os.path.abspath(__file__)), "xsl/mmltex.xsl" 9 | ) 10 | 11 | 12 | def mml2tex(mml_xml): 13 | tree = ET.fromstring(mml_xml) 14 | global transform 15 | if not transform: 16 | transform = ET.XSLT(ET.parse(_xslt_filename)) 17 | return str(transform(tree)) 18 | -------------------------------------------------------------------------------- /magic_doc/progress/filepupdator.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from magic_doc.progress.pupdator import ConvProgressUpdator 4 | 5 | 6 | class FileBaseProgressUpdator(ConvProgressUpdator): 7 | def __init__(self, progress_file_path:str): 8 | self.__progress_file_path = progress_file_path 9 | 10 | def do_update(self, progress:int) -> bool: 11 | with open(self.__progress_file_path, 'w', encoding='utf-8') as fout: 12 | fout.write(str(int(progress))) 13 | 14 | return True -------------------------------------------------------------------------------- /magic_doc/model/seq_layout.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from magic_doc.model.sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor 4 | 5 | class SeqLayout: 6 | def __init__(self, config): 7 | self.model = Layoutlmv3_Predictor(config) 8 | 9 | def __call__(self, params): 10 | """ 11 | params: list[(idx, image)] 12 | """ 13 | if len(params) == 0: 14 | return [] 15 | 16 | results = [] 17 | for idx, image in params: 18 | layout_res = self.model(image) 19 | results.append((idx, layout_res)) 20 | return results 21 | 22 | -------------------------------------------------------------------------------- /magic_doc/conv/base.py: -------------------------------------------------------------------------------- 1 | 2 | from abc import ABC, abstractmethod 3 | 4 | from magic_doc.progress.pupdator import ConvProgressUpdator 5 | from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter 6 | class BaseConv(ABC): 7 | def __init__(self, *args, **kwargs): 8 | pass 9 | 10 | @abstractmethod 11 | def to_md(self, bits: bytes | str, pupdator:ConvProgressUpdator) -> str: 12 | return NotImplemented 13 | 14 | def to_mid_result(self, rw: AbsReaderWriter, bits: bytes | str, pupdator:ConvProgressUpdator) -> list[dict] | dict: 15 | pupdator.update(100) 16 | return {} 17 | 18 | 19 | class ParseFailed(BaseException): 20 | pass -------------------------------------------------------------------------------- /magic_doc/model/seq_paddle.py: -------------------------------------------------------------------------------- 1 | 2 | from magic_pdf.model.pp_structure_v2 import CustomPaddleModel 3 | from magic_doc.utils import split_to_chunks 4 | import paddle 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | 7 | 8 | class SeqPaddle: 9 | def __init__(self, **kwargs): 10 | self.model = CustomPaddleModel(ocr=True, show_log=False) 11 | 12 | def __call__(self, params): 13 | """ 14 | params: list[(idx, image, *args)] 15 | """ 16 | results = [] 17 | for idx, img in params: 18 | ocr_res = self.model(img) 19 | results.append((idx, ocr_res)) 20 | 21 | return results 22 | -------------------------------------------------------------------------------- /magic_doc/common/default_config.py: -------------------------------------------------------------------------------- 1 | 2 | from magic_doc.utils.yaml_load import patch_dict_with_env 3 | 4 | class PdfFastParseMethod: 5 | AUTO = "auto" 6 | FAST = "fast" 7 | LITEOCR = "lite_ocr" 8 | 9 | class PdfHqParseMethod: 10 | AUTO = "auto" 11 | OCR = "ocr" 12 | TXT = "txt" 13 | 14 | 15 | DEFAULT_CONFIG = { 16 | "pdf": { 17 | "fast": { 18 | "parsemethod": PdfFastParseMethod.AUTO, 19 | "liteocrmodelinstance": 1, 20 | }, 21 | "hq": { 22 | "parsemethod": PdfHqParseMethod.OCR, 23 | } 24 | } 25 | } 26 | 27 | 28 | DEFAULT_CONFIG = patch_dict_with_env("filter", DEFAULT_CONFIG) 29 | 30 | -------------------------------------------------------------------------------- /magic_doc/restful_api/common/custom_response.py: -------------------------------------------------------------------------------- 1 | from flask import jsonify 2 | 3 | 4 | class ResponseCode: 5 | SUCCESS = 200 6 | PARAM_WARING = 400 7 | MESSAGE = "success" 8 | 9 | 10 | def generate_response(data=None, code=ResponseCode.SUCCESS, msg=ResponseCode.MESSAGE, **kwargs): 11 | """ 12 | 自定义响应 13 | :param code:状态码 14 | :param data:返回数据 15 | :param msg:返回消息 16 | :param kwargs: 17 | :return: 18 | """ 19 | msg = msg or 'success' if code == 200 else msg or 'fail' 20 | success = True if code == 200 else False 21 | res = jsonify(dict(code=code, success=success, data=data, msg=msg, **kwargs)) 22 | res.status_code = 200 23 | return res 24 | -------------------------------------------------------------------------------- /magic_doc/restful_api/config/config.yaml: -------------------------------------------------------------------------------- 1 | # 基本配置 2 | BaseConfig: &base 3 | DEBUG: true 4 | LOG_LEVEL: "DEBUG" 5 | SQLALCHEMY_TRACK_MODIFICATIONS: true 6 | SQLALCHEMY_DATABASE_URI: "" 7 | SECRET_KEY: "#$%^&**$##*(*^%%$**((&" 8 | JWT_SECRET_KEY: "#$%^&**$##*(*^%%$**((&" 9 | JWT_ACCESS_TOKEN_EXPIRES: 300 10 | AccessKeyID: "" 11 | AccessKeySecret: "" 12 | Endpoint: "" 13 | BucketName: "" 14 | UrlExpires: 60 15 | 16 | S3AK: "" 17 | S3SK: "" 18 | S3ENDPOINT: "" 19 | 20 | 21 | # 开发配置 22 | DevelopmentConfig: 23 | <<: *base 24 | 25 | # 生产配置 26 | ProductionConfig: 27 | <<: *base 28 | 29 | # 测试配置 30 | TestingConfig: 31 | <<: *base 32 | 33 | # 当前使用配置 34 | CurrentConfig: "DevelopmentConfig" 35 | -------------------------------------------------------------------------------- /magic_doc/model/seq_ocr.py: -------------------------------------------------------------------------------- 1 | 2 | from magic_doc.model.sub_modules.self_modify import ModifiedPaddleOCR 3 | from magic_doc.utils import split_to_chunks 4 | import paddle 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | 7 | 8 | class SeqOCR: 9 | def __init__(self, **kwargs): 10 | self.model = ModifiedPaddleOCR(show_log=False, **kwargs) 11 | 12 | def __call__(self, params): 13 | """ 14 | params: list[(idx, image, *args)] 15 | """ 16 | results = [] 17 | for idx, cropped_image, single_page_mfdetrec_res in params: 18 | ocr_res = self.model.ocr(cropped_image, mfd_res=single_page_mfdetrec_res)[0] 19 | if ocr_res: 20 | results.append((idx, ocr_res)) 21 | 22 | return results 23 | 24 | -------------------------------------------------------------------------------- /magic_doc/restful_api/api/extentions.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | from flask_restful import Api 3 | from flask_cors import CORS 4 | from flask_sqlalchemy import SQLAlchemy as _SQLAlchemy 5 | from flask_migrate import Migrate 6 | from contextlib import contextmanager 7 | from flask_jwt_extended import JWTManager 8 | from flask_marshmallow import Marshmallow 9 | 10 | 11 | class SQLAlchemy(_SQLAlchemy): 12 | @contextmanager 13 | def auto_commit(self): 14 | try: 15 | yield 16 | db.session.commit() 17 | db.session.flush() 18 | except Exception as e: 19 | db.session.rollback() 20 | raise e 21 | 22 | 23 | app = Flask(__name__) 24 | CORS(app, supports_credentials=True) 25 | db = SQLAlchemy() 26 | migrate = Migrate() 27 | jwt = JWTManager() 28 | ma = Marshmallow() -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.13.1 2 | aniso8601==9.0.1 3 | blinker==1.8.2 4 | cchardet==2.1.7 5 | certifi==2024.2.2 6 | charset-normalizer==3.3.2 7 | docopt==0.6.2 8 | Flask==3.0.3 9 | Flask-Cors==4.0.1 10 | Flask-JWT-Extended==4.6.0 11 | flask-marshmallow==1.2.1 12 | Flask-Migrate==4.0.7 13 | Flask-RESTful==0.3.10 14 | Flask-SQLAlchemy==3.1.1 15 | func-timeout==4.3.5 16 | greenlet==3.0.3 17 | idna==3.7 18 | itsdangerous==2.2.0 19 | Jinja2==3.1.4 20 | lark-parser==0.12.0 21 | lxml==5.1.1 22 | Mako==1.3.5 23 | MarkupSafe==2.1.5 24 | marshmallow==3.21.2 25 | marshmallow-sqlalchemy==1.0.0 26 | packaging==24.0 27 | py-asciimath==0.3.0 28 | PyJWT==2.8.0 29 | pytz==2024.1 30 | PyYAML==6.0.1 31 | requests==2.32.2 32 | six==1.16.0 33 | SQLAlchemy==2.0.30 34 | typing_extensions==4.11.0 35 | urllib3 36 | Werkzeug==3.0.3 37 | python-pptx 38 | s3pathlib 39 | PyMuPDF==1.24.5 40 | smart-open[s3] 41 | -------------------------------------------------------------------------------- /magic_doc/restful_api/api/analysis/ext.py: -------------------------------------------------------------------------------- 1 | import time 2 | from pathlib import Path 3 | from loguru import logger 4 | 5 | 6 | def upload_image_to_oss(oss_client, file_name, img_path, NULL_IMG_DIR, bucket_name): 7 | img_object_name = f"pdf/{file_name}/{Path(img_path).name}" 8 | local_img_path = f"{NULL_IMG_DIR}/images/{Path(img_path).name}" 9 | t3 = time.time() 10 | oss_rep = oss_client.put_file(bucket_name, img_object_name, local_img_path) 11 | t4 = time.time() 12 | logger.info(f"upload img:{t4 - t3}") 13 | file_link = oss_rep["file_link"] 14 | return str(img_path), file_link 15 | 16 | 17 | def upload_md_to_oss(oss_client, bucket_name, md_object_name, md_content): 18 | t3 = time.time() 19 | oss_rep = oss_client.pub_object(bucket_name, md_object_name, md_content) 20 | t4 = time.time() 21 | logger.info(f"upload md:{t4 - t3}") 22 | md_link = oss_rep["file_link"] 23 | return md_link 24 | 25 | -------------------------------------------------------------------------------- /magic_doc/restful_api/api/analysis/serialization.py: -------------------------------------------------------------------------------- 1 | import lxml 2 | from marshmallow import Schema, fields, validates, ValidationError 3 | 4 | 5 | class MagicHtmlSchema(Schema): 6 | pageUrl = fields.Str() 7 | html = fields.Str(required=True) 8 | html_type = fields.Str() 9 | 10 | @validates('html') 11 | def validate_html(self, data, **kwargs): 12 | if not data: 13 | raise ValidationError('HTML cannot be empty') 14 | else: 15 | if lxml.html.fromstring(data).find('.//*') is None: 16 | raise ValidationError('Content is not HTML') 17 | return data 18 | 19 | 20 | class MagicPdfSchema(Schema): 21 | pageUrl = fields.Str(required=True) 22 | 23 | @validates('pageUrl') 24 | def validate_url(self, data, **kwargs): 25 | if not data: 26 | raise ValidationError('pageUrl cannot be empty') 27 | else: 28 | return data 29 | -------------------------------------------------------------------------------- /magic_doc/utils/yaml_load.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import yaml 4 | from collections import deque 5 | 6 | 7 | def patch_dict_with_env(env_namespace, configs): 8 | for env_var in os.environ: 9 | arr = deque(map(lambda x: x.lower(), env_var.split("_"))) 10 | if arr[0] != env_namespace: 11 | continue 12 | arr.popleft() 13 | d = configs 14 | while arr: 15 | if arr[0] not in d: 16 | break 17 | if len(arr) > 1: 18 | d = d[arr[0]] 19 | arr.popleft() 20 | else: 21 | d[arr[0]] = os.environ[env_var] 22 | break 23 | return configs 24 | 25 | 26 | def patch_yaml_load_with_env(yaml_file, env_namespace, loader=yaml.FullLoader): 27 | with open(yaml_file, "r") as f: 28 | configs = yaml.load(f, Loader=yaml.FullLoader) 29 | 30 | return patch_dict_with_env(env_namespace, configs) 31 | 32 | -------------------------------------------------------------------------------- /update_version.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | 5 | def get_version(): 6 | command = ["git", "describe", "--tags"] 7 | try: 8 | version = subprocess.check_output(command).decode().strip() 9 | version_parts = version.split("-") 10 | if len(version_parts) > 1 and version_parts[0].startswith("magic_doc"): 11 | return version_parts[1] 12 | else: 13 | raise ValueError(f"Invalid version tag {version}. Expected format is magic_doc--released.") 14 | except Exception as e: 15 | print(e) 16 | return "0.0.0" 17 | 18 | 19 | def write_version_to_commons(version): 20 | commons_path = os.path.join(os.path.dirname(__file__), 'magic_doc', 'libs', 'version.py') 21 | with open(commons_path, 'w') as f: 22 | f.write(f'__version__ = "{version}"\n') 23 | 24 | 25 | if __name__ == '__main__': 26 | version_name = get_version() 27 | write_version_to_commons(version_name) 28 | -------------------------------------------------------------------------------- /magic_doc/model/doc_analysis_by_pp.py: -------------------------------------------------------------------------------- 1 | from magic_doc.model.parallel_paddle import ParallelPaddle 2 | 3 | 4 | class PaddleDocAnalysis: 5 | def __init__(self, **kwargs): 6 | self.model = ParallelPaddle(**kwargs) 7 | 8 | def __call__(self, image_dicts): 9 | images = [(i, image_dicts[i]["img"]) for i in range(len(image_dicts))] 10 | results = sorted(self.model(images), key=lambda x: x[0]) 11 | if len(results) != len(image_dicts): 12 | raise Exception("fatal error: failed to inference using paddleocr") 13 | 14 | model_json = [] 15 | for index, img_dict in enumerate(image_dicts): 16 | img = img_dict["img"] 17 | page_width = img_dict["width"] 18 | page_height = img_dict["height"] 19 | page_info = {"page_no": index, "height": page_height, "width": page_width} 20 | page_dict = {"layout_dets": results[index][1], "page_info": page_info} 21 | model_json.append(page_dict) 22 | return model_json 23 | 24 | -------------------------------------------------------------------------------- /magic_doc/resources/model/UniMERNet/demo.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: unimernet 3 | model_type: unimernet 4 | model_config: 5 | model_name: ./models 6 | max_seq_len: 1024 7 | length_aware: True 8 | load_pretrained: True 9 | pretrained: ./models/pytorch_model.bin 10 | tokenizer_config: 11 | path: ./models 12 | 13 | datasets: 14 | formula_rec_eval: 15 | vis_processor: 16 | eval: 17 | name: "formula_image_eval" 18 | image_size: 19 | - 192 20 | - 672 21 | 22 | run: 23 | runner: runner_iter 24 | task: unimernet_train 25 | 26 | batch_size_train: 64 27 | batch_size_eval: 64 28 | num_workers: 1 29 | 30 | iters_per_inner_epoch: 2000 31 | max_iters: 60000 32 | 33 | seed: 42 34 | output_dir: "../output/demo" 35 | 36 | evaluate: True 37 | test_splits: [ "eval" ] 38 | 39 | device: "cuda" 40 | world_size: 1 41 | dist_url: "env://" 42 | distributed: True 43 | distributed_type: ddp # or fsdp when train llm 44 | 45 | generate_cfg: 46 | temperature: 0.0 47 | -------------------------------------------------------------------------------- /magic_doc/resources/model/UniMERNet/demo_old.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: unimernet 3 | model_type: unimernet 4 | model_config: 5 | model_name: ./models 6 | max_seq_len: 1024 7 | length_aware: False 8 | load_pretrained: True 9 | pretrained: ./models/pytorch_model.bin 10 | tokenizer_config: 11 | path: ./models 12 | 13 | datasets: 14 | formula_rec_eval: 15 | vis_processor: 16 | eval: 17 | name: "formula_image_eval" 18 | image_size: 19 | - 192 20 | - 672 21 | 22 | run: 23 | runner: runner_iter 24 | task: unimernet_train 25 | 26 | batch_size_train: 64 27 | batch_size_eval: 64 28 | num_workers: 1 29 | 30 | iters_per_inner_epoch: 2000 31 | max_iters: 60000 32 | 33 | seed: 42 34 | output_dir: "../output/demo" 35 | 36 | evaluate: True 37 | test_splits: [ "eval" ] 38 | 39 | device: "cuda" 40 | world_size: 1 41 | dist_url: "env://" 42 | distributed: True 43 | distributed_type: ddp # or fsdp when train llm 44 | 45 | generate_cfg: 46 | temperature: 0.0 47 | -------------------------------------------------------------------------------- /magic_doc/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from magic_pdf.libs.pdf_check import detect_invalid_chars 2 | 3 | import magic_doc 4 | import os 5 | import random 6 | import fitz 7 | 8 | def get_repo_directory(): 9 | return os.path.dirname(magic_doc.__file__) 10 | 11 | 12 | def is_digital(bits: bytes) -> bool: 13 | def _is_digital(doc, check_page=10, text_len_thrs=100) -> bool: 14 | sample_page_num = min(check_page, doc.page_count) 15 | page_ids = random.sample(range(doc.page_count), sample_page_num) 16 | page_text_len = [ 17 | len(doc[pno].get_text("text")) > text_len_thrs for pno in page_ids 18 | ] 19 | if any(page_text_len): 20 | return True 21 | return False 22 | 23 | def _check_invalid_chars(pdf_bytes: bytes) -> bool: 24 | return detect_invalid_chars(pdf_bytes) 25 | 26 | with fitz.open(stream=bits) as doc: 27 | return _is_digital(doc) and _check_invalid_chars(bits) 28 | 29 | 30 | def split_to_chunks(lst, n): 31 | for i in range(0, len(lst), n): 32 | yield lst[i:i + n] -------------------------------------------------------------------------------- /magic_doc/restful_api/api/analysis/magic_html_view.py: -------------------------------------------------------------------------------- 1 | from flask import request 2 | from flask_restful import Resource 3 | from .serialization import MagicHtmlSchema 4 | from marshmallow import ValidationError 5 | from magic_doc.restful_api.common.custom_response import generate_response 6 | from magic_doc.contrib.magic_html import GeneralExtractor 7 | from loguru import logger 8 | 9 | extractor = GeneralExtractor() 10 | 11 | 12 | class MagicHtmlView(Resource): 13 | @logger.catch 14 | def post(self): 15 | """ 16 | 网页提取 17 | :return: 18 | """ 19 | magic_html_schema = MagicHtmlSchema() 20 | try: 21 | params = magic_html_schema.load(request.get_json()) 22 | except ValidationError as err: 23 | return generate_response(code=400, msg=err.messages) 24 | url = params.get("pageUrl", "") 25 | html_type = params.get("html_type") 26 | html = params.get("html") 27 | data = extractor.extract(html, base_url=url, html_type=html_type) 28 | return generate_response(data=data) 29 | -------------------------------------------------------------------------------- /magic_doc/utils/path_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from s3pathlib import S3Path 4 | 5 | from magic_doc.utils.config import read_config 6 | 7 | 8 | def get_local_dir(): 9 | config = read_config() 10 | return config.get("temp-output-dir", "/tmp") 11 | 12 | 13 | def prepare_env(doc_file_name, doc_type="") -> str: 14 | if doc_type == "": 15 | doc_type = "unknown" 16 | local_parent_dir = os.path.join( 17 | get_local_dir(), "magic-doc", doc_type, doc_file_name 18 | ) 19 | 20 | # local_image_dir = os.path.join(local_parent_dir, "images") 21 | local_md_dir = local_parent_dir 22 | # os.makedirs(local_image_dir, exist_ok=True) 23 | os.makedirs(local_md_dir, exist_ok=True) 24 | return str(local_md_dir) 25 | 26 | 27 | def remove_non_official_s3_args(s3path): 28 | """ 29 | example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json 30 | """ 31 | arr = s3path.split("?") 32 | return arr[0] 33 | 34 | 35 | def parse_s3path(s3path: str): 36 | p = S3Path(remove_non_official_s3_args(s3path)) 37 | return p.bucket, p.key 38 | -------------------------------------------------------------------------------- /assets/license.svg: -------------------------------------------------------------------------------- 1 | license: Apache-2.0licenseApache-2.0 -------------------------------------------------------------------------------- /magic_doc/utils/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def read_config(): 6 | home_dir = os.path.expanduser("~") 7 | 8 | config_file = os.path.join(home_dir, "magic-doc.json") 9 | 10 | if not os.path.exists(config_file): 11 | raise Exception(f"{config_file} not found") 12 | 13 | with open(config_file, "r") as f: 14 | config = json.load(f) 15 | return config 16 | 17 | 18 | def get_s3_config(bucket_name: str): 19 | """ 20 | 从 ~/magic-doc.json 读取配置 21 | """ 22 | config = read_config() 23 | 24 | bucket_info = config.get("bucket_info") 25 | if bucket_name not in bucket_info: 26 | access_key, secret_key, storage_endpoint = bucket_info["[default]"] 27 | else: 28 | access_key, secret_key, storage_endpoint = bucket_info[bucket_name] 29 | 30 | if access_key is None or secret_key is None or storage_endpoint is None: 31 | raise Exception("ak, sk or endpoint not found in magic-doc.json") 32 | 33 | # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}") 34 | 35 | return access_key, secret_key, storage_endpoint 36 | 37 | -------------------------------------------------------------------------------- /.github/workflows/benchmark.yml: -------------------------------------------------------------------------------- 1 | name: Magic doc benchmark 2 | on: 3 | push: 4 | branches: 5 | - "main" 6 | paths-ignore: 7 | - "cmds/**" 8 | - "**.md" 9 | pull_request: 10 | branches: 11 | - "main" 12 | paths-ignore: 13 | - "cmds/**" 14 | - "**.md" 15 | workflow_dispatch: 16 | jobs: 17 | magic-doc-test: 18 | runs-on: doc 19 | timeout-minutes: 180 20 | strategy: 21 | fail-fast: true 22 | 23 | steps: 24 | - name: config-net 25 | run: | 26 | source activate magicdoc 27 | - name: pull code 28 | uses: actions/checkout@v3 29 | with: 30 | fetch-depth: 2 31 | - name: check-requirements 32 | run: | 33 | changed_files=$(git diff --name-only -r HEAD~1 HEAD) 34 | echo $changed_files 35 | if [[ $changed_files =~ "requirements.txt" ]]; then 36 | pip install -r requirements.txt 37 | fi 38 | - name: install dependencies 39 | run: | 40 | sudo su - 41 | yum install libreoffice 42 | pip install fairy-doc[cpu] 43 | - name: get-doc-benchmark-result 44 | run: | 45 | echo "start test" 46 | cd tools && python benchmark.py 47 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/post_process.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def layout_rm_equation(layout_res): 4 | rm_idxs = [] 5 | for idx, ele in enumerate(layout_res['layout_dets']): 6 | if ele['category_id'] == 10: 7 | rm_idxs.append(idx) 8 | 9 | for idx in rm_idxs[::-1]: 10 | del layout_res['layout_dets'][idx] 11 | return layout_res 12 | 13 | 14 | def get_croped_image(image_pil, bbox): 15 | x_min, y_min, x_max, y_max = bbox 16 | croped_img = image_pil.crop((x_min, y_min, x_max, y_max)) 17 | return croped_img 18 | 19 | 20 | def latex_rm_whitespace(s: str): 21 | """Remove unnecessary whitespace from LaTeX code. 22 | """ 23 | text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})' 24 | letter = '[a-zA-Z]' 25 | noletter = '[\W_^\d]' 26 | names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)] 27 | s = re.sub(text_reg, lambda match: str(names.pop(0)), s) 28 | news = s 29 | while True: 30 | s = news 31 | news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s) 32 | news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news) 33 | news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news) 34 | if news == s: 35 | break 36 | return s -------------------------------------------------------------------------------- /magic_doc/contrib/model/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, TypeAlias, TypedDict 3 | from werkzeug.datastructures import FileStorage 4 | 5 | 6 | class Content(TypedDict): 7 | # 类型: image/text/md 8 | type: str 9 | 10 | # 数据 11 | # image: s3路径 s3://doc/xxx.png 12 | # text: 文本行 13 | # md: Markdown格式的文本 14 | data: str 15 | 16 | 17 | class Page(TypedDict): 18 | # 从0开始 19 | page_no: int 20 | 21 | # 内容列表 22 | content_list: List[Content] 23 | 24 | 25 | ExtractResponse: TypeAlias = List[Page] 26 | 27 | 28 | if __name__ == "__main__": 29 | pages_data: ExtractResponse = [ 30 | { 31 | "page_no": 0, 32 | "content_list": [ 33 | { 34 | "type": "text", 35 | "data": "This is some text content.", 36 | }, 37 | { 38 | "type": "image", 39 | "data": "s3://somebucket/imagepath.jpg", 40 | }, 41 | ], 42 | } 43 | ] 44 | 45 | 46 | class Extractor(ABC): 47 | @abstractmethod 48 | def setup(): 49 | pass 50 | 51 | @abstractmethod 52 | def run(self, file_parse_id: str, r: FileStorage, skip_image: bool = True) -> ExtractResponse: 53 | pass 54 | -------------------------------------------------------------------------------- /magic_doc/restful_api/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | 4 | base_dir = Path(__file__).resolve().parent 5 | root_dir = base_dir.parent.parent 6 | sys.path.append(str(root_dir)) 7 | 8 | from api import create_app 9 | import yaml 10 | 11 | config_path = base_dir / "config/config.yaml" 12 | 13 | 14 | class ConfigMap(dict): 15 | __setattr__ = dict.__setitem__ 16 | __getattr__ = dict.__getitem__ 17 | 18 | 19 | with open(str(config_path), mode='r', encoding='utf-8') as fd: 20 | data = yaml.load(fd, Loader=yaml.FullLoader) 21 | _config = data.get(data.get("CurrentConfig", "DevelopmentConfig")) 22 | config = ConfigMap() 23 | for k, v in _config.items(): 24 | config[k] = v 25 | config['base_dir'] = base_dir 26 | database = _config.get("database") 27 | if database: 28 | if database.get("type") == "sqlite": 29 | database_uri = f'sqlite:///{base_dir}/{database.get("path")}' 30 | elif database.get("type") == "mysql": 31 | database_uri = f'mysql+pymysql://{database.get("user")}:{database.get("password")}@{database.get("host")}:{database.get("port")}/{database.get("database")}?' 32 | else: 33 | database_uri = '' 34 | config['SQLALCHEMY_DATABASE_URI'] = database_uri 35 | app = create_app(config) 36 | 37 | if __name__ == '__main__': 38 | app.run(host="0.0.0.0", port=5556, debug=True) -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, \ 2 | AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer 3 | from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, RobertaConverter 4 | 5 | from .configuration_layoutlmv3 import LayoutLMv3Config 6 | from .modeling_layoutlmv3 import ( 7 | LayoutLMv3ForTokenClassification, 8 | LayoutLMv3ForQuestionAnswering, 9 | LayoutLMv3ForSequenceClassification, 10 | LayoutLMv3Model, 11 | ) 12 | from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer 13 | from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast 14 | 15 | 16 | #AutoConfig.register("layoutlmv3", LayoutLMv3Config) 17 | #AutoModel.register(LayoutLMv3Config, LayoutLMv3Model) 18 | #AutoModelForTokenClassification.register(LayoutLMv3Config, LayoutLMv3ForTokenClassification) 19 | #AutoModelForQuestionAnswering.register(LayoutLMv3Config, LayoutLMv3ForQuestionAnswering) 20 | #AutoModelForSequenceClassification.register(LayoutLMv3Config, LayoutLMv3ForSequenceClassification) 21 | #AutoTokenizer.register( 22 | # LayoutLMv3Config, slow_tokenizer_class=LayoutLMv3Tokenizer, fast_tokenizer_class=LayoutLMv3TokenizerFast 23 | #) 24 | SLOW_TO_FAST_CONVERTERS.update({"LayoutLMv3Tokenizer": RobertaConverter}) 25 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Tokenization classes for LayoutLMv3, refer to RoBERTa.""" 16 | 17 | from transformers.models.roberta import RobertaTokenizer 18 | from transformers.utils import logging 19 | 20 | 21 | logger = logging.get_logger(__name__) 22 | 23 | VOCAB_FILES_NAMES = { 24 | "vocab_file": "vocab.json", 25 | "merges_file": "merges.txt", 26 | } 27 | 28 | class LayoutLMv3Tokenizer(RobertaTokenizer): 29 | vocab_files_names = VOCAB_FILES_NAMES 30 | # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 31 | # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 32 | model_input_names = ["input_ids", "attention_mask"] 33 | -------------------------------------------------------------------------------- /test/test_docconv.py: -------------------------------------------------------------------------------- 1 | 2 | from magic_doc.docconv import ConvException, S3Config, DocConverter 3 | 4 | def test_conv_localfile(): 5 | convert = DocConverter(None) 6 | many_docs = ["/path/docs/mypdf.pdf", "/path/docs/mydoc.docx", "/path/docs/mydoc.doc", "/path/docs/mydoc.pptx", "/path/docs/mydoc.ppt"] 7 | for i, doc in enumerate(many_docs): 8 | try: 9 | markdown = convert.convert(doc, f"/path/progress/progress-{i}.txt") 10 | # do something with markdown 11 | except ConvException as e: 12 | assert False, f"Failed to convert {doc}, Reason: {e.message}" 13 | except Exception as e: 14 | assert False, f"Failed to convert {doc}: {e}" 15 | 16 | 17 | def test_conv_s3file(): 18 | s3cfg = S3Config("ak", "sk", "endpoint") 19 | convert = DocConverter(s3cfg) 20 | many_docs = ["s3://bucket/mypdf.pdf", "s3://bucket/mydoc.docx", "s3://bucket/mydoc.doc", "s3://bucket/mydoc.pptx", "s3://bucket/mydoc.ppt"] 21 | for i, doc in enumerate(many_docs): 22 | try: 23 | markdown = convert.convert(doc, f"/path/progress/progress-{i}.txt") 24 | # do something with markdown 25 | except ConvException as e: 26 | assert False, f"Failed to convert {doc}, Reason: {e.message}" 27 | except Exception as e: 28 | assert False, f"Failed to convert {doc}: {e}" 29 | -------------------------------------------------------------------------------- /magic_doc/contrib/office/ppt_extract.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | 4 | from pathlib import Path 5 | 6 | 7 | from werkzeug.datastructures import FileStorage 8 | 9 | from magic_doc.contrib.office import OfficeExtractor 10 | from magic_doc.contrib.model import ExtractResponse 11 | 12 | 13 | class PptExtractor(OfficeExtractor): 14 | def __init__(self) -> None: 15 | super().__init__() 16 | 17 | def setup(self): 18 | pass 19 | 20 | def extract( 21 | self, 22 | r: FileStorage | Path, 23 | id: str, 24 | dir: Path, 25 | media_dir: Path, 26 | skip_image: bool, 27 | ) -> ExtractResponse: 28 | 29 | if type(r) is FileStorage: 30 | data = r.stream.read() 31 | elif issubclass(type(r), Path): 32 | with open(r, "rb") as data_file: 33 | data = data_file.read() 34 | 35 | files = {"file": data} 36 | response = requests.post(f"{self.config.tika}/api/v1/parse", files=files) 37 | self.upload_background(id, {}) 38 | return response.json()["pages"] 39 | 40 | 41 | if __name__ == "__main__": 42 | e = PptExtractor() 43 | print( 44 | json.dumps( 45 | e.run( 46 | "def", 47 | Path( 48 | "/home/SENSETIME/wuziming/diclm/doc2docx/doc/【中繁-课件】物理学简介.ppt", 49 | ), 50 | ), 51 | ensure_ascii=False, 52 | indent=4, 53 | ) 54 | ) 55 | e.wait_all() 56 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 2 | 3 | name: doc-ci 4 | on: 5 | push: 6 | branches: 7 | - "main" 8 | paths-ignore: 9 | - "cmds/**" 10 | - "**.md" 11 | pull_request: 12 | branches: 13 | - "main" 14 | paths-ignore: 15 | - "cmds/**" 16 | - "**.md" 17 | workflow_dispatch: 18 | jobs: 19 | cli-test: 20 | runs-on: doc 21 | timeout-minutes: 40 22 | strategy: 23 | fail-fast: true 24 | 25 | steps: 26 | - name: config-net 27 | run: | 28 | source activate magicdoc 29 | - name: doc cli 30 | uses: actions/checkout@v3 31 | with: 32 | fetch-depth: 2 33 | 34 | - name: check-requirements 35 | run: | 36 | changed_files=$(git diff --name-only -r HEAD~1 HEAD) 37 | echo $changed_files 38 | if [[ $changed_files =~ "requirements.txt" ]]; then 39 | pip install -r requirements.txt 40 | fi 41 | 42 | - name: config-net-reset 43 | run: | 44 | export http_proxy="" 45 | export https_proxy="" 46 | - name: test_cli 47 | run: | 48 | echo $GITHUB_WORKSPACE 49 | cd $GITHUB_WORKSPACE && export PYTHONPATH=. && pytest -s -v tests/test_unit.py 50 | cd $GITHUB_WORKSPACE && pytest -s -v test/test_cli/test_cli.py 51 | 52 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/extractors/article_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from magic_doc.contrib.magic_html.utils import * 4 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor 5 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor 6 | 7 | 8 | class ArticleExtractor(BaseExtractor): 9 | def __init__(self) -> None: 10 | super().__init__() 11 | 12 | def extract(self, html="", base_url="") -> dict: 13 | html = html.replace(" ", " ").replace(" ", " ") 14 | tree = load_html(html) 15 | if tree is None: 16 | raise ValueError 17 | 18 | title = TitleExtractor().process(tree) 19 | 20 | # base_url 21 | base_href = tree.xpath("//base/@href") 22 | 23 | if base_href and "http" in base_href[0]: 24 | base_url = base_href[0] 25 | 26 | # 标签转换, 增加数学标签处理 27 | format_tree = self.convert_tags(tree, base_url=base_url) 28 | 29 | # 删除script style等标签及其内容 30 | normal_tree = self.clean_tags(format_tree) 31 | 32 | subtree, xp_num, drop_list = self.xp_1_5(normal_tree) 33 | if xp_num == "others": 34 | subtree, drop_list = self.prune_unwanted_sections(normal_tree) 35 | body_html = self.get_content_html(subtree, xp_num, base_url) 36 | 37 | return { 38 | "xp_num": xp_num, 39 | "drop_list": drop_list, 40 | "html": body_html, 41 | "title": title, 42 | "base_url": base_url, 43 | } 44 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Fast Tokenization classes for LayoutLMv3, refer to RoBERTa.""" 16 | 17 | 18 | from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast 19 | from transformers.utils import logging 20 | 21 | from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer 22 | 23 | 24 | logger = logging.get_logger(__name__) 25 | 26 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} 27 | 28 | 29 | class LayoutLMv3TokenizerFast(RobertaTokenizerFast): 30 | vocab_files_names = VOCAB_FILES_NAMES 31 | # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP 32 | # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES 33 | model_input_names = ["input_ids", "attention_mask"] 34 | slow_tokenizer_class = LayoutLMv3Tokenizer 35 | -------------------------------------------------------------------------------- /magic_doc/restful_api/api/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from pathlib import Path 4 | from loguru import logger 5 | from .extentions import app, db, migrate, jwt, ma 6 | from magic_doc.restful_api.common.web_hook import before_request 7 | 8 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 9 | 10 | 11 | def init_app_log(config): 12 | """ 13 | Setup logging 14 | :param config: config file 15 | :return: 16 | """ 17 | log_path = os.path.join(Path(__file__).parent.parent, "log") 18 | if not Path(log_path).exists(): 19 | Path(log_path).mkdir(parents=True, exist_ok=True) 20 | log_level = config.get("LOG_LEVEL") 21 | log_name = f'log_{datetime.now().strftime("%Y-%m-%d")}.log' 22 | log_file_path = os.path.join(log_path, log_name) 23 | logger.add(str(log_file_path), rotation='00:00', encoding='utf-8', level=log_level, enqueue=True) 24 | return logger 25 | 26 | 27 | def _register_db(flask_app): 28 | db.init_app(flask_app) 29 | with app.app_context(): 30 | db.create_all() 31 | 32 | 33 | def create_app(config): 34 | """ 35 | Create and configure an instance of the Flask application 36 | :param config: 37 | :return: 38 | """ 39 | app.static_folder = os.path.join(root_dir, "static") 40 | if config is None: 41 | config = {} 42 | app.config.update(config) 43 | init_app_log(config) 44 | # _register_db(app) 45 | migrate.init_app(app=app, db=db) 46 | jwt.init_app(app=app) 47 | ma.init_app(app=app) 48 | from .analysis import analysis_blue 49 | app.register_blueprint(analysis_blue) 50 | 51 | app.before_request(before_request) 52 | 53 | return app 54 | -------------------------------------------------------------------------------- /magic_doc/model/parallel_paddle.py: -------------------------------------------------------------------------------- 1 | from magic_pdf.model.pp_structure_v2 import CustomPaddleModel 2 | from magic_doc.utils import split_to_chunks 3 | import paddle 4 | from concurrent.futures import ThreadPoolExecutor, as_completed 5 | import math 6 | 7 | class ParallelPaddle: 8 | def __init__(self, model_load_on_each_gpu_count=1): 9 | models = [] 10 | for _ in range(model_load_on_each_gpu_count): 11 | models.append(CustomPaddleModel(ocr=True, show_log=False)) 12 | self.models = models 13 | 14 | def __call__(self, params): 15 | """ 16 | params: list[(idx, image, *args)] 17 | """ 18 | if len(params) == 0: 19 | return [] 20 | chunks = list(split_to_chunks(params, max(math.ceil(len(params) *1.0/ len(self.models)), 1))) 21 | return self._run_ocr_concurrently(chunks) 22 | 23 | 24 | def _run_ocr_concurrently(self, chunks): 25 | results = [] 26 | def run_ocr(chunk, i): 27 | result = [] 28 | for idx, img in chunk: 29 | ocr_res = self.models[i](img) 30 | if ocr_res: 31 | result.append((idx, ocr_res)) 32 | return result 33 | 34 | with ThreadPoolExecutor(max_workers=len(chunks)) as executor: 35 | future_to_ocr = {executor.submit(run_ocr, chunk, i): i for i, chunk in enumerate(chunks)} 36 | for future in as_completed(future_to_ocr): 37 | try: 38 | data = future.result() 39 | results.extend(data) 40 | except Exception as exc: 41 | print(f"failed to process ocr, reason: ", exc) 42 | return sorted(results, key=lambda x: x[0]) 43 | 44 | -------------------------------------------------------------------------------- /tools/scoring.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from rapidfuzz import fuzz 4 | import re 5 | import regex 6 | from statistics import mean 7 | 8 | CHUNK_MIN_CHARS = 25 9 | 10 | def chunk_text(text, chunk_len=500): 11 | chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)] 12 | chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS] 13 | return chunks 14 | 15 | 16 | def overlap_score(hypothesis_chunks, reference_chunks): 17 | if len(reference_chunks) > 0: 18 | length_modifier = len(hypothesis_chunks) / len(reference_chunks) 19 | else: 20 | length_modifier = 0 21 | search_distance = max(len(reference_chunks) // 5, 10) 22 | chunk_scores = [] 23 | for i, hyp_chunk in enumerate(hypothesis_chunks): 24 | max_score = 0 25 | total_len = 0 26 | i_offset = int(i * length_modifier) 27 | chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance)) 28 | for j in chunk_range: 29 | ref_chunk = reference_chunks[j] 30 | score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100 31 | if score > max_score: 32 | max_score = score 33 | total_len = len(ref_chunk) 34 | chunk_scores.append(max_score) 35 | return chunk_scores 36 | 37 | 38 | def score_text(hypothesis, reference): 39 | # Returns a 0-1 alignment score 40 | hypothesis_chunks = chunk_text(hypothesis) 41 | reference_chunks = chunk_text(reference) 42 | chunk_scores = overlap_score(hypothesis_chunks, reference_chunks) 43 | if len(chunk_scores) > 0: 44 | mean_score = mean(chunk_scores) 45 | return mean_score 46 | else: 47 | return 0 48 | #return mean(chunk_scores) -------------------------------------------------------------------------------- /magic_doc/bin/linux/share/antiword/fontnames.russian: -------------------------------------------------------------------------------- 1 | # Default fontnames translation table 2 | # for Cyrillic 3 | # 4 | # by: Dmitry Chernyak 5 | # 6 | # MS-Word fontname, Italic, Bold, PostScript fontname, Special 7 | Arial, 0, 0, ArialCyrMT, 0 8 | Arial, 0, 1, ArialCyrMT-Bold, 0 9 | Arial, 1, 0, ArialCyrMT-Italic, 0 10 | Arial, 1, 1, ArialCyrMT-BoldItalic, 0 11 | Courier, 0, 0, CourierCyrPS, 0 12 | Courier, 0, 1, CourierCyrPS-Bold, 0 13 | Courier, 1, 0, CourierCyrPS-Inclined, 0 14 | Courier, 1, 1, CourierCyrPS-BoldInclined, 0 15 | Courier New, 0, 0, CourierCyrPS, 0 16 | Courier New, 0, 1, CourierCyrPS-Bold, 0 17 | Courier New, 1, 0, CourierCyrPS-Inclined, 0 18 | Courier New, 1, 1, CourierCyrPS-BoldInclined, 0 19 | Fixedsys, 0, 0, CourierCyrPS, 0 20 | Fixedsys, 0, 1, CourierCyrPS-Bold, 0 21 | Fixedsys, 1, 0, CourierCyrPS-Inclined, 0 22 | Fixedsys, 1, 1, CourierCyrPS-BoldInclined, 0 23 | Helvetica, 0, 0, ArialCyrMT, 0 24 | Helvetica, 0, 1, ArialCyrMT-Bold, 0 25 | Helvetica, 1, 0, ArialCyrMT-Italic, 0 26 | Helvetica, 1, 1, ArialCyrMT-BoldItalic, 0 27 | Lucida Console, 0, 0, CourierCyrPS, 0 28 | Lucida Console, 0, 1, CourierCyrPS-Bold, 0 29 | Lucida Console, 1, 0, CourierCyrPS-Inclined, 0 30 | Lucida Console, 1, 1, CourierCyrPS-BoldInclined, 0 31 | Swiss, 0, 0, Helvetica, 0 32 | Swiss, 0, 1, Helvetica-Bold, 0 33 | Swiss, 1, 0, Helvetica-Oblique, 0 34 | Swiss, 1, 1, Helvetica-BoldOblique, 0 35 | Univers, 0, 0, Helvetica, 0 36 | Univers, 0, 1, Helvetica-Bold, 0 37 | Univers, 1, 0, Helvetica-Oblique, 0 38 | Univers, 1, 1, Helvetica-BoldOblique, 0 39 | # All the other fonts 40 | *, 0, 0, TimesNRCyrMT, 0 41 | *, 0, 1, TimesNRCyrMT-Bold, 0 42 | *, 1, 0, TimesNRCyrMT-Inclined, 0 43 | *, 1, 1, TimesNRCyrMT-BoldInclined, 0 44 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/mmltex/mmltex.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | $ 41 | 42 | $ 43 | 44 | 45 | -------------------------------------------------------------------------------- /magic_doc/contrib/office/formula/mml/xsl/mmltex.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 8 | 9 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | $ 41 | 42 | $ 43 | 44 | 45 | -------------------------------------------------------------------------------- /magic_doc/model/parallel_layout.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | 4 | from magic_doc.model.sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor 5 | from magic_doc.utils import split_to_chunks 6 | from concurrent.futures import ThreadPoolExecutor, as_completed 7 | import math 8 | 9 | class ParallelLayout: 10 | def __init__(self, config, model_load_on_each_gpu_count=1): 11 | models = [] 12 | for i in range(torch.cuda.device_count()): 13 | torch.cuda.set_device(i) 14 | for _ in range(model_load_on_each_gpu_count): 15 | models.append(Layoutlmv3_Predictor(config)) 16 | self.models = models 17 | 18 | def __call__(self, params): 19 | """ 20 | params: list[(idx, image)] 21 | """ 22 | if len(params) == 0: 23 | return [] 24 | chunks = list(split_to_chunks(params, max(math.ceil(len(params) *1.0/ len(self.models)), 1))) 25 | return self._run_layout_concurrently(chunks) 26 | 27 | 28 | def _run_layout_concurrently(self, chunks): 29 | results = [] 30 | 31 | def run_layout(chunk, i): 32 | result = [] 33 | for idx, image in chunk: 34 | layout_res = self.models[i](image, ignore_catids=[]) 35 | result.append((idx, layout_res)) 36 | return result 37 | 38 | with ThreadPoolExecutor(max_workers=len(chunks)) as executor: 39 | future_to_ocr = {executor.submit(run_layout, chunk, i): i for i, chunk in enumerate(chunks)} 40 | for future in as_completed(future_to_ocr): 41 | try: 42 | data = future.result() 43 | results.extend(data) 44 | except Exception as exc: 45 | print(f"failed to process layout, reason: ", exc) 46 | return sorted(results, key=lambda x: x[0]) 47 | 48 | -------------------------------------------------------------------------------- /magic_doc/model/parallel_ocr.py: -------------------------------------------------------------------------------- 1 | 2 | from magic_doc.model.sub_modules.self_modify import ModifiedPaddleOCR 3 | from magic_doc.utils import split_to_chunks 4 | import paddle 5 | from concurrent.futures import ThreadPoolExecutor, as_completed 6 | import math 7 | 8 | class ParallelOCR: 9 | def __init__(self, model_load_on_each_gpu_count=1): 10 | models = [] 11 | for i in range(paddle.device.cuda.device_count()): 12 | for _ in range(model_load_on_each_gpu_count): 13 | models.append(ModifiedPaddleOCR(use_gpu=True, show_log=False, gpu_id=i)) 14 | self.models = models 15 | 16 | def __call__(self, params): 17 | """ 18 | params: list[(idx, image, *args)] 19 | """ 20 | if len(params) == 0: 21 | return [] 22 | chunks = list(split_to_chunks(params, max(math.ceil(len(params) *1.0/ len(self.models)), 1))) 23 | return self._run_ocr_concurrently(chunks) 24 | 25 | 26 | def _run_ocr_concurrently(self, chunks): 27 | results = [] 28 | 29 | def run_ocr(chunk, i): 30 | result = [] 31 | for idx, cropped_image, single_page_mfdetrec_res in chunk: 32 | ocr_res = self.models[i].ocr(cropped_image, mfd_res=single_page_mfdetrec_res)[0] 33 | if ocr_res: 34 | result.append((idx, ocr_res)) 35 | return result 36 | 37 | with ThreadPoolExecutor(max_workers=len(chunks)) as executor: 38 | future_to_ocr = {executor.submit(run_ocr, chunk, i): i for i, chunk in enumerate(chunks)} 39 | for future in as_completed(future_to_ocr): 40 | try: 41 | data = future.result() 42 | results.extend(data) 43 | except Exception as exc: 44 | print(f"failed to process ocr, reason: ", exc) 45 | return results -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/extractors/title_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from magic_doc.contrib.magic_html.utils import * 4 | from magic_doc.contrib.magic_html.config import * 5 | 6 | 7 | class TitleExtractor: 8 | def extract_by_meta(self, element: HtmlElement): 9 | for xpath in METAS: 10 | title = element.xpath(xpath) 11 | if title: 12 | return "".join(title) 13 | 14 | def extract_by_title(self, element: HtmlElement): 15 | return "".join(element.xpath("//title//text()")).strip() 16 | 17 | def extract_by_hs(self, element: HtmlElement): 18 | hs = element.xpath("//h1//text()|//h2//text()|//h3//text()") 19 | return hs or [] 20 | 21 | def extract_by_h(self, element: HtmlElement): 22 | for xpath in ["//h1", "//h2", "//h3"]: 23 | children = element.xpath(xpath) 24 | if not children: 25 | continue 26 | child = children[0] 27 | texts = child.xpath("./text()") 28 | if texts and len(texts): 29 | return texts[0].strip() 30 | 31 | def process(self, element: HtmlElement): 32 | title_extracted_by_meta = self.extract_by_meta(element) 33 | if title_extracted_by_meta: 34 | return title_extracted_by_meta 35 | title_extracted_by_h = self.extract_by_h(element) 36 | title_extracted_by_hs = self.extract_by_hs(element) 37 | title_extracted_by_title = self.extract_by_title(element) 38 | title_extracted_by_hs = sorted( 39 | title_extracted_by_hs, 40 | key=lambda x: similarity2(x, title_extracted_by_title), 41 | reverse=True, 42 | ) 43 | if title_extracted_by_hs: 44 | return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) 45 | 46 | if title_extracted_by_title: 47 | return title_extracted_by_title 48 | 49 | return title_extracted_by_h 50 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/extractors/custom_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import re 3 | 4 | from magic_doc.contrib.magic_html.utils import * 5 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor 6 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor 7 | 8 | 9 | class CustomExtractor(BaseExtractor): 10 | def __init__(self) -> None: 11 | super().__init__() 12 | 13 | def use_clean_rule(self, tree, clean_rules): 14 | for clean_rule in clean_rules: 15 | for x in tree.xpath(clean_rule): 16 | self.remove_node(x) 17 | return tree 18 | 19 | def use_extract_rule(self, tree, extract_rule): 20 | if "/text()" in extract_rule["value"]: 21 | return "".join(tree.xpath(extract_rule["value"])).strip() 22 | return tree.xpath(extract_rule["value"])[0] 23 | 24 | def extract(self, html="", base_url="", rule={}) -> dict: 25 | tree = load_html(html) 26 | if tree is None: 27 | raise ValueError 28 | 29 | # base_url 30 | base_href = tree.xpath("//base/@href") 31 | 32 | if base_href and "http" in base_href[0]: 33 | base_url = base_href[0] 34 | 35 | if "clean" in rule: 36 | tree = self.use_clean_rule(tree, rule["clean"]) 37 | 38 | # 获取title 39 | if "title" not in rule: 40 | title = TitleExtractor().process(tree) 41 | else: 42 | title = self.use_extract_rule(tree, rule["title"]) 43 | 44 | # 文章区域 45 | try: 46 | body_tree = self.use_extract_rule(tree, rule["content"]) 47 | except: 48 | raise ValueError 49 | body_html = tostring(body_tree, encoding=str) 50 | 51 | return { 52 | "xp_num": "custom", 53 | "drop_list": False, 54 | "html": body_html, 55 | "title": title, 56 | "base_url": base_url 57 | } 58 | -------------------------------------------------------------------------------- /magic_doc/conv/pptx_python_pptx.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | from loguru import logger 5 | 6 | from magic_doc.contrib.model import Page 7 | from magic_doc.contrib.office.pptx_extract import PptxExtractor 8 | from magic_doc.conv.base import BaseConv 9 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 10 | from magic_doc.progress.pupdator import ConvProgressUpdator 11 | 12 | 13 | class Pptx(BaseConv): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str: 18 | page_list = self.pptx_to_pagelist(bits, pupdator) 19 | md_content_list = [] 20 | total = len(page_list) 21 | for index, page in enumerate(page_list): 22 | progress = 50 + int(index / total * 50) 23 | # logger.info(f"progress: {progress}") 24 | page_content_list = page['content_list'] 25 | for content in page_content_list: 26 | pupdator.update(progress) 27 | if content['type'] == 'image': 28 | pass 29 | elif content['type'] == "text": 30 | data = content['data'] 31 | md_content_list.append(data) 32 | return "\n".join(md_content_list) 33 | 34 | def pptx_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: 35 | with tempfile.TemporaryDirectory() as temp_path: 36 | temp_dir = Path(temp_path) 37 | media_dir = temp_dir / "media" 38 | media_dir.mkdir() 39 | file_path = temp_dir / "tmp.pptx" 40 | file_path.write_bytes(bits) 41 | pptx_extractor = PptxExtractor() 42 | pages = pptx_extractor.extract(file_path, "tmp", temp_dir, media_dir, True) 43 | pupdator.update(50) 44 | return pages 45 | 46 | 47 | if __name__ == '__main__': 48 | pupdator = FileBaseProgressUpdator("/tmp/p.txt") 49 | pptx = Pptx() 50 | logger.info( 51 | pptx.to_md(open(r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-模板】Professional Pack Standard.pptx", "rb").read(), pupdator)) 52 | -------------------------------------------------------------------------------- /magic_doc/conv/docx_xml_parse.py: -------------------------------------------------------------------------------- 1 | import io 2 | import tempfile 3 | import zipfile 4 | import xml.etree.ElementTree as ET 5 | from pathlib import Path 6 | 7 | from loguru import logger 8 | 9 | from magic_doc.contrib.model import Content, Page 10 | from magic_doc.contrib.office.docx_extract import DocxExtractor 11 | from magic_doc.conv.base import BaseConv 12 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 13 | from magic_doc.progress.pupdator import ConvProgressUpdator 14 | 15 | 16 | class Docx(BaseConv): 17 | def __init__(self): 18 | super().__init__() 19 | 20 | def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str: 21 | page_list = self.docx_to_pagelist(bits, pupdator) 22 | md_content_list = [] 23 | for page in page_list: 24 | page_content_list = page['content_list'] 25 | total = len(page_content_list) 26 | for index, content in enumerate(page_content_list): 27 | progress = 50 + int(index / total * 50) 28 | pupdator.update(progress) 29 | if content['type'] == 'image': 30 | pass 31 | elif content['type'] in ["text", "md"]: 32 | data = content['data'] 33 | md_content_list.append(data) 34 | return "\n".join(md_content_list) 35 | 36 | def docx_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: 37 | with tempfile.TemporaryDirectory() as temp_path: 38 | temp_dir = Path(temp_path) 39 | media_dir = temp_dir / "media" 40 | media_dir.mkdir() 41 | file_path = temp_dir / "tmp.docx" 42 | file_path.write_bytes(bits) 43 | docx_extractor = DocxExtractor() 44 | pages = docx_extractor.extract(file_path, "tmp", temp_dir, media_dir, True) 45 | pupdator.update(50) 46 | return pages 47 | 48 | 49 | if __name__ == '__main__': 50 | pupdator = FileBaseProgressUpdator("/tmp/p.txt") 51 | docx = Docx() 52 | logger.info(docx.to_md(open(r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图.docx", "rb").read(), pupdator)) 53 | -------------------------------------------------------------------------------- /magic_doc/conv/doc_antiword.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from pathlib import Path 4 | 5 | from loguru import logger 6 | 7 | from magic_doc.contrib.model import Page 8 | from magic_doc.contrib.office.doc import DocExtractor 9 | from magic_doc.conv.base import BaseConv 10 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 11 | from magic_doc.progress.pupdator import ConvProgressUpdator 12 | 13 | 14 | class Doc(BaseConv): 15 | 16 | def __init__(self): 17 | super().__init__() 18 | 19 | def to_md(self, bits: bytes, pupdator:ConvProgressUpdator) -> str: 20 | page_list = self.doc_to_pagelist(bits, pupdator) 21 | md_content_list = [] 22 | for page in page_list: 23 | page_content_list = page['content_list'] 24 | total = len(page_content_list) 25 | for index, content in enumerate(page_content_list): 26 | progress = 50 + int(index / total * 50) 27 | pupdator.update(progress) 28 | if content['type'] == 'image': 29 | pass 30 | elif content['type'] == "text": 31 | data = content['data'] 32 | md_content_list.append(data) 33 | return "\n".join(md_content_list) 34 | 35 | def doc_to_pagelist(self, bits, pupdator:ConvProgressUpdator) -> list[Page]: 36 | with tempfile.TemporaryDirectory() as temp_path: 37 | temp_dir = Path(temp_path) 38 | media_dir = temp_dir / "media" 39 | media_dir.mkdir() 40 | file_path = temp_dir / "tmp.doc" 41 | file_path.write_bytes(bits) 42 | doc_extractor = DocExtractor() 43 | cwd_path = os.path.dirname(os.path.abspath(__file__)) / Path("../bin/linux") 44 | bin_path = cwd_path / "antiword" 45 | os.chmod(bin_path, 0o755) 46 | page_list = doc_extractor.extract(file_path, "tmp", temp_dir, media_dir, True, cwd_path=cwd_path) 47 | pupdator.update(50) 48 | return page_list 49 | 50 | 51 | if __name__ == '__main__': 52 | pupdator = FileBaseProgressUpdator("/tmp/p.txt") 53 | doc = Doc() 54 | logger.info(doc.to_md(Path("/home/myhloli/文本+表+图1.doc").read_bytes(), pupdator)) 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | from pathlib import Path 3 | from magic_doc.libs.version import __version__ 4 | 5 | 6 | def parse_requirements(filename): 7 | with open(filename) as f: 8 | lines = f.read().splitlines() 9 | 10 | requires = [] 11 | 12 | for line in lines: 13 | if "http" in line: 14 | pkg_name_without_url = line.split("@")[0].strip() 15 | requires.append(pkg_name_without_url) 16 | else: 17 | requires.append(line) 18 | 19 | return requires 20 | 21 | 22 | if __name__ == "__main__": 23 | with Path(Path(__file__).parent, 24 | 'README.md').open(encoding='utf-8') as file: 25 | long_description = file.read() 26 | setup( 27 | name="fairy_doc", # 项目名 28 | version=__version__, # 自动从tag中获取版本号 29 | packages=find_packages() + ["magic_doc.bin", "magic_doc.resources", "magic_doc/contrib/magic_html/mmltex"], # 包含所有的包 30 | package_data={ 31 | "magic_doc.bin": ["**"], # 包含magic_doc.bin目录下的所有文件 32 | "magic_doc.resources": ["**"], # 包含magic_doc.resources目录下的所有文件 33 | "magic_doc.contrib.office.formula": ["**"], # 包含magic_doc.contrib.office.formula目录下的所有文件 34 | "magic_doc/contrib/magic_html/mmltex": ["**"], 35 | }, 36 | license='Apache 2.0', 37 | extras_require={ 38 | "gpu": ["paddlepaddle-gpu==2.6.1", "paddleocr==2.7.3", "magic-pdf[gpu]>=0.5.10"], 39 | "cpu": ["paddlepaddle==2.5.2", "paddleocr==2.7.3", "magic-pdf[cpu]>=0.5.10"], 40 | }, 41 | description='A lightweight toolbox to manipulate documents', 42 | long_description=long_description, 43 | long_description_content_type='text/markdown', 44 | install_requires=parse_requirements("requirements.txt"), # 项目依赖的第三方库 45 | url="https://github.com/InternLM/magic-doc", 46 | python_requires=">=3.10", # 项目依赖的 Python 版本 47 | entry_points={ 48 | "console_scripts": [ 49 | "magic-doc=magic_doc.cli:cli_conv", 50 | "pdf2md=magic_doc.cli:pdf_cli" 51 | ], 52 | }, 53 | include_package_data=True, 54 | zip_safe=False, # 是否使用 zip 文件格式打包,一般设为 False 55 | ) 56 | -------------------------------------------------------------------------------- /README_zh-CN.md: -------------------------------------------------------------------------------- 1 |
2 |
3 | 4 | [![license](https://img.shields.io/github/license/magicpdf/Magic-Doc.svg)](https://github.com/magicpdf/Magic-Doc/tree/main/LICENSE) 5 | [![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues) 6 | [![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues) 7 | 8 | [English](README.md) | [简体中文](README_zh-CN.md) 9 | 10 |
11 | 12 |
13 | 14 |
15 | 16 | 17 | ### 安装 18 | 前置依赖: python3.10 + 19 | 20 | 安装依赖 21 | 22 | **linux/osx** 23 | 24 | ```bash 25 | apt-get/yum/brew install libreoffice 26 | ``` 27 | 28 | **windows** 29 | ```text 30 | 安装 libreoffice 31 | 添加 "install_dir\LibreOffice\program" to 环境变量 PATH 32 | ``` 33 | 34 | 35 | 安装 Magic-Doc 36 | 37 | 38 | ```bash 39 | pip install fairy-doc[cpu] # 安装 cpu 版本 40 | 或 41 | pip install fairy-doc[gpu] # 安装 gpu 版本 42 | ``` 43 | 44 | 45 | ## 简介 46 | 47 | Magic-Doc 是一个轻量级、开源的用于将多种格式的文档(PPT/PPTX/DOC/DOCX/PDF) 转化为 markdown 格式的工具。支持转换本地文档或者位于 AWS S3 上的文件 48 | 49 | 50 | ## 使用示例 51 | 52 | ```python 53 | # for local file 54 | from magic_doc.docconv import DocConverter, S3Config 55 | converter = DocConverter(s3_config=None) 56 | markdown_cotent, time_cost = converter.convert("some_doc.pptx", conv_timeout=300) 57 | ``` 58 | 59 | ```python 60 | # for remote file located in aws s3 61 | from magic_doc.docconv import DocConverter, S3Config 62 | 63 | s3_config = S3Config(ak='${ak}', sk='${sk}', endpoint='${endpoint}') 64 | converter = DocConverter(s3_config=s3_config) 65 | markdown_cotent, time_cost = converter.convert("s3://some_bucket/some_doc.pptx", conv_timeout=300) 66 | ``` 67 | 68 | 69 | ## 性能 70 | 环境:AMD EPYC 7742 64-Core Processor, NVIDIA A100, Centos 7 71 | 72 | | 文件类型 | 转化速度| 73 | | ------------------ | -------- | 74 | | PDF (digital) | 347 (page/s) | 75 | | PDF (ocr) | 2.7 (page/s) | 76 | | PPT | 20 (page/s) | 77 | | PPTX | 149 (page/s) | 78 | | DOC | 600 (page/s) | 79 | | DOCX | 1482 (page/s) | 80 | 81 | 82 | 83 | ## 开源许可证 84 | 85 | 该项目采用[Apache 2.0 开源许可证](LICENSE)。 86 | 87 |

🔼 Back to top

88 | -------------------------------------------------------------------------------- /magic_doc/conv/conv_html.py: -------------------------------------------------------------------------------- 1 | import json 2 | from magic_doc.conv.base import BaseConv 3 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 4 | from magic_doc.contrib.magic_html import GeneralExtractor 5 | from magic_doc.progress.pupdator import ConvProgressUpdator 6 | from loguru import logger 7 | 8 | extractor = GeneralExtractor() 9 | 10 | 11 | class Html(BaseConv): 12 | 13 | def __init__(self): 14 | super().__init__() 15 | 16 | @logger.catch 17 | def to_md(self, html: str, pupdator: ConvProgressUpdator, **kwargs) -> str: 18 | """ 19 | 从HTML中提取主体区域内容 20 | :param html: html文本 21 | :param kwargs: 可选参数 22 | base_url 网页地址 23 | html_type 网页类型(支持3种) 24 | 1. article 文章类 25 | 2. forum 论坛类 26 | 3. weixin 微信文章 27 | :return: { 28 | "base_url": "https://example.com/", 29 | "drop_list": false, 30 | "html": "
36 | 37 | $omml_xml 38 | """.format( 39 | _xmlns_str 40 | ) 41 | ) 42 | 43 | 44 | transform = None 45 | 46 | _xslt_filename = os.path.join( 47 | os.path.dirname(os.path.abspath(__file__)), "OMML2MML.XSL" 48 | ) 49 | 50 | 51 | def omml2mml(omml_xml): 52 | xml_content = _template.safe_substitute(omml_xml=omml_xml) 53 | tree = ET.fromstring(xml_content) 54 | global transform 55 | if not transform: 56 | transform = ET.XSLT(ET.parse(_xslt_filename)) 57 | return str(transform(tree)) 58 | 59 | 60 | def omml2tex(omml_xml): 61 | mml_xml = omml2mml(omml_xml) 62 | return mml2tex(mml_xml) 63 | -------------------------------------------------------------------------------- /magic_doc/conv/pdf.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | 3 | from loguru import logger 4 | from werkzeug.datastructures import FileStorage 5 | 6 | from magic_doc.contrib.pdf.pdf_extractor import PDFExtractor 7 | from magic_doc.conv.base import BaseConv 8 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 9 | from magic_doc.progress.pupdator import ConvProgressUpdator 10 | from magic_doc.conv.base import ParseFailed 11 | from magic_doc.conv.pdf_pp_structurev2 import Pdf as liteOcr 12 | 13 | class Pdf(BaseConv): 14 | def __init__(self, allowed_failure=True): 15 | self.allowed_failure = allowed_failure 16 | 17 | def to_md(self, bits: bytes | str, pupdator: ConvProgressUpdator) -> str: 18 | pdf_extractor = PDFExtractor() 19 | buf = BytesIO(bits) # type: ignore 20 | content = pdf_extractor.run("stream io data", FileStorage(buf, "fake.pdf")) 21 | arr = [] 22 | pupdator.update(0) 23 | 24 | N = len(content) 25 | progress_h = {N * i // 100: 1 for i in range(10, 100, 10)} 26 | for idx, page in enumerate(content): 27 | if idx in progress_h: 28 | pupdator.update(idx * 100 // N) 29 | for record in page.get("content_list", []): 30 | arr.append(record.get("data", "")) 31 | 32 | text_all = "" 33 | for content in arr: 34 | text_all += content 35 | def calculate_not_printable_rate(text): 36 | printable = sum(1 for c in text if c.isprintable()) 37 | total = len(text) 38 | if total == 0: 39 | return 0 # 避免除以零的错误 40 | return (total - printable) / total 41 | not_printable_rate = calculate_not_printable_rate(text_all) 42 | if not_printable_rate > 0.02: 43 | if self.allowed_failure: 44 | raise ParseFailed 45 | else: 46 | liteOcrPdf = liteOcr() 47 | return liteOcrPdf.to_md(bits, pupdator) 48 | else: 49 | pupdator.update(100) 50 | return "\n\n".join(arr) 51 | 52 | 53 | if __name__ == "__main__": 54 | if 1: 55 | with open("/opt/data/pdf/20240423/pdf_test2/ol006018w.pdf", "rb") as f: 56 | bits_data = f.read() 57 | parser = Pdf() 58 | md_content = parser.to_md(bits_data, FileBaseProgressUpdator("debug/progress.txt")) 59 | 60 | with open("debug/pdf2md.md", "w") as f: 61 | f.write(md_content) 62 | -------------------------------------------------------------------------------- /test/test_cli/test_cli.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import os 3 | from lib import common 4 | import logging 5 | 6 | code_path = "magic_doc" # 假设代码路径已经在配置文件中设置 7 | output_path = "magic_doc/datas_new" # 输出路径 8 | 9 | class TestDocConversion: 10 | 11 | def test_convert_doc_to_md(self): 12 | """ 13 | 将DOC文件转换为Markdown 14 | """ 15 | file_path = os.path.join(code_path, "datas/test01.doc") 16 | cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}" 17 | logging.info(cmd) 18 | common.check_shell(cmd) 19 | # 这里可以添加更多的检查函数来验证转换结果 20 | 21 | def test_convert_docx_to_md(self): 22 | """ 23 | 将DOCX文件转换为Markdown 24 | """ 25 | file_path = os.path.join(code_path, "datas/test02.docx") 26 | cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}" 27 | logging.info(cmd) 28 | common.check_shell(cmd) 29 | # 这里可以添加更多的检查函数来验证转换结果 30 | 31 | def test_convert_html_to_md(self): 32 | """ 33 | 将HTML文件转换为Markdown 34 | """ 35 | file_path = os.path.join(code_path, "datas/test03.html") 36 | cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}" 37 | logging.info(cmd) 38 | common.check_shell(cmd) 39 | # 这里可以添加更多的检查函数来验证转换结果 40 | 41 | def test_convert_pdf_to_md(self): 42 | """ 43 | 将PDF文件转换为Markdown 44 | """ 45 | file_path = os.path.join(code_path, "datas/test04.pdf") 46 | cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}" 47 | logging.info(cmd) 48 | common.check_shell(cmd) 49 | # 这里可以添加更多的检查函数来验证转换结果 50 | 51 | def test_convert_ppt_to_md(self): 52 | """ 53 | 将PPT文件转换为Markdown 54 | """ 55 | file_path = os.path.join(code_path, "datas/test05.ppt") 56 | cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}" 57 | logging.info(cmd) 58 | common.check_shell(cmd) 59 | # 这里可以添加更多的检查函数来验证转换结果 60 | 61 | def test_convert_pptx_to_md(self): 62 | """ 63 | 将PPTX文件转换为Markdown 64 | """ 65 | file_path = os.path.join(code_path, "datas/test06.pptx") 66 | cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}" 67 | logging.info(cmd) 68 | common.check_shell(cmd) 69 | # 这里可以添加更多的检查函数来验证转换结果 70 | 71 | if __name__ == "__main__": 72 | pytest.main(["-v", __file__]) -------------------------------------------------------------------------------- /magic_doc/contrib/test_data/url_service/run.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, Response, request 2 | import requests 3 | 4 | app = Flask(__name__) 5 | 6 | 7 | @app.route("/path/") 8 | def handle_path(subpath): 9 | include_content_type = request.args.get("ct", "false").lower() == "true" 10 | include_content_disposition = request.args.get("cd", "false").lower() == "true" 11 | 12 | if subpath.endswith(".html"): 13 | content_type, disposition = "text/html", "inline" 14 | elif subpath.endswith(".pdf"): 15 | content_type, disposition = ( 16 | "application/pdf", 17 | 'attachment; filename="document.pdf"', 18 | ) 19 | elif subpath.endswith(".doc"): 20 | content_type, disposition = ( 21 | "application/msword", 22 | 'attachment; filename="document.doc"', 23 | ) 24 | elif subpath.endswith(".docx"): 25 | content_type, disposition = ( 26 | "application/vnd.openxmlformats-officedocument.wordprocessingml.document", 27 | 'attachment; filename="document.docx"', 28 | ) 29 | elif subpath.endswith(".ppt"): 30 | content_type, disposition = ( 31 | "application/vnd.ms-powerpoint", 32 | 'attachment; filename="presentation.ppt"', 33 | ) 34 | elif subpath.endswith(".pptx"): 35 | content_type, disposition = ( 36 | "application/vnd.openxmlformats-officedocument.presentationml.presentation", 37 | 'attachment; filename="presentation.pptx"', 38 | ) 39 | elif subpath.endswith(".jpg") or subpath.endswith(".jpeg"): 40 | content_type, disposition = "image/jpeg", "inline" 41 | elif subpath.endswith(".png"): 42 | content_type, disposition = "image/png", "inline" 43 | else: 44 | content_type = "text/plain" 45 | disposition = 'attachment; filename="default.txt"' 46 | 47 | response = Response(f"Requested {subpath}") 48 | if include_content_type: 49 | response.headers["Content-Type"] = content_type 50 | if include_content_disposition: 51 | response.headers["Content-Disposition"] = disposition 52 | 53 | return response 54 | 55 | 56 | if __name__ == "__main__": 57 | # app.run(debug=True, port=6500) 58 | res = requests.get( 59 | "https://filesamples.com/samples/document/doc/sample2.doc", 60 | timeout=10, 61 | stream=True, 62 | ) 63 | if res.status_code not in [200]: 64 | res.raise_for_status() 65 | print(res.headers.get("Content-Type")) 66 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | from urllib.parse import urlparse 4 | from magic_doc.contrib.magic_html.extractors.article_extractor import ArticleExtractor 5 | from magic_doc.contrib.magic_html.extractors.weixin_extractor import WeixinExtractor 6 | from magic_doc.contrib.magic_html.extractors.forum_extractor import ForumExtractor 7 | from magic_doc.contrib.magic_html.extractors.custom_extractor import CustomExtractor 8 | 9 | 10 | class GeneralExtractor: 11 | def __init__(self, config_path=""): 12 | if config_path: 13 | """ 14 | demo rule config file json: 15 | { 16 | "www.***.com": { 17 | "clean": ["//script", "//style"], 18 | "title": { 19 | "mode": "xpath", 20 | "value": "//div[@class='media-body']/h4/text()" 21 | }, 22 | "content": { 23 | "mode": "xpath", 24 | "value": "//div[@class='message break-all']" 25 | } 26 | } 27 | } 28 | """ 29 | try: 30 | with open(config_path, 'r', encoding='utf-8') as f: 31 | self.rule = json.loads(f.read()) 32 | except: 33 | pass 34 | else: 35 | self.rule = {} 36 | 37 | def extract(self, html="", **kwargs) -> dict: 38 | base_url = kwargs.get("base_url", "") 39 | html_type = kwargs.pop("html_type", None) 40 | if html_type: 41 | if html_type == "forum": 42 | return ForumExtractor().extract(html=html, **kwargs) 43 | elif html_type == "weixin": 44 | return WeixinExtractor().extract(html=html, **kwargs) 45 | if base_url: 46 | netloc = urlparse(base_url).netloc 47 | if netloc in self.rule: 48 | try: 49 | new_kwargs = dict() 50 | new_kwargs["rule"] = self.rule[netloc] 51 | new_kwargs.update(kwargs) 52 | return CustomExtractor().extract(html=html, **new_kwargs) 53 | except: 54 | # 当自定义规则不能覆盖站点所有板块时,使用 55 | return ArticleExtractor().extract(html=html, **kwargs) 56 | if netloc == "mp.weixin.qq.com": 57 | return WeixinExtractor().extract(html=html, **kwargs) 58 | return ArticleExtractor().extract(html=html, **kwargs) 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 |
4 | 5 | [![license](https://img.shields.io/github/license/magicpdf/Magic-Doc.svg)](https://github.com/magicpdf/Magic-Doc/tree/main/LICENSE) 6 | [![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues) 7 | [![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues) 8 | 9 | [English](READMD.md) | [简体中文](README_zh-CN.md) 10 | 11 |
12 | 13 |
14 | 15 |
16 | 17 | 18 | ### Install 19 | 20 | Prerequisites: python3.10+ 21 | 22 | Install Dependencies 23 | 24 | **linux/osx** 25 | 26 | ```bash 27 | apt-get/yum/brew install libreoffice 28 | ``` 29 | 30 | **windows** 31 | ```text 32 | install libreoffice 33 | append "install_dir\LibreOffice\program" to ENVIRONMENT PATH 34 | ``` 35 | 36 | 37 | Install Magic-Doc 38 | 39 | 40 | ```bash 41 | pip install fairy-doc[cpu] # cpu version 42 | or 43 | pip install fairy-doc[gpu] # gpu version 44 | ``` 45 | 46 | 47 | 48 | ## Introduction 49 | 50 | Magic-Doc is a lightweight open-source tool that allows users to convert mulitple file type (PPT/PPTX/DOC/DOCX/PDF) to markdown. It supports both local file and S3 file. 51 | 52 | 53 | ## Example 54 | 55 | ```python 56 | # for local file 57 | from magic_doc.docconv import DocConverter, S3Config 58 | converter = DocConverter(s3_config=None) 59 | markdown_cotent, time_cost = converter.convert("some_doc.pptx", conv_timeout=300) 60 | ``` 61 | 62 | ```python 63 | # for remote file located in aws s3 64 | from magic_doc.docconv import DocConverter, S3Config 65 | 66 | s3_config = S3Config(ak='${ak}', sk='${sk}', endpoint='${endpoint}') 67 | converter = DocConverter(s3_config=s3_config) 68 | markdown_cotent, time_cost = converter.convert("s3://some_bucket/some_doc.pptx", conv_timeout=300) 69 | ``` 70 | 71 | ## Performance 72 | 73 | ENV: AMD EPYC 7742 64-Core Processor, NVIDIA A100, Centos 7 74 | 75 | | File Type | Speed | 76 | | ------------------ | -------- | 77 | | PDF (digital) | 347 (page/s) | 78 | | PDF (ocr) | 2.7 (page/s) | 79 | | PPT | 20 (page/s) | 80 | | PPTX | 149 (page/s) | 81 | | DOC | 600 (page/s) | 82 | | DOCX | 1482 (page/s) | 83 | 84 | ### All Thanks To Our Contributors: 85 | 86 | ![image](https://github.com/magicpdf/Magic-Doc/blob/main/assets/contributor.png) 87 | 88 | ## License 89 | 90 | This project is released under the [Apache 2.0 license](LICENSE). 91 | 92 |

🔼 Back to top

93 | -------------------------------------------------------------------------------- /magic_doc/contrib/pdf/pdf_extractor.py: -------------------------------------------------------------------------------- 1 | import random 2 | import fitz 3 | 4 | from magic_doc.contrib.model import ( 5 | ExtractResponse, 6 | Extractor, 7 | Page, 8 | Content, 9 | ) 10 | from magic_doc.contrib.wrapper_exceptions import NotSupportOcrPDFException 11 | 12 | from werkzeug.datastructures import FileStorage 13 | from loguru import logger 14 | 15 | 16 | class PDFExtractor(Extractor): 17 | def __init__(self) -> None: 18 | super().__init__() 19 | 20 | def setup(self): 21 | pass 22 | 23 | def is_digital(self, doc, check_page=10, text_len_thrs=100): 24 | sample_page_num = min(check_page, doc.page_count) 25 | page_ids = random.sample(range(doc.page_count), sample_page_num) 26 | page_text_len = [ 27 | len(doc[pno].get_text("text")) > text_len_thrs for pno in page_ids 28 | ] 29 | if any(page_text_len): 30 | return True 31 | return False 32 | 33 | # Guess kimi implemetation 34 | def get_text_with_pymupdf(self, doc): 35 | pages = [] 36 | page_no = 0 37 | for page in doc: 38 | content_list = [] 39 | for block in page.get_text("blocks"): 40 | x0, y0, x1, y1, block_text, block_no, block_type = block 41 | lf_count = 0 42 | for ch in block_text: 43 | if ch == "\n": 44 | lf_count += 1 45 | block_text = ( 46 | block_text.replace("-\n", "") 47 | .replace("´\n", "´") 48 | .replace(" \n", " ") 49 | ) 50 | if lf_count >= 2: 51 | block_text = block_text.replace("\n", " ").strip() 52 | if len(block_text.strip()) == 0: 53 | continue 54 | content_list.append( 55 | Content( 56 | type="text", 57 | data=block_text, 58 | ) 59 | ) 60 | pages.append(Page(page_no=page_no, content_list=content_list)) 61 | page_no += 1 62 | return pages 63 | 64 | def run( 65 | self, file_parse_id: str, r: FileStorage, skip_image: bool = True 66 | ) -> ExtractResponse: 67 | file_content = r.stream.read() 68 | with fitz.open(stream=file_content) as doc: 69 | if self.is_digital(doc): 70 | logger.info(f"{file_parse_id} is digital pdf") 71 | return self.get_text_with_pymupdf(doc) 72 | raise NotSupportOcrPDFException 73 | 74 | 75 | if __name__ == "__main__": 76 | pdf_extractor = PDFExtractor() 77 | with open("magic_doc/contrib/test_data/pdf/test.pdf", "rb") as f: 78 | logger.info(pdf_extractor.run("test", FileStorage(f, filename="STL.pdf"))) 79 | -------------------------------------------------------------------------------- /magic_doc/conv/doc_libreoffice.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from pathlib import Path 4 | from subprocess import Popen 5 | 6 | from loguru import logger 7 | 8 | from magic_doc.contrib.model import Page 9 | from magic_doc.contrib.office.docx_extract import DocxExtractor 10 | from magic_doc.conv.base import BaseConv 11 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 12 | from magic_doc.progress.pupdator import ConvProgressUpdator 13 | 14 | 15 | class Doc(BaseConv): 16 | 17 | def __init__(self): 18 | super().__init__() 19 | 20 | def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str: 21 | page_list = self.doc_to_pagelist(bits, pupdator) 22 | md_content_list = [] 23 | for page in page_list: 24 | page_content_list = page['content_list'] 25 | total = len(page_content_list) 26 | for index, content in enumerate(page_content_list): 27 | progress = 50 + int(index / total * 50) 28 | # logger.info(f"progress: {progress}") 29 | pupdator.update(progress) 30 | if content['type'] == 'image': 31 | pass 32 | elif content['type'] in ["text", "md"]: 33 | data = content['data'] 34 | md_content_list.append(data) 35 | return "\n".join(md_content_list) 36 | 37 | def doc_to_docx(self, doc_path: str, dir_path: str) -> str: 38 | cmd = f'soffice --headless --convert-to docx "{doc_path}" --outdir "{dir_path}"' 39 | logger.info(cmd) 40 | process = Popen(cmd, shell=True) 41 | process.wait() 42 | fname = str(Path(doc_path).stem) 43 | docx_path = os.path.join(os.path.dirname(doc_path), f'{fname}.docx') 44 | if not os.path.exists(docx_path): 45 | # logger.error(f"> !!! File conversion failed {doc_path} ==> {docx_path}") 46 | raise Exception(f"> !!! File conversion failed {doc_path} ==> {docx_path}") 47 | else: 48 | return docx_path 49 | 50 | def doc_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: 51 | with tempfile.TemporaryDirectory() as temp_path: 52 | temp_dir = Path(temp_path) 53 | media_dir = temp_dir / "media" 54 | media_dir.mkdir() 55 | file_path = temp_dir / "tmp.doc" 56 | file_path.write_bytes(bits) 57 | docx_file_path = self.doc_to_docx(str(file_path), str(temp_path)) 58 | pupdator.update(50) 59 | docx_extractor = DocxExtractor() 60 | pages = docx_extractor.extract(Path(docx_file_path), "tmp", temp_dir, media_dir, True) 61 | pupdator.update(80) 62 | return pages 63 | 64 | 65 | if __name__ == '__main__': 66 | pupdator = FileBaseProgressUpdator("/tmp/p.txt") 67 | doc = Doc() 68 | logger.info(doc.to_md(Path(r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图1.doc").read_bytes(), pupdator)) 69 | -------------------------------------------------------------------------------- /magic_doc/conv/ppt_libreoffice.py: -------------------------------------------------------------------------------- 1 | import os 2 | from subprocess import Popen 3 | import tempfile 4 | from pathlib import Path 5 | 6 | from loguru import logger 7 | 8 | from magic_doc.contrib.model import Page 9 | from magic_doc.contrib.office.pptx_extract import PptxExtractor 10 | from magic_doc.conv.base import BaseConv 11 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 12 | from magic_doc.progress.pupdator import ConvProgressUpdator 13 | 14 | 15 | class Ppt(BaseConv): 16 | def __init__(self): 17 | super().__init__() 18 | 19 | def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str: 20 | page_list = self.ppt_to_pagelist(bits, pupdator) 21 | md_content_list = [] 22 | total = len(page_list) 23 | for index, page in enumerate(page_list): 24 | progress = 80 + int(index / total * 20) 25 | # logger.info(f"progress: {progress}") 26 | page_content_list = page['content_list'] 27 | for content in page_content_list: 28 | pupdator.update(progress) 29 | if content['type'] == 'image': 30 | pass 31 | elif content['type'] == "text": 32 | data = content['data'] 33 | md_content_list.append(data) 34 | return "\n".join(md_content_list) 35 | 36 | def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str: 37 | cmd = f'soffice --headless --convert-to pptx "{ppt_path}" --outdir "{dir_path}"' 38 | logger.info(cmd) 39 | process = Popen(cmd, shell=True) 40 | process.wait() 41 | fname = str(Path(ppt_path).stem) 42 | pptx_path = os.path.join(os.path.dirname(ppt_path), f'{fname}.pptx') 43 | if not os.path.exists(pptx_path): 44 | # logger.error(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}") 45 | raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}") 46 | else: 47 | return pptx_path 48 | 49 | def ppt_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]: 50 | with tempfile.TemporaryDirectory() as temp_path: 51 | temp_dir = Path(temp_path) 52 | media_dir = temp_dir / "media" 53 | media_dir.mkdir() 54 | file_path = temp_dir / "tmp.ppt" 55 | file_path.write_bytes(bits) 56 | pptx_file_path = self.ppt_to_pptx(str(file_path), str(temp_path)) 57 | pupdator.update(50) 58 | pptx_extractor = PptxExtractor() 59 | pages = pptx_extractor.extract(Path(pptx_file_path), "tmp", temp_dir, media_dir, True) 60 | pupdator.update(80) 61 | return pages 62 | 63 | 64 | if __name__ == '__main__': 65 | pupdator = FileBaseProgressUpdator("/tmp/p.txt") 66 | ppt = Ppt() 67 | logger.info( 68 | ppt.to_md( 69 | open(r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-课件】MIT15_082JF10_lec10.3MB.ppt", "rb").read(), pupdator)) 70 | -------------------------------------------------------------------------------- /magic_doc/contrib/office/__init__.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | from concurrent.futures import Future 6 | from concurrent.futures import ThreadPoolExecutor 7 | from datetime import datetime 8 | from pathlib import Path 9 | from typing import Tuple 10 | 11 | from loguru import logger 12 | from werkzeug.datastructures import FileStorage 13 | 14 | # from pedia_document_parser.config import Config 15 | from magic_doc.contrib.model import ExtractResponse, Extractor 16 | # from pedia_document_parser.s3.client import S3Client 17 | 18 | 19 | class OfficeExtractor(Extractor, ABC): 20 | def __init__(self) -> None: 21 | super().__init__() 22 | # self.config = Config() 23 | self.tpe = ThreadPoolExecutor(max_workers=30) 24 | self.counter = {} 25 | self.tmp_dir = Path("/tmp") 26 | self.max_text_count = 50_0000 27 | 28 | # def generate_img_path(self, id: str, image_name: str) -> str: 29 | # return f"s3://{self.config.s3_bucket}/{datetime.today().strftime('%Y-%m-%d')}/{id}/{image_name}" 30 | # 31 | # def upload(self, id: str, s3_path: str, path: Path) -> Tuple[str, str]: 32 | # cli = S3Client(self.config.s3_ak, self.config.s3_sk, self.config.s3_ep) 33 | # cli.upload_file(s3_path, path.absolute().as_posix()) 34 | # return (id, s3_path) 35 | 36 | # def upload_background(self, id: str, img_map: dict[Path, str]): 37 | # if len(img_map) == 0: 38 | # self.clean_up(id) 39 | # return 40 | # 41 | # self.counter[id] = len(img_map) 42 | # for src, dest in img_map.items(): 43 | # fut = self.tpe.submit(self.upload, id, dest, src) 44 | # fut.add_done_callback(self.on_upload_succ) 45 | 46 | def clean_up(self, id: str): 47 | dir = self.get_dir_by_id(id).absolute().as_posix() 48 | shutil.rmtree(dir) 49 | self.counter.pop(id, 0) 50 | logger.debug(f"del {dir}") 51 | 52 | def on_upload_succ(self, fut: Future[Tuple[str, str]]) -> None: 53 | id, s3_path = fut.result() 54 | logger.debug(f"upload {s3_path} succ") 55 | 56 | self.counter[id] -= 1 57 | if self.counter[id] == 0: 58 | self.clean_up(id) 59 | 60 | def wait_all(self): 61 | self.tpe.shutdown(wait=True) 62 | 63 | def get_dir_by_id(self, id: str) -> Path: 64 | return self.tmp_dir.joinpath(id) 65 | 66 | def run(self, id: str, r: FileStorage, skip_image: bool = True) -> ExtractResponse: 67 | dir = self.get_dir_by_id(id) 68 | 69 | dir.mkdir() 70 | media_dir = dir.joinpath("media") 71 | media_dir.mkdir() 72 | 73 | try: 74 | return self.extract(r, id, dir, media_dir, skip_image) 75 | except Exception as e: 76 | self.clean_up(id) 77 | raise e 78 | 79 | @abstractmethod 80 | def extract( 81 | self, 82 | r: FileStorage | Path, 83 | id: str, 84 | dir: Path, 85 | media_dir: Path, 86 | skip_image: bool, 87 | ) -> ExtractResponse: 88 | pass 89 | -------------------------------------------------------------------------------- /magic_doc/bin/linux/share/antiword/Example: -------------------------------------------------------------------------------- 1 | # An example of a fontnames translation table 2 | # 3 | # MS-Word fontname, Italic, Bold, Acorn fontname, Special 4 | Arial, 0, 0, Homerton.Medium, 0 5 | Arial, 0, 1, Homerton.Bold, 0 6 | Arial, 1, 0, Homerton.Medium.Oblique,0 7 | Arial, 1, 1, Homerton.Bold.Oblique, 0 8 | Arial Black, 0, 0, Homerton.Medium, 0 9 | Arial Black, 0, 1, Homerton.Bold, 0 10 | Arial Black, 1, 0, Homerton.Medium.Oblique,0 11 | Arial Black, 1, 1, Homerton.Bold.Oblique, 0 12 | AvantGarde, 0, 0, Clare.Medium, 0 13 | AvantGarde, 0, 1, Clare.Demi, 0 14 | AvantGarde, 1, 0, Clare.Medium.Oblique, 0 15 | AvantGarde, 1, 1, Clare.Demi.Oblique, 0 16 | Bookman, 0, 0, Robinson.Light, 0 17 | Bookman, 0, 1, Robinson.Demi, 0 18 | Bookman, 1, 0, Robinson.Light.Italic, 0 19 | Bookman, 1, 1, Robinson.Demi.Italic, 0 20 | Bookman Old Style, 0, 0, Robinson.Light, 0 21 | Bookman Old Style, 0, 1, Robinson.Demi, 0 22 | Bookman Old Style, 1, 0, Robinson.Light.Italic, 0 23 | Bookman Old Style, 1, 1, Robinson.Demi.Italic, 0 24 | Courier, 0, 0, Corpus.Medium, 0 25 | Courier, 0, 1, Corpus.Bold, 0 26 | Courier, 1, 0, Corpus.Medium.Oblique, 0 27 | Courier, 1, 1, Corpus.Bold.Oblique, 0 28 | Courier New, 0, 0, Corpus.Medium, 0 29 | Courier New, 0, 1, Corpus.Bold, 0 30 | Courier New, 1, 0, Corpus.Medium.Oblique, 0 31 | Courier New, 1, 1, Corpus.Bold.Oblique, 0 32 | Fixedsys, 0, 0, Corpus.Medium, 0 33 | Fixedsys, 0, 1, Corpus.Bold, 0 34 | Fixedsys, 1, 0, Corpus.Medium.Oblique, 0 35 | Fixedsys, 1, 1, Corpus.Bold.Oblique, 0 36 | Helvetica, 0, 0, Homerton.Medium, 0 37 | Helvetica, 0, 1, Homerton.Bold, 0 38 | Helvetica, 1, 0, Homerton.Medium.Oblique,0 39 | Helvetica, 1, 1, Homerton.Bold.Oblique, 0 40 | Lucida Console, 0, 0, Corpus.Medium, 0 41 | Lucida Console, 0, 1, Corpus.Bold, 0 42 | Lucida Console, 1, 0, Corpus.Medium.Oblique, 0 43 | Lucida Console, 1, 1, Corpus.Bold.Oblique, 0 44 | Palatino, 0, 0, Pembroke.Medium, 0 45 | Palatino, 0, 1, Pembroke.Bold, 0 46 | Palatino, 1, 0, Pembroke.Medium.Italic, 0 47 | Palatino, 1, 1, Pembroke.Bold.Italic, 0 48 | Swiss, 0, 0, Homerton.Medium, 0 49 | Swiss, 0, 1, Homerton.Bold, 0 50 | Swiss, 1, 0, Homerton.Medium.Oblique,0 51 | Swiss, 1, 1, Homerton.Bold.Oblique, 0 52 | Symbol, 0, 0, Sidney, 1 53 | Symbol, 0, 1, Sidney, 1 54 | Symbol, 1, 0, Sidney, 1 55 | Symbol, 1, 1, Sidney, 1 56 | Times, 0, 0, Trinity.Medium, 0 57 | Times, 0, 1, Trinity.Bold, 0 58 | Times, 1, 0, Trinity.Medium.Italic, 0 59 | Times, 1, 1, Trinity.Bold.Italic, 0 60 | Times New Roman, 0, 0, Trinity.Medium, 0 61 | Times New Roman, 0, 1, Trinity.Bold, 0 62 | Times New Roman, 1, 0, Trinity.Medium.Italic, 0 63 | Times New Roman, 1, 1, Trinity.Bold.Italic, 0 64 | Times Roman, 0, 0, Trinity.Medium, 0 65 | Times Roman, 0, 1, Trinity.Bold, 0 66 | Times Roman, 1, 0, Trinity.Medium.Italic, 0 67 | Times Roman, 1, 1, Trinity.Bold.Italic, 0 68 | Univers, 0, 0, Homerton.Medium, 0 69 | Univers, 0, 1, Homerton.Bold, 0 70 | Univers, 1, 0, Homerton.Medium.Oblique,0 71 | Univers, 1, 1, Homerton.Bold.Oblique, 0 72 | ZapfDingbats, 0, 0, Selwyn, 2 73 | ZapfDingbats, 0, 1, Selwyn, 2 74 | ZapfDingbats, 1, 0, Selwyn, 2 75 | ZapfDingbats, 1, 1, Selwyn, 2 76 | # All the other fonts 77 | *, 0, 0, Trinity.Medium, 0 78 | *, 0, 1, Trinity.Bold, 0 79 | *, 1, 0, Trinity.Medium.Italic, 0 80 | *, 1, 1, Trinity.Bold.Italic, 0 81 | -------------------------------------------------------------------------------- /magic_doc/conv/pdf_pp_structurev2.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import os 3 | 4 | from magic_pdf.dict2md.ocr_mkcontent import union_make 5 | from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode 6 | from magic_pdf.libs.json_compressor import JsonCompressor 7 | from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze 8 | from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter 9 | from magic_pdf.pipe.UNIPipe import UNIPipe 10 | 11 | from loguru import logger 12 | 13 | from magic_doc.conv.base import BaseConv 14 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 15 | from magic_doc.progress.pupdator import ConvProgressUpdator 16 | from magic_doc.utils.null_writer import NullWriter 17 | from magic_doc.common.default_config import DEFAULT_CONFIG 18 | 19 | NULL_IMG_DIR = "/tmp" 20 | 21 | class SingletonModelWrapper: 22 | 23 | def __new__(cls): 24 | if not hasattr(cls, "instance"): 25 | from magic_doc.model.doc_analysis_by_pp import PaddleDocAnalysis 26 | cls.instance = super(SingletonModelWrapper, cls).__new__(cls) 27 | cls.instance.model = PaddleDocAnalysis(model_load_on_each_gpu_count=int(DEFAULT_CONFIG["pdf"]["fast"]["liteocrmodelinstance"])) 28 | return cls.instance 29 | 30 | def __call__(self, bytes: bytes): 31 | from magic_pdf.model.doc_analyze_by_custom_model import load_images_from_pdf 32 | images = load_images_from_pdf(bytes, dpi=200) 33 | return self.model(images) # type: ignore 34 | 35 | 36 | class Pdf(BaseConv): 37 | def to_md(self, bits: bytes | str, pupdator: ConvProgressUpdator) -> str: 38 | model = SingletonModelWrapper() 39 | model_list = model(bits) 40 | pupdator.update(50) 41 | jso_useful_key = { 42 | "_pdf_type": "ocr", 43 | "model_list": model_list, 44 | } 45 | image_writer = NullWriter() 46 | pipe = UNIPipe(bits, jso_useful_key, image_writer, is_debug=True) # type: ignore 47 | # pipe.pipe_classify() # 默认ocrpipe的时候不需要再做分类,可以节省时间 48 | pipe.pipe_parse() 49 | pupdator.update(100) 50 | 51 | pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data()) 52 | pdf_info_list = pdf_mid_data["pdf_info"] 53 | md_content = union_make(pdf_info_list, MakeMode.NLP_MD, DropMode.NONE, NULL_IMG_DIR) 54 | return md_content # type: ignore 55 | 56 | def to_mid_result(self, image_writer: AbsReaderWriter, bits: bytes | str, pupdator: ConvProgressUpdator) \ 57 | -> list[dict] | dict: 58 | model = SingletonModelWrapper() 59 | pupdator.update(0) 60 | model_list = model(bits) 61 | pupdator.update(50) 62 | jso_useful_key = { 63 | "_pdf_type": "ocr", 64 | "model_list": model_list, 65 | } 66 | 67 | pipe = UNIPipe(bits, jso_useful_key, image_writer, is_debug=True) # type: ignore 68 | # pipe.pipe_classify() 69 | pipe.pipe_parse() 70 | pupdator.update(100) 71 | 72 | pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data()) 73 | pdf_info_list = pdf_mid_data["pdf_info"] 74 | return pdf_info_list 75 | 76 | 77 | if __name__ == "__main__": 78 | pupdator = FileBaseProgressUpdator("/tmp/p.txt") 79 | pdf = Pdf() 80 | logger.info( 81 | pdf.to_md(Path(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\j.sna.2004.11.030.pdf").read_bytes(), pupdator)) 82 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/mmltex/README: -------------------------------------------------------------------------------- 1 | README for the XSLT MathML Library 2 | 3 | XSLT MathML Library is a set of XSLT stylesheets to transform 4 | MathML 2.0 to LaTeX. 5 | 6 | For more information, see 7 | http://www.raleigh.ru/MathML/mmltex/index.php?lang=en 8 | 9 | Manifest 10 | -------- 11 | 12 | README this file 13 | mmltex.xsl 14 | tokens.xsl 15 | glayout.xsl 16 | scripts.xsl 17 | tables.xsl 18 | entities.xsl 19 | cmarkup.xsl 20 | 21 | Use 22 | --- 23 | 24 | There are two ways of using the library: 25 | 26 | * Use a local copy of the library. 27 | 28 | 1. Download the distribution (see below). 29 | 30 | 2. Unpack the distribution, using unzip. 31 | 32 | 3. In your stylesheet import or include either the main 33 | stylesheet, mmltex.xsl, or the stylesheet module you 34 | wish to use, such as tokens.xsl. This example assumes 35 | that the distribution has been extracted into the same 36 | directory as your own stylesheet: 37 | 38 | 39 | 40 | * Import or include either the main stylesheet, or the 41 | stylesheet module you wish to use, directly from the library 42 | website; http://www.raleigh.ru/MathML/mmltex/. For example: 43 | 44 | 45 | 46 | Obtaining The Library 47 | --------------------- 48 | 49 | The XSLT MathML Library is available for download as: 50 | 51 | * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip 52 | 53 | Copyright 54 | --------- 55 | 56 | Copyright (C) 2001, 2002 Vasil Yaroshevich 57 | 58 | Permission is hereby granted, free of charge, to any person 59 | obtaining a copy of this software and associated documentation 60 | files (the ``Software''), to deal in the Software without 61 | restriction, including without limitation the rights to use, 62 | copy, modify, merge, publish, distribute, sublicense, and/or 63 | sell copies of the Software, and to permit persons to whom the 64 | Software is furnished to do so, subject to the following 65 | conditions: 66 | 67 | The above copyright notice and this permission notice shall be 68 | included in all copies or substantial portions of the Software. 69 | 70 | Except as contained in this notice, the names of individuals 71 | credited with contribution to this software shall not be used in 72 | advertising or otherwise to promote the sale, use or other 73 | dealings in this Software without prior written authorization 74 | from the individuals in question. 75 | 76 | Any stylesheet derived from this Software that is publically 77 | distributed will be identified with a different name and the 78 | version strings in any derived Software will be changed so that 79 | no possibility of confusion between the derived package and this 80 | Software will exist. 81 | 82 | Warranty 83 | -------- 84 | 85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 88 | NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER 89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 92 | OTHER DEALINGS IN THE SOFTWARE. 93 | 94 | Contacting the Author 95 | --------------------- 96 | 97 | These stylesheets are maintained by Vasil Yaroshevich, . 98 | -------------------------------------------------------------------------------- /magic_doc/contrib/office/formula/mml/xsl/README: -------------------------------------------------------------------------------- 1 | README for the XSLT MathML Library 2 | 3 | XSLT MathML Library is a set of XSLT stylesheets to transform 4 | MathML 2.0 to LaTeX. 5 | 6 | For more information, see 7 | http://www.raleigh.ru/MathML/mmltex/index.php?lang=en 8 | 9 | Manifest 10 | -------- 11 | 12 | README this file 13 | mmltex.xsl 14 | tokens.xsl 15 | glayout.xsl 16 | scripts.xsl 17 | tables.xsl 18 | entities.xsl 19 | cmarkup.xsl 20 | 21 | Use 22 | --- 23 | 24 | There are two ways of using the library: 25 | 26 | * Use a local copy of the library. 27 | 28 | 1. Download the distribution (see below). 29 | 30 | 2. Unpack the distribution, using unzip. 31 | 32 | 3. In your stylesheet import or include either the main 33 | stylesheet, mmltex.xsl, or the stylesheet module you 34 | wish to use, such as tokens.xsl. This example assumes 35 | that the distribution has been extracted into the same 36 | directory as your own stylesheet: 37 | 38 | 39 | 40 | * Import or include either the main stylesheet, or the 41 | stylesheet module you wish to use, directly from the library 42 | website; http://www.raleigh.ru/MathML/mmltex/. For example: 43 | 44 | 45 | 46 | Obtaining The Library 47 | --------------------- 48 | 49 | The XSLT MathML Library is available for download as: 50 | 51 | * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip 52 | 53 | Copyright 54 | --------- 55 | 56 | Copyright (C) 2001, 2002 Vasil Yaroshevich 57 | 58 | Permission is hereby granted, free of charge, to any person 59 | obtaining a copy of this software and associated documentation 60 | files (the ``Software''), to deal in the Software without 61 | restriction, including without limitation the rights to use, 62 | copy, modify, merge, publish, distribute, sublicense, and/or 63 | sell copies of the Software, and to permit persons to whom the 64 | Software is furnished to do so, subject to the following 65 | conditions: 66 | 67 | The above copyright notice and this permission notice shall be 68 | included in all copies or substantial portions of the Software. 69 | 70 | Except as contained in this notice, the names of individuals 71 | credited with contribution to this software shall not be used in 72 | advertising or otherwise to promote the sale, use or other 73 | dealings in this Software without prior written authorization 74 | from the individuals in question. 75 | 76 | Any stylesheet derived from this Software that is publically 77 | distributed will be identified with a different name and the 78 | version strings in any derived Software will be changed so that 79 | no possibility of confusion between the derived package and this 80 | Software will exist. 81 | 82 | Warranty 83 | -------- 84 | 85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 88 | NONINFRINGEMENT. IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER 89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 92 | OTHER DEALINGS IN THE SOFTWARE. 93 | 94 | Contacting the Author 95 | --------------------- 96 | 97 | These stylesheets are maintained by Vasil Yaroshevich, . 98 | -------------------------------------------------------------------------------- /magic_doc/bin/linux/share/antiword/Default: -------------------------------------------------------------------------------- 1 | # Default fontnames translation table 2 | # uses only fonts present in the RISC OS 3 ROMs 3 | # 4 | # MS-Word fontname, Italic, Bold, Acorn fontname, Special 5 | Arial, 0, 0, Homerton.Medium, 0 6 | Arial, 0, 1, Homerton.Bold, 0 7 | Arial, 1, 0, Homerton.Medium.Oblique,0 8 | Arial, 1, 1, Homerton.Bold.Oblique, 0 9 | Arial Black, 0, 0, Homerton.Medium, 0 10 | Arial Black, 0, 1, Homerton.Bold, 0 11 | Arial Black, 1, 0, Homerton.Medium.Oblique,0 12 | Arial Black, 1, 1, Homerton.Bold.Oblique, 0 13 | Arial CE, 0, 0, Homerton.Medium, 0 14 | Arial CE, 0, 1, Homerton.Bold, 0 15 | Arial CE, 1, 0, Homerton.Medium.Oblique,0 16 | Arial CE, 1, 1, Homerton.Bold.Oblique, 0 17 | Arial Narrow, 0, 0, Homerton.Medium, 0 18 | Arial Narrow, 0, 1, Homerton.Bold, 0 19 | Arial Narrow, 1, 0, Homerton.Medium.Oblique,0 20 | Arial Narrow, 1, 1, Homerton.Bold.Oblique, 0 21 | Comic Sans MS, 0, 0, Homerton.Medium, 0 22 | Comic Sans MS, 0, 1, Homerton.Bold, 0 23 | Comic Sans MS, 1, 0, Homerton.Medium.Oblique,0 24 | Comic Sans MS, 1, 1, Homerton.Bold.Oblique, 0 25 | Courier, 0, 0, Corpus.Medium, 0 26 | Courier, 0, 1, Corpus.Bold, 0 27 | Courier, 1, 0, Corpus.Medium.Oblique, 0 28 | Courier, 1, 1, Corpus.Bold.Oblique, 0 29 | Courier New, 0, 0, Corpus.Medium, 0 30 | Courier New, 0, 1, Corpus.Bold, 0 31 | Courier New, 1, 0, Corpus.Medium.Oblique, 0 32 | Courier New, 1, 1, Corpus.Bold.Oblique, 0 33 | Fixedsys, 0, 0, Corpus.Medium, 0 34 | Fixedsys, 0, 1, Corpus.Bold, 0 35 | Fixedsys, 1, 0, Corpus.Medium.Oblique, 0 36 | Fixedsys, 1, 1, Corpus.Bold.Oblique, 0 37 | Helvetica, 0, 0, Homerton.Medium, 0 38 | Helvetica, 0, 1, Homerton.Bold, 0 39 | Helvetica, 1, 0, Homerton.Medium.Oblique,0 40 | Helvetica, 1, 1, Homerton.Bold.Oblique, 0 41 | Helvetica-Narrow, 0, 0, Homerton.Medium, 0 42 | Helvetica-Narrow, 0, 1, Homerton.Bold, 0 43 | Helvetica-Narrow, 1, 0, Homerton.Medium.Oblique,0 44 | Helvetica-Narrow, 1, 1, Homerton.Bold.Oblique, 0 45 | Lucida Console, 0, 0, Corpus.Medium, 0 46 | Lucida Console, 0, 1, Corpus.Bold, 0 47 | Lucida Console, 1, 0, Corpus.Medium.Oblique, 0 48 | Lucida Console, 1, 1, Corpus.Bold.Oblique, 0 49 | Monotype.com, 0, 0, Corpus.Medium, 0 50 | Monotype.com, 0, 1, Corpus.Bold, 0 51 | Monotype.com, 1, 0, Corpus.Medium.Oblique, 0 52 | Monotype.com, 1, 1, Corpus.Bold.Oblique, 0 53 | MS Sans Serif, 0, 0, Homerton.Medium, 0 54 | MS Sans Serif, 0, 1, Homerton.Bold, 0 55 | MS Sans Serif, 1, 0, Homerton.Medium.Oblique,0 56 | MS Sans Serif, 1, 1, Homerton.Bold.Oblique, 0 57 | Swiss, 0, 0, Homerton.Medium, 0 58 | Swiss, 0, 1, Homerton.Bold, 0 59 | Swiss, 1, 0, Homerton.Medium.Oblique,0 60 | Swiss, 1, 1, Homerton.Bold.Oblique, 0 61 | Tahoma, 0, 0, Homerton.Medium, 0 62 | Tahoma, 0, 1, Homerton.Bold, 0 63 | Tahoma, 1, 0, Homerton.Medium.Oblique,0 64 | Tahoma, 1, 1, Homerton.Bold.Oblique, 0 65 | Trebuchet MS, 0, 0, Homerton.Medium, 0 66 | Trebuchet MS, 0, 1, Homerton.Bold, 0 67 | Trebuchet MS, 1, 0, Homerton.Medium.Oblique,0 68 | Trebuchet MS, 1, 1, Homerton.Bold.Oblique, 0 69 | Verdana, 0, 0, Homerton.Medium, 0 70 | Verdana, 0, 1, Homerton.Bold, 0 71 | Verdana, 1, 0, Homerton.Medium.Oblique,0 72 | Verdana, 1, 1, Homerton.Bold.Oblique, 0 73 | Univers, 0, 0, Homerton.Medium, 0 74 | Univers, 0, 1, Homerton.Bold, 0 75 | Univers, 1, 0, Homerton.Medium.Oblique,0 76 | Univers, 1, 1, Homerton.Bold.Oblique, 0 77 | # All the other fonts 78 | *, 0, 0, Trinity.Medium, 0 79 | *, 0, 1, Trinity.Bold, 0 80 | *, 1, 0, Trinity.Medium.Italic, 0 81 | *, 1, 1, Trinity.Bold.Italic, 0 82 | -------------------------------------------------------------------------------- /tools/benchmark.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import os 3 | import shutil 4 | import json 5 | import markdown_calculate 6 | code_path = os.environ.get('GITHUB_WORKSPACE') 7 | #code_path = "/home/quyuan/actions-runner/_work/Magic-PDF/Magic-PDF.bk" 8 | #评测集存放路径 9 | pdf_dev_path = "/home/quyuan/data" 10 | #magicpdf跑测结果 11 | pdf_res_path = "/tmp/magic-pdf" 12 | file_types = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"] 13 | #file_types = ["academic_literature"] 14 | 15 | def test_cli(): 16 | magicpdf_path = os.path.join(pdf_dev_path, "output") 17 | rm_cmd = "rm -rf %s" % (pdf_res_path) 18 | os.system(rm_cmd) 19 | os.makedirs(pdf_res_path) 20 | cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py pdf-command --pdf {}' % (code_path, magicpdf_path) 21 | os.system(cmd) 22 | for root, dirs, files in os.walk(pdf_res_path): 23 | for magic_file in files: 24 | for file_type in file_types: 25 | target_dir = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf") 26 | if magic_file.endswith(".md") and magic_file.startswith(file_type): 27 | source_file = os.path.join(root, magic_file) 28 | target_file = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf", magic_file) 29 | if not os.path.exists(target_dir): 30 | os.makedirs(target_dir) 31 | shutil.copy(source_file, target_file) 32 | 33 | def calculate_score(): 34 | data_path = os.path.join(pdf_dev_path, "ci") 35 | cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, data_path) 36 | os.system(cmd) 37 | cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path) 38 | os.system(cmd) 39 | score = markdown_calculate.Scoring(os.path.join(data_path, "result.json")) 40 | score.calculate_similarity_total("magicpdf", file_types, data_path) 41 | res = score.summary_scores() 42 | return res 43 | 44 | 45 | def extrat_zip(zip_file_path, extract_to_path): 46 | if zipfile.is_zipfile(zip_file_path): 47 | with zipfile.ZipFile(zip_file_path, 'r') as zip_ref: 48 | zip_ref.extractall(extract_to_path) 49 | print(f'Files extracted to {extract_to_path}') 50 | else: 51 | print(f'{zip_file_path} is not a zip file') 52 | 53 | 54 | def ci_ben(): 55 | fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r") 56 | lines = fr.readlines() 57 | last_line = lines[-1].strip() 58 | last_score = json.loads(last_line) 59 | print ("last_score:", last_score) 60 | last_simscore = last_score["average_sim_score"] 61 | last_editdistance = last_score["average_edit_distance"] 62 | last_bleu = last_score["average_bleu_score"] 63 | extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path)) 64 | test_cli() 65 | now_score = calculate_score() 66 | print ("now_score:", now_score) 67 | now_simscore = now_score["average_sim_score"] 68 | now_editdistance = now_score["average_edit_distance"] 69 | now_bleu = now_score["average_bleu_score"] 70 | assert last_simscore <= now_simscore 71 | assert last_editdistance <= now_editdistance 72 | assert last_bleu <= now_bleu 73 | 74 | 75 | if __name__ == "__main__": 76 | ci_ben() 77 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/extractors/weixin_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | from magic_doc.contrib.magic_html.utils import * 4 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor 5 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor 6 | 7 | 8 | class WeixinExtractor(BaseExtractor): 9 | def __init__(self) -> None: 10 | super().__init__() 11 | 12 | def extract(self, html="", base_url="") -> dict: 13 | html = html.replace(" ", " ") 14 | tree = load_html(html) 15 | if tree is None: 16 | raise ValueError 17 | 18 | # 获取title 19 | title = TitleExtractor().process(tree) 20 | 21 | # base_url 22 | base_href = tree.xpath("//base/@href") 23 | 24 | if base_href and "http" in base_href[0]: 25 | base_url = base_href[0] 26 | 27 | # 文章区域 28 | try: 29 | body_tree = tree.xpath('.//*[@id="img-content"]')[0] 30 | except: 31 | raise ValueError 32 | 33 | # 去除 js , style, comment 34 | for script in body_tree.xpath(".//script"): 35 | self.remove_node(script) 36 | for style in body_tree.xpath(".//style"): 37 | self.remove_node(style) 38 | for comment in body_tree.xpath(".//comment()"): 39 | self.remove_node(comment) 40 | 41 | # 删除所有的公众号介绍 42 | for mp in body_tree.xpath('.//div[@id="meta_content"]'): 43 | self.remove_node(mp) 44 | for mp in body_tree.xpath('.//div[@id="js_tags"]'): 45 | self.remove_node(mp) 46 | for mp in body_tree.xpath('.//div[@class="original_area_primary"]'): 47 | self.remove_node(mp) 48 | # 隐藏的封禁 介绍 49 | for mp in body_tree.xpath('.//section[@class="wx_profile_card_inner"]'): 50 | self.remove_node(mp) 51 | # 特殊的wx卡片介绍 52 | for mp in body_tree.xpath( 53 | ".//section[contains(@class, 'wx_profile_msg_inner')]" 54 | ): 55 | self.remove_node(mp) 56 | 57 | # 针对杂乱内容进行去除 58 | all_raga = body_tree.xpath( 59 | ".//*[contains(@style, 'color: rgba(255, 255, 255, 0)')] | .//*[contains(@style, 'color: rgba(255 255 255 0)')]" 60 | ) 61 | 62 | for mp in all_raga: 63 | flag_have_color_rgb, detail_style = self.ensure_have_color_rgb( 64 | mp.attrib["style"] 65 | ) 66 | 67 | if not flag_have_color_rgb: 68 | continue 69 | self.remove_node(mp) 70 | 71 | for img in body_tree.xpath(".//img"): 72 | 73 | if "data-src" not in img.attrib: 74 | continue 75 | 76 | try: 77 | img.set("src", img.attrib["data-src"]) 78 | except Exception as e: 79 | continue 80 | 81 | for h1 in body_tree.xpath(".//h1"): 82 | if not h1.text: 83 | continue 84 | h1.text = h1.text.replace("\n", "").strip() 85 | 86 | body_html = tostring(body_tree, encoding=str) 87 | 88 | return { 89 | "xp_num": "weixin", 90 | "drop_list": False, 91 | "html": body_html, 92 | "title": title, 93 | "base_url": base_url 94 | } 95 | 96 | @staticmethod 97 | def ensure_have_color_rgb(htmlstr): 98 | pattern = r"(? None: 16 | super().__init__() 17 | 18 | def setup(self): 19 | pass 20 | 21 | def extract( 22 | self, 23 | r: FileStorage | Path, 24 | id: str, 25 | dir: Path, 26 | media_dir: Path, 27 | skip_image: bool, 28 | cwd_path="/opt/antiword" 29 | ) -> ExtractResponse: 30 | doc_path = dir.joinpath("its.doc") 31 | 32 | if type(r) is FileStorage: 33 | r.save(doc_path) 34 | else: 35 | shutil.copyfile(r, doc_path) 36 | 37 | if skip_image: 38 | cmd = f"./antiword -f -i 1 -o {dir.as_posix()} {doc_path.as_posix()}" 39 | else: 40 | cmd = f"./antiword -f -i 3 -o {dir.as_posix()} {doc_path.as_posix()}" 41 | logger.info(f"cmd: {cmd}") 42 | process = Popen(cmd, shell=True, cwd=Path(cwd_path), stdout=PIPE, stderr=PIPE) 43 | stdout, stderr = process.communicate() 44 | process.wait() 45 | 46 | shutil.rmtree(media_dir.absolute().as_posix()) 47 | shutil.move( 48 | dir.joinpath("pic").absolute().as_posix(), media_dir.absolute().as_posix() 49 | ) 50 | code = process.returncode 51 | if code != 0: 52 | err = stderr.decode() 53 | raise Exception(f"parse doc failed: {err}") 54 | 55 | pure_text_path = dir.joinpath("text") 56 | 57 | with open(pure_text_path, "r") as f: 58 | content = f.read() 59 | 60 | # img_map: dict[Path, str] = {} 61 | # imgs = media_dir.glob("*") 62 | # for img in imgs: 63 | # img_map[img] = self.generate_img_path(id, img.name) 64 | # 65 | # self.upload_background(id, img_map) 66 | 67 | pages = [ 68 | Page(page_no=idx, content=x) 69 | for idx, x in enumerate(content.split("[pedia-page]")) 70 | ] 71 | 72 | for page in pages: 73 | content: str = page.pop("content") 74 | content_list = [ 75 | Content(data=x.strip(), type="text") 76 | for x in content.split("\n") 77 | if x.strip() 78 | ] 79 | 80 | for content in content_list: 81 | if not content["data"].startswith("[pedia-"): 82 | continue 83 | if content["data"] == "[pedia-badpic]": 84 | content["data"] = "" 85 | content["type"] = "image" 86 | elif content["data"].startswith("[pedia-pic"): 87 | content["type"] = "image" 88 | img_name = content["data"][len("[pedia-") : -1] 89 | img_path = media_dir.joinpath(img_name) 90 | content["data"] = img_map[img_path] 91 | else: 92 | content["data"] = content["data"] + "\n" 93 | 94 | page["content_list"] = content_list 95 | 96 | return pages 97 | 98 | 99 | if __name__ == "__main__": 100 | e = DocExtractor() 101 | print( 102 | json.dumps( 103 | e.run("abc", Path("/home/SENSETIME/wuziming/diclm/doc2docx/doc/md4.doc")), 104 | ensure_ascii=False, 105 | indent=4, 106 | ), 107 | ) 108 | e.wait_all() 109 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | tags: 9 | - '*released' 10 | workflow_dispatch: 11 | 12 | 13 | jobs: 14 | 15 | update-version: 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v4 20 | with: 21 | ref: main 22 | fetch-depth: 0 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: "3.10" 28 | 29 | - name: Update version.py 30 | run: | 31 | python update_version.py 32 | 33 | - name: Verify version.py 34 | run: | 35 | ls -l magic_doc/libs/version.py 36 | cat magic_doc/libs/version.py 37 | 38 | - name: Commit changes 39 | run: | 40 | git config --local user.email "moe@myhloli.com" 41 | git config --local user.name "myhloli" 42 | git add magic_doc/libs/version.py 43 | if git diff-index --quiet HEAD; then 44 | echo "No changes to commit" 45 | else 46 | git commit -m "Update version.py with new version" 47 | fi 48 | id: commit_changes 49 | 50 | - name: Push changes 51 | if: steps.commit_changes.outcome == 'success' 52 | env: 53 | GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }} 54 | run: | 55 | git push origin HEAD:main 56 | 57 | build: 58 | needs: [ update-version ] 59 | runs-on: ubuntu-latest 60 | strategy: 61 | fail-fast: false 62 | matrix: 63 | python-version: ["3.10"] 64 | 65 | steps: 66 | - name: Checkout code 67 | uses: actions/checkout@v4 68 | with: 69 | ref: main 70 | fetch-depth: 0 71 | 72 | - name: Verify version.py 73 | run: | 74 | ls -l magic_doc/libs/version.py 75 | cat magic_doc/libs/version.py 76 | 77 | - name: Set up Python ${{ matrix.python-version }} 78 | uses: actions/setup-python@v5 79 | with: 80 | python-version: ${{ matrix.python-version }} 81 | 82 | - name: Install dependencies 83 | run: | 84 | python -m pip install --upgrade pip 85 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 86 | 87 | - name: Install wheel 88 | run: | 89 | python -m pip install wheel 90 | 91 | - name: Build wheel 92 | run: | 93 | python setup.py bdist_wheel 94 | 95 | - name: Upload artifact 96 | uses: actions/upload-artifact@v4 97 | with: 98 | name: wheel-file 99 | path: dist/*.whl 100 | retention-days: 30 101 | 102 | release: 103 | needs: [ build ] 104 | runs-on: ubuntu-latest 105 | steps: 106 | - name: Checkout code 107 | uses: actions/checkout@v4 108 | 109 | - name: Download artifact 110 | uses: actions/download-artifact@v4 111 | with: 112 | name: wheel-file 113 | path: dist 114 | 115 | - name: Create and Upload Release 116 | id: create_release 117 | uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981 118 | with: 119 | files: './dist/*.whl' 120 | env: 121 | GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }} 122 | 123 | - name: Publish distribution to PyPI 124 | run: | 125 | pip install twine 126 | twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }} 127 | -------------------------------------------------------------------------------- /magic_doc/restful_api/common/oss/oss.py: -------------------------------------------------------------------------------- 1 | import oss2 2 | from magic_doc.restful_api.common.ext import singleton_func 3 | from loguru import logger 4 | 5 | 6 | @singleton_func 7 | class Oss(object): 8 | def __init__(self, access_key_id, access_secret_key, bucket_name, endpoint, expires=60): 9 | self.access_key_id = access_key_id 10 | self.access_secret_key = access_secret_key 11 | self.bucket_name = bucket_name 12 | self.endpoint = endpoint 13 | self.expires = expires 14 | self.auth = oss2.Auth(self.access_key_id, self.access_secret_key) 15 | 16 | def create_bucket(self, bucket_name=None): 17 | """ 18 | 创建存储空间 19 | :param bucket_name: bucket名称 20 | :return: 21 | """ 22 | bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name) 23 | # 设置存储空间为私有读写权限 24 | # bucket.create_bucket(oss2.models.BUCKET_ACL_PRIVATE) 25 | bucket.create_bucket() 26 | return True 27 | 28 | def delete_bucket(self, bucket_name=None): 29 | """ 30 | 删除存储空间 31 | :param bucket_name: bucket名称 32 | :return: 33 | """ 34 | bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name) 35 | try: 36 | bucket.delete_bucket() 37 | return True 38 | except oss2.exceptions.BucketNotEmpty: 39 | logger.error('bucket is not empty.') 40 | return False 41 | except oss2.exceptions.NoSuchBucket: 42 | logger.error('bucket does not exist') 43 | return False 44 | 45 | def pub_object(self, bucket_name=None, object_name=None, object_data=None): 46 | """ 47 | 上传文件 48 | Str 49 | Bytes 50 | Unicode 51 | Stream 52 | :param bucket_name: bucket名称 53 | :param object_name: 不包含Bucket名称组成的Object完整路径 54 | :param object_data: 55 | :return: 56 | """ 57 | bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name) 58 | result = bucket.put_object(object_name, object_data) 59 | file_link = bucket.sign_url('GET', object_name, self.expires, slash_safe=True) 60 | return { 61 | "status": result.status, 62 | "request_id": result.request_id, 63 | "etag": result.etag, 64 | "date": result.headers['date'], 65 | "file_link": file_link 66 | } 67 | 68 | def put_file(self, bucket_name=None, object_name=None, file_path=None): 69 | """ 70 | 上传文件 71 | file 72 | :param bucket_name: bucket名称 73 | :param object_name: 不包含Bucket名称组成的Object完整路径 74 | :param file_path: 文件路径 75 | :return: 76 | """ 77 | bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name) 78 | result = bucket.put_object_from_file(object_name, file_path) 79 | file_link = bucket.sign_url('GET', object_name, self.expires, slash_safe=True) 80 | return { 81 | "status": result.status, 82 | "request_id": result.request_id, 83 | "etag": result.etag, 84 | "date": result.headers['date'], 85 | "file_link": file_link 86 | } 87 | 88 | def delete_objects(self, bucket_name=None, object_name=None): 89 | """ 90 | 批量删除文件 91 | :param bucket_name: bucket名称 92 | :param object_name: 不包含Bucket名称组成的Object完整路径列表 93 | :return: 94 | """ 95 | if object_name is None: 96 | object_name = [] 97 | bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name) 98 | result = bucket.batch_delete_objects(object_name) 99 | return result.deleted_keys 100 | 101 | def download_file(self, bucket_name=None, object_name=None, save_path=None): 102 | """ 103 | 下载文件到本地 104 | :param bucket_name: bucket名称 105 | :param object_name: 不包含Bucket名称组成的Object完整路径 106 | :param save_path: 保存路径 107 | :return: 108 | """ 109 | bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name) 110 | bucket.get_object_to_file(object_name, save_path) 111 | -------------------------------------------------------------------------------- /magic_doc/restful_api/api/analysis/magic_pdf_view.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import time 4 | import requests 5 | from flask import request, current_app 6 | from flask_restful import Resource 7 | from marshmallow import ValidationError 8 | from pathlib import Path 9 | from magic_doc.pdf_transform import DocConverter, S3Config 10 | from .serialization import MagicPdfSchema 11 | from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para_and_pagination 12 | from magic_doc.restful_api.common.oss.oss import Oss 13 | from .ext import upload_image_to_oss, upload_md_to_oss 14 | from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED 15 | from magic_doc.restful_api.common.custom_response import generate_response 16 | from loguru import logger 17 | 18 | executor = ThreadPoolExecutor() 19 | 20 | 21 | class MagicPdfView(Resource): 22 | @logger.catch 23 | def post(self): 24 | """ 25 | PDF解析,将markdown结果上传至服务器 26 | """ 27 | t0 = time.time() 28 | magic_pdf_schema = MagicPdfSchema() 29 | try: 30 | params = magic_pdf_schema.load(request.get_json()) 31 | except ValidationError as err: 32 | return generate_response(code=400, msg=err.messages) 33 | pdf_path = params.get('pageUrl') 34 | # ############ pdf解析 ############### 35 | file_name = str(Path(pdf_path).stem) 36 | pf_path = f"/tmp/{file_name}.txt" 37 | pdf_dir = f"{current_app.static_folder}/pdf/{file_name}" 38 | NULL_IMG_DIR = f"{current_app.static_folder}/pdf/{file_name}" 39 | app_config = current_app.config 40 | if not Path(NULL_IMG_DIR).exists(): 41 | Path(NULL_IMG_DIR).mkdir(parents=True, exist_ok=True) 42 | if pdf_path.startswith("http://") or pdf_path.startswith("https://"): 43 | download_pdf = requests.get(pdf_path, stream=True) 44 | pdf_path = f"{pdf_dir}/{file_name}.pdf" 45 | with open(pdf_path, "wb") as wf: 46 | wf.write(download_pdf.content) 47 | doc_conv = DocConverter(None) 48 | elif pdf_path.startswith("s3://"): 49 | s3_config = S3Config(app_config["S3AK"], app_config["S3SK"], app_config["S3ENDPOINT"]) 50 | doc_conv = DocConverter(s3_config) 51 | else: 52 | doc_conv = DocConverter(None) 53 | t1 = time.time() 54 | logger.info(f"param init cost_time:{t1 - t0}") 55 | result = doc_conv.convert_to_mid_result(pdf_path, pf_path, 60) 56 | t2 = time.time() 57 | logger.info(f"pdf doc_conv cost_time:{t2 - t1}") 58 | md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(result[0], NULL_IMG_DIR), ensure_ascii=False) 59 | t3 = time.time() 60 | logger.info(f"make markdown cost_time:{t3 - t2}") 61 | # local_md_path = f"{pdf_dir}/{file_name}.md" 62 | # with open(local_md_path, "w", encoding="utf-8") as f: 63 | # f.write(md_content) 64 | # t4 = time.time() 65 | # logger.info(f"save markdown cost_time:{t4 - t3}") 66 | _t0 = time.time() 67 | oss_client = Oss( 68 | app_config["AccessKeyID"], 69 | app_config["AccessKeySecret"], 70 | app_config["BucketName"], 71 | app_config["Endpoint"], 72 | app_config["UrlExpires"] 73 | ) 74 | img_list = Path(f"{NULL_IMG_DIR}/images").glob('*') if Path(f"{NULL_IMG_DIR}/images").exists() else [] 75 | all_task = [executor.submit(upload_image_to_oss, oss_client, file_name, img_path, NULL_IMG_DIR, app_config["BucketName"]) for img_path in img_list] 76 | wait(all_task, return_when=ALL_COMPLETED) 77 | for task in all_task: 78 | task_result = task.result() 79 | regex = re.compile(fr'.*\((.*?{Path(task_result[0]).name})') 80 | regex_result = regex.search(md_content) 81 | if regex_result: 82 | md_content = md_content.replace(regex_result.group(1), task_result[1]) 83 | _t1 = time.time() 84 | logger.info(f"upload img cost_time:{_t1 - _t0}") 85 | 86 | all_md_task = [executor.submit(upload_md_to_oss, oss_client, app_config["BucketName"], f"pdf/{file_name}/{md.get('page_no', n)}.md", md["md_content"]) for n, md in enumerate(json.loads(md_content))] 87 | wait(all_md_task, return_when=ALL_COMPLETED) 88 | md_link_list = [] 89 | for task in all_md_task: 90 | task_result = task.result() 91 | md_link_list.append(task_result) 92 | _t2 = time.time() 93 | logger.info(f"upload md cost_time:{_t2 - _t1}") 94 | 95 | return generate_response(markDownUrl=md_link_list) 96 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/mmltex/tables.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | \multicolumn{ 15 | 16 | }{c}{ 17 | 18 | } 19 | 20 | & 21 | 22 | 23 | 24 | 25 | 26 | 27 | \hfill 28 | 29 | 30 | 31 | \hfill 32 | 33 | 34 | 36 | & 37 | 38 | 39 | 40 | 41 | 42 | 43 | \\ 44 | 45 | 46 | 47 | 48 | \begin{array}{ 49 | 50 | | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | | 85 | 86 | } 87 | 88 | \hline 89 | 90 | 91 | 92 | \\ \hline 93 | 94 | \end{array} 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /magic_doc/contrib/office/formula/mml/xsl/tables.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | \multicolumn{ 15 | 16 | }{c}{ 17 | 18 | } 19 | 20 | & 21 | 22 | 23 | 24 | 25 | 26 | 27 | \hfill 28 | 29 | 30 | 31 | \hfill 32 | 33 | 34 | 36 | & 37 | 38 | 39 | 40 | 41 | 42 | 43 | \\ 44 | 45 | 46 | 47 | 48 | \begin{array}{ 49 | 50 | | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | | 85 | 86 | } 87 | 88 | \hline 89 | 90 | 91 | 92 | \\ \hline 93 | 94 | \end{array} 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /magic_doc/conv/pdf_magicpdf.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode 4 | from magic_pdf.pipe.UNIPipe import UNIPipe 5 | from magic_pdf.pipe.OCRPipe import OCRPipe 6 | from magic_pdf.pipe.TXTPipe import TXTPipe 7 | from magic_doc.conv.base import BaseConv 8 | 9 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator 10 | from magic_doc.progress.pupdator import ConvProgressUpdator 11 | from magic_doc.utils import get_repo_directory 12 | from magic_doc.utils.null_writer import NullWriter 13 | from magic_pdf.dict2md.ocr_mkcontent import union_make 14 | from magic_pdf.libs.json_compressor import JsonCompressor 15 | from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter 16 | from magic_doc.common.default_config import DEFAULT_CONFIG, PdfHqParseMethod 17 | 18 | 19 | NULL_IMG_DIR = "/tmp" 20 | 21 | class SingletonModelWrapper: 22 | 23 | def __new__(cls): 24 | if not hasattr(cls, "instance"): 25 | from magic_doc.model.doc_analysis import DocAnalysis 26 | apply_ocr = os.getenv("APPLY_OCR", "TRUE") == "TRUE" 27 | apply_layout = os.getenv("APPLY_LAYOUT", "TRUE") == "TRUE" 28 | apply_formula = os.getenv("APPLY_FORMULA", "FALSE") == "TRUE" 29 | 30 | cls.instance = super(SingletonModelWrapper, cls).__new__(cls) 31 | cls.instance.doc_analysis = DocAnalysis( # type: ignore 32 | configs=os.path.join( 33 | get_repo_directory(), "resources/model/model_configs.yaml" 34 | ), 35 | apply_ocr=apply_ocr, apply_layout=apply_layout, apply_formula=apply_formula, 36 | ) 37 | return cls.instance 38 | 39 | def __call__(self, bits: bytes): 40 | from magic_doc.model.doc_analysis import load_images_from_pdf 41 | images = load_images_from_pdf(bits, dpi=200) 42 | return self.doc_analysis(images) # type: ignore 43 | 44 | class Pdf(BaseConv): 45 | 46 | def __construct_pdf_pipe(self, bits, model_list, image_writer): 47 | if DEFAULT_CONFIG["pdf"]["hq"]["parsemethod"] == PdfHqParseMethod.AUTO: 48 | pipe = UNIPipe(bits, model_list, image_writer, is_debug=True) # type: ignore 49 | elif DEFAULT_CONFIG["pdf"]["hq"]["parsemethod"] == PdfHqParseMethod.OCR: 50 | pipe = OCRPipe(bits, model_list, image_writer, is_debug=True) # type: ignore 51 | elif DEFAULT_CONFIG["pdf"]["hq"]["parsemethod"] == PdfHqParseMethod.TXT: 52 | pipe = TXTPipe(bits, model_list, image_writer, is_debug=True) # type: ignore 53 | else: 54 | raise Exception("unknown parse method under hq mode") 55 | return pipe 56 | 57 | 58 | def to_md(self, bits: bytes | str, pupdator: ConvProgressUpdator) -> str: 59 | model_proc = SingletonModelWrapper() 60 | pupdator.update(0) 61 | 62 | model_list = model_proc(bits) # type: ignore 63 | pupdator.update(50) 64 | # jso_useful_key = { 65 | # "_pdf_type": "", 66 | # "model_list": model_list, 67 | # } 68 | image_writer = NullWriter() 69 | pipe = self.__construct_pdf_pipe(bits, model_list, image_writer) 70 | # pipe.pipe_classify() # 默认ocrpipe的时候不需要再做分类,可以节省时间 71 | pipe.pipe_parse() 72 | pupdator.update(100) 73 | 74 | pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data()) 75 | pdf_info_list = pdf_mid_data["pdf_info"] 76 | md_content = union_make(pdf_info_list, MakeMode.NLP_MD, DropMode.NONE, NULL_IMG_DIR) 77 | return md_content # type: ignore 78 | 79 | def to_mid_result(self, image_writer: AbsReaderWriter, bits: bytes | str, pupdator: ConvProgressUpdator) -> list[dict] | dict: 80 | model_proc = SingletonModelWrapper() 81 | pupdator.update(0) 82 | 83 | model_list = model_proc(bits) # type: ignore 84 | pupdator.update(50) 85 | # jso_useful_key = { 86 | # "_pdf_type": "", 87 | # "model_list": model_list, 88 | # } 89 | pipe = self.__construct_pdf_pipe(bits, model_list, image_writer) 90 | # pipe.pipe_classify() 91 | pipe.pipe_parse() 92 | pupdator.update(100) 93 | 94 | pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data()) 95 | pdf_info_list = pdf_mid_data["pdf_info"] 96 | return pdf_info_list 97 | 98 | if __name__ == "__main__": 99 | with open("/opt/data/pdf/20240423/pdf_test2/ol006018w.pdf", "rb") as f: 100 | bits_data = f.read() 101 | parser = Pdf() 102 | md_content = parser.to_md( 103 | bits_data, FileBaseProgressUpdator("debug/progress.txt") 104 | ) 105 | with open("debug/pdf2md.by_model.md", "w") as f: 106 | f.write(md_content) # type: ignore 107 | 108 | -------------------------------------------------------------------------------- /tools/clean_photo.py: -------------------------------------------------------------------------------- 1 | import pypandoc 2 | import re 3 | import htmltabletomd 4 | import os 5 | import argparse 6 | import zipfile 7 | 8 | parser = argparse.ArgumentParser(description="get tool type") 9 | parser.add_argument( 10 | "--tool_name", 11 | type=str, 12 | required=True, 13 | help="input tool name", 14 | ) 15 | parser.add_argument( 16 | "--download_dir", 17 | type=str, 18 | required=True, 19 | help="input download dir", 20 | ) 21 | args = parser.parse_args() 22 | 23 | def clean_markdown_images(content): 24 | pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE) 25 | cleaned_content = pattern.sub('', content) 26 | return cleaned_content 27 | 28 | def clean_ocrmath_photo(content): 29 | pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE) 30 | cleaned_content = pattern.sub('', content) 31 | return cleaned_content 32 | 33 | def convert_html_table_to_md(html_table): 34 | lines = html_table.strip().split('\n') 35 | md_table = '' 36 | if lines and '' in lines[0]: 37 | in_thead = True 38 | for line in lines: 39 | if '' in line: 40 | cells = re.findall(r'(.*?)', line) 41 | md_table += '| ' + ' | '.join(cells) + ' |\n' 42 | in_thead = False 43 | elif '' in line and not in_thead: 44 | cells = re.findall(r'(.*?)', line) 45 | md_table += '| ' + ' | '.join(cells) + ' |\n' 46 | md_table = md_table.rstrip() + '\n' 47 | return md_table 48 | 49 | def convert_latext_to_md(content): 50 | tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL) 51 | placeholders = [] 52 | for table in tables: 53 | placeholder = f"" 54 | replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}" 55 | content = content.replace(replace_str, placeholder) 56 | try: 57 | pypandoc.convert_text(replace_str, format="latex", to="md", outputfile="output.md", encoding="utf-8") 58 | except: 59 | markdown_string = replace_str 60 | else: 61 | markdown_string = open('output.md', 'r', encoding='utf-8').read() 62 | placeholders.append((placeholder, markdown_string)) 63 | new_content = content 64 | for placeholder, md_table in placeholders: 65 | new_content = new_content.replace(placeholder, md_table) 66 | # 写入文件 67 | return new_content 68 | 69 | 70 | def convert_htmltale_to_md(content): 71 | tables = re.findall(r'(.*?)
', content, re.DOTALL) 72 | placeholders = [] 73 | for table in tables: 74 | placeholder = f"" 75 | content = content.replace(f"{table}
", placeholder) 76 | try: 77 | convert_table = htmltabletomd.convert_table(table) 78 | except: 79 | convert_table = table 80 | placeholders.append((placeholder,convert_table)) 81 | new_content = content 82 | for placeholder, md_table in placeholders: 83 | new_content = new_content.replace(placeholder, md_table) 84 | # 写入文件 85 | return new_content 86 | 87 | def clean_data(prod_type, download_dir): 88 | file_type = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"] 89 | for filetype in file_type: 90 | tgt_dir = os.path.join(download_dir, filetype, prod_type, "cleaned") 91 | if not os.path.exists(tgt_dir): 92 | os.makedirs(tgt_dir) 93 | source_dir = os.path.join(download_dir, filetype, prod_type) 94 | filenames = os.listdir(source_dir) 95 | for filename in filenames: 96 | if filename.endswith('.md'): 97 | input_file = os.path.join(source_dir, filename) 98 | output_file = os.path.join(tgt_dir, "cleaned_" + filename) 99 | with open(input_file, 'r', encoding='utf-8') as fr: 100 | content = fr.read() 101 | new_content = convert_htmltale_to_md(content) 102 | new_content = clean_markdown_images(new_content) 103 | new_content = clean_ocrmath_photo(new_content) 104 | new_content = convert_latext_to_md(new_content) 105 | with open(output_file, 'w', encoding='utf-8') as fw: 106 | fw.write(new_content) 107 | 108 | 109 | if __name__ == '__main__': 110 | tool_type = args.tool_name 111 | download_dir = args.download_dir 112 | clean_data(tool_type, download_dir) 113 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/extractors/forum_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import re 3 | 4 | from magic_doc.contrib.magic_html.config import Forum_XPATH, Unique_ID 5 | from magic_doc.contrib.magic_html.utils import * 6 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor 7 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor 8 | 9 | 10 | class ForumExtractor(BaseExtractor): 11 | def __init__(self) -> None: 12 | super().__init__() 13 | 14 | def extract(self, html="", base_url="") -> dict: 15 | self.need_comment = True 16 | html = html.replace(" ", " ").replace(" ", " ") 17 | tree = load_html(html) 18 | if tree is None: 19 | raise ValueError 20 | 21 | # 获取title 22 | title = TitleExtractor().process(tree) 23 | 24 | # base_url 25 | base_href = tree.xpath("//base/@href") 26 | 27 | if base_href and "http" in base_href[0]: 28 | base_url = base_href[0] 29 | self.generate_unique_id(tree) 30 | 31 | format_tree = self.convert_tags(tree, base_url=base_url) 32 | 33 | normal_tree = self.clean_tags(format_tree) 34 | 35 | subtree, xp_num, drop_list = self.xp_1_5(normal_tree) 36 | if xp_num == "others": 37 | subtree, drop_list = self.prune_unwanted_sections(normal_tree) 38 | body_html = self.get_content_html(subtree, xp_num, base_url) 39 | 40 | # 论坛等独有 41 | body_html_tree = fromstring(body_html) 42 | try: 43 | body_tree = body_html_tree.body 44 | except: 45 | body_tree = Element("body") 46 | body_tree.extend(body_html_tree) 47 | main_ids = body_tree.xpath(f".//@{Unique_ID}") 48 | 49 | for main_id in main_ids: 50 | main_tree = normal_tree.xpath( 51 | f".//*[@{Unique_ID}={main_id}]" 52 | ) 53 | if main_tree: 54 | self.remove_node(main_tree[0]) 55 | if not main_ids: 56 | main_ids = [-1] 57 | 58 | if xp_num != "others": 59 | normal_tree, _ = self.prune_unwanted_sections(normal_tree) 60 | for c_xpath in Forum_XPATH: 61 | while normal_tree.xpath(c_xpath): 62 | x = normal_tree.xpath(c_xpath)[0] 63 | self.remove_node(x) 64 | if "'post-'" in c_xpath: 65 | if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+', 66 | x.attrib.get("id", 67 | "").lower())): 68 | continue 69 | if ( 70 | "header" in x.attrib.get("class", "").lower() 71 | or "header" in x.attrib.get("id", "").lower() 72 | ): 73 | continue 74 | try: 75 | if int(x.attrib.get(Unique_ID, "0")) > int( 76 | main_ids[-1] 77 | ): 78 | body_tree.append(x) 79 | else: 80 | prefix_div = Element("div") 81 | suffix_div = Element("div") 82 | need_prefix = False 83 | need_suffix = False 84 | while x.xpath( 85 | f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" 86 | ): 87 | tmp_x = x.xpath( 88 | f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]" 89 | )[0] 90 | self.remove_node(tmp_x) 91 | suffix_div.append(tmp_x) 92 | need_suffix = True 93 | while x.xpath( 94 | f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" 95 | ): 96 | tmp_x = x.xpath( 97 | f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]" 98 | )[0] 99 | self.remove_node(tmp_x) 100 | prefix_div.append(tmp_x) 101 | need_prefix = True 102 | if need_prefix: 103 | body_tree.insert(0, prefix_div) 104 | if need_suffix: 105 | body_tree.append(suffix_div) 106 | 107 | except: 108 | pass 109 | 110 | body_html = re.sub( 111 | f' {Unique_ID}="\d+"', 112 | "", 113 | tostring(body_tree, encoding=str), 114 | ) 115 | 116 | return { 117 | "xp_num": xp_num, 118 | "drop_list": drop_list, 119 | "html": body_html, 120 | "title": title, 121 | "base_url": base_url 122 | } 123 | -------------------------------------------------------------------------------- /magic_doc/bin/linux/share/antiword/fontnames: -------------------------------------------------------------------------------- 1 | # Default fontnames translation table 2 | # uses only Standard PostScript (TM) fonts 3 | # 4 | # MS-Word fontname, Italic, Bold, PostScript fontname, Special 5 | Arial, 0, 0, Helvetica, 0 6 | Arial, 0, 1, Helvetica-Bold, 0 7 | Arial, 1, 0, Helvetica-Oblique, 0 8 | Arial, 1, 1, Helvetica-BoldOblique, 0 9 | Arial Black, 0, 0, Helvetica, 0 10 | Arial Black, 0, 1, Helvetica-Bold, 0 11 | Arial Black, 1, 0, Helvetica-Oblique, 0 12 | Arial Black, 1, 1, Helvetica-BoldOblique, 0 13 | Arial CE, 0, 0, Helvetica, 0 14 | Arial CE, 0, 1, Helvetica-Bold, 0 15 | Arial CE, 1, 0, Helvetica-Oblique, 0 16 | Arial CE, 1, 1, Helvetica-BoldOblique, 0 17 | Arial Narrow, 0, 0, Helvetica-Narrow, 0 18 | Arial Narrow, 0, 1, Helvetica-Narrow-Bold, 0 19 | Arial Narrow, 1, 0, Helvetica-Narrow-Oblique, 0 20 | Arial Narrow, 1, 1, Helvetica-Narrow-BoldOblique, 0 21 | AvantGarde, 0, 0, AvantGarde-Book, 0 22 | AvantGarde, 0, 1, AvantGarde-Demi, 0 23 | AvantGarde, 1, 0, AvantGarde-BookOblique, 0 24 | AvantGarde, 1, 1, AvantGarde-DemiOblique, 0 25 | Bookman Old Style, 0, 0, Bookman-Light, 0 26 | Bookman Old Style, 0, 1, Bookman-Demi, 0 27 | Bookman Old Style, 1, 0, Bookman-LightItalic, 0 28 | Bookman Old Style, 1, 1, Bookman-DemiItalic, 0 29 | Century Schoolbook, 0, 0, NewCenturySchlbk-Roman, 0 30 | Century Schoolbook, 0, 1, NewCenturySchlbk-Bold, 0 31 | Century Schoolbook, 1, 0, NewCenturySchlbk-Italic, 0 32 | Century Schoolbook, 1, 1, NewCenturySchlbk-BoldItalic, 0 33 | CG Omega, 0, 0, Helvetica, 0 34 | CG Omega, 0, 1, Helvetica-Bold, 0 35 | CG Omega, 1, 0, Helvetica-Oblique, 0 36 | CG Omega, 1, 1, Helvetica-BoldOblique, 0 37 | Comic Sans MS, 0, 0, Helvetica, 0 38 | Comic Sans MS, 0, 1, Helvetica-Bold, 0 39 | Comic Sans MS, 1, 0, Helvetica-Oblique, 0 40 | Comic Sans MS, 1, 1, Helvetica-BoldOblique, 0 41 | Courier, 0, 0, Courier, 0 42 | Courier, 0, 1, Courier-Bold, 0 43 | Courier, 1, 0, Courier-Oblique, 0 44 | Courier, 1, 1, Courier-BoldOblique, 0 45 | Courier New, 0, 0, Courier, 0 46 | Courier New, 0, 1, Courier-Bold, 0 47 | Courier New, 1, 0, Courier-Oblique, 0 48 | Courier New, 1, 1, Courier-BoldOblique, 0 49 | Fixedsys, 0, 0, Courier, 0 50 | Fixedsys, 0, 1, Courier-Bold, 0 51 | Fixedsys, 1, 0, Courier-Oblique, 0 52 | Fixedsys, 1, 1, Courier-BoldOblique, 0 53 | Helvetica, 0, 0, Helvetica, 0 54 | Helvetica, 0, 1, Helvetica-Bold, 0 55 | Helvetica, 1, 0, Helvetica-Oblique, 0 56 | Helvetica, 1, 1, Helvetica-BoldOblique, 0 57 | Helvetica-Narrow, 0, 0, Helvetica-Narrow, 0 58 | Helvetica-Narrow, 0, 1, Helvetica-Narrow-Bold, 0 59 | Helvetica-Narrow, 1, 0, Helvetica-Narrow-Oblique, 0 60 | Helvetica-Narrow, 1, 1, Helvetica-Narrow-BoldOblique, 0 61 | ITC Bookman, 0, 0, Bookman-Light, 0 62 | ITC Bookman, 0, 1, Bookman-Demi, 0 63 | ITC Bookman, 1, 0, Bookman-LightItalic, 0 64 | ITC Bookman, 1, 1, Bookman-DemiItalic, 0 65 | Lucida Console, 0, 0, Courier, 0 66 | Lucida Console, 0, 1, Courier-Bold, 0 67 | Lucida Console, 1, 0, Courier-Oblique, 0 68 | Lucida Console, 1, 1, Courier-BoldOblique, 0 69 | Lucida Sans Typewriter, 0, 0, Courier, 0 70 | Lucida Sans Typewriter, 0, 1, Courier-Bold, 0 71 | Lucida Sans Typewriter, 1, 0, Courier-Oblique, 0 72 | Lucida Sans Typewriter, 1, 1, Courier-BoldOblique, 0 73 | Monotype.com, 0, 0, Courier, 0 74 | Monotype.com, 0, 1, Courier-Bold, 0 75 | Monotype.com, 1, 0, Courier-Oblique, 0 76 | Monotype.com, 1, 1, Courier-BoldOblique, 0 77 | MS Sans Serif, 0, 0, Helvetica, 0 78 | MS Sans Serif, 0, 1, Helvetica-Bold, 0 79 | MS Sans Serif, 1, 0, Helvetica-Oblique, 0 80 | MS Sans Serif, 1, 1, Helvetica-BoldOblique, 0 81 | New Century Schlbk, 0, 0, NewCenturySchlbk-Roman, 0 82 | New Century Schlbk, 0, 1, NewCenturySchlbk-Bold, 0 83 | New Century Schlbk, 1, 0, NewCenturySchlbk-Italic, 0 84 | New Century Schlbk, 1, 1, NewCenturySchlbk-BoldItalic, 0 85 | NewCenturySchlbk, 0, 0, NewCenturySchlbk-Roman, 0 86 | NewCenturySchlbk, 0, 1, NewCenturySchlbk-Bold, 0 87 | NewCenturySchlbk, 1, 0, NewCenturySchlbk-Italic, 0 88 | NewCenturySchlbk, 1, 1, NewCenturySchlbk-BoldItalic, 0 89 | Palatino, 0, 0, Palatino-Roman, 0 90 | Palatino, 0, 1, Palatino-Bold, 0 91 | Palatino, 1, 0, Palatino-Italic, 0 92 | Palatino, 1, 1, Palatino-BoldItalic, 0 93 | Swiss, 0, 0, Helvetica, 0 94 | Swiss, 0, 1, Helvetica-Bold, 0 95 | Swiss, 1, 0, Helvetica-Oblique, 0 96 | Swiss, 1, 1, Helvetica-BoldOblique, 0 97 | Tahoma, 0, 0, Helvetica, 0 98 | Tahoma, 0, 1, Helvetica-Bold, 0 99 | Tahoma, 1, 0, Helvetica-Oblique, 0 100 | Tahoma, 1, 1, Helvetica-BoldOblique, 0 101 | Trebuchet MS, 0, 0, Helvetica, 0 102 | Trebuchet MS, 0, 1, Helvetica-Bold, 0 103 | Trebuchet MS, 1, 0, Helvetica-Oblique, 0 104 | Trebuchet MS, 1, 1, Helvetica-BoldOblique, 0 105 | Univers, 0, 0, Helvetica, 0 106 | Univers, 0, 1, Helvetica-Bold, 0 107 | Univers, 1, 0, Helvetica-Oblique, 0 108 | Univers, 1, 1, Helvetica-BoldOblique, 0 109 | Verdana, 0, 0, Helvetica, 0 110 | Verdana, 0, 1, Helvetica-Bold, 0 111 | Verdana, 1, 0, Helvetica-Oblique, 0 112 | Verdana, 1, 1, Helvetica-BoldOblique, 0 113 | # All the other fonts 114 | *, 0, 0, Times-Roman, 0 115 | *, 0, 1, Times-Bold, 0 116 | *, 1, 0, Times-Italic, 0 117 | *, 1, 1, Times-BoldItalic, 0 118 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/model_init.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .visualizer import Visualizer 3 | from .rcnn_vl import * 4 | from .backbone import * 5 | 6 | from detectron2.config import get_cfg 7 | from detectron2.config import CfgNode as CN 8 | from detectron2.data import MetadataCatalog, DatasetCatalog 9 | from detectron2.data.datasets import register_coco_instances 10 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor 11 | from magic_doc.utils import get_repo_directory 12 | 13 | def add_vit_config(cfg): 14 | """ 15 | Add config for VIT. 16 | """ 17 | _C = cfg 18 | 19 | _C.MODEL.VIT = CN() 20 | 21 | # CoaT model name. 22 | _C.MODEL.VIT.NAME = "" 23 | 24 | # Output features from CoaT backbone. 25 | _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"] 26 | 27 | _C.MODEL.VIT.IMG_SIZE = [224, 224] 28 | 29 | _C.MODEL.VIT.POS_TYPE = "shared_rel" 30 | 31 | _C.MODEL.VIT.DROP_PATH = 0. 32 | 33 | _C.MODEL.VIT.MODEL_KWARGS = "{}" 34 | 35 | _C.SOLVER.OPTIMIZER = "ADAMW" 36 | 37 | _C.SOLVER.BACKBONE_MULTIPLIER = 1.0 38 | 39 | _C.AUG = CN() 40 | 41 | _C.AUG.DETR = False 42 | 43 | _C.MODEL.IMAGE_ONLY = True 44 | _C.PUBLAYNET_DATA_DIR_TRAIN = "" 45 | _C.PUBLAYNET_DATA_DIR_TEST = "" 46 | _C.FOOTNOTE_DATA_DIR_TRAIN = "" 47 | _C.FOOTNOTE_DATA_DIR_VAL = "" 48 | _C.SCIHUB_DATA_DIR_TRAIN = "" 49 | _C.SCIHUB_DATA_DIR_TEST = "" 50 | _C.JIAOCAI_DATA_DIR_TRAIN = "" 51 | _C.JIAOCAI_DATA_DIR_TEST = "" 52 | _C.ICDAR_DATA_DIR_TRAIN = "" 53 | _C.ICDAR_DATA_DIR_TEST = "" 54 | _C.M6DOC_DATA_DIR_TEST = "" 55 | _C.DOCSTRUCTBENCH_DATA_DIR_TEST = "" 56 | _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = "" 57 | _C.CACHE_DIR = "" 58 | _C.MODEL.CONFIG_PATH = "" 59 | 60 | # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS 61 | # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS 62 | _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1 63 | 64 | 65 | def setup(args): 66 | """ 67 | Create configs and perform basic setups. 68 | """ 69 | cfg = get_cfg() 70 | # add_coat_config(cfg) 71 | add_vit_config(cfg) 72 | cfg.merge_from_file(args.config_file) 73 | cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2 # set threshold for this model 74 | cfg.merge_from_list(args.opts) 75 | cfg.freeze() 76 | default_setup(cfg, args) 77 | 78 | """ 79 | #TODO: 可以去掉? 80 | register_coco_instances( 81 | "scihub_train", 82 | {}, 83 | cfg.SCIHUB_DATA_DIR_TRAIN + ".json", 84 | cfg.SCIHUB_DATA_DIR_TRAIN 85 | ) 86 | """ 87 | 88 | return cfg 89 | 90 | 91 | class DotDict(dict): 92 | def __init__(self, *args, **kwargs): 93 | super(DotDict, self).__init__(*args, **kwargs) 94 | 95 | def __getattr__(self, key): 96 | if key not in self.keys(): 97 | return None 98 | value = self[key] 99 | if isinstance(value, dict): 100 | value = DotDict(value) 101 | return value 102 | 103 | def __setattr__(self, key, value): 104 | self[key] = value 105 | 106 | class Layoutlmv3_Predictor(object): 107 | def __init__(self, weights): 108 | layout_args = { 109 | "config_file": os.path.join(get_repo_directory(), "resources/model/layoutlmv3/layoutlmv3_base_inference.yaml"), # TODO 修改配置路径 110 | "resume": False, 111 | "eval_only": False, 112 | "num_gpus": 1, 113 | "num_machines": 1, 114 | "machine_rank": 0, 115 | "dist_url": "tcp://127.0.0.1:57823", 116 | "opts": ["MODEL.WEIGHTS", weights], 117 | } 118 | layout_args = DotDict(layout_args) 119 | 120 | cfg = setup(layout_args) 121 | self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption", "table_footnote", "isolate_formula", "formula_caption"] 122 | MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping 123 | self.predictor = DefaultPredictor(cfg) 124 | 125 | def __call__(self, image, ignore_catids=[]): 126 | page_layout_result = { 127 | "layout_dets": [] 128 | } 129 | outputs = self.predictor(image) 130 | boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist() 131 | labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist() 132 | scores = outputs["instances"].to("cpu")._fields["scores"].tolist() 133 | for bbox_idx in range(len(boxes)): 134 | if labels[bbox_idx] in ignore_catids: 135 | continue 136 | page_layout_result["layout_dets"].append({ 137 | "category_id": labels[bbox_idx], 138 | "poly": [ 139 | boxes[bbox_idx][0], boxes[bbox_idx][1], 140 | boxes[bbox_idx][2], boxes[bbox_idx][1], 141 | boxes[bbox_idx][2], boxes[bbox_idx][3], 142 | boxes[bbox_idx][0], boxes[bbox_idx][3], 143 | ], 144 | "score": scores[bbox_idx] 145 | }) 146 | return page_layout_result -------------------------------------------------------------------------------- /magic_doc/contrib/office/pptx_extract.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | 4 | from pathlib import Path 5 | from typing import List 6 | 7 | from loguru import logger 8 | from pptx import Presentation 9 | from pptx.enum.shapes import MSO_SHAPE_TYPE 10 | from pptx.parts.image import Image 11 | from pptx.presentation import Presentation as ppt 12 | from pptx.shapes.autoshape import Shape 13 | from pptx.shapes.picture import Picture 14 | from pptx.shapes.graphfrm import GraphicFrame 15 | from pptx.table import Table, _Row, _Cell 16 | from pptx.slide import Slide 17 | from pptx.shapes.group import GroupShape 18 | from werkzeug.datastructures import FileStorage 19 | 20 | from magic_doc.contrib.office import OfficeExtractor 21 | from magic_doc.contrib.model import ExtractResponse, Page, Content 22 | 23 | 24 | class PptxExtractor(OfficeExtractor): 25 | def __init__(self) -> None: 26 | super().__init__() 27 | 28 | def setup(self): 29 | pass 30 | 31 | def handle_shape( 32 | self, 33 | shape: Shape, 34 | content_list: List[Content], 35 | media_dir: Path, 36 | img_map: dict[Path, str], 37 | id: str, 38 | skip_image: bool, 39 | ): 40 | if shape.has_text_frame: 41 | for paragraph in shape.text_frame.paragraphs: 42 | content_list.append( 43 | Content( 44 | type="text", 45 | data=paragraph.text + "\n", 46 | ) 47 | ) 48 | elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE and not skip_image: 49 | shape: Picture 50 | image: Image = shape.image 51 | image_bytes = image.blob 52 | img_path = media_dir.joinpath(f"pic-{len(img_map)}.{image.ext}") 53 | img_s3_path = self.generate_img_path(id, img_path.name) 54 | img_map[img_path] = img_s3_path 55 | content_list.append(Content(type="image", data=img_s3_path)) 56 | with open(img_path, "wb") as file: 57 | file.write(image_bytes) 58 | elif shape.shape_type == MSO_SHAPE_TYPE.TABLE: 59 | shape: GraphicFrame 60 | table: Table = shape.table 61 | md = "\n" 62 | for row_no, row in enumerate(table.rows): 63 | row: _Row 64 | md += "|" 65 | if row_no == 1: 66 | for col in row.cells: 67 | md += "---|" 68 | md += "\n|" 69 | for col in row.cells: 70 | cell: _Cell = col 71 | md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |" 72 | md += "\n" 73 | md += "\n" 74 | content_list.append(Content(type="md", data=md)) 75 | elif shape.shape_type == MSO_SHAPE_TYPE.GROUP: 76 | shape: GroupShape 77 | for sub_shape in shape.shapes: 78 | self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image) 79 | else: 80 | # print(shape.shape_type, type(shape), file=sys.stderr) 81 | pass 82 | 83 | def extract( 84 | self, 85 | r: FileStorage | Path, 86 | id: str, 87 | dir: Path, 88 | media_dir: Path, 89 | skip_image: bool, 90 | ) -> ExtractResponse: 91 | pages = [] 92 | img_map = {} 93 | 94 | presentation: ppt = Presentation(r) 95 | for page_no, slide in enumerate(presentation.slides): 96 | slide: Slide 97 | page = Page(page_no=page_no, content_list=[]) 98 | for shape in slide.shapes: 99 | self.handle_shape( 100 | shape, 101 | page["content_list"], 102 | media_dir, 103 | img_map, 104 | id, 105 | skip_image, 106 | ) 107 | 108 | pages.append(page) 109 | 110 | # self.upload_background(id, img_map) 111 | 112 | return pages 113 | 114 | 115 | if __name__ == "__main__": 116 | e = PptxExtractor() 117 | # from pedia_document_parser.s3.client import get_s3_client 118 | 119 | # cli = get_s3_client() 120 | 121 | # data = cli.read_object( 122 | # "s3://pedia-document-parser/office-doucments/【英文-模板】Professional Pack Standard.pptx" 123 | # ) 124 | # with open("1.pptx", "wb") as f: 125 | # f.write(data.read()) 126 | 127 | x = e.run( 128 | "ghi", 129 | Path("test_data/doc/商业项目市场分析与产品定位报告.pptx"), 130 | ) 131 | content = "" 132 | for p in x: 133 | content += f"\n====== page {p['page_no']} ======\n" 134 | for pp in p["content_list"]: 135 | content += pp["data"] 136 | 137 | print(content) 138 | 139 | # cli.read_object("s3://pedia-document-parser/office-doucments/【英文-课件】MIT15_082JF10_av.pptx") 140 | 141 | # print( 142 | # json.dumps( 143 | # e.run( 144 | # "ghi", 145 | # Path( 146 | # "/home/SENSETIME/wuziming/doc/doc/【英文-模板】Professional Pack Standard.pptx", 147 | # ), 148 | # ), 149 | # ensure_ascii=False, 150 | # indent=4, 151 | # ) 152 | # ) 153 | e.wait_all() 154 | -------------------------------------------------------------------------------- /tools/markdown_calculate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from Levenshtein import distance 3 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu 4 | from nltk.tokenize import word_tokenize 5 | import json 6 | import re 7 | import scoring 8 | import argparse 9 | 10 | # 初始化列表来存储编辑距离和BLEU分数 11 | class Scoring: 12 | def __init__(self, result_path): 13 | self.edit_distances = [] 14 | self.bleu_scores = [] 15 | self.sim_scores = [] 16 | self.filenames = [] 17 | self.score_dict = {} 18 | self.anntion_cnt = 0 19 | self.fw = open(result_path, "w+") 20 | def simple_bleu_score(self, candidate, reference): 21 | candidate_tokens = word_tokenize(candidate) 22 | reference_tokens = word_tokenize(reference) 23 | return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1) 24 | 25 | 26 | def preprocess_string(self, s): 27 | sub_enter = re.sub(r'\n+', '\n', s) 28 | return re.sub(r' ', ' ', sub_enter) 29 | 30 | def calculate_similarity(self, annotion, actual, tool_type): 31 | class_dict = {} 32 | edit_distances = [] 33 | bleu_scores = [] 34 | sim_scores = list() 35 | total_file = 0 36 | for filename in os.listdir(annotion): 37 | if filename.endswith('.md') and not filename.startswith('.'): # 忽略隐藏文件 38 | total_file = total_file + 1 39 | # 读取A目录中的文件 40 | with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a: 41 | content_a = file_a.read() 42 | self.anntion_cnt = self.anntion_cnt + 1 43 | filepath_b = os.path.join(actual, filename) 44 | if os.path.exists(filepath_b): 45 | with open(filepath_b, 'r', encoding='utf-8') as file_b: 46 | content_b = file_b.read() 47 | self.filenames.append(filename) 48 | # 计算编辑距离 49 | edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b)) 50 | self.edit_distances.append(edit_dist) 51 | edit_distances.append(edit_dist) 52 | #计算BLUE分数 53 | bleu_score = self.simple_bleu_score(content_b, content_a) 54 | bleu_scores.append(bleu_score) 55 | self.bleu_scores.append(bleu_score) 56 | #计算marker分数 57 | score = scoring.score_text(content_b, content_a) 58 | sim_scores.append(score) 59 | self.sim_scores.append(score) 60 | class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score} 61 | self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score} 62 | else: 63 | print(f"File {filename} not found in actual directory.") 64 | # 计算每类平均值 65 | class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0 66 | class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0 67 | class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0 68 | self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n") 69 | ratio = len(class_dict)/total_file 70 | self.fw.write(f"{tool_type} extract ratio: {ratio}" + "\n") 71 | self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n") 72 | self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n") 73 | self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n") 74 | 75 | print (f"{tool_type} extract ratio: {ratio}") 76 | print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}") 77 | print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}") 78 | print (f"{tool_type} Average Sim Score: {class_average_sim_score}") 79 | return self.score_dict 80 | 81 | def summary_scores(self): 82 | # 计算整体平均值 83 | over_all_dict = dict() 84 | average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0 85 | average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0 86 | average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0 87 | over_all_dict["average_edit_distance"] = average_edit_distance 88 | over_all_dict["average_bleu_score"] = average_bleu_score 89 | over_all_dict["average_sim_score"] = average_sim_score 90 | self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n") 91 | return over_all_dict 92 | 93 | def calculate_similarity_total(self, tool_type, file_types, download_dir): 94 | for file_type in file_types: 95 | annotion = os.path.join(download_dir, file_type, "annotations", "cleaned") 96 | actual = os.path.join(download_dir, file_type, tool_type, "cleaned") 97 | self.calculate_similarity(annotion, actual, file_type) 98 | 99 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/data/funsd.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | ''' 3 | Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py 4 | ''' 5 | import json 6 | import os 7 | 8 | import datasets 9 | 10 | from .image_utils import load_image, normalize_bbox 11 | 12 | 13 | logger = datasets.logging.get_logger(__name__) 14 | 15 | 16 | _CITATION = """\ 17 | @article{Jaume2019FUNSDAD, 18 | title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents}, 19 | author={Guillaume Jaume and H. K. Ekenel and J. Thiran}, 20 | journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)}, 21 | year={2019}, 22 | volume={2}, 23 | pages={1-6} 24 | } 25 | """ 26 | 27 | _DESCRIPTION = """\ 28 | https://guillaumejaume.github.io/FUNSD/ 29 | """ 30 | 31 | 32 | class FunsdConfig(datasets.BuilderConfig): 33 | """BuilderConfig for FUNSD""" 34 | 35 | def __init__(self, **kwargs): 36 | """BuilderConfig for FUNSD. 37 | 38 | Args: 39 | **kwargs: keyword arguments forwarded to super. 40 | """ 41 | super(FunsdConfig, self).__init__(**kwargs) 42 | 43 | 44 | class Funsd(datasets.GeneratorBasedBuilder): 45 | """Conll2003 dataset.""" 46 | 47 | BUILDER_CONFIGS = [ 48 | FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"), 49 | ] 50 | 51 | def _info(self): 52 | return datasets.DatasetInfo( 53 | description=_DESCRIPTION, 54 | features=datasets.Features( 55 | { 56 | "id": datasets.Value("string"), 57 | "tokens": datasets.Sequence(datasets.Value("string")), 58 | "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))), 59 | "ner_tags": datasets.Sequence( 60 | datasets.features.ClassLabel( 61 | names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"] 62 | ) 63 | ), 64 | "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"), 65 | "image_path": datasets.Value("string"), 66 | } 67 | ), 68 | supervised_keys=None, 69 | homepage="https://guillaumejaume.github.io/FUNSD/", 70 | citation=_CITATION, 71 | ) 72 | 73 | def _split_generators(self, dl_manager): 74 | """Returns SplitGenerators.""" 75 | downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip") 76 | return [ 77 | datasets.SplitGenerator( 78 | name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"} 79 | ), 80 | datasets.SplitGenerator( 81 | name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"} 82 | ), 83 | ] 84 | 85 | def get_line_bbox(self, bboxs): 86 | x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)] 87 | y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)] 88 | 89 | x0, y0, x1, y1 = min(x), min(y), max(x), max(y) 90 | 91 | assert x1 >= x0 and y1 >= y0 92 | bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))] 93 | return bbox 94 | 95 | def _generate_examples(self, filepath): 96 | logger.info("⏳ Generating examples from = %s", filepath) 97 | ann_dir = os.path.join(filepath, "annotations") 98 | img_dir = os.path.join(filepath, "images") 99 | for guid, file in enumerate(sorted(os.listdir(ann_dir))): 100 | tokens = [] 101 | bboxes = [] 102 | ner_tags = [] 103 | 104 | file_path = os.path.join(ann_dir, file) 105 | with open(file_path, "r", encoding="utf8") as f: 106 | data = json.load(f) 107 | image_path = os.path.join(img_dir, file) 108 | image_path = image_path.replace("json", "png") 109 | image, size = load_image(image_path) 110 | for item in data["form"]: 111 | cur_line_bboxes = [] 112 | words, label = item["words"], item["label"] 113 | words = [w for w in words if w["text"].strip() != ""] 114 | if len(words) == 0: 115 | continue 116 | if label == "other": 117 | for w in words: 118 | tokens.append(w["text"]) 119 | ner_tags.append("O") 120 | cur_line_bboxes.append(normalize_bbox(w["box"], size)) 121 | else: 122 | tokens.append(words[0]["text"]) 123 | ner_tags.append("B-" + label.upper()) 124 | cur_line_bboxes.append(normalize_bbox(words[0]["box"], size)) 125 | for w in words[1:]: 126 | tokens.append(w["text"]) 127 | ner_tags.append("I-" + label.upper()) 128 | cur_line_bboxes.append(normalize_bbox(w["box"], size)) 129 | # by default: --segment_level_layout 1 130 | # if do not want to use segment_level_layout, comment the following line 131 | cur_line_bboxes = self.get_line_bbox(cur_line_bboxes) 132 | # box = normalize_bbox(item["box"], size) 133 | # cur_line_bboxes = [box for _ in range(len(words))] 134 | bboxes.extend(cur_line_bboxes) 135 | yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags, 136 | "image": image, "image_path": image_path} -------------------------------------------------------------------------------- /magic_doc/contrib/office/docx_extract.py: -------------------------------------------------------------------------------- 1 | import xml.etree.ElementTree as ET 2 | import zipfile 3 | 4 | from pathlib import Path 5 | from magic_doc.contrib.model import ExtractResponse, Content, Page 6 | from magic_doc.contrib.office import OfficeExtractor 7 | from typing import IO 8 | from io import BytesIO 9 | from werkzeug.datastructures import FileStorage 10 | 11 | from magic_doc.contrib.office.formula.omml import omml2tex 12 | 13 | 14 | class DocxExtractor(OfficeExtractor): 15 | def __init__(self) -> None: 16 | super().__init__() 17 | 18 | def setup(self): 19 | pass 20 | 21 | def __word2markdown( 22 | self, 23 | id: str, 24 | docx_file_stream: IO[bytes], 25 | save_fig_dir, 26 | ): 27 | tag_w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" 28 | tag_body = f"{tag_w}body" 29 | 30 | content_list = [] 31 | text_count = 0 32 | with zipfile.ZipFile(docx_file_stream, "r") as docx: 33 | xml_content = docx.read("word/document.xml") 34 | tree = ET.XML(xml_content) 35 | body = tree.find(tag_body) 36 | 37 | for child in body: 38 | tag = child.tag.split("}")[-1] 39 | if text_count >= self.max_text_count: 40 | break 41 | 42 | match tag: 43 | case "p": 44 | text = "" 45 | for ele in child.iter(): 46 | if "math" in ele.tag: 47 | if ele.tag.endswith("oMath"): 48 | math_xml = BytesIO() 49 | ET.ElementTree(ele).write( 50 | math_xml, 51 | encoding="utf-8", 52 | xml_declaration=True, 53 | ) 54 | math_xml = math_xml.getvalue().decode("utf-8") 55 | math_xml = "\n".join(math_xml.split("\n")[1:]) 56 | math_formula = "\n" + omml2tex(math_xml) + "\n" 57 | 58 | text = text.strip() 59 | if len(text) > 0: 60 | text_count += len(text) + 1 61 | content_list.append( 62 | Content(type="text", data=text) 63 | ) 64 | text = "" 65 | 66 | text_count += len(math_formula) 67 | content_list.append( 68 | Content(type="md", data=math_formula) 69 | ) 70 | continue 71 | if t := ele.text: 72 | if len(t) > 0: 73 | text += t 74 | text = text.strip() 75 | if len(text) > 0: 76 | text_count += len(text) + 1 77 | content_list.append(Content(type="text", data=text)) 78 | text = "" 79 | case "tbl": 80 | col_size = len(list(child.find(f"{tag_w}tblGrid"))) 81 | md = "\n" 82 | for idx, row in enumerate(child.iter(f"{tag_w}tr")): 83 | if idx == 1: 84 | md += "|" 85 | for _ in range(col_size): 86 | md += "---|" 87 | md += "\n" 88 | md += "|" 89 | # print(row) 90 | for cell in row.iter(f"{tag_w}tc"): 91 | t = "" 92 | for cell_ele in cell.itertext(): 93 | t += ( 94 | cell_ele.strip() 95 | .replace("\r", "") 96 | .replace("\n", "") 97 | ) 98 | md += f" {t} |" 99 | md += "\n" 100 | md += "\n" 101 | text_count += len(md) 102 | content_list.append(Content(type="md", data=md)) 103 | case "sectPr": 104 | # docx section pointer, meaningless 105 | pass 106 | case unknown: 107 | pass 108 | # print(unknown) 109 | return content_list 110 | 111 | def extract( 112 | self, 113 | r: FileStorage | Path, 114 | id: str, 115 | dir: Path, 116 | media_dir: Path, 117 | skip_image: bool, 118 | ) -> ExtractResponse: 119 | if type(r) is FileStorage: 120 | page = Page( 121 | page_no=0, 122 | content_list=self.__word2markdown(id, r.stream, media_dir), 123 | ) 124 | else: 125 | page = Page( 126 | page_no=0, 127 | content_list=self.__word2markdown(id, open(r, "rb"), media_dir), 128 | ) 129 | # self.clean_up(id) 130 | return [page] 131 | 132 | 133 | if __name__ == "__main__": 134 | e = DocxExtractor() 135 | 136 | res = e.run( 137 | "def", 138 | Path( 139 | "test_data/doc/【中简】模电自测第四版.docx", 140 | ), 141 | ) 142 | 143 | print(res) 144 | 145 | e.wait_all() 146 | -------------------------------------------------------------------------------- /magic_doc/model/sub_modules/layoutlmv3/layoutlmft/data/data_collator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from dataclasses import dataclass 3 | from typing import Any, Dict, List, Optional, Tuple, Union 4 | 5 | from transformers import BatchEncoding, PreTrainedTokenizerBase 6 | from transformers.data.data_collator import ( 7 | DataCollatorMixin, 8 | _torch_collate_batch, 9 | ) 10 | from transformers.file_utils import PaddingStrategy 11 | 12 | from typing import NewType 13 | InputDataClass = NewType("InputDataClass", Any) 14 | 15 | def pre_calc_rel_mat(segment_ids): 16 | valid_span = torch.zeros((segment_ids.shape[0], segment_ids.shape[1], segment_ids.shape[1]), 17 | device=segment_ids.device, dtype=torch.bool) 18 | for i in range(segment_ids.shape[0]): 19 | for j in range(segment_ids.shape[1]): 20 | valid_span[i, j, :] = segment_ids[i, :] == segment_ids[i, j] 21 | 22 | return valid_span 23 | 24 | @dataclass 25 | class DataCollatorForKeyValueExtraction(DataCollatorMixin): 26 | """ 27 | Data collator that will dynamically pad the inputs received, as well as the labels. 28 | Args: 29 | tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`): 30 | The tokenizer used for encoding the data. 31 | padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`): 32 | Select a strategy to pad the returned sequences (according to the model's padding side and padding index) 33 | among: 34 | * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single 35 | sequence if provided). 36 | * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the 37 | maximum acceptable input length for the model if that argument is not provided. 38 | * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of 39 | different lengths). 40 | max_length (:obj:`int`, `optional`): 41 | Maximum length of the returned list and optionally padding length (see above). 42 | pad_to_multiple_of (:obj:`int`, `optional`): 43 | If set will pad the sequence to a multiple of the provided value. 44 | This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 45 | 7.5 (Volta). 46 | label_pad_token_id (:obj:`int`, `optional`, defaults to -100): 47 | The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). 48 | """ 49 | 50 | tokenizer: PreTrainedTokenizerBase 51 | padding: Union[bool, str, PaddingStrategy] = True 52 | max_length: Optional[int] = None 53 | pad_to_multiple_of: Optional[int] = None 54 | label_pad_token_id: int = -100 55 | 56 | def __call__(self, features): 57 | label_name = "label" if "label" in features[0].keys() else "labels" 58 | labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None 59 | 60 | images = None 61 | if "images" in features[0]: 62 | images = torch.stack([torch.tensor(d.pop("images")) for d in features]) 63 | IMAGE_LEN = int(images.shape[-1] / 16) * int(images.shape[-1] / 16) + 1 64 | 65 | batch = self.tokenizer.pad( 66 | features, 67 | padding=self.padding, 68 | max_length=self.max_length, 69 | pad_to_multiple_of=self.pad_to_multiple_of, 70 | # Conversion to tensors will fail if we have labels as they are not of the same length yet. 71 | return_tensors="pt" if labels is None else None, 72 | ) 73 | 74 | if images is not None: 75 | batch["images"] = images 76 | batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) and k == 'attention_mask' else v 77 | for k, v in batch.items()} 78 | visual_attention_mask = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) 79 | batch["attention_mask"] = torch.cat([batch['attention_mask'], visual_attention_mask], dim=1) 80 | 81 | if labels is None: 82 | return batch 83 | 84 | has_bbox_input = "bbox" in features[0] 85 | has_position_input = "position_ids" in features[0] 86 | padding_idx=self.tokenizer.pad_token_id 87 | sequence_length = torch.tensor(batch["input_ids"]).shape[1] 88 | padding_side = self.tokenizer.padding_side 89 | if padding_side == "right": 90 | batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels] 91 | if has_bbox_input: 92 | batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]] 93 | if has_position_input: 94 | batch["position_ids"] = [position_id + [padding_idx] * (sequence_length - len(position_id)) 95 | for position_id in batch["position_ids"]] 96 | 97 | else: 98 | batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels] 99 | if has_bbox_input: 100 | batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]] 101 | if has_position_input: 102 | batch["position_ids"] = [[padding_idx] * (sequence_length - len(position_id)) 103 | + position_id for position_id in batch["position_ids"]] 104 | 105 | if 'segment_ids' in batch: 106 | assert 'position_ids' in batch 107 | for i in range(len(batch['segment_ids'])): 108 | batch['segment_ids'][i] = batch['segment_ids'][i] + [batch['segment_ids'][i][-1] + 1] * (sequence_length - len(batch['segment_ids'][i])) + [ 109 | batch['segment_ids'][i][-1] + 2] * IMAGE_LEN 110 | 111 | batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()} 112 | 113 | if 'segment_ids' in batch: 114 | valid_span = pre_calc_rel_mat( 115 | segment_ids=batch['segment_ids'] 116 | ) 117 | batch['valid_span'] = valid_span 118 | del batch['segment_ids'] 119 | 120 | if images is not None: 121 | visual_labels = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) * -100 122 | batch["labels"] = torch.cat([batch['labels'], visual_labels], dim=1) 123 | 124 | return batch 125 | -------------------------------------------------------------------------------- /magic_doc/contrib/magic_html/mmltex/glayout.xsl: -------------------------------------------------------------------------------- 1 | 2 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 21 | 22 | 23 | \genfrac{}{}{ 24 | 25 | 26 | 27 | ex 28 | 29 | 30 | .05ex 31 | 32 | 33 | 34 | .2ex 35 | 36 | 37 | 38 | 39 | 40 | }{}{ 41 | 42 | 43 | \frac{ 44 | 45 | 46 | 47 | \hfill 48 | 49 | 50 | 51 | \hfill 52 | 53 | }{ 54 | 55 | \hfill 56 | 57 | 58 | 59 | \hfill 60 | 61 | } 62 | 63 | 64 | 65 | 66 | 67 | \sqrt[ 68 | 69 | ]{ 70 | 71 | } 72 | 73 | 74 | 75 | exception 25: 76 | \text{exception 25:} 77 | 78 | 79 | 80 | 81 | 82 | \sqrt{ 83 | 84 | } 85 | 86 | 87 | 88 | 89 | 90 | 91 | \left 92 | 93 | 94 | \ 95 | 96 | 97 | 98 | \left( 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | , 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | \right 134 | 135 | 136 | \ 137 | 138 | 139 | 140 | \right) 141 | 142 | 143 | 144 | 145 | \phantom{ 146 | 147 | } 148 | 149 | 150 | 151 | 152 | 153 | \overline{ 154 | 155 | \hspace{.2em}|} 156 | 157 | 158 | \sqrt{ 159 | 160 | } 161 | 162 | 163 | \overline{) 164 | 165 | } 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | \colorbox[rgb]{ 177 | 178 | 179 | 180 | }{$ 181 | 182 | 183 | \textcolor[rgb]{ 184 | 185 | 186 | 187 | }{ 188 | 189 | 190 | 191 | } 192 | 193 | 194 | $} 195 | 196 | 197 | 215 | 216 | 217 | 218 | 219 | 220 | --------------------------------------------------------------------------------