├── magic_doc
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   └── default_config.py
    ├── conv
    │   ├── __init__.py
    │   ├── base.py
    │   ├── pptx_python_pptx.py
    │   ├── docx_xml_parse.py
    │   ├── doc_antiword.py
    │   ├── conv_html.py
    │   ├── pdf.py
    │   ├── doc_libreoffice.py
    │   ├── ppt_libreoffice.py
    │   ├── pdf_pp_structurev2.py
    │   └── pdf_magicpdf.py
    ├── libs
    │   ├── __init__.py
    │   └── version.py
    ├── model
    │   ├── __init__.py
    │   ├── sub_modules
    │   │   ├── __init__.py
    │   │   ├── UniMERNet
    │   │   │   └── __init__.py
    │   │   ├── layoutlmv3
    │   │   │   ├── __init__.py
    │   │   │   ├── layoutlmft
    │   │   │   │   ├── data
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── funsd.py
    │   │   │   │   │   └── data_collator.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── models
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── layoutlmv3
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       ├── tokenization_layoutlmv3.py
    │   │   │   │   │       ├── tokenization_layoutlmv3_fast.py
    │   │   │   │   │       └── configuration_layoutlmv3.py
    │   │   │   └── model_init.py
    │   │   └── post_process.py
    │   ├── seq_layout.py
    │   ├── seq_paddle.py
    │   ├── seq_ocr.py
    │   ├── doc_analysis_by_pp.py
    │   ├── parallel_paddle.py
    │   ├── parallel_layout.py
    │   └── parallel_ocr.py
    ├── contrib
    │   ├── __init__.py
    │   ├── pdf
    │   │   ├── __init__.py
    │   │   └── pdf_extractor.py
    │   ├── office
    │   │   ├── formula
    │   │   │   ├── __init__.py
    │   │   │   ├── mml
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── xsl
    │   │   │   │   │   ├── mmltex.xsl
    │   │   │   │   │   ├── README
    │   │   │   │   │   └── tables.xsl
    │   │   │   └── omml
    │   │   │   │   └── __init__.py
    │   │   ├── ppt_extract.py
    │   │   ├── __init__.py
    │   │   ├── doc.py
    │   │   ├── pptx_extract.py
    │   │   └── docx_extract.py
    │   ├── magic_html
    │   │   ├── extractors
    │   │   │   ├── __init__.py
    │   │   │   ├── article_extractor.py
    │   │   │   ├── title_extractor.py
    │   │   │   ├── custom_extractor.py
    │   │   │   ├── weixin_extractor.py
    │   │   │   └── forum_extractor.py
    │   │   ├── mmltex
    │   │   │   ├── mmltex.xsl
    │   │   │   ├── README
    │   │   │   ├── tables.xsl
    │   │   │   └── glayout.xsl
    │   │   └── __init__.py
    │   ├── wrapper_exceptions.py
    │   ├── test_data
    │   │   ├── doc
    │   │   │   └── test.doc
    │   │   └── url_service
    │   │   │   └── run.py
    │   └── model
    │   │   └── __init__.py
    ├── progress
    │   ├── __init__.py
    │   ├── pupdator.py
    │   └── filepupdator.py
    ├── restful_api
    │   ├── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── import_models.py
    │   │   ├── oss
    │   │   │   ├── __init__.py
    │   │   │   └── oss.py
    │   │   ├── web_hook.py
    │   │   ├── ext.py
    │   │   └── custom_response.py
    │   ├── config
    │   │   ├── __init__.py
    │   │   └── config.yaml
    │   ├── api
    │   │   ├── analysis
    │   │   │   ├── __init__.py
    │   │   │   ├── ext.py
    │   │   │   ├── serialization.py
    │   │   │   ├── magic_html_view.py
    │   │   │   └── magic_pdf_view.py
    │   │   ├── extentions.py
    │   │   └── __init__.py
    │   └── app.py
    ├── bin
    │   └── linux
    │   │   ├── antiword
    │   │   └── share
    │   │       └── antiword
    │   │           ├── UTF-8.txt
    │   │           ├── fontnames.russian
    │   │           ├── Example
    │   │           ├── Default
    │   │           └── fontnames
    ├── utils
    │   ├── null_writer.py
    │   ├── yaml_load.py
    │   ├── __init__.py
    │   ├── path_utils.py
    │   └── config.py
    └── resources
    │   └── model
    │       ├── model_configs.yaml
    │       └── UniMERNet
    │           ├── demo.yaml
    │           └── demo_old.yaml
├── assets
    ├── contributor.png
    └── license.svg
├── magic-doc-template.json
├── test
    ├── test_cli
    │   ├── conf
    │   │   └── conf.py
    │   └── test_cli.py
    └── test_docconv.py
├── .gitignore
├── requirements.txt
├── update_version.py
├── .github
    └── workflows
    │   ├── benchmark.yml
    │   ├── ci.yml
    │   └── python-package.yml
├── tools
    ├── scoring.py
    ├── benchmark.py
    ├── clean_photo.py
    └── markdown_calculate.py
├── setup.py
├── README_zh-CN.md
└── README.md


/magic_doc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/conv/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/libs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/pdf/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/progress/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/office/formula/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/common/import_models.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/common/oss/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/libs/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.38"
2 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/UniMERNet/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | 


--------------------------------------------------------------------------------
/assets/contributor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magicpdf/Magic-Doc/HEAD/assets/contributor.png


--------------------------------------------------------------------------------
/magic_doc/bin/linux/antiword:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magicpdf/Magic-Doc/HEAD/magic_doc/bin/linux/antiword


--------------------------------------------------------------------------------
/magic-doc-template.json:
--------------------------------------------------------------------------------
1 | {
2 |   "s3_config": {
3 |     "ak": "",
4 |     "sk": "",
5 |     "endpoint": ""
6 |   }
7 | }


--------------------------------------------------------------------------------
/magic_doc/contrib/wrapper_exceptions.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | class NotSupportOcrPDFException(BaseException):
5 |     pass
6 | 
7 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/test_data/doc/test.doc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/magicpdf/Magic-Doc/HEAD/magic_doc/contrib/test_data/doc/test.doc


--------------------------------------------------------------------------------
/magic_doc/utils/null_writer.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | class NullWriter:
4 |     def write(self, *args, **kwargs):
5 |         return None 
6 | 
7 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/common/web_hook.py:
--------------------------------------------------------------------------------
1 | from flask import request
2 | 
3 | 
4 | # @jwt_required()
5 | def before_request():
6 |     return None
7 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/data/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .data_collator import DataCollatorForKeyValueExtraction
3 | 


--------------------------------------------------------------------------------
/magic_doc/bin/linux/share/antiword/UTF-8.txt:
--------------------------------------------------------------------------------
1 | # UTF-8 to Unicode
2 | # This file is a dummy.
3 | # The conversion is done algorithmicly, not by a table look-up.
4 | 


--------------------------------------------------------------------------------
/test/test_cli/conf/conf.py:
--------------------------------------------------------------------------------
1 | import os
2 | conf = {
3 | "code_path": os.environ.get('GITHUB_WORKSPACE'),
4 | "pdf_dev_path" : os.environ.get('GITHUB_WORKSPACE') + "/test/test_cli/pdf_dev",
5 | "pdf_res_path": "/home/quyuan/code/Magic-PDF/Magic-PDF/Magic-PDF/ci"
6 | 
7 | }


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/__init__.py:
--------------------------------------------------------------------------------
1 | from .models import (
2 |     LayoutLMv3Config,
3 |     LayoutLMv3ForTokenClassification,
4 |     LayoutLMv3ForQuestionAnswering,
5 |     LayoutLMv3ForSequenceClassification,
6 |     LayoutLMv3Tokenizer,
7 | )
8 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .layoutlmv3 import (
2 |     LayoutLMv3Config,
3 |     LayoutLMv3ForTokenClassification,
4 |     LayoutLMv3ForQuestionAnswering,
5 |     LayoutLMv3ForSequenceClassification,
6 |     LayoutLMv3Tokenizer,
7 | )
8 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/common/ext.py:
--------------------------------------------------------------------------------
 1 | def singleton_func(cls):
 2 |     instance = {}
 3 | 
 4 |     def _singleton(*args, **kwargs):
 5 |         if cls not in instance:
 6 |             instance[cls] = cls(*args, **kwargs)
 7 |         return instance[cls]
 8 | 
 9 |     return _singleton
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.ipynb*
 2 | *.ipynb
 3 | *.png
 4 | *.jpg
 5 | *.pdf
 6 | 
 7 | # python
 8 | .ipynb_checkpoints
 9 | *.ipynb
10 | **/__pycache__/
11 | 
12 | # vscode
13 | .vscode
14 | 
15 | # logs
16 | *.log
17 | *.out
18 | 
19 | # debug directory
20 | debug/
21 | source.dev.env
22 | 
23 | # pycharm
24 | .idea/
25 | 
26 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/api/analysis/__init__.py:
--------------------------------------------------------------------------------
 1 | from flask import Blueprint
 2 | from .magic_pdf_view import *
 3 | from .magic_html_view import *
 4 | from ..extentions import Api
 5 | 
 6 | analysis_blue = Blueprint('analysis', __name__, url_prefix='/analysis')
 7 | 
 8 | api = Api(analysis_blue)
 9 | api.add_resource(MagicPdfView, '/pdf')
10 | api.add_resource(MagicHtmlView, '/html')


--------------------------------------------------------------------------------
/magic_doc/progress/pupdator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | 
 5 | class ConvProgressUpdator(ABC):
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     def update(self, progress: int) -> bool:
10 |         # TODO ratelimie
11 |         return self.do_update(progress)
12 |     
13 |     @abstractmethod
14 |     def do_update(self, progress: int):
15 |         pass
16 |     
17 | 
18 | 


--------------------------------------------------------------------------------
/magic_doc/resources/model/model_configs.yaml:
--------------------------------------------------------------------------------
 1 | models:
 2 |   layout: True
 3 |   formula: False
 4 |   ocr: False
 5 | 
 6 | weights:
 7 |   layout: /mnt/hwfile/opendatalab/wufan/weights/ocr_pipeline/Layoutlmv3/model_final.pth
 8 |   mfd: /mnt/hwfile/opendatalab/wufan/weights/ocr_pipeline/yolov8/withouscihubtrain_addr4_epoch91.pt
 9 |   mfr: /mnt/hwfile/opendatalab/wufan/weights/ocr_pipeline/UniMERNet/models_old
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/office/formula/mml/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from lxml import etree as ET
 4 | 
 5 | transform = None
 6 | 
 7 | _xslt_filename = os.path.join(
 8 |     os.path.dirname(os.path.abspath(__file__)), "xsl/mmltex.xsl"
 9 | )
10 | 
11 | 
12 | def mml2tex(mml_xml):
13 |     tree = ET.fromstring(mml_xml)
14 |     global transform
15 |     if not transform:
16 |         transform = ET.XSLT(ET.parse(_xslt_filename))
17 |     return str(transform(tree))
18 | 


--------------------------------------------------------------------------------
/magic_doc/progress/filepupdator.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from magic_doc.progress.pupdator import ConvProgressUpdator
 4 | 
 5 | 
 6 | class FileBaseProgressUpdator(ConvProgressUpdator):
 7 |     def __init__(self, progress_file_path:str):
 8 |         self.__progress_file_path = progress_file_path
 9 |     
10 |     def do_update(self, progress:int) -> bool:
11 |         with open(self.__progress_file_path, 'w', encoding='utf-8') as fout:
12 |             fout.write(str(int(progress)))
13 | 
14 |         return True


--------------------------------------------------------------------------------
/magic_doc/model/seq_layout.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from magic_doc.model.sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
 4 | 
 5 | class SeqLayout:
 6 |     def __init__(self, config):
 7 |         self.model = Layoutlmv3_Predictor(config)
 8 | 
 9 |     def __call__(self, params):
10 |         """
11 |         params: list[(idx, image)]
12 |         """
13 |         if len(params) == 0:
14 |             return []
15 | 
16 |         results = []
17 |         for idx, image in params:
18 |             layout_res = self.model(image)
19 |             results.append((idx, layout_res))
20 |         return results
21 | 
22 | 


--------------------------------------------------------------------------------
/magic_doc/conv/base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | from magic_doc.progress.pupdator import ConvProgressUpdator
 5 | from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 6 | class BaseConv(ABC):
 7 |     def __init__(self, *args, **kwargs):
 8 |         pass
 9 | 
10 |     @abstractmethod
11 |     def to_md(self, bits: bytes | str, pupdator:ConvProgressUpdator) -> str:
12 |         return NotImplemented
13 | 
14 |     def to_mid_result(self, rw: AbsReaderWriter,  bits: bytes | str, pupdator:ConvProgressUpdator) -> list[dict] | dict:
15 |         pupdator.update(100)
16 |         return {}
17 | 
18 | 
19 | class ParseFailed(BaseException):
20 |     pass  


--------------------------------------------------------------------------------
/magic_doc/model/seq_paddle.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
 3 | from magic_doc.utils import split_to_chunks
 4 | import paddle 
 5 | from concurrent.futures import ThreadPoolExecutor, as_completed
 6 | 
 7 | 
 8 | class SeqPaddle:
 9 |     def __init__(self, **kwargs):
10 |         self.model = CustomPaddleModel(ocr=True, show_log=False)
11 | 
12 |     def __call__(self, params):
13 |         """
14 |         params: list[(idx, image, *args)]
15 |         """
16 |         results = [] 
17 |         for idx, img in params:
18 |             ocr_res = self.model(img)
19 |             results.append((idx, ocr_res))
20 | 
21 |         return results 
22 | 


--------------------------------------------------------------------------------
/magic_doc/common/default_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from magic_doc.utils.yaml_load import patch_dict_with_env
 3 | 
 4 | class PdfFastParseMethod:
 5 |     AUTO = "auto"
 6 |     FAST = "fast"
 7 |     LITEOCR = "lite_ocr"
 8 | 
 9 | class PdfHqParseMethod:
10 |     AUTO = "auto"
11 |     OCR = "ocr"
12 |     TXT = "txt"
13 | 
14 | 
15 | DEFAULT_CONFIG = {
16 |     "pdf": {
17 |         "fast": {
18 |             "parsemethod": PdfFastParseMethod.AUTO,
19 |             "liteocrmodelinstance": 1,
20 |         }, 
21 |         "hq": {
22 |             "parsemethod": PdfHqParseMethod.OCR,
23 |         }
24 |     }
25 | }
26 | 
27 | 
28 | DEFAULT_CONFIG = patch_dict_with_env("filter", DEFAULT_CONFIG)
29 | 
30 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/common/custom_response.py:
--------------------------------------------------------------------------------
 1 | from flask import jsonify
 2 | 
 3 | 
 4 | class ResponseCode:
 5 |     SUCCESS = 200
 6 |     PARAM_WARING = 400
 7 |     MESSAGE = "success"
 8 | 
 9 | 
10 | def generate_response(data=None, code=ResponseCode.SUCCESS, msg=ResponseCode.MESSAGE, **kwargs):
11 |     """
12 |     自定义响应
13 |     :param code:状态码
14 |     :param data:返回数据
15 |     :param msg:返回消息
16 |     :param kwargs:
17 |     :return:
18 |     """
19 |     msg = msg or 'success' if code == 200 else msg or 'fail'
20 |     success = True if code == 200 else False
21 |     res = jsonify(dict(code=code, success=success, data=data, msg=msg, **kwargs))
22 |     res.status_code = 200
23 |     return res
24 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/config/config.yaml:
--------------------------------------------------------------------------------
 1 | # 基本配置
 2 | BaseConfig: &base
 3 |   DEBUG: true
 4 |   LOG_LEVEL: "DEBUG"
 5 |   SQLALCHEMY_TRACK_MODIFICATIONS: true
 6 |   SQLALCHEMY_DATABASE_URI: ""
 7 |   SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
 8 |   JWT_SECRET_KEY: "#$%^&**$##*(*^%%$**((&"
 9 |   JWT_ACCESS_TOKEN_EXPIRES: 300
10 |   AccessKeyID: ""
11 |   AccessKeySecret: ""
12 |   Endpoint: ""
13 |   BucketName: ""
14 |   UrlExpires: 60
15 | 
16 |   S3AK: ""
17 |   S3SK: ""
18 |   S3ENDPOINT: ""
19 | 
20 | 
21 | # 开发配置
22 | DevelopmentConfig:
23 |   <<: *base
24 | 
25 | # 生产配置
26 | ProductionConfig:
27 |   <<: *base
28 | 
29 | # 测试配置
30 | TestingConfig:
31 |   <<: *base
32 | 
33 | # 当前使用配置
34 | CurrentConfig: "DevelopmentConfig"
35 | 


--------------------------------------------------------------------------------
/magic_doc/model/seq_ocr.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from magic_doc.model.sub_modules.self_modify import ModifiedPaddleOCR
 3 | from magic_doc.utils import split_to_chunks
 4 | import paddle 
 5 | from concurrent.futures import ThreadPoolExecutor, as_completed
 6 | 
 7 | 
 8 | class SeqOCR:
 9 |     def __init__(self, **kwargs):
10 |         self.model = ModifiedPaddleOCR(show_log=False, **kwargs)
11 | 
12 |     def __call__(self, params):
13 |         """
14 |         params: list[(idx, image, *args)]
15 |         """
16 |         results = [] 
17 |         for idx, cropped_image, single_page_mfdetrec_res in params:
18 |                 ocr_res = self.model.ocr(cropped_image, mfd_res=single_page_mfdetrec_res)[0]
19 |                 if ocr_res:
20 |                     results.append((idx, ocr_res))
21 | 
22 |         return results 
23 | 
24 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/api/extentions.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask
 2 | from flask_restful import Api
 3 | from flask_cors import CORS
 4 | from flask_sqlalchemy import SQLAlchemy as _SQLAlchemy
 5 | from flask_migrate import Migrate
 6 | from contextlib import contextmanager
 7 | from flask_jwt_extended import JWTManager
 8 | from flask_marshmallow import Marshmallow
 9 | 
10 | 
11 | class SQLAlchemy(_SQLAlchemy):
12 |     @contextmanager
13 |     def auto_commit(self):
14 |         try:
15 |             yield
16 |             db.session.commit()
17 |             db.session.flush()
18 |         except Exception as e:
19 |             db.session.rollback()
20 |             raise e
21 | 
22 | 
23 | app = Flask(__name__)
24 | CORS(app, supports_credentials=True)
25 | db = SQLAlchemy()
26 | migrate = Migrate()
27 | jwt = JWTManager()
28 | ma = Marshmallow()


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alembic==1.13.1
 2 | aniso8601==9.0.1
 3 | blinker==1.8.2
 4 | cchardet==2.1.7
 5 | certifi==2024.2.2
 6 | charset-normalizer==3.3.2
 7 | docopt==0.6.2
 8 | Flask==3.0.3
 9 | Flask-Cors==4.0.1
10 | Flask-JWT-Extended==4.6.0
11 | flask-marshmallow==1.2.1
12 | Flask-Migrate==4.0.7
13 | Flask-RESTful==0.3.10
14 | Flask-SQLAlchemy==3.1.1
15 | func-timeout==4.3.5
16 | greenlet==3.0.3
17 | idna==3.7
18 | itsdangerous==2.2.0
19 | Jinja2==3.1.4
20 | lark-parser==0.12.0
21 | lxml==5.1.1
22 | Mako==1.3.5
23 | MarkupSafe==2.1.5
24 | marshmallow==3.21.2
25 | marshmallow-sqlalchemy==1.0.0
26 | packaging==24.0
27 | py-asciimath==0.3.0
28 | PyJWT==2.8.0
29 | pytz==2024.1
30 | PyYAML==6.0.1
31 | requests==2.32.2
32 | six==1.16.0
33 | SQLAlchemy==2.0.30
34 | typing_extensions==4.11.0
35 | urllib3
36 | Werkzeug==3.0.3
37 | python-pptx
38 | s3pathlib
39 | PyMuPDF==1.24.5
40 | smart-open[s3]
41 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/api/analysis/ext.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from pathlib import Path
 3 | from loguru import logger
 4 | 
 5 | 
 6 | def upload_image_to_oss(oss_client, file_name, img_path, NULL_IMG_DIR, bucket_name):
 7 |     img_object_name = f"pdf/{file_name}/{Path(img_path).name}"
 8 |     local_img_path = f"{NULL_IMG_DIR}/images/{Path(img_path).name}"
 9 |     t3 = time.time()
10 |     oss_rep = oss_client.put_file(bucket_name, img_object_name, local_img_path)
11 |     t4 = time.time()
12 |     logger.info(f"upload img:{t4 - t3}")
13 |     file_link = oss_rep["file_link"]
14 |     return str(img_path), file_link
15 | 
16 | 
17 | def upload_md_to_oss(oss_client, bucket_name, md_object_name, md_content):
18 |     t3 = time.time()
19 |     oss_rep = oss_client.pub_object(bucket_name, md_object_name, md_content)
20 |     t4 = time.time()
21 |     logger.info(f"upload md:{t4 - t3}")
22 |     md_link = oss_rep["file_link"]
23 |     return md_link
24 | 
25 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/api/analysis/serialization.py:
--------------------------------------------------------------------------------
 1 | import lxml
 2 | from marshmallow import Schema, fields, validates, ValidationError
 3 | 
 4 | 
 5 | class MagicHtmlSchema(Schema):
 6 |     pageUrl = fields.Str()
 7 |     html = fields.Str(required=True)
 8 |     html_type = fields.Str()
 9 | 
10 |     @validates('html')
11 |     def validate_html(self, data, **kwargs):
12 |         if not data:
13 |             raise ValidationError('HTML cannot be empty')
14 |         else:
15 |             if lxml.html.fromstring(data).find('.//*') is None:
16 |                 raise ValidationError('Content is not HTML')
17 |             return data
18 | 
19 | 
20 | class MagicPdfSchema(Schema):
21 |     pageUrl = fields.Str(required=True)
22 | 
23 |     @validates('pageUrl')
24 |     def validate_url(self, data, **kwargs):
25 |         if not data:
26 |             raise ValidationError('pageUrl cannot be empty')
27 |         else:
28 |             return data
29 | 


--------------------------------------------------------------------------------
/magic_doc/utils/yaml_load.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | import yaml
 4 | from collections import deque
 5 | 
 6 | 
 7 | def patch_dict_with_env(env_namespace, configs):
 8 |     for env_var in os.environ:
 9 |         arr = deque(map(lambda x: x.lower(), env_var.split("_")))
10 |         if arr[0] != env_namespace:
11 |             continue
12 |         arr.popleft()
13 |         d = configs
14 |         while arr:
15 |             if arr[0] not in d:
16 |                 break
17 |             if len(arr) > 1:
18 |                 d = d[arr[0]]
19 |                 arr.popleft()
20 |             else:
21 |                 d[arr[0]] = os.environ[env_var]
22 |                 break
23 |     return configs
24 | 
25 | 
26 | def patch_yaml_load_with_env(yaml_file, env_namespace, loader=yaml.FullLoader):
27 |     with open(yaml_file, "r") as f:
28 |         configs = yaml.load(f, Loader=yaml.FullLoader)
29 | 
30 |     return patch_dict_with_env(env_namespace, configs)
31 | 
32 | 


--------------------------------------------------------------------------------
/update_version.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | 
 5 | def get_version():
 6 |     command = ["git", "describe", "--tags"]
 7 |     try:
 8 |         version = subprocess.check_output(command).decode().strip()
 9 |         version_parts = version.split("-")
10 |         if len(version_parts) > 1 and version_parts[0].startswith("magic_doc"):
11 |             return version_parts[1]
12 |         else:
13 |             raise ValueError(f"Invalid version tag {version}. Expected format is magic_doc-<version>-released.")
14 |     except Exception as e:
15 |         print(e)
16 |         return "0.0.0"
17 | 
18 | 
19 | def write_version_to_commons(version):
20 |     commons_path = os.path.join(os.path.dirname(__file__), 'magic_doc', 'libs', 'version.py')
21 |     with open(commons_path, 'w') as f:
22 |         f.write(f'__version__ = "{version}"\n')
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     version_name = get_version()
27 |     write_version_to_commons(version_name)
28 | 


--------------------------------------------------------------------------------
/magic_doc/model/doc_analysis_by_pp.py:
--------------------------------------------------------------------------------
 1 | from magic_doc.model.parallel_paddle import ParallelPaddle
 2 | 
 3 | 
 4 | class PaddleDocAnalysis:
 5 |     def __init__(self, **kwargs):
 6 |         self.model = ParallelPaddle(**kwargs)
 7 | 
 8 |     def __call__(self, image_dicts):
 9 |         images = [(i, image_dicts[i]["img"]) for i in range(len(image_dicts))]
10 |         results = sorted(self.model(images), key=lambda x: x[0])
11 |         if len(results) != len(image_dicts):
12 |             raise Exception("fatal error: failed to inference using paddleocr")
13 | 
14 |         model_json = []
15 |         for index, img_dict in enumerate(image_dicts):
16 |             img = img_dict["img"]
17 |             page_width = img_dict["width"]
18 |             page_height = img_dict["height"]
19 |             page_info = {"page_no": index, "height": page_height, "width": page_width}
20 |             page_dict = {"layout_dets": results[index][1], "page_info": page_info}
21 |             model_json.append(page_dict)
22 |         return model_json
23 | 
24 | 


--------------------------------------------------------------------------------
/magic_doc/resources/model/UniMERNet/demo.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: unimernet
 3 |   model_type: unimernet
 4 |   model_config:
 5 |     model_name: ./models
 6 |     max_seq_len: 1024
 7 |     length_aware: True
 8 |   load_pretrained: True
 9 |   pretrained: ./models/pytorch_model.bin
10 |   tokenizer_config:
11 |     path: ./models
12 | 
13 | datasets:
14 |   formula_rec_eval:
15 |     vis_processor:
16 |       eval:
17 |         name: "formula_image_eval"
18 |         image_size:
19 |           - 192
20 |           - 672
21 |    
22 | run:
23 |   runner: runner_iter
24 |   task: unimernet_train
25 | 
26 |   batch_size_train: 64
27 |   batch_size_eval: 64
28 |   num_workers: 1
29 | 
30 |   iters_per_inner_epoch: 2000
31 |   max_iters: 60000
32 | 
33 |   seed: 42
34 |   output_dir: "../output/demo"
35 | 
36 |   evaluate: True
37 |   test_splits: [ "eval" ]
38 | 
39 |   device: "cuda"
40 |   world_size: 1
41 |   dist_url: "env://"
42 |   distributed: True
43 |   distributed_type: ddp  # or fsdp when train llm
44 | 
45 |   generate_cfg:
46 |     temperature: 0.0
47 | 


--------------------------------------------------------------------------------
/magic_doc/resources/model/UniMERNet/demo_old.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: unimernet
 3 |   model_type: unimernet
 4 |   model_config:
 5 |     model_name: ./models
 6 |     max_seq_len: 1024
 7 |     length_aware: False
 8 |   load_pretrained: True
 9 |   pretrained: ./models/pytorch_model.bin
10 |   tokenizer_config:
11 |     path: ./models
12 | 
13 | datasets:
14 |   formula_rec_eval:
15 |     vis_processor:
16 |       eval:
17 |         name: "formula_image_eval"
18 |         image_size:
19 |           - 192
20 |           - 672
21 |    
22 | run:
23 |   runner: runner_iter
24 |   task: unimernet_train
25 | 
26 |   batch_size_train: 64
27 |   batch_size_eval: 64
28 |   num_workers: 1
29 | 
30 |   iters_per_inner_epoch: 2000
31 |   max_iters: 60000
32 | 
33 |   seed: 42
34 |   output_dir: "../output/demo"
35 | 
36 |   evaluate: True
37 |   test_splits: [ "eval" ]
38 | 
39 |   device: "cuda"
40 |   world_size: 1
41 |   dist_url: "env://"
42 |   distributed: True
43 |   distributed_type: ddp  # or fsdp when train llm
44 | 
45 |   generate_cfg:
46 |     temperature: 0.0
47 | 


--------------------------------------------------------------------------------
/magic_doc/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from magic_pdf.libs.pdf_check import detect_invalid_chars
 2 | 
 3 | import magic_doc
 4 | import os
 5 | import random
 6 | import fitz
 7 | 
 8 | def get_repo_directory():
 9 |     return os.path.dirname(magic_doc.__file__)
10 | 
11 | 
12 | def is_digital(bits: bytes) -> bool:
13 |     def _is_digital(doc, check_page=10, text_len_thrs=100) -> bool:
14 |         sample_page_num = min(check_page, doc.page_count)
15 |         page_ids = random.sample(range(doc.page_count), sample_page_num)
16 |         page_text_len = [
17 |             len(doc[pno].get_text("text")) > text_len_thrs for pno in page_ids
18 |         ]
19 |         if any(page_text_len):
20 |             return True
21 |         return False
22 | 
23 |     def _check_invalid_chars(pdf_bytes: bytes) -> bool:
24 |         return detect_invalid_chars(pdf_bytes)
25 |     
26 |     with fitz.open(stream=bits) as doc:
27 |         return _is_digital(doc) and _check_invalid_chars(bits)
28 | 
29 | 
30 | def split_to_chunks(lst, n):
31 |     for i in range(0, len(lst), n):
32 |         yield lst[i:i + n]


--------------------------------------------------------------------------------
/magic_doc/restful_api/api/analysis/magic_html_view.py:
--------------------------------------------------------------------------------
 1 | from flask import request
 2 | from flask_restful import Resource
 3 | from .serialization import MagicHtmlSchema
 4 | from marshmallow import ValidationError
 5 | from magic_doc.restful_api.common.custom_response import generate_response
 6 | from magic_doc.contrib.magic_html import GeneralExtractor
 7 | from loguru import logger
 8 | 
 9 | extractor = GeneralExtractor()
10 | 
11 | 
12 | class MagicHtmlView(Resource):
13 |     @logger.catch
14 |     def post(self):
15 |         """
16 |         网页提取
17 |         :return:
18 |         """
19 |         magic_html_schema = MagicHtmlSchema()
20 |         try:
21 |             params = magic_html_schema.load(request.get_json())
22 |         except ValidationError as err:
23 |             return generate_response(code=400, msg=err.messages)
24 |         url = params.get("pageUrl", "")
25 |         html_type = params.get("html_type")
26 |         html = params.get("html")
27 |         data = extractor.extract(html, base_url=url, html_type=html_type)
28 |         return generate_response(data=data)
29 | 


--------------------------------------------------------------------------------
/magic_doc/utils/path_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from s3pathlib import S3Path
 4 | 
 5 | from magic_doc.utils.config import read_config
 6 | 
 7 | 
 8 | def get_local_dir():
 9 |     config = read_config()
10 |     return config.get("temp-output-dir", "/tmp")
11 | 
12 | 
13 | def prepare_env(doc_file_name, doc_type="") -> str:
14 |     if doc_type == "":
15 |         doc_type = "unknown"
16 |     local_parent_dir = os.path.join(
17 |         get_local_dir(), "magic-doc", doc_type, doc_file_name
18 |     )
19 | 
20 |     # local_image_dir = os.path.join(local_parent_dir, "images")
21 |     local_md_dir = local_parent_dir
22 |     # os.makedirs(local_image_dir, exist_ok=True)
23 |     os.makedirs(local_md_dir, exist_ok=True)
24 |     return str(local_md_dir)
25 | 
26 | 
27 | def remove_non_official_s3_args(s3path):
28 |     """
29 |     example: s3://abc/xxxx.json?bytes=0,81350 ==> s3://abc/xxxx.json
30 |     """
31 |     arr = s3path.split("?")
32 |     return arr[0]
33 | 
34 | 
35 | def parse_s3path(s3path: str):
36 |     p = S3Path(remove_non_official_s3_args(s3path))
37 |     return p.bucket, p.key
38 | 


--------------------------------------------------------------------------------
/assets/license.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="120" height="20" role="img" aria-label="license: Apache-2.0"><title>license: Apache-2.0</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="120" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="47" height="20" fill="#555"/><rect x="47" width="73" height="20" fill="#97ca00"/><rect width="120" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="245" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="370">license</text><text x="245" y="140" transform="scale(.1)" fill="#fff" textLength="370">license</text><text aria-hidden="true" x="825" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="630">Apache-2.0</text><text x="825" y="140" transform="scale(.1)" fill="#fff" textLength="630">Apache-2.0</text></g></svg>


--------------------------------------------------------------------------------
/magic_doc/utils/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | 
 5 | def read_config():
 6 |     home_dir = os.path.expanduser("~")
 7 | 
 8 |     config_file = os.path.join(home_dir, "magic-doc.json")
 9 | 
10 |     if not os.path.exists(config_file):
11 |         raise Exception(f"{config_file} not found")
12 | 
13 |     with open(config_file, "r") as f:
14 |         config = json.load(f)
15 |     return config
16 | 
17 | 
18 | def get_s3_config(bucket_name: str):
19 |     """
20 |     从 ~/magic-doc.json 读取配置
21 |     """
22 |     config = read_config()
23 | 
24 |     bucket_info = config.get("bucket_info")
25 |     if bucket_name not in bucket_info:
26 |         access_key, secret_key, storage_endpoint = bucket_info["[default]"]
27 |     else:
28 |         access_key, secret_key, storage_endpoint = bucket_info[bucket_name]
29 | 
30 |     if access_key is None or secret_key is None or storage_endpoint is None:
31 |         raise Exception("ak, sk or endpoint not found in magic-doc.json")
32 | 
33 |     # logger.info(f"get_s3_config: ak={access_key}, sk={secret_key}, endpoint={storage_endpoint}")
34 | 
35 |     return access_key, secret_key, storage_endpoint
36 | 
37 | 


--------------------------------------------------------------------------------
/.github/workflows/benchmark.yml:
--------------------------------------------------------------------------------
 1 | name: Magic doc benchmark
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - "main"
 6 |     paths-ignore:
 7 |       - "cmds/**"
 8 |       - "**.md"
 9 |   pull_request:
10 |     branches:
11 |       - "main"
12 |     paths-ignore:
13 |       - "cmds/**"
14 |       - "**.md"
15 |   workflow_dispatch:
16 | jobs:
17 |   magic-doc-test:
18 |     runs-on: doc
19 |     timeout-minutes: 180
20 |     strategy:
21 |       fail-fast: true
22 | 
23 |     steps:
24 |     - name: config-net
25 |       run: |
26 |         source activate magicdoc
27 |     - name: pull code
28 |       uses: actions/checkout@v3
29 |       with:
30 |         fetch-depth: 2
31 |     - name: check-requirements
32 |       run: |
33 |         changed_files=$(git diff --name-only -r HEAD~1 HEAD)
34 |         echo $changed_files
35 |         if [[ $changed_files =~ "requirements.txt" ]]; then
36 |           pip install -r requirements.txt
37 |         fi
38 |     - name: install dependencies
39 |       run: |
40 |         sudo su -
41 |         yum install libreoffice
42 |         pip install fairy-doc[cpu]
43 |     - name: get-doc-benchmark-result
44 |       run: |
45 |         echo "start test"
46 |         cd tools && python benchmark.py
47 |   


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/post_process.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | def layout_rm_equation(layout_res):
 4 |     rm_idxs = []
 5 |     for idx, ele in enumerate(layout_res['layout_dets']):
 6 |         if ele['category_id'] == 10:
 7 |             rm_idxs.append(idx)
 8 |     
 9 |     for idx in rm_idxs[::-1]:
10 |         del layout_res['layout_dets'][idx]
11 |     return layout_res
12 | 
13 | 
14 | def get_croped_image(image_pil, bbox):
15 |     x_min, y_min, x_max, y_max = bbox
16 |     croped_img = image_pil.crop((x_min, y_min, x_max, y_max))
17 |     return croped_img
18 | 
19 | 
20 | def latex_rm_whitespace(s: str):
21 |     """Remove unnecessary whitespace from LaTeX code.
22 |     """
23 |     text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
24 |     letter = '[a-zA-Z]'
25 |     noletter = '[\W_^\d]'
26 |     names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
27 |     s = re.sub(text_reg, lambda match: str(names.pop(0)), s)
28 |     news = s
29 |     while True:
30 |         s = news
31 |         news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
32 |         news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
33 |         news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
34 |         if news == s:
35 |             break
36 |     return s


--------------------------------------------------------------------------------
/magic_doc/contrib/model/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, TypeAlias, TypedDict
 3 | from werkzeug.datastructures import FileStorage
 4 | 
 5 | 
 6 | class Content(TypedDict):
 7 |     # 类型: image/text/md
 8 |     type: str
 9 | 
10 |     # 数据
11 |     # image: s3路径 s3://doc/xxx.png
12 |     # text: 文本行
13 |     # md: Markdown格式的文本
14 |     data: str
15 | 
16 | 
17 | class Page(TypedDict):
18 |     # 从0开始
19 |     page_no: int
20 | 
21 |     # 内容列表
22 |     content_list: List[Content]
23 | 
24 | 
25 | ExtractResponse: TypeAlias = List[Page]
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     pages_data: ExtractResponse = [
30 |         {
31 |             "page_no": 0,
32 |             "content_list": [
33 |                 {
34 |                     "type": "text",
35 |                     "data": "This is some text content.",
36 |                 },
37 |                 {
38 |                     "type": "image",
39 |                     "data": "s3://somebucket/imagepath.jpg",
40 |                 },
41 |             ],
42 |         }
43 |     ]
44 | 
45 | 
46 | class Extractor(ABC):
47 |     @abstractmethod
48 |     def setup():
49 |         pass
50 | 
51 |     @abstractmethod
52 |     def run(self, file_parse_id: str, r: FileStorage, skip_image: bool = True) -> ExtractResponse:
53 |         pass
54 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/app.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from pathlib import Path
 3 | 
 4 | base_dir = Path(__file__).resolve().parent
 5 | root_dir = base_dir.parent.parent
 6 | sys.path.append(str(root_dir))
 7 | 
 8 | from api import create_app
 9 | import yaml
10 | 
11 | config_path = base_dir / "config/config.yaml"
12 | 
13 | 
14 | class ConfigMap(dict):
15 |     __setattr__ = dict.__setitem__
16 |     __getattr__ = dict.__getitem__
17 | 
18 | 
19 | with open(str(config_path), mode='r', encoding='utf-8') as fd:
20 |     data = yaml.load(fd, Loader=yaml.FullLoader)
21 |     _config = data.get(data.get("CurrentConfig", "DevelopmentConfig"))
22 | config = ConfigMap()
23 | for k, v in _config.items():
24 |     config[k] = v
25 | config['base_dir'] = base_dir
26 | database = _config.get("database")
27 | if database:
28 |     if database.get("type") == "sqlite":
29 |         database_uri = f'sqlite:///{base_dir}/{database.get("path")}'
30 |     elif database.get("type") == "mysql":
31 |         database_uri = f'mysql+pymysql://{database.get("user")}:{database.get("password")}@{database.get("host")}:{database.get("port")}/{database.get("database")}?'
32 |     else:
33 |         database_uri = ''
34 |     config['SQLALCHEMY_DATABASE_URI'] = database_uri
35 | app = create_app(config)
36 | 
37 | if __name__ == '__main__':
38 |     app.run(host="0.0.0.0", port=5556, debug=True)


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/__init__.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig, AutoModel, AutoModelForTokenClassification, \
 2 |     AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoTokenizer
 3 | from transformers.convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, RobertaConverter
 4 | 
 5 | from .configuration_layoutlmv3 import LayoutLMv3Config
 6 | from .modeling_layoutlmv3 import (
 7 |     LayoutLMv3ForTokenClassification,
 8 |     LayoutLMv3ForQuestionAnswering,
 9 |     LayoutLMv3ForSequenceClassification,
10 |     LayoutLMv3Model,
11 | )
12 | from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
13 | from .tokenization_layoutlmv3_fast import LayoutLMv3TokenizerFast
14 | 
15 | 
16 | #AutoConfig.register("layoutlmv3", LayoutLMv3Config)
17 | #AutoModel.register(LayoutLMv3Config, LayoutLMv3Model)
18 | #AutoModelForTokenClassification.register(LayoutLMv3Config, LayoutLMv3ForTokenClassification)
19 | #AutoModelForQuestionAnswering.register(LayoutLMv3Config, LayoutLMv3ForQuestionAnswering)
20 | #AutoModelForSequenceClassification.register(LayoutLMv3Config, LayoutLMv3ForSequenceClassification)
21 | #AutoTokenizer.register(
22 | #    LayoutLMv3Config, slow_tokenizer_class=LayoutLMv3Tokenizer, fast_tokenizer_class=LayoutLMv3TokenizerFast
23 | #)
24 | SLOW_TO_FAST_CONVERTERS.update({"LayoutLMv3Tokenizer": RobertaConverter})
25 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Tokenization classes for LayoutLMv3, refer to RoBERTa."""
16 | 
17 | from transformers.models.roberta import RobertaTokenizer
18 | from transformers.utils import logging
19 | 
20 | 
21 | logger = logging.get_logger(__name__)
22 | 
23 | VOCAB_FILES_NAMES = {
24 |     "vocab_file": "vocab.json",
25 |     "merges_file": "merges.txt",
26 | }
27 | 
28 | class LayoutLMv3Tokenizer(RobertaTokenizer):
29 |     vocab_files_names = VOCAB_FILES_NAMES
30 |     # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
31 |     # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
32 |     model_input_names = ["input_ids", "attention_mask"]
33 | 


--------------------------------------------------------------------------------
/test/test_docconv.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from magic_doc.docconv import ConvException, S3Config, DocConverter
 3 | 
 4 | def test_conv_localfile():
 5 |     convert = DocConverter(None)
 6 |     many_docs = ["/path/docs/mypdf.pdf", "/path/docs/mydoc.docx", "/path/docs/mydoc.doc", "/path/docs/mydoc.pptx", "/path/docs/mydoc.ppt"]
 7 |     for i, doc in enumerate(many_docs):
 8 |         try:
 9 |             markdown = convert.convert(doc, f"/path/progress/progress-{i}.txt")
10 |             # do something with markdown
11 |         except ConvException as e:
12 |             assert False, f"Failed to convert {doc}, Reason: {e.message}"
13 |         except Exception as e:
14 |             assert False, f"Failed to convert {doc}: {e}"
15 |         
16 |         
17 | def test_conv_s3file():
18 |     s3cfg = S3Config("ak", "sk", "endpoint")
19 |     convert = DocConverter(s3cfg)
20 |     many_docs = ["s3://bucket/mypdf.pdf", "s3://bucket/mydoc.docx", "s3://bucket/mydoc.doc", "s3://bucket/mydoc.pptx", "s3://bucket/mydoc.ppt"]
21 |     for i, doc in enumerate(many_docs):
22 |         try:
23 |             markdown = convert.convert(doc, f"/path/progress/progress-{i}.txt")
24 |             # do something with markdown
25 |         except ConvException as e:
26 |             assert False, f"Failed to convert {doc}, Reason: {e.message}"
27 |         except Exception as e:
28 |             assert False, f"Failed to convert {doc}: {e}"
29 |             


--------------------------------------------------------------------------------
/magic_doc/contrib/office/ppt_extract.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import json
 3 | 
 4 | from pathlib import Path
 5 | 
 6 | 
 7 | from werkzeug.datastructures import FileStorage
 8 | 
 9 | from magic_doc.contrib.office import OfficeExtractor
10 | from magic_doc.contrib.model import ExtractResponse
11 | 
12 | 
13 | class PptExtractor(OfficeExtractor):
14 |     def __init__(self) -> None:
15 |         super().__init__()
16 | 
17 |     def setup(self):
18 |         pass
19 | 
20 |     def extract(
21 |         self,
22 |         r: FileStorage | Path,
23 |         id: str,
24 |         dir: Path,
25 |         media_dir: Path,
26 |         skip_image: bool,
27 |     ) -> ExtractResponse:
28 | 
29 |         if type(r) is FileStorage:
30 |             data = r.stream.read()
31 |         elif issubclass(type(r), Path):
32 |             with open(r, "rb") as data_file:
33 |                 data = data_file.read()
34 | 
35 |         files = {"file": data}
36 |         response = requests.post(f"{self.config.tika}/api/v1/parse", files=files)
37 |         self.upload_background(id, {})
38 |         return response.json()["pages"]
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     e = PptExtractor()
43 |     print(
44 |         json.dumps(
45 |             e.run(
46 |                 "def",
47 |                 Path(
48 |                     "/home/SENSETIME/wuziming/diclm/doc2docx/doc/【中繁-课件】物理学简介.ppt",
49 |                 ),
50 |             ),
51 |             ensure_ascii=False,
52 |             indent=4,
53 |         )
54 |     )
55 |     e.wait_all()
56 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 2 | 
 3 | name: doc-ci
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - "main"
 8 |     paths-ignore:
 9 |       - "cmds/**"
10 |       - "**.md"
11 |   pull_request:
12 |     branches:
13 |       - "main"
14 |     paths-ignore:
15 |       - "cmds/**"
16 |       - "**.md"
17 |   workflow_dispatch:
18 | jobs:
19 |   cli-test:
20 |     runs-on: doc
21 |     timeout-minutes: 40
22 |     strategy:
23 |       fail-fast: true
24 | 
25 |     steps:
26 |     - name: config-net
27 |       run: |
28 |         source activate magicdoc
29 |     - name: doc cli
30 |       uses: actions/checkout@v3
31 |       with:
32 |         fetch-depth: 2
33 |       
34 |     - name: check-requirements
35 |       run: |
36 |         changed_files=$(git diff --name-only -r HEAD~1 HEAD)
37 |         echo $changed_files
38 |         if [[ $changed_files =~ "requirements.txt" ]]; then
39 |           pip install -r requirements.txt
40 |         fi
41 | 
42 |     - name: config-net-reset
43 |       run: |
44 |         export http_proxy=""
45 |         export https_proxy=""
46 |     - name: test_cli
47 |       run: |
48 |         echo $GITHUB_WORKSPACE
49 |         cd $GITHUB_WORKSPACE && export PYTHONPATH=. && pytest -s -v tests/test_unit.py
50 |         cd $GITHUB_WORKSPACE &&  pytest -s -v test/test_cli/test_cli.py
51 |                                                                                                                             
52 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/extractors/article_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from magic_doc.contrib.magic_html.utils import *
 4 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor
 5 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor
 6 | 
 7 | 
 8 | class ArticleExtractor(BaseExtractor):
 9 |     def __init__(self) -> None:
10 |         super().__init__()
11 | 
12 |     def extract(self, html="", base_url="") -> dict:
13 |         html = html.replace("&nbsp;", " ").replace("&#160;", " ")
14 |         tree = load_html(html)
15 |         if tree is None:
16 |             raise ValueError
17 | 
18 |         title = TitleExtractor().process(tree)
19 | 
20 |         # base_url
21 |         base_href = tree.xpath("//base/@href")
22 | 
23 |         if base_href and "http" in base_href[0]:
24 |             base_url = base_href[0]
25 | 
26 |         # 标签转换, 增加数学标签处理
27 |         format_tree = self.convert_tags(tree, base_url=base_url)
28 | 
29 |         # 删除script style等标签及其内容
30 |         normal_tree = self.clean_tags(format_tree)
31 | 
32 |         subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
33 |         if xp_num == "others":
34 |             subtree, drop_list = self.prune_unwanted_sections(normal_tree)
35 |         body_html = self.get_content_html(subtree, xp_num, base_url)
36 | 
37 |         return {
38 |             "xp_num": xp_num,
39 |             "drop_list": drop_list,
40 |             "html": body_html,
41 |             "title": title,
42 |             "base_url": base_url,
43 |         }
44 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Fast Tokenization classes for LayoutLMv3, refer to RoBERTa."""
16 | 
17 | 
18 | from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
19 | from transformers.utils import logging
20 | 
21 | from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
22 | 
23 | 
24 | logger = logging.get_logger(__name__)
25 | 
26 | VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
27 | 
28 | 
29 | class LayoutLMv3TokenizerFast(RobertaTokenizerFast):
30 |     vocab_files_names = VOCAB_FILES_NAMES
31 |     # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
32 |     # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
33 |     model_input_names = ["input_ids", "attention_mask"]
34 |     slow_tokenizer_class = LayoutLMv3Tokenizer
35 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/api/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from datetime import datetime
 3 | from pathlib import Path
 4 | from loguru import logger
 5 | from .extentions import app, db, migrate, jwt, ma
 6 | from magic_doc.restful_api.common.web_hook import before_request
 7 | 
 8 | root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 9 | 
10 | 
11 | def init_app_log(config):
12 |     """
13 |     Setup logging
14 |     :param config:  config file
15 |     :return:
16 |     """
17 |     log_path = os.path.join(Path(__file__).parent.parent, "log")
18 |     if not Path(log_path).exists():
19 |         Path(log_path).mkdir(parents=True, exist_ok=True)
20 |     log_level = config.get("LOG_LEVEL")
21 |     log_name = f'log_{datetime.now().strftime("%Y-%m-%d")}.log'
22 |     log_file_path = os.path.join(log_path, log_name)
23 |     logger.add(str(log_file_path), rotation='00:00', encoding='utf-8', level=log_level, enqueue=True)
24 |     return logger
25 | 
26 | 
27 | def _register_db(flask_app):
28 |     db.init_app(flask_app)
29 |     with app.app_context():
30 |         db.create_all()
31 | 
32 | 
33 | def create_app(config):
34 |     """
35 |     Create and configure an instance of the Flask application
36 |     :param config:
37 |     :return:
38 |     """
39 |     app.static_folder = os.path.join(root_dir, "static")
40 |     if config is None:
41 |         config = {}
42 |     app.config.update(config)
43 |     init_app_log(config)
44 |     # _register_db(app)
45 |     migrate.init_app(app=app, db=db)
46 |     jwt.init_app(app=app)
47 |     ma.init_app(app=app)
48 |     from .analysis import analysis_blue
49 |     app.register_blueprint(analysis_blue)
50 | 
51 |     app.before_request(before_request)
52 | 
53 |     return app
54 | 


--------------------------------------------------------------------------------
/magic_doc/model/parallel_paddle.py:
--------------------------------------------------------------------------------
 1 | from magic_pdf.model.pp_structure_v2 import CustomPaddleModel
 2 | from magic_doc.utils import split_to_chunks
 3 | import paddle 
 4 | from concurrent.futures import ThreadPoolExecutor, as_completed
 5 | import math
 6 | 
 7 | class ParallelPaddle:
 8 |     def __init__(self, model_load_on_each_gpu_count=1):
 9 |         models = []
10 |         for _ in range(model_load_on_each_gpu_count):
11 |             models.append(CustomPaddleModel(ocr=True, show_log=False))
12 |         self.models = models
13 | 
14 |     def __call__(self, params):
15 |         """
16 |         params: list[(idx, image, *args)]
17 |         """
18 |         if len(params) == 0:
19 |             return []
20 |         chunks = list(split_to_chunks(params, max(math.ceil(len(params) *1.0/ len(self.models)), 1)))
21 |         return self._run_ocr_concurrently(chunks)
22 | 
23 | 
24 |     def _run_ocr_concurrently(self, chunks):
25 |         results = []
26 |         def run_ocr(chunk, i):
27 |             result = []
28 |             for idx, img in chunk:
29 |                 ocr_res = self.models[i](img)
30 |                 if ocr_res:
31 |                     result.append((idx, ocr_res))
32 |             return result
33 | 
34 |         with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
35 |             future_to_ocr = {executor.submit(run_ocr, chunk, i): i for i, chunk in enumerate(chunks)}
36 |             for future in as_completed(future_to_ocr):
37 |                 try:
38 |                     data = future.result()
39 |                     results.extend(data)
40 |                 except Exception as exc:
41 |                     print(f"failed to process ocr, reason: ", exc)
42 |         return sorted(results, key=lambda x: x[0])
43 | 
44 | 


--------------------------------------------------------------------------------
/tools/scoring.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from rapidfuzz import fuzz
 4 | import re
 5 | import regex
 6 | from statistics import mean
 7 | 
 8 | CHUNK_MIN_CHARS = 25
 9 | 
10 | def chunk_text(text, chunk_len=500):
11 |     chunks = [text[i:i+chunk_len] for i in range(0, len(text), chunk_len)]
12 |     chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
13 |     return chunks
14 | 
15 | 
16 | def overlap_score(hypothesis_chunks, reference_chunks):
17 |     if len(reference_chunks) > 0:
18 |         length_modifier = len(hypothesis_chunks) / len(reference_chunks)
19 |     else:
20 |         length_modifier = 0
21 |     search_distance = max(len(reference_chunks) // 5, 10)
22 |     chunk_scores = []
23 |     for i, hyp_chunk in enumerate(hypothesis_chunks):
24 |         max_score = 0
25 |         total_len = 0
26 |         i_offset = int(i * length_modifier)
27 |         chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
28 |         for j in chunk_range:
29 |             ref_chunk = reference_chunks[j]
30 |             score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
31 |             if score > max_score:
32 |                 max_score = score
33 |                 total_len = len(ref_chunk)
34 |         chunk_scores.append(max_score)
35 |     return chunk_scores
36 | 
37 | 
38 | def score_text(hypothesis, reference):
39 |     # Returns a 0-1 alignment score
40 |     hypothesis_chunks = chunk_text(hypothesis)
41 |     reference_chunks = chunk_text(reference)
42 |     chunk_scores = overlap_score(hypothesis_chunks, reference_chunks)
43 |     if len(chunk_scores) > 0:
44 |         mean_score = mean(chunk_scores)
45 |         return mean_score
46 |     else:
47 |         return 0
48 |     #return mean(chunk_scores)


--------------------------------------------------------------------------------
/magic_doc/bin/linux/share/antiword/fontnames.russian:
--------------------------------------------------------------------------------
 1 | # Default fontnames translation table
 2 | # for Cyrillic
 3 | #
 4 | # by: Dmitry Chernyak <cdl@inkasbank.ru>
 5 | #
 6 | # MS-Word fontname,	Italic,	Bold,	PostScript fontname,	Special
 7 | Arial,			0,	0,	ArialCyrMT,		0
 8 | Arial,			0,	1,	ArialCyrMT-Bold,	0
 9 | Arial,			1,	0,	ArialCyrMT-Italic,	0
10 | Arial,			1,	1,	ArialCyrMT-BoldItalic,	0
11 | Courier,		0,	0,	CourierCyrPS,		0
12 | Courier,		0,	1,	CourierCyrPS-Bold,	0
13 | Courier,		1,	0,	CourierCyrPS-Inclined,	0
14 | Courier,		1,	1,	CourierCyrPS-BoldInclined,	0
15 | Courier New,		0,	0,	CourierCyrPS,		0
16 | Courier New,		0,	1,	CourierCyrPS-Bold,	0
17 | Courier New,		1,	0,	CourierCyrPS-Inclined,	0
18 | Courier New,		1,	1,	CourierCyrPS-BoldInclined,	0
19 | Fixedsys,		0,	0,	CourierCyrPS,		0
20 | Fixedsys,		0,	1,	CourierCyrPS-Bold,	0
21 | Fixedsys,		1,	0,	CourierCyrPS-Inclined,	0
22 | Fixedsys,		1,	1,	CourierCyrPS-BoldInclined,	0
23 | Helvetica,		0,	0,	ArialCyrMT,		0
24 | Helvetica,		0,	1,	ArialCyrMT-Bold,	0
25 | Helvetica,		1,	0,	ArialCyrMT-Italic,	0
26 | Helvetica,		1,	1,	ArialCyrMT-BoldItalic,	0
27 | Lucida Console,		0,	0,	CourierCyrPS,		0
28 | Lucida Console,		0,	1,	CourierCyrPS-Bold,	0
29 | Lucida Console,		1,	0,	CourierCyrPS-Inclined,	0
30 | Lucida Console,		1,	1,	CourierCyrPS-BoldInclined,	0
31 | Swiss,			0,	0,	Helvetica,		0
32 | Swiss,			0,	1,	Helvetica-Bold,		0
33 | Swiss,			1,	0,	Helvetica-Oblique,	0
34 | Swiss,			1,	1,	Helvetica-BoldOblique,	0
35 | Univers,		0,	0,	Helvetica,		0
36 | Univers,		0,	1,	Helvetica-Bold,		0
37 | Univers,		1,	0,	Helvetica-Oblique,	0
38 | Univers,		1,	1,	Helvetica-BoldOblique,	0
39 | # All the other fonts
40 | *,			0,	0,	TimesNRCyrMT,		0
41 | *,			0,	1,	TimesNRCyrMT-Bold,	0
42 | *,			1,	0,	TimesNRCyrMT-Inclined,	0
43 | *,			1,	1,	TimesNRCyrMT-BoldInclined,	0
44 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/mmltex/mmltex.xsl:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding="UTF-8"?>
 2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 3 | 		xmlns:m="http://www.w3.org/1998/Math/MathML"
 4 |                 version='1.0'>
 5 |                 
 6 | <xsl:output method="text" indent="no" encoding="UTF-8"/>
 7 | 
 8 | <!-- ====================================================================== -->
 9 | <!-- $id: mmltex.xsl, 2002/22/11 Exp $
10 |      This file is part of the XSLT MathML Library distribution.
11 |      See ./README or http://www.raleigh.ru/MathML/mmltex for
12 |      copyright and other information                                        -->
13 | <!-- ====================================================================== -->
14 | 
15 | <xsl:include href="tokens.xsl"/>
16 | <xsl:include href="glayout.xsl"/>
17 | <xsl:include href="scripts.xsl"/>
18 | <xsl:include href="tables.xsl"/>
19 | <xsl:include href="entities.xsl"/>
20 | <xsl:include href="cmarkup.xsl"/>
21 | 
22 | <!-- Note: variables colora (template color) and symbola (template startspace) only for Sablotron -->
23 | 
24 | <xsl:template name="startspace">
25 | 	<xsl:param name="symbol"/>
26 | 	<xsl:if test="contains($symbol,' ')">
27 | 		<xsl:variable name="symbola" select="concat(substring-before($symbol,' '),substring-after($symbol,' '))"/>
28 | 		<xsl:call-template name="startspace">
29 | 			<xsl:with-param name="symbol" select="$symbola"/>
30 | 		</xsl:call-template>
31 | 	</xsl:if>
32 | 	<xsl:if test="not(contains($symbol,' '))">
33 | 		<xsl:value-of select="$symbol"/>
34 | 	</xsl:if>
35 | </xsl:template>
36 | 
37 | <xsl:strip-space elements="m:*"/>
38 | 
39 | <xsl:template match="m:math">
40 | 	<xsl:text>&#x00024;</xsl:text>
41 | 	<xsl:apply-templates/>
42 | 	<xsl:text>&#x00024;</xsl:text>
43 | </xsl:template>
44 | 
45 | </xsl:stylesheet>


--------------------------------------------------------------------------------
/magic_doc/contrib/office/formula/mml/xsl/mmltex.xsl:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding="UTF-8"?>
 2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
 3 | 		xmlns:m="http://www.w3.org/1998/Math/MathML"
 4 |                 version='1.0'>
 5 |                 
 6 | <xsl:output method="text" indent="no" encoding="UTF-8"/>
 7 | 
 8 | <!-- ====================================================================== -->
 9 | <!-- $id: mmltex.xsl, 2002/22/11 Exp $
10 |      This file is part of the XSLT MathML Library distribution.
11 |      See ./README or http://www.raleigh.ru/MathML/mmltex for
12 |      copyright and other information                                        -->
13 | <!-- ====================================================================== -->
14 | 
15 | <xsl:include href="tokens.xsl"/>
16 | <xsl:include href="glayout.xsl"/>
17 | <xsl:include href="scripts.xsl"/>
18 | <xsl:include href="tables.xsl"/>
19 | <xsl:include href="entities.xsl"/>
20 | <xsl:include href="cmarkup.xsl"/>
21 | 
22 | <!-- Note: variables colora (template color) and symbola (template startspace) only for Sablotron -->
23 | 
24 | <xsl:template name="startspace">
25 | 	<xsl:param name="symbol"/>
26 | 	<xsl:if test="contains($symbol,' ')">
27 | 		<xsl:variable name="symbola" select="concat(substring-before($symbol,' '),substring-after($symbol,' '))"/>
28 | 		<xsl:call-template name="startspace">
29 | 			<xsl:with-param name="symbol" select="$symbola"/>
30 | 		</xsl:call-template>
31 | 	</xsl:if>
32 | 	<xsl:if test="not(contains($symbol,' '))">
33 | 		<xsl:value-of select="$symbol"/>
34 | 	</xsl:if>
35 | </xsl:template>
36 | 
37 | <xsl:strip-space elements="m:*"/>
38 | 
39 | <xsl:template match="m:math">
40 | 	<xsl:text>&#x00024;</xsl:text>
41 | 	<xsl:apply-templates/>
42 | 	<xsl:text>&#x00024;</xsl:text>
43 | </xsl:template>
44 | 
45 | </xsl:stylesheet>


--------------------------------------------------------------------------------
/magic_doc/model/parallel_layout.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | 
 4 | from magic_doc.model.sub_modules.layoutlmv3.model_init import Layoutlmv3_Predictor
 5 | from magic_doc.utils import split_to_chunks
 6 | from concurrent.futures import ThreadPoolExecutor, as_completed
 7 | import math
 8 | 
 9 | class ParallelLayout:
10 |     def __init__(self, config, model_load_on_each_gpu_count=1):
11 |         models = []
12 |         for i in range(torch.cuda.device_count()):
13 |             torch.cuda.set_device(i)
14 |             for _ in range(model_load_on_each_gpu_count):
15 |                 models.append(Layoutlmv3_Predictor(config))
16 |         self.models = models
17 | 
18 |     def __call__(self, params):
19 |         """
20 |         params: list[(idx, image)]
21 |         """
22 |         if len(params) == 0:
23 |             return []
24 |         chunks = list(split_to_chunks(params, max(math.ceil(len(params) *1.0/ len(self.models)), 1)))
25 |         return self._run_layout_concurrently(chunks)
26 | 
27 | 
28 |     def _run_layout_concurrently(self, chunks):
29 |         results = []
30 | 
31 |         def run_layout(chunk, i):
32 |             result = []
33 |             for idx, image in chunk:
34 |                 layout_res = self.models[i](image, ignore_catids=[])
35 |                 result.append((idx, layout_res))
36 |             return result
37 | 
38 |         with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
39 |             future_to_ocr = {executor.submit(run_layout, chunk, i): i for i, chunk in enumerate(chunks)}
40 |             for future in as_completed(future_to_ocr):
41 |                 try:
42 |                     data = future.result()
43 |                     results.extend(data)
44 |                 except Exception as exc:
45 |                     print(f"failed to process layout, reason: ", exc)
46 |         return sorted(results, key=lambda x: x[0])
47 | 
48 | 


--------------------------------------------------------------------------------
/magic_doc/model/parallel_ocr.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from magic_doc.model.sub_modules.self_modify import ModifiedPaddleOCR
 3 | from magic_doc.utils import split_to_chunks
 4 | import paddle 
 5 | from concurrent.futures import ThreadPoolExecutor, as_completed
 6 | import math
 7 | 
 8 | class ParallelOCR:
 9 |     def __init__(self, model_load_on_each_gpu_count=1):
10 |         models = []
11 |         for i in range(paddle.device.cuda.device_count()):
12 |             for _ in range(model_load_on_each_gpu_count):
13 |                 models.append(ModifiedPaddleOCR(use_gpu=True, show_log=False, gpu_id=i))
14 |         self.models = models
15 | 
16 |     def __call__(self, params):
17 |         """
18 |         params: list[(idx, image, *args)]
19 |         """
20 |         if len(params) == 0:
21 |             return []
22 |         chunks = list(split_to_chunks(params, max(math.ceil(len(params) *1.0/ len(self.models)), 1)))
23 |         return self._run_ocr_concurrently(chunks)
24 | 
25 | 
26 |     def _run_ocr_concurrently(self, chunks):
27 |         results = []
28 | 
29 |         def run_ocr(chunk, i):
30 |             result = []
31 |             for idx, cropped_image, single_page_mfdetrec_res in chunk:
32 |                 ocr_res = self.models[i].ocr(cropped_image, mfd_res=single_page_mfdetrec_res)[0]
33 |                 if ocr_res:
34 |                     result.append((idx, ocr_res))
35 |             return result
36 | 
37 |         with ThreadPoolExecutor(max_workers=len(chunks)) as executor:
38 |             future_to_ocr = {executor.submit(run_ocr, chunk, i): i for i, chunk in enumerate(chunks)}
39 |             for future in as_completed(future_to_ocr):
40 |                 try:
41 |                     data = future.result()
42 |                     results.extend(data)
43 |                 except Exception as exc:
44 |                     print(f"failed to process ocr, reason: ", exc)
45 |         return results


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/extractors/title_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | from magic_doc.contrib.magic_html.utils import *
 4 | from magic_doc.contrib.magic_html.config import *
 5 | 
 6 | 
 7 | class TitleExtractor:
 8 |     def extract_by_meta(self, element: HtmlElement):
 9 |         for xpath in METAS:
10 |             title = element.xpath(xpath)
11 |             if title:
12 |                 return "".join(title)
13 | 
14 |     def extract_by_title(self, element: HtmlElement):
15 |         return "".join(element.xpath("//title//text()")).strip()
16 | 
17 |     def extract_by_hs(self, element: HtmlElement):
18 |         hs = element.xpath("//h1//text()|//h2//text()|//h3//text()")
19 |         return hs or []
20 | 
21 |     def extract_by_h(self, element: HtmlElement):
22 |         for xpath in ["//h1", "//h2", "//h3"]:
23 |             children = element.xpath(xpath)
24 |             if not children:
25 |                 continue
26 |             child = children[0]
27 |             texts = child.xpath("./text()")
28 |             if texts and len(texts):
29 |                 return texts[0].strip()
30 | 
31 |     def process(self, element: HtmlElement):
32 |         title_extracted_by_meta = self.extract_by_meta(element)
33 |         if title_extracted_by_meta:
34 |             return title_extracted_by_meta
35 |         title_extracted_by_h = self.extract_by_h(element)
36 |         title_extracted_by_hs = self.extract_by_hs(element)
37 |         title_extracted_by_title = self.extract_by_title(element)
38 |         title_extracted_by_hs = sorted(
39 |             title_extracted_by_hs,
40 |             key=lambda x: similarity2(x, title_extracted_by_title),
41 |             reverse=True,
42 |         )
43 |         if title_extracted_by_hs:
44 |             return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title)
45 | 
46 |         if title_extracted_by_title:
47 |             return title_extracted_by_title
48 | 
49 |         return title_extracted_by_h
50 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/extractors/custom_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import re
 3 | 
 4 | from magic_doc.contrib.magic_html.utils import *
 5 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor
 6 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor
 7 | 
 8 | 
 9 | class CustomExtractor(BaseExtractor):
10 |     def __init__(self) -> None:
11 |         super().__init__()
12 | 
13 |     def use_clean_rule(self, tree, clean_rules):
14 |         for clean_rule in clean_rules:
15 |             for x in tree.xpath(clean_rule):
16 |                 self.remove_node(x)
17 |         return tree
18 | 
19 |     def use_extract_rule(self, tree, extract_rule):
20 |         if "/text()" in extract_rule["value"]:
21 |             return "".join(tree.xpath(extract_rule["value"])).strip()
22 |         return tree.xpath(extract_rule["value"])[0]
23 | 
24 |     def extract(self, html="", base_url="", rule={}) -> dict:
25 |         tree = load_html(html)
26 |         if tree is None:
27 |             raise ValueError
28 | 
29 |         # base_url
30 |         base_href = tree.xpath("//base/@href")
31 | 
32 |         if base_href and "http" in base_href[0]:
33 |             base_url = base_href[0]
34 | 
35 |         if "clean" in rule:
36 |             tree = self.use_clean_rule(tree, rule["clean"])
37 | 
38 |         # 获取title
39 |         if "title" not in rule:
40 |             title = TitleExtractor().process(tree)
41 |         else:
42 |             title = self.use_extract_rule(tree, rule["title"])
43 | 
44 |         # 文章区域
45 |         try:
46 |             body_tree = self.use_extract_rule(tree, rule["content"])
47 |         except:
48 |             raise ValueError
49 |         body_html = tostring(body_tree, encoding=str)
50 | 
51 |         return {
52 |             "xp_num": "custom",
53 |             "drop_list": False,
54 |             "html": body_html,
55 |             "title": title,
56 |             "base_url": base_url
57 |         }
58 | 


--------------------------------------------------------------------------------
/magic_doc/conv/pptx_python_pptx.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from pathlib import Path
 3 | 
 4 | from loguru import logger
 5 | 
 6 | from magic_doc.contrib.model import Page
 7 | from magic_doc.contrib.office.pptx_extract import PptxExtractor
 8 | from magic_doc.conv.base import BaseConv
 9 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
10 | from magic_doc.progress.pupdator import ConvProgressUpdator
11 | 
12 | 
13 | class Pptx(BaseConv):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str:
18 |         page_list = self.pptx_to_pagelist(bits, pupdator)
19 |         md_content_list = []
20 |         total = len(page_list)
21 |         for index, page in enumerate(page_list):
22 |             progress = 50 + int(index / total * 50)
23 |             # logger.info(f"progress: {progress}")
24 |             page_content_list = page['content_list']
25 |             for content in page_content_list:
26 |                 pupdator.update(progress)
27 |                 if content['type'] == 'image':
28 |                     pass
29 |                 elif content['type'] == "text":
30 |                     data = content['data']
31 |                     md_content_list.append(data)
32 |         return "\n".join(md_content_list)
33 | 
34 |     def pptx_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]:
35 |         with tempfile.TemporaryDirectory() as temp_path:
36 |             temp_dir = Path(temp_path)
37 |             media_dir = temp_dir / "media"
38 |             media_dir.mkdir()
39 |             file_path = temp_dir / "tmp.pptx"
40 |             file_path.write_bytes(bits)
41 |             pptx_extractor = PptxExtractor()
42 |             pages = pptx_extractor.extract(file_path, "tmp", temp_dir, media_dir, True)
43 |             pupdator.update(50)
44 |             return pages
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     pupdator = FileBaseProgressUpdator("/tmp/p.txt")
49 |     pptx = Pptx()
50 |     logger.info(
51 |         pptx.to_md(open(r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-模板】Professional Pack Standard.pptx", "rb").read(), pupdator))
52 | 


--------------------------------------------------------------------------------
/magic_doc/conv/docx_xml_parse.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import tempfile
 3 | import zipfile
 4 | import xml.etree.ElementTree as ET
 5 | from pathlib import Path
 6 | 
 7 | from loguru import logger
 8 | 
 9 | from magic_doc.contrib.model import Content, Page
10 | from magic_doc.contrib.office.docx_extract import DocxExtractor
11 | from magic_doc.conv.base import BaseConv
12 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
13 | from magic_doc.progress.pupdator import ConvProgressUpdator
14 | 
15 | 
16 | class Docx(BaseConv):
17 |     def __init__(self):
18 |         super().__init__()
19 | 
20 |     def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str:
21 |         page_list = self.docx_to_pagelist(bits, pupdator)
22 |         md_content_list = []
23 |         for page in page_list:
24 |             page_content_list = page['content_list']
25 |             total = len(page_content_list)
26 |             for index, content in enumerate(page_content_list):
27 |                 progress = 50 + int(index / total * 50)
28 |                 pupdator.update(progress)
29 |                 if content['type'] == 'image':
30 |                     pass
31 |                 elif content['type'] in ["text", "md"]:
32 |                     data = content['data']
33 |                     md_content_list.append(data)
34 |         return "\n".join(md_content_list)
35 | 
36 |     def docx_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]:
37 |         with tempfile.TemporaryDirectory() as temp_path:
38 |             temp_dir = Path(temp_path)
39 |             media_dir = temp_dir / "media"
40 |             media_dir.mkdir()
41 |             file_path = temp_dir / "tmp.docx"
42 |             file_path.write_bytes(bits)
43 |             docx_extractor = DocxExtractor()
44 |             pages = docx_extractor.extract(file_path, "tmp", temp_dir, media_dir, True)
45 |             pupdator.update(50)
46 |             return pages
47 | 
48 | 
49 | if __name__ == '__main__':
50 |     pupdator = FileBaseProgressUpdator("/tmp/p.txt")
51 |     docx = Docx()
52 |     logger.info(docx.to_md(open(r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图.docx", "rb").read(), pupdator))
53 | 


--------------------------------------------------------------------------------
/magic_doc/conv/doc_antiword.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from pathlib import Path
 4 | 
 5 | from loguru import logger
 6 | 
 7 | from magic_doc.contrib.model import Page
 8 | from magic_doc.contrib.office.doc import DocExtractor
 9 | from magic_doc.conv.base import BaseConv
10 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
11 | from magic_doc.progress.pupdator import ConvProgressUpdator
12 | 
13 | 
14 | class Doc(BaseConv):
15 | 
16 |     def __init__(self):
17 |         super().__init__()
18 | 
19 |     def to_md(self, bits: bytes, pupdator:ConvProgressUpdator) -> str:
20 |         page_list = self.doc_to_pagelist(bits, pupdator)
21 |         md_content_list = []
22 |         for page in page_list:
23 |             page_content_list = page['content_list']
24 |             total = len(page_content_list)
25 |             for index, content in enumerate(page_content_list):
26 |                 progress = 50 + int(index / total * 50)
27 |                 pupdator.update(progress)
28 |                 if content['type'] == 'image':
29 |                     pass
30 |                 elif content['type'] == "text":
31 |                     data = content['data']
32 |                     md_content_list.append(data)
33 |         return "\n".join(md_content_list)
34 | 
35 |     def doc_to_pagelist(self, bits,  pupdator:ConvProgressUpdator) -> list[Page]:
36 |         with tempfile.TemporaryDirectory() as temp_path:
37 |             temp_dir = Path(temp_path)
38 |             media_dir = temp_dir / "media"
39 |             media_dir.mkdir()
40 |             file_path = temp_dir / "tmp.doc"
41 |             file_path.write_bytes(bits)
42 |             doc_extractor = DocExtractor()
43 |             cwd_path = os.path.dirname(os.path.abspath(__file__)) / Path("../bin/linux")
44 |             bin_path = cwd_path / "antiword"
45 |             os.chmod(bin_path, 0o755)
46 |             page_list = doc_extractor.extract(file_path, "tmp", temp_dir, media_dir, True, cwd_path=cwd_path)
47 |             pupdator.update(50)
48 |         return page_list
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     pupdator = FileBaseProgressUpdator("/tmp/p.txt")
53 |     doc = Doc()
54 |     logger.info(doc.to_md(Path("/home/myhloli/文本+表+图1.doc").read_bytes(), pupdator))
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | from pathlib import Path
 3 | from magic_doc.libs.version import __version__
 4 | 
 5 | 
 6 | def parse_requirements(filename):
 7 |     with open(filename) as f:
 8 |         lines = f.read().splitlines()
 9 | 
10 |     requires = []
11 | 
12 |     for line in lines:
13 |         if "http" in line:
14 |             pkg_name_without_url = line.split("@")[0].strip()
15 |             requires.append(pkg_name_without_url)
16 |         else:
17 |             requires.append(line)
18 | 
19 |     return requires
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     with Path(Path(__file__).parent,
24 |               'README.md').open(encoding='utf-8') as file:
25 |         long_description = file.read()
26 |     setup(
27 |         name="fairy_doc",  # 项目名
28 |         version=__version__,  # 自动从tag中获取版本号
29 |         packages=find_packages() + ["magic_doc.bin", "magic_doc.resources", "magic_doc/contrib/magic_html/mmltex"],  # 包含所有的包
30 |         package_data={
31 |             "magic_doc.bin": ["**"],  # 包含magic_doc.bin目录下的所有文件
32 |             "magic_doc.resources": ["**"],  # 包含magic_doc.resources目录下的所有文件
33 |             "magic_doc.contrib.office.formula": ["**"],  # 包含magic_doc.contrib.office.formula目录下的所有文件
34 |             "magic_doc/contrib/magic_html/mmltex": ["**"],
35 |         },
36 |         license='Apache 2.0',
37 |         extras_require={
38 |             "gpu": ["paddlepaddle-gpu==2.6.1", "paddleocr==2.7.3", "magic-pdf[gpu]>=0.5.10"],
39 |             "cpu": ["paddlepaddle==2.5.2", "paddleocr==2.7.3", "magic-pdf[cpu]>=0.5.10"],
40 |         },
41 |         description='A lightweight toolbox to manipulate documents',
42 |         long_description=long_description,
43 |         long_description_content_type='text/markdown',
44 |         install_requires=parse_requirements("requirements.txt"),  # 项目依赖的第三方库
45 |         url="https://github.com/InternLM/magic-doc",
46 |         python_requires=">=3.10",  # 项目依赖的 Python 版本
47 |         entry_points={
48 |             "console_scripts": [
49 |                 "magic-doc=magic_doc.cli:cli_conv",
50 |                 "pdf2md=magic_doc.cli:pdf_cli"
51 |             ],
52 |         },
53 |         include_package_data=True,
54 |         zip_safe=False,  # 是否使用 zip 文件格式打包，一般设为 False
55 |     )
56 | 


--------------------------------------------------------------------------------
/README_zh-CN.md:
--------------------------------------------------------------------------------
 1 | <div id="top"></div>
 2 | <div align="center">
 3 | 
 4 | [![license](https://img.shields.io/github/license/magicpdf/Magic-Doc.svg)](https://github.com/magicpdf/Magic-Doc/tree/main/LICENSE)
 5 | [![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues)
 6 | [![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues)
 7 | 
 8 | [English](README.md) | [简体中文](README_zh-CN.md)
 9 | 
10 | </div>
11 | 
12 | <div align="center">
13 | 
14 | </div>
15 | 
16 | 
17 | ### 安装
18 | 前置依赖： python3.10 + 
19 | 
20 | 安装依赖
21 | 
22 | **linux/osx** 
23 | 
24 | ```bash
25 | apt-get/yum/brew install libreoffice
26 | ```
27 | 
28 | **windows**
29 | ```text
30 | 安装 libreoffice 
31 | 添加 "install_dir\LibreOffice\program" to 环境变量 PATH
32 | ```
33 | 
34 | 
35 | 安装 Magic-Doc
36 | 
37 | 
38 | ```bash
39 | pip install fairy-doc[cpu] # 安装 cpu 版本 
40 | 或 
41 | pip install fairy-doc[gpu] # 安装 gpu 版本
42 | ```
43 | 
44 | 
45 | ## 简介
46 | 
47 | Magic-Doc 是一个轻量级、开源的用于将多种格式的文档（PPT/PPTX/DOC/DOCX/PDF) 转化为 markdown 格式的工具。支持转换本地文档或者位于 AWS S3 上的文件
48 | 
49 | 
50 | ## 使用示例
51 | 
52 | ```python
53 | # for local file
54 | from magic_doc.docconv import DocConverter, S3Config
55 | converter = DocConverter(s3_config=None)
56 | markdown_cotent, time_cost = converter.convert("some_doc.pptx", conv_timeout=300)
57 | ```
58 | 
59 | ```python
60 | # for remote file located in aws s3
61 | from magic_doc.docconv import DocConverter, S3Config
62 | 
63 | s3_config = S3Config(ak='${ak}', sk='${sk}', endpoint='${endpoint}')
64 | converter = DocConverter(s3_config=s3_config)
65 | markdown_cotent, time_cost = converter.convert("s3://some_bucket/some_doc.pptx", conv_timeout=300)
66 | ```
67 | 
68 | 
69 | ## 性能
70 | 环境：AMD EPYC 7742 64-Core Processor, NVIDIA A100, Centos 7
71 | 
72 | | 文件类型        | 转化速度| 
73 | | ------------------ | -------- | 
74 | | PDF (digital)      | 347 (page/s)   | 
75 | | PDF (ocr)          | 2.7 (page/s)   | 
76 | | PPT                | 20 (page/s)    | 
77 | | PPTX               | 149 (page/s)   | 
78 | | DOC                | 600 (page/s)   | 
79 | | DOCX               | 1482 (page/s)  | 
80 | 
81 | 
82 | 
83 | ## 开源许可证
84 | 
85 | 该项目采用[Apache 2.0 开源许可证](LICENSE)。
86 | 
87 | <p align="right"><a href="#top">🔼 Back to top</a></p>
88 | 


--------------------------------------------------------------------------------
/magic_doc/conv/conv_html.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from magic_doc.conv.base import BaseConv
 3 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
 4 | from magic_doc.contrib.magic_html import GeneralExtractor
 5 | from magic_doc.progress.pupdator import ConvProgressUpdator
 6 | from loguru import logger
 7 | 
 8 | extractor = GeneralExtractor()
 9 | 
10 | 
11 | class Html(BaseConv):
12 | 
13 |     def __init__(self):
14 |         super().__init__()
15 | 
16 |     @logger.catch
17 |     def to_md(self, html: str, pupdator: ConvProgressUpdator, **kwargs) -> str:
18 |         """
19 |         从HTML中提取主体区域内容
20 |         :param html:  html文本
21 |         :param kwargs: 可选参数
22 |             base_url   网页地址
23 |             html_type  网页类型(支持3种)
24 |                 1. article  文章类
25 |                 2. forum    论坛类
26 |                 3. weixin   微信文章
27 |         :return:   {
28 |                     "base_url": "https://example.com/",
29 |                     "drop_list": false,
30 |                     "html": "<div><body...",
31 |                     "title": "",
32 |                     "xp_num": "others"
33 |                   }
34 |         """
35 |         base_url = kwargs.get("base_url", "")
36 |         html_type = kwargs.get("html_type", None)
37 |         data = extractor.extract(html, base_url=base_url, html_type=html_type)
38 |         return json.dumps(data, ensure_ascii=False)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     html_str = "<!doctype html><html><head><title>Example Domain</title><meta charset='utf-8' /><meta http-equiv='Content-type' content='text/html; charset=utf-8' /><meta name='viewport' content='width=device-width, initial-scale=1' /></head><body><div><h1>Example Domain</h1><p>This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.</p><p>This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.</p><p>This domain is for use in illustrative examples in documents. You may use this domain in literature without prior coordination or asking for permission.</p><p><a href='https://www.iana.org/domains/example'>More information...</a></p></div></body></html>"
43 |     pupdator = FileBaseProgressUpdator("/tmp/p.txt")
44 |     result = Html(pupdator).to_md(html=html_str, pupdator=pupdator)
45 |     print(result)
46 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | from transformers.models.bert.configuration_bert import BertConfig
 3 | from transformers.utils import logging
 4 | 
 5 | 
 6 | logger = logging.get_logger(__name__)
 7 | 
 8 | LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 9 |     "layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json",
10 |     "layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/resolve/main/config.json",
11 |     # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3
12 | }
13 | 
14 | 
15 | class LayoutLMv3Config(BertConfig):
16 |     model_type = "layoutlmv3"
17 | 
18 |     def __init__(
19 |         self,
20 |         pad_token_id=1,
21 |         bos_token_id=0,
22 |         eos_token_id=2,
23 |         max_2d_position_embeddings=1024,
24 |         coordinate_size=None,
25 |         shape_size=None,
26 |         has_relative_attention_bias=False,
27 |         rel_pos_bins=32,
28 |         max_rel_pos=128,
29 |         has_spatial_attention_bias=False,
30 |         rel_2d_pos_bins=64,
31 |         max_rel_2d_pos=256,
32 |         visual_embed=True,
33 |         mim=False,
34 |         wpa_task=False,
35 |         discrete_vae_weight_path='',
36 |         discrete_vae_type='dall-e',
37 |         input_size=224,
38 |         second_input_size=112,
39 |         device='cuda',
40 |         **kwargs
41 |     ):
42 |         """Constructs RobertaConfig."""
43 |         super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
44 |         self.max_2d_position_embeddings = max_2d_position_embeddings
45 |         self.coordinate_size = coordinate_size
46 |         self.shape_size = shape_size
47 |         self.has_relative_attention_bias = has_relative_attention_bias
48 |         self.rel_pos_bins = rel_pos_bins
49 |         self.max_rel_pos = max_rel_pos
50 |         self.has_spatial_attention_bias = has_spatial_attention_bias
51 |         self.rel_2d_pos_bins = rel_2d_pos_bins
52 |         self.max_rel_2d_pos = max_rel_2d_pos
53 |         self.visual_embed = visual_embed
54 |         self.mim = mim
55 |         self.wpa_task = wpa_task
56 |         self.discrete_vae_weight_path = discrete_vae_weight_path
57 |         self.discrete_vae_type = discrete_vae_type
58 |         self.input_size = input_size
59 |         self.second_input_size = second_input_size
60 |         self.device = device
61 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/office/formula/omml/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import string
 3 | 
 4 | from lxml import etree as ET
 5 | 
 6 | from ..mml import mml2tex
 7 | 
 8 | 
 9 | _namespaces = {
10 |     "wpc": "http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas",
11 |     "cx": "http://schemas.microsoft.com/office/drawing/2014/chartex",
12 |     "cx1": "http://schemas.microsoft.com/office/drawing/2015/9/8/chartex",
13 |     "mc": "http://schemas.openxmlformats.org/markup-compatibility/2006",
14 |     "o": "urn:schemas-microsoft-com:office:office",
15 |     "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
16 |     "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
17 |     "v": "urn:schemas-microsoft-com:vml",
18 |     "wp14": "http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing",
19 |     "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
20 |     "w10": "urn:schemas-microsoft-com:office:word",
21 |     "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
22 |     "w14": "http://schemas.microsoft.com/office/word/2010/wordml",
23 |     "w15": "http://schemas.microsoft.com/office/word/2012/wordml",
24 |     "w16se": "http://schemas.microsoft.com/office/word/2015/wordml/symex",
25 |     "wpg": "http://schemas.microsoft.com/office/word/2010/wordprocessingGroup",
26 |     "wpi": "http://schemas.microsoft.com/office/word/2010/wordprocessingInk",
27 |     "wne": "http://schemas.microsoft.com/office/word/2006/wordml",
28 |     "wps": "http://schemas.microsoft.com/office/word/2010/wordprocessingShape",
29 | }
30 | _xmlns_str = " ".join(
31 |     'xmlns:{}="{}"'.format(key, value) for key, value in _namespaces.items()
32 | )
33 | 
34 | _template = string.Template(
35 |     """<?xml version="1.0" standalone="yes"?>
36 | <w:document mc:Ignorable="w14 w15 w16se wp14" {}>
37 |     $omml_xml
38 | </w:document>""".format(
39 |         _xmlns_str
40 |     )
41 | )
42 | 
43 | 
44 | transform = None
45 | 
46 | _xslt_filename = os.path.join(
47 |     os.path.dirname(os.path.abspath(__file__)), "OMML2MML.XSL"
48 | )
49 | 
50 | 
51 | def omml2mml(omml_xml):
52 |     xml_content = _template.safe_substitute(omml_xml=omml_xml)
53 |     tree = ET.fromstring(xml_content)
54 |     global transform
55 |     if not transform:
56 |         transform = ET.XSLT(ET.parse(_xslt_filename))
57 |     return str(transform(tree))
58 | 
59 | 
60 | def omml2tex(omml_xml):
61 |     mml_xml = omml2mml(omml_xml)
62 |     return mml2tex(mml_xml)
63 | 


--------------------------------------------------------------------------------
/magic_doc/conv/pdf.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | 
 3 | from loguru import logger
 4 | from werkzeug.datastructures import FileStorage
 5 | 
 6 | from magic_doc.contrib.pdf.pdf_extractor import PDFExtractor
 7 | from magic_doc.conv.base import BaseConv
 8 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
 9 | from magic_doc.progress.pupdator import ConvProgressUpdator
10 | from magic_doc.conv.base import ParseFailed
11 | from magic_doc.conv.pdf_pp_structurev2 import Pdf as liteOcr
12 | 
13 | class Pdf(BaseConv):
14 |     def __init__(self, allowed_failure=True):
15 |         self.allowed_failure = allowed_failure
16 | 
17 |     def to_md(self, bits: bytes | str, pupdator: ConvProgressUpdator) -> str:
18 |         pdf_extractor = PDFExtractor()
19 |         buf = BytesIO(bits)  # type: ignore
20 |         content = pdf_extractor.run("stream io data", FileStorage(buf, "fake.pdf"))
21 |         arr = []
22 |         pupdator.update(0)
23 | 
24 |         N = len(content)
25 |         progress_h = {N * i // 100: 1 for i in range(10, 100, 10)}
26 |         for idx, page in enumerate(content):
27 |             if idx in progress_h:
28 |                 pupdator.update(idx * 100 // N)
29 |             for record in page.get("content_list", []):
30 |                 arr.append(record.get("data", ""))
31 | 
32 |         text_all = ""
33 |         for content in arr:
34 |              text_all += content
35 |         def calculate_not_printable_rate(text):
36 |              printable = sum(1 for c in text if c.isprintable())
37 |              total = len(text)
38 |              if total == 0:
39 |                  return 0  # 避免除以零的错误
40 |              return (total - printable) / total
41 |         not_printable_rate = calculate_not_printable_rate(text_all)
42 |         if not_printable_rate > 0.02:
43 |             if self.allowed_failure:
44 |                 raise ParseFailed
45 |             else:
46 |                 liteOcrPdf = liteOcr()
47 |                 return liteOcrPdf.to_md(bits, pupdator)
48 |         else:
49 |             pupdator.update(100)
50 |             return "\n\n".join(arr)
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     if 1:
55 |         with open("/opt/data/pdf/20240423/pdf_test2/ol006018w.pdf", "rb") as f:
56 |             bits_data = f.read()
57 |             parser = Pdf()
58 |             md_content = parser.to_md(bits_data, FileBaseProgressUpdator("debug/progress.txt"))
59 | 
60 |         with open("debug/pdf2md.md", "w") as f:
61 |             f.write(md_content)
62 | 


--------------------------------------------------------------------------------
/test/test_cli/test_cli.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import os
 3 | from lib import common
 4 | import logging
 5 | 
 6 | code_path = "magic_doc"  # 假设代码路径已经在配置文件中设置
 7 | output_path = "magic_doc/datas_new"  # 输出路径
 8 | 
 9 | class TestDocConversion:
10 | 
11 |     def test_convert_doc_to_md(self):
12 |         """
13 |         将DOC文件转换为Markdown
14 |         """
15 |         file_path = os.path.join(code_path, "datas/test01.doc")
16 |         cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}"
17 |         logging.info(cmd)
18 |         common.check_shell(cmd)
19 |         # 这里可以添加更多的检查函数来验证转换结果
20 | 
21 |     def test_convert_docx_to_md(self):
22 |         """
23 |         将DOCX文件转换为Markdown
24 |         """
25 |         file_path = os.path.join(code_path, "datas/test02.docx")
26 |         cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}"
27 |         logging.info(cmd)
28 |         common.check_shell(cmd)
29 |         # 这里可以添加更多的检查函数来验证转换结果
30 | 
31 |     def test_convert_html_to_md(self):
32 |         """
33 |         将HTML文件转换为Markdown
34 |         """
35 |         file_path = os.path.join(code_path, "datas/test03.html")
36 |         cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}"
37 |         logging.info(cmd)
38 |         common.check_shell(cmd)
39 |         # 这里可以添加更多的检查函数来验证转换结果
40 | 
41 |     def test_convert_pdf_to_md(self):
42 |         """
43 |         将PDF文件转换为Markdown
44 |         """
45 |         file_path = os.path.join(code_path, "datas/test04.pdf")
46 |         cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}"
47 |         logging.info(cmd)
48 |         common.check_shell(cmd)
49 |         # 这里可以添加更多的检查函数来验证转换结果
50 | 
51 |     def test_convert_ppt_to_md(self):
52 |         """
53 |         将PPT文件转换为Markdown
54 |         """
55 |         file_path = os.path.join(code_path, "datas/test05.ppt")
56 |         cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}"
57 |         logging.info(cmd)
58 |         common.check_shell(cmd)
59 |         # 这里可以添加更多的检查函数来验证转换结果
60 | 
61 |     def test_convert_pptx_to_md(self):
62 |         """
63 |         将PPTX文件转换为Markdown
64 |         """
65 |         file_path = os.path.join(code_path, "datas/test06.pptx")
66 |         cmd = f"python {code_path}/cli.py --file-path {file_path} --output {output_path}"
67 |         logging.info(cmd)
68 |         common.check_shell(cmd)
69 |         # 这里可以添加更多的检查函数来验证转换结果
70 | 
71 | if __name__ == "__main__":
72 |     pytest.main(["-v", __file__])


--------------------------------------------------------------------------------
/magic_doc/contrib/test_data/url_service/run.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, Response, request
 2 | import requests
 3 | 
 4 | app = Flask(__name__)
 5 | 
 6 | 
 7 | @app.route("/path/<path:subpath>")
 8 | def handle_path(subpath):
 9 |     include_content_type = request.args.get("ct", "false").lower() == "true"
10 |     include_content_disposition = request.args.get("cd", "false").lower() == "true"
11 | 
12 |     if subpath.endswith(".html"):
13 |         content_type, disposition = "text/html", "inline"
14 |     elif subpath.endswith(".pdf"):
15 |         content_type, disposition = (
16 |             "application/pdf",
17 |             'attachment; filename="document.pdf"',
18 |         )
19 |     elif subpath.endswith(".doc"):
20 |         content_type, disposition = (
21 |             "application/msword",
22 |             'attachment; filename="document.doc"',
23 |         )
24 |     elif subpath.endswith(".docx"):
25 |         content_type, disposition = (
26 |             "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
27 |             'attachment; filename="document.docx"',
28 |         )
29 |     elif subpath.endswith(".ppt"):
30 |         content_type, disposition = (
31 |             "application/vnd.ms-powerpoint",
32 |             'attachment; filename="presentation.ppt"',
33 |         )
34 |     elif subpath.endswith(".pptx"):
35 |         content_type, disposition = (
36 |             "application/vnd.openxmlformats-officedocument.presentationml.presentation",
37 |             'attachment; filename="presentation.pptx"',
38 |         )
39 |     elif subpath.endswith(".jpg") or subpath.endswith(".jpeg"):
40 |         content_type, disposition = "image/jpeg", "inline"
41 |     elif subpath.endswith(".png"):
42 |         content_type, disposition = "image/png", "inline"
43 |     else:
44 |         content_type = "text/plain"
45 |         disposition = 'attachment; filename="default.txt"'
46 | 
47 |     response = Response(f"Requested {subpath}")
48 |     if include_content_type:
49 |         response.headers["Content-Type"] = content_type
50 |     if include_content_disposition:
51 |         response.headers["Content-Disposition"] = disposition
52 | 
53 |     return response
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     # app.run(debug=True, port=6500)
58 |     res = requests.get(
59 |         "https://filesamples.com/samples/document/doc/sample2.doc",
60 |         timeout=10,
61 |         stream=True,
62 |     )
63 |     if res.status_code not in [200]:
64 |         res.raise_for_status()
65 |     print(res.headers.get("Content-Type"))
66 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import json
 3 | from urllib.parse import urlparse
 4 | from magic_doc.contrib.magic_html.extractors.article_extractor import ArticleExtractor
 5 | from magic_doc.contrib.magic_html.extractors.weixin_extractor import WeixinExtractor
 6 | from magic_doc.contrib.magic_html.extractors.forum_extractor import ForumExtractor
 7 | from magic_doc.contrib.magic_html.extractors.custom_extractor import CustomExtractor
 8 | 
 9 | 
10 | class GeneralExtractor:
11 |     def __init__(self, config_path=""):
12 |         if config_path:
13 |             """
14 |             demo rule config file json:
15 |             {
16 |                 "www.***.com": {
17 |                     "clean": ["//script", "//style"],
18 |                     "title": {
19 |                         "mode": "xpath",
20 |                         "value": "//div[@class='media-body']/h4/text()"
21 |                     },
22 |                     "content": {
23 |                         "mode": "xpath",
24 |                         "value": "//div[@class='message break-all']"
25 |                     }
26 |                 }
27 |             }     
28 |             """
29 |             try:
30 |                 with open(config_path, 'r', encoding='utf-8') as f:
31 |                     self.rule = json.loads(f.read())
32 |             except:
33 |                 pass
34 |         else:
35 |             self.rule = {}
36 | 
37 |     def extract(self, html="", **kwargs) -> dict:
38 |         base_url = kwargs.get("base_url", "")
39 |         html_type = kwargs.pop("html_type", None)
40 |         if html_type:
41 |             if html_type == "forum":
42 |                 return ForumExtractor().extract(html=html, **kwargs)
43 |             elif html_type == "weixin":
44 |                 return WeixinExtractor().extract(html=html, **kwargs)
45 |         if base_url:
46 |             netloc = urlparse(base_url).netloc
47 |             if netloc in self.rule:
48 |                 try:
49 |                     new_kwargs = dict()
50 |                     new_kwargs["rule"] = self.rule[netloc]
51 |                     new_kwargs.update(kwargs)
52 |                     return CustomExtractor().extract(html=html, **new_kwargs)
53 |                 except:
54 |                     # 当自定义规则不能覆盖站点所有板块时，使用
55 |                     return ArticleExtractor().extract(html=html, **kwargs)
56 |             if netloc == "mp.weixin.qq.com":
57 |                 return WeixinExtractor().extract(html=html, **kwargs)
58 |         return ArticleExtractor().extract(html=html, **kwargs)
59 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <div id="top"></div>
 3 | <div align="center">
 4 | 
 5 | [![license](https://img.shields.io/github/license/magicpdf/Magic-Doc.svg)](https://github.com/magicpdf/Magic-Doc/tree/main/LICENSE)
 6 | [![issue resolution](https://img.shields.io/github/issues-closed-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues)
 7 | [![open issues](https://img.shields.io/github/issues-raw/magicpdf/Magic-Doc)](https://github.com/magicpdf/Magic-Doc/issues)
 8 | 
 9 | [English](READMD.md) | [简体中文](README_zh-CN.md)
10 | 
11 | </div>
12 | 
13 | <div align="center">
14 | 
15 | </div>
16 | 
17 | 
18 | ### Install
19 | 
20 | Prerequisites: python3.10+
21 | 
22 | Install Dependencies
23 | 
24 | **linux/osx** 
25 | 
26 | ```bash
27 | apt-get/yum/brew install libreoffice
28 | ```
29 | 
30 | **windows**
31 | ```text
32 | install libreoffice 
33 | append "install_dir\LibreOffice\program" to ENVIRONMENT PATH
34 | ```
35 | 
36 | 
37 | Install Magic-Doc
38 | 
39 | 
40 | ```bash
41 | pip install fairy-doc[cpu] # cpu version
42 | or
43 | pip install fairy-doc[gpu] # gpu version
44 | ```
45 | 
46 | 
47 | 
48 | ## Introduction
49 | 
50 | Magic-Doc is a lightweight open-source tool that allows users to convert mulitple file type (PPT/PPTX/DOC/DOCX/PDF) to markdown. It supports both local file and S3 file.
51 | 
52 | 
53 | ## Example
54 | 
55 | ```python
56 | # for local file
57 | from magic_doc.docconv import DocConverter, S3Config
58 | converter = DocConverter(s3_config=None)
59 | markdown_cotent, time_cost = converter.convert("some_doc.pptx", conv_timeout=300)
60 | ```
61 | 
62 | ```python
63 | # for remote file located in aws s3
64 | from magic_doc.docconv import DocConverter, S3Config
65 | 
66 | s3_config = S3Config(ak='${ak}', sk='${sk}', endpoint='${endpoint}')
67 | converter = DocConverter(s3_config=s3_config)
68 | markdown_cotent, time_cost = converter.convert("s3://some_bucket/some_doc.pptx", conv_timeout=300)
69 | ```
70 | 
71 | ## Performance
72 | 
73 | ENV: AMD EPYC 7742 64-Core Processor, NVIDIA A100, Centos 7
74 | 
75 | | File Type        | Speed | 
76 | | ------------------ | -------- | 
77 | | PDF (digital)        | 347 (page/s) | 
78 | | PDF (ocr)           | 2.7 (page/s)  | 
79 | | PPT                 | 20 (page/s)   | 
80 | | PPTX                | 149 (page/s)   | 
81 | | DOC                 | 600 (page/s)   | 
82 | | DOCX                | 1482 (page/s)   | 
83 | 
84 | ### All Thanks To Our Contributors:
85 | 
86 | ![image](https://github.com/magicpdf/Magic-Doc/blob/main/assets/contributor.png)
87 | 
88 | ## License
89 | 
90 | This project is released under the [Apache 2.0 license](LICENSE).
91 | 
92 | <p align="right"><a href="#top">🔼 Back to top</a></p>
93 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/pdf/pdf_extractor.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import fitz
 3 | 
 4 | from magic_doc.contrib.model import (
 5 |     ExtractResponse,
 6 |     Extractor,
 7 |     Page,
 8 |     Content,
 9 | )
10 | from magic_doc.contrib.wrapper_exceptions import NotSupportOcrPDFException
11 | 
12 | from werkzeug.datastructures import FileStorage
13 | from loguru import logger
14 | 
15 | 
16 | class PDFExtractor(Extractor):
17 |     def __init__(self) -> None:
18 |         super().__init__()
19 | 
20 |     def setup(self):
21 |         pass
22 | 
23 |     def is_digital(self, doc, check_page=10, text_len_thrs=100):
24 |         sample_page_num = min(check_page, doc.page_count)
25 |         page_ids = random.sample(range(doc.page_count), sample_page_num)
26 |         page_text_len = [
27 |             len(doc[pno].get_text("text")) > text_len_thrs for pno in page_ids
28 |         ]
29 |         if any(page_text_len):
30 |             return True
31 |         return False
32 | 
33 |     # Guess kimi implemetation
34 |     def get_text_with_pymupdf(self, doc):
35 |         pages = []
36 |         page_no = 0
37 |         for page in doc:
38 |             content_list = []
39 |             for block in page.get_text("blocks"):
40 |                 x0, y0, x1, y1, block_text, block_no, block_type = block
41 |                 lf_count = 0
42 |                 for ch in block_text:
43 |                     if ch == "\n":
44 |                         lf_count += 1
45 |                 block_text = (
46 |                     block_text.replace("-\n", "")
47 |                     .replace("´\n", "´")
48 |                     .replace(" \n", " ")
49 |                 )
50 |                 if lf_count >= 2:
51 |                     block_text = block_text.replace("\n", " ").strip()
52 |                 if len(block_text.strip()) == 0:
53 |                     continue
54 |                 content_list.append(
55 |                     Content(
56 |                         type="text",
57 |                         data=block_text,
58 |                     )
59 |                 )
60 |             pages.append(Page(page_no=page_no, content_list=content_list))
61 |             page_no += 1
62 |         return pages
63 | 
64 |     def run(
65 |         self, file_parse_id: str, r: FileStorage, skip_image: bool = True
66 |     ) -> ExtractResponse:
67 |         file_content = r.stream.read()
68 |         with fitz.open(stream=file_content) as doc:
69 |             if self.is_digital(doc):
70 |                 logger.info(f"{file_parse_id} is digital pdf")
71 |                 return self.get_text_with_pymupdf(doc)
72 |         raise NotSupportOcrPDFException
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     pdf_extractor = PDFExtractor()
77 |     with open("magic_doc/contrib/test_data/pdf/test.pdf", "rb") as f:
78 |         logger.info(pdf_extractor.run("test", FileStorage(f, filename="STL.pdf")))
79 | 


--------------------------------------------------------------------------------
/magic_doc/conv/doc_libreoffice.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from pathlib import Path
 4 | from subprocess import Popen
 5 | 
 6 | from loguru import logger
 7 | 
 8 | from magic_doc.contrib.model import Page
 9 | from magic_doc.contrib.office.docx_extract import DocxExtractor
10 | from magic_doc.conv.base import BaseConv
11 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
12 | from magic_doc.progress.pupdator import ConvProgressUpdator
13 | 
14 | 
15 | class Doc(BaseConv):
16 | 
17 |     def __init__(self):
18 |         super().__init__()
19 |         
20 |     def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str:
21 |         page_list = self.doc_to_pagelist(bits, pupdator)
22 |         md_content_list = []
23 |         for page in page_list:
24 |             page_content_list = page['content_list']
25 |             total = len(page_content_list)
26 |             for index, content in enumerate(page_content_list):
27 |                 progress = 50 + int(index / total * 50)
28 |                 # logger.info(f"progress: {progress}")
29 |                 pupdator.update(progress)
30 |                 if content['type'] == 'image':
31 |                     pass
32 |                 elif content['type'] in ["text", "md"]:
33 |                     data = content['data']
34 |                     md_content_list.append(data)
35 |         return "\n".join(md_content_list)
36 | 
37 |     def doc_to_docx(self, doc_path: str, dir_path: str) -> str:
38 |         cmd = f'soffice --headless --convert-to docx "{doc_path}" --outdir "{dir_path}"'
39 |         logger.info(cmd)
40 |         process = Popen(cmd, shell=True)
41 |         process.wait()
42 |         fname = str(Path(doc_path).stem)
43 |         docx_path = os.path.join(os.path.dirname(doc_path), f'{fname}.docx')
44 |         if not os.path.exists(docx_path):
45 |             # logger.error(f"> !!! File conversion failed {doc_path} ==> {docx_path}")
46 |             raise Exception(f"> !!! File conversion failed {doc_path} ==> {docx_path}")
47 |         else:
48 |             return docx_path
49 | 
50 |     def doc_to_pagelist(self, bits,  pupdator: ConvProgressUpdator) -> list[Page]:
51 |         with tempfile.TemporaryDirectory() as temp_path:
52 |             temp_dir = Path(temp_path)
53 |             media_dir = temp_dir / "media"
54 |             media_dir.mkdir()
55 |             file_path = temp_dir / "tmp.doc"
56 |             file_path.write_bytes(bits)
57 |             docx_file_path = self.doc_to_docx(str(file_path), str(temp_path))
58 |             pupdator.update(50)
59 |             docx_extractor = DocxExtractor()
60 |             pages = docx_extractor.extract(Path(docx_file_path), "tmp", temp_dir, media_dir, True)
61 |             pupdator.update(80)
62 |             return pages
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     pupdator = FileBaseProgressUpdator("/tmp/p.txt")
67 |     doc = Doc()
68 |     logger.info(doc.to_md(Path(r"D:\project\20240514magic_doc\doc_ppt\doc\demo\文本+表+图1.doc").read_bytes(), pupdator))
69 | 


--------------------------------------------------------------------------------
/magic_doc/conv/ppt_libreoffice.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from subprocess import Popen
 3 | import tempfile
 4 | from pathlib import Path
 5 | 
 6 | from loguru import logger
 7 | 
 8 | from magic_doc.contrib.model import Page
 9 | from magic_doc.contrib.office.pptx_extract import PptxExtractor
10 | from magic_doc.conv.base import BaseConv
11 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
12 | from magic_doc.progress.pupdator import ConvProgressUpdator
13 | 
14 | 
15 | class Ppt(BaseConv):
16 |     def __init__(self):
17 |         super().__init__()
18 | 
19 |     def to_md(self, bits: bytes, pupdator: ConvProgressUpdator) -> str:
20 |         page_list = self.ppt_to_pagelist(bits, pupdator)
21 |         md_content_list = []
22 |         total = len(page_list)
23 |         for index, page in enumerate(page_list):
24 |             progress = 80 + int(index / total * 20)
25 |             # logger.info(f"progress: {progress}")
26 |             page_content_list = page['content_list']
27 |             for content in page_content_list:
28 |                 pupdator.update(progress)
29 |                 if content['type'] == 'image':
30 |                     pass
31 |                 elif content['type'] == "text":
32 |                     data = content['data']
33 |                     md_content_list.append(data)
34 |         return "\n".join(md_content_list)
35 | 
36 |     def ppt_to_pptx(self, ppt_path: str, dir_path: str) -> str:
37 |         cmd = f'soffice --headless --convert-to pptx "{ppt_path}" --outdir "{dir_path}"'
38 |         logger.info(cmd)
39 |         process = Popen(cmd, shell=True)
40 |         process.wait()
41 |         fname = str(Path(ppt_path).stem)
42 |         pptx_path = os.path.join(os.path.dirname(ppt_path), f'{fname}.pptx')
43 |         if not os.path.exists(pptx_path):
44 |             # logger.error(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}")
45 |             raise Exception(f"> !!! File conversion failed {ppt_path} ==> {pptx_path}")
46 |         else:
47 |             return pptx_path
48 | 
49 |     def ppt_to_pagelist(self, bits, pupdator: ConvProgressUpdator) -> list[Page]:
50 |         with tempfile.TemporaryDirectory() as temp_path:
51 |             temp_dir = Path(temp_path)
52 |             media_dir = temp_dir / "media"
53 |             media_dir.mkdir()
54 |             file_path = temp_dir / "tmp.ppt"
55 |             file_path.write_bytes(bits)
56 |             pptx_file_path = self.ppt_to_pptx(str(file_path), str(temp_path))
57 |             pupdator.update(50)
58 |             pptx_extractor = PptxExtractor()
59 |             pages = pptx_extractor.extract(Path(pptx_file_path), "tmp", temp_dir, media_dir, True)
60 |             pupdator.update(80)
61 |             return pages
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     pupdator = FileBaseProgressUpdator("/tmp/p.txt")
66 |     ppt = Ppt()
67 |     logger.info(
68 |         ppt.to_md(
69 |             open(r"D:\project\20240514magic_doc\doc_ppt\doc\【英文-课件】MIT15_082JF10_lec10.3MB.ppt", "rb").read(), pupdator))
70 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/office/__init__.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | 
 5 | from concurrent.futures import Future
 6 | from concurrent.futures import ThreadPoolExecutor
 7 | from datetime import datetime
 8 | from pathlib import Path
 9 | from typing import Tuple
10 | 
11 | from loguru import logger
12 | from werkzeug.datastructures import FileStorage
13 | 
14 | # from pedia_document_parser.config import Config
15 | from magic_doc.contrib.model import ExtractResponse, Extractor
16 | # from pedia_document_parser.s3.client import S3Client
17 | 
18 | 
19 | class OfficeExtractor(Extractor, ABC):
20 |     def __init__(self) -> None:
21 |         super().__init__()
22 |         # self.config = Config()
23 |         self.tpe = ThreadPoolExecutor(max_workers=30)
24 |         self.counter = {}
25 |         self.tmp_dir = Path("/tmp")
26 |         self.max_text_count = 50_0000
27 | 
28 |     # def generate_img_path(self, id: str, image_name: str) -> str:
29 |     #     return f"s3://{self.config.s3_bucket}/{datetime.today().strftime('%Y-%m-%d')}/{id}/{image_name}"
30 |     #
31 |     # def upload(self, id: str, s3_path: str, path: Path) -> Tuple[str, str]:
32 |     #     cli = S3Client(self.config.s3_ak, self.config.s3_sk, self.config.s3_ep)
33 |     #     cli.upload_file(s3_path, path.absolute().as_posix())
34 |     #     return (id, s3_path)
35 | 
36 |     # def upload_background(self, id: str, img_map: dict[Path, str]):
37 |     #     if len(img_map) == 0:
38 |     #         self.clean_up(id)
39 |     #         return
40 |     #
41 |     #     self.counter[id] = len(img_map)
42 |     #     for src, dest in img_map.items():
43 |     #         fut = self.tpe.submit(self.upload, id, dest, src)
44 |     #         fut.add_done_callback(self.on_upload_succ)
45 | 
46 |     def clean_up(self, id: str):
47 |         dir = self.get_dir_by_id(id).absolute().as_posix()
48 |         shutil.rmtree(dir)
49 |         self.counter.pop(id, 0)
50 |         logger.debug(f"del {dir}")
51 | 
52 |     def on_upload_succ(self, fut: Future[Tuple[str, str]]) -> None:
53 |         id, s3_path = fut.result()
54 |         logger.debug(f"upload {s3_path} succ")
55 | 
56 |         self.counter[id] -= 1
57 |         if self.counter[id] == 0:
58 |             self.clean_up(id)
59 | 
60 |     def wait_all(self):
61 |         self.tpe.shutdown(wait=True)
62 | 
63 |     def get_dir_by_id(self, id: str) -> Path:
64 |         return self.tmp_dir.joinpath(id)
65 | 
66 |     def run(self, id: str, r: FileStorage, skip_image: bool = True) -> ExtractResponse:
67 |         dir = self.get_dir_by_id(id)
68 | 
69 |         dir.mkdir()
70 |         media_dir = dir.joinpath("media")
71 |         media_dir.mkdir()
72 | 
73 |         try:
74 |             return self.extract(r, id, dir, media_dir, skip_image)
75 |         except Exception as e:
76 |             self.clean_up(id)
77 |             raise e
78 | 
79 |     @abstractmethod
80 |     def extract(
81 |         self,
82 |         r: FileStorage | Path,
83 |         id: str,
84 |         dir: Path,
85 |         media_dir: Path,
86 |         skip_image: bool,
87 |     ) -> ExtractResponse:
88 |         pass
89 | 


--------------------------------------------------------------------------------
/magic_doc/bin/linux/share/antiword/Example:
--------------------------------------------------------------------------------
 1 | # An example of a fontnames translation table
 2 | #
 3 | # MS-Word fontname,	Italic,	Bold,	Acorn fontname,		Special
 4 | Arial,			0,	0,	Homerton.Medium,	0
 5 | Arial,			0,	1,	Homerton.Bold,		0
 6 | Arial,			1,	0,	Homerton.Medium.Oblique,0
 7 | Arial,			1,	1,	Homerton.Bold.Oblique,	0
 8 | Arial Black,		0,	0,	Homerton.Medium,	0
 9 | Arial Black,		0,	1,	Homerton.Bold,		0
10 | Arial Black,		1,	0,	Homerton.Medium.Oblique,0
11 | Arial Black,		1,	1,	Homerton.Bold.Oblique,	0
12 | AvantGarde,		0,	0,	Clare.Medium,		0
13 | AvantGarde,		0,	1,	Clare.Demi,		0
14 | AvantGarde,		1,	0,	Clare.Medium.Oblique,	0
15 | AvantGarde,		1,	1,	Clare.Demi.Oblique,	0
16 | Bookman,		0,	0,	Robinson.Light,		0
17 | Bookman,		0,	1,	Robinson.Demi,		0
18 | Bookman,		1,	0,	Robinson.Light.Italic,	0
19 | Bookman,		1,	1,	Robinson.Demi.Italic,	0
20 | Bookman Old Style,	0,	0,	Robinson.Light,		0
21 | Bookman Old Style,	0,	1,	Robinson.Demi,		0
22 | Bookman Old Style,	1,	0,	Robinson.Light.Italic,	0
23 | Bookman Old Style,	1,	1,	Robinson.Demi.Italic,	0
24 | Courier,		0,	0,	Corpus.Medium,		0
25 | Courier,		0,	1,	Corpus.Bold,		0
26 | Courier,		1,	0,	Corpus.Medium.Oblique,	0
27 | Courier,		1,	1,	Corpus.Bold.Oblique,	0
28 | Courier New,		0,	0,	Corpus.Medium,		0
29 | Courier New,		0,	1,	Corpus.Bold,		0
30 | Courier New,		1,	0,	Corpus.Medium.Oblique,	0
31 | Courier New,		1,	1,	Corpus.Bold.Oblique,	0
32 | Fixedsys,		0,	0,	Corpus.Medium,		0
33 | Fixedsys,		0,	1,	Corpus.Bold,		0
34 | Fixedsys,		1,	0,	Corpus.Medium.Oblique,	0
35 | Fixedsys,		1,	1,	Corpus.Bold.Oblique,	0
36 | Helvetica,		0,	0,	Homerton.Medium,	0
37 | Helvetica,		0,	1,	Homerton.Bold,		0
38 | Helvetica,		1,	0,	Homerton.Medium.Oblique,0
39 | Helvetica,		1,	1,	Homerton.Bold.Oblique,	0
40 | Lucida Console,		0,	0,	Corpus.Medium,		0
41 | Lucida Console,		0,	1,	Corpus.Bold,		0
42 | Lucida Console,		1,	0,	Corpus.Medium.Oblique,	0
43 | Lucida Console,		1,	1,	Corpus.Bold.Oblique,	0
44 | Palatino,		0,	0,	Pembroke.Medium,	0
45 | Palatino,		0,	1,	Pembroke.Bold,		0
46 | Palatino,		1,	0,	Pembroke.Medium.Italic,	0
47 | Palatino,		1,	1,	Pembroke.Bold.Italic,	0
48 | Swiss,			0,	0,	Homerton.Medium,	0
49 | Swiss,			0,	1,	Homerton.Bold,		0
50 | Swiss,			1,	0,	Homerton.Medium.Oblique,0
51 | Swiss,			1,	1,	Homerton.Bold.Oblique,	0
52 | Symbol,			0,	0,	Sidney,			1
53 | Symbol,			0,	1,	Sidney,			1
54 | Symbol,			1,	0,	Sidney,			1
55 | Symbol,			1,	1,	Sidney,			1
56 | Times,			0,	0,	Trinity.Medium,		0
57 | Times,			0,	1,	Trinity.Bold,		0
58 | Times,			1,	0,	Trinity.Medium.Italic,	0
59 | Times,			1,	1,	Trinity.Bold.Italic,	0
60 | Times New Roman,	0,	0,	Trinity.Medium,		0
61 | Times New Roman,	0,	1,	Trinity.Bold,		0
62 | Times New Roman,	1,	0,	Trinity.Medium.Italic,	0
63 | Times New Roman,	1,	1,	Trinity.Bold.Italic,	0
64 | Times Roman,		0,	0,	Trinity.Medium,		0
65 | Times Roman,		0,	1,	Trinity.Bold,		0
66 | Times Roman,		1,	0,	Trinity.Medium.Italic,	0
67 | Times Roman,		1,	1,	Trinity.Bold.Italic,	0
68 | Univers,		0,	0,	Homerton.Medium,	0
69 | Univers,		0,	1,	Homerton.Bold,		0
70 | Univers,		1,	0,	Homerton.Medium.Oblique,0
71 | Univers,		1,	1,	Homerton.Bold.Oblique,	0
72 | ZapfDingbats,		0,	0,	Selwyn,			2
73 | ZapfDingbats,		0,	1,	Selwyn,			2
74 | ZapfDingbats,		1,	0,	Selwyn,			2
75 | ZapfDingbats,		1,	1,	Selwyn,			2
76 | # All the other fonts
77 | *,			0,	0,	Trinity.Medium,		0
78 | *,			0,	1,	Trinity.Bold,		0
79 | *,			1,	0,	Trinity.Medium.Italic,	0
80 | *,			1,	1,	Trinity.Bold.Italic,	0
81 | 


--------------------------------------------------------------------------------
/magic_doc/conv/pdf_pp_structurev2.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | 
 4 | from magic_pdf.dict2md.ocr_mkcontent import union_make
 5 | from magic_pdf.libs.MakeContentConfig import MakeMode, DropMode
 6 | from magic_pdf.libs.json_compressor import JsonCompressor
 7 | from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
 8 | from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 9 | from magic_pdf.pipe.UNIPipe import UNIPipe
10 | 
11 | from loguru import logger
12 | 
13 | from magic_doc.conv.base import BaseConv
14 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
15 | from magic_doc.progress.pupdator import ConvProgressUpdator
16 | from magic_doc.utils.null_writer import NullWriter
17 | from magic_doc.common.default_config import DEFAULT_CONFIG
18 | 
19 | NULL_IMG_DIR = "/tmp"
20 | 
21 | class SingletonModelWrapper:
22 | 
23 |     def __new__(cls):
24 |         if not hasattr(cls, "instance"):
25 |             from magic_doc.model.doc_analysis_by_pp import PaddleDocAnalysis
26 |             cls.instance = super(SingletonModelWrapper, cls).__new__(cls)
27 |             cls.instance.model = PaddleDocAnalysis(model_load_on_each_gpu_count=int(DEFAULT_CONFIG["pdf"]["fast"]["liteocrmodelinstance"]))
28 |         return cls.instance
29 |     
30 |     def __call__(self, bytes: bytes):
31 |         from magic_pdf.model.doc_analyze_by_custom_model import load_images_from_pdf
32 |         images = load_images_from_pdf(bytes, dpi=200)
33 |         return self.model(images) # type: ignore
34 | 
35 | 
36 | class Pdf(BaseConv):
37 |     def to_md(self, bits: bytes | str, pupdator: ConvProgressUpdator) -> str:
38 |         model = SingletonModelWrapper()
39 |         model_list = model(bits)
40 |         pupdator.update(50)
41 |         jso_useful_key = {
42 |             "_pdf_type": "ocr",
43 |             "model_list": model_list,
44 |         }
45 |         image_writer = NullWriter()
46 |         pipe = UNIPipe(bits, jso_useful_key, image_writer, is_debug=True)  # type: ignore
47 |         # pipe.pipe_classify() # 默认ocrpipe的时候不需要再做分类，可以节省时间
48 |         pipe.pipe_parse()
49 |         pupdator.update(100)
50 | 
51 |         pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
52 |         pdf_info_list = pdf_mid_data["pdf_info"]
53 |         md_content = union_make(pdf_info_list, MakeMode.NLP_MD, DropMode.NONE, NULL_IMG_DIR)
54 |         return md_content  # type: ignore
55 | 
56 |     def to_mid_result(self, image_writer: AbsReaderWriter, bits: bytes | str, pupdator: ConvProgressUpdator) \
57 |             -> list[dict] | dict:
58 |         model = SingletonModelWrapper()
59 |         pupdator.update(0)
60 |         model_list = model(bits)
61 |         pupdator.update(50)
62 |         jso_useful_key = {
63 |             "_pdf_type": "ocr",
64 |             "model_list": model_list,
65 |         }
66 | 
67 |         pipe = UNIPipe(bits, jso_useful_key, image_writer, is_debug=True)  # type: ignore
68 |         # pipe.pipe_classify()
69 |         pipe.pipe_parse()
70 |         pupdator.update(100)
71 | 
72 |         pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
73 |         pdf_info_list = pdf_mid_data["pdf_info"]
74 |         return pdf_info_list
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     pupdator = FileBaseProgressUpdator("/tmp/p.txt")
79 |     pdf = Pdf()
80 |     logger.info(
81 |         pdf.to_md(Path(r"D:\project\20231108code-clean\linshixuqiu\pdf_dev\新模型\j.sna.2004.11.030.pdf").read_bytes(), pupdator))
82 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/mmltex/README:
--------------------------------------------------------------------------------
 1 | README for the XSLT MathML Library
 2 | 
 3 | XSLT MathML Library is a set of XSLT stylesheets to transform
 4 | MathML 2.0 to LaTeX.
 5 | 
 6 | For more information, see
 7 | http://www.raleigh.ru/MathML/mmltex/index.php?lang=en
 8 | 
 9 | Manifest
10 | --------
11 | 
12 | README        this file
13 | mmltex.xsl
14 | tokens.xsl
15 | glayout.xsl
16 | scripts.xsl
17 | tables.xsl
18 | entities.xsl
19 | cmarkup.xsl
20 | 
21 | Use
22 | ---
23 | 
24 | There are two ways of using the library:
25 | 
26 |     * Use a local copy of the library.
27 | 
28 |         1. Download the distribution (see below).
29 | 
30 |         2. Unpack the distribution, using unzip.
31 | 
32 |         3. In your stylesheet import or include either the main
33 |            stylesheet, mmltex.xsl, or the stylesheet module you
34 |            wish to use, such as tokens.xsl. This example assumes
35 |            that the distribution has been extracted into the same
36 |            directory as your own stylesheet:
37 | 
38 |            <xsl:import href="mmltex.xsl"/>
39 | 
40 |     * Import or include either the main stylesheet, or the
41 |       stylesheet module you wish to use, directly from the library
42 |       website; http://www.raleigh.ru/MathML/mmltex/. For example:
43 | 
44 |       <xsl:import href="http://www.raleigh.ru/MathML/mmltex/mmltex.xsl"/>
45 | 
46 | Obtaining The Library
47 | ---------------------
48 | 
49 | The XSLT MathML Library is available for download as:
50 | 
51 |     * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip
52 | 
53 | Copyright
54 | ---------
55 | 
56 | Copyright (C) 2001, 2002 Vasil Yaroshevich
57 | 
58 | Permission is hereby granted, free of charge, to any person
59 | obtaining a copy of this software and associated documentation
60 | files (the ``Software''), to deal in the Software without
61 | restriction, including without limitation the rights to use,
62 | copy, modify, merge, publish, distribute, sublicense, and/or
63 | sell copies of the Software, and to permit persons to whom the
64 | Software is furnished to do so, subject to the following
65 | conditions:
66 | 
67 | The above copyright notice and this permission notice shall be
68 | included in all copies or substantial portions of the Software.
69 | 
70 | Except as contained in this notice, the names of individuals
71 | credited with contribution to this software shall not be used in
72 | advertising or otherwise to promote the sale, use or other
73 | dealings in this Software without prior written authorization
74 | from the individuals in question.
75 | 
76 | Any stylesheet derived from this Software that is publically
77 | distributed will be identified with a different name and the
78 | version strings in any derived Software will be changed so that
79 | no possibility of confusion between the derived package and this
80 | Software will exist.
81 | 
82 | Warranty
83 | --------
84 | 
85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
88 | NONINFRINGEMENT.  IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER
89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
92 | OTHER DEALINGS IN THE SOFTWARE.
93 | 
94 | Contacting the Author
95 | ---------------------
96 | 
97 | These stylesheets are maintained by Vasil Yaroshevich, <yarosh@raleigh.ru>.
98 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/office/formula/mml/xsl/README:
--------------------------------------------------------------------------------
 1 | README for the XSLT MathML Library
 2 | 
 3 | XSLT MathML Library is a set of XSLT stylesheets to transform
 4 | MathML 2.0 to LaTeX.
 5 | 
 6 | For more information, see
 7 | http://www.raleigh.ru/MathML/mmltex/index.php?lang=en
 8 | 
 9 | Manifest
10 | --------
11 | 
12 | README        this file
13 | mmltex.xsl
14 | tokens.xsl
15 | glayout.xsl
16 | scripts.xsl
17 | tables.xsl
18 | entities.xsl
19 | cmarkup.xsl
20 | 
21 | Use
22 | ---
23 | 
24 | There are two ways of using the library:
25 | 
26 |     * Use a local copy of the library.
27 | 
28 |         1. Download the distribution (see below).
29 | 
30 |         2. Unpack the distribution, using unzip.
31 | 
32 |         3. In your stylesheet import or include either the main
33 |            stylesheet, mmltex.xsl, or the stylesheet module you
34 |            wish to use, such as tokens.xsl. This example assumes
35 |            that the distribution has been extracted into the same
36 |            directory as your own stylesheet:
37 | 
38 |            <xsl:import href="mmltex.xsl"/>
39 | 
40 |     * Import or include either the main stylesheet, or the
41 |       stylesheet module you wish to use, directly from the library
42 |       website; http://www.raleigh.ru/MathML/mmltex/. For example:
43 | 
44 |       <xsl:import href="http://www.raleigh.ru/MathML/mmltex/mmltex.xsl"/>
45 | 
46 | Obtaining The Library
47 | ---------------------
48 | 
49 | The XSLT MathML Library is available for download as:
50 | 
51 |     * Zip file: http://www.raleigh.ru/MathML/mmltex/mmltex.zip
52 | 
53 | Copyright
54 | ---------
55 | 
56 | Copyright (C) 2001, 2002 Vasil Yaroshevich
57 | 
58 | Permission is hereby granted, free of charge, to any person
59 | obtaining a copy of this software and associated documentation
60 | files (the ``Software''), to deal in the Software without
61 | restriction, including without limitation the rights to use,
62 | copy, modify, merge, publish, distribute, sublicense, and/or
63 | sell copies of the Software, and to permit persons to whom the
64 | Software is furnished to do so, subject to the following
65 | conditions:
66 | 
67 | The above copyright notice and this permission notice shall be
68 | included in all copies or substantial portions of the Software.
69 | 
70 | Except as contained in this notice, the names of individuals
71 | credited with contribution to this software shall not be used in
72 | advertising or otherwise to promote the sale, use or other
73 | dealings in this Software without prior written authorization
74 | from the individuals in question.
75 | 
76 | Any stylesheet derived from this Software that is publically
77 | distributed will be identified with a different name and the
78 | version strings in any derived Software will be changed so that
79 | no possibility of confusion between the derived package and this
80 | Software will exist.
81 | 
82 | Warranty
83 | --------
84 | 
85 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
86 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
87 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
88 | NONINFRINGEMENT.  IN NO EVENT SHALL NORMAN WALSH OR ANY OTHER
89 | CONTRIBUTOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
90 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
91 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
92 | OTHER DEALINGS IN THE SOFTWARE.
93 | 
94 | Contacting the Author
95 | ---------------------
96 | 
97 | These stylesheets are maintained by Vasil Yaroshevich, <yarosh@raleigh.ru>.
98 | 


--------------------------------------------------------------------------------
/magic_doc/bin/linux/share/antiword/Default:
--------------------------------------------------------------------------------
 1 | # Default fontnames translation table
 2 | # uses only fonts present in the RISC OS 3 ROMs
 3 | #
 4 | # MS-Word fontname,	Italic,	Bold,	Acorn fontname,		Special
 5 | Arial,			0,	0,	Homerton.Medium,	0
 6 | Arial,			0,	1,	Homerton.Bold,		0
 7 | Arial,			1,	0,	Homerton.Medium.Oblique,0
 8 | Arial,			1,	1,	Homerton.Bold.Oblique,	0
 9 | Arial Black,		0,	0,	Homerton.Medium,	0
10 | Arial Black,		0,	1,	Homerton.Bold,		0
11 | Arial Black,		1,	0,	Homerton.Medium.Oblique,0
12 | Arial Black,		1,	1,	Homerton.Bold.Oblique,	0
13 | Arial CE,		0,	0,	Homerton.Medium,	0
14 | Arial CE,		0,	1,	Homerton.Bold,		0
15 | Arial CE,		1,	0,	Homerton.Medium.Oblique,0
16 | Arial CE,		1,	1,	Homerton.Bold.Oblique,	0
17 | Arial Narrow,		0,	0,	Homerton.Medium,	0
18 | Arial Narrow,		0,	1,	Homerton.Bold,		0
19 | Arial Narrow,		1,	0,	Homerton.Medium.Oblique,0
20 | Arial Narrow,		1,	1,	Homerton.Bold.Oblique,	0
21 | Comic Sans MS,		0,	0,	Homerton.Medium,	0
22 | Comic Sans MS,		0,	1,	Homerton.Bold,		0
23 | Comic Sans MS,		1,	0,	Homerton.Medium.Oblique,0
24 | Comic Sans MS,		1,	1,	Homerton.Bold.Oblique,	0
25 | Courier,		0,	0,	Corpus.Medium,		0
26 | Courier,		0,	1,	Corpus.Bold,		0
27 | Courier,		1,	0,	Corpus.Medium.Oblique,	0
28 | Courier,		1,	1,	Corpus.Bold.Oblique,	0
29 | Courier New,		0,	0,	Corpus.Medium,		0
30 | Courier New,		0,	1,	Corpus.Bold,		0
31 | Courier New,		1,	0,	Corpus.Medium.Oblique,	0
32 | Courier New,		1,	1,	Corpus.Bold.Oblique,	0
33 | Fixedsys,		0,	0,	Corpus.Medium,		0
34 | Fixedsys,		0,	1,	Corpus.Bold,		0
35 | Fixedsys,		1,	0,	Corpus.Medium.Oblique,	0
36 | Fixedsys,		1,	1,	Corpus.Bold.Oblique,	0
37 | Helvetica,		0,	0,	Homerton.Medium,	0
38 | Helvetica,		0,	1,	Homerton.Bold,		0
39 | Helvetica,		1,	0,	Homerton.Medium.Oblique,0
40 | Helvetica,		1,	1,	Homerton.Bold.Oblique,	0
41 | Helvetica-Narrow,	0,	0,	Homerton.Medium,	0
42 | Helvetica-Narrow,	0,	1,	Homerton.Bold,		0
43 | Helvetica-Narrow,	1,	0,	Homerton.Medium.Oblique,0
44 | Helvetica-Narrow,	1,	1,	Homerton.Bold.Oblique,	0
45 | Lucida Console,		0,	0,	Corpus.Medium,		0
46 | Lucida Console,		0,	1,	Corpus.Bold,		0
47 | Lucida Console,		1,	0,	Corpus.Medium.Oblique,	0
48 | Lucida Console,		1,	1,	Corpus.Bold.Oblique,	0
49 | Monotype.com,		0,	0,	Corpus.Medium,		0
50 | Monotype.com,		0,	1,	Corpus.Bold,		0
51 | Monotype.com,		1,	0,	Corpus.Medium.Oblique,	0
52 | Monotype.com,		1,	1,	Corpus.Bold.Oblique,	0
53 | MS Sans Serif,		0,	0,	Homerton.Medium,	0
54 | MS Sans Serif,		0,	1,	Homerton.Bold,		0
55 | MS Sans Serif,		1,	0,	Homerton.Medium.Oblique,0
56 | MS Sans Serif,		1,	1,	Homerton.Bold.Oblique,	0
57 | Swiss,			0,	0,	Homerton.Medium,	0
58 | Swiss,			0,	1,	Homerton.Bold,		0
59 | Swiss,			1,	0,	Homerton.Medium.Oblique,0
60 | Swiss,			1,	1,	Homerton.Bold.Oblique,	0
61 | Tahoma,			0,	0,	Homerton.Medium,	0
62 | Tahoma,			0,	1,	Homerton.Bold,		0
63 | Tahoma,			1,	0,	Homerton.Medium.Oblique,0
64 | Tahoma,			1,	1,	Homerton.Bold.Oblique,	0
65 | Trebuchet MS,		0,	0,	Homerton.Medium,	0
66 | Trebuchet MS,		0,	1,	Homerton.Bold,		0
67 | Trebuchet MS,		1,	0,	Homerton.Medium.Oblique,0
68 | Trebuchet MS,		1,	1,	Homerton.Bold.Oblique,	0
69 | Verdana,		0,	0,	Homerton.Medium,	0
70 | Verdana,		0,	1,	Homerton.Bold,		0
71 | Verdana,		1,	0,	Homerton.Medium.Oblique,0
72 | Verdana,		1,	1,	Homerton.Bold.Oblique,	0
73 | Univers,		0,	0,	Homerton.Medium,	0
74 | Univers,		0,	1,	Homerton.Bold,		0
75 | Univers,		1,	0,	Homerton.Medium.Oblique,0
76 | Univers,		1,	1,	Homerton.Bold.Oblique,	0
77 | # All the other fonts
78 | *,			0,	0,	Trinity.Medium,		0
79 | *,			0,	1,	Trinity.Bold,		0
80 | *,			1,	0,	Trinity.Medium.Italic,	0
81 | *,			1,	1,	Trinity.Bold.Italic,	0
82 | 


--------------------------------------------------------------------------------
/tools/benchmark.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | import os
 3 | import shutil
 4 | import json
 5 | import markdown_calculate
 6 | code_path = os.environ.get('GITHUB_WORKSPACE')
 7 | #code_path = "/home/quyuan/actions-runner/_work/Magic-PDF/Magic-PDF.bk"
 8 | #评测集存放路径
 9 | pdf_dev_path = "/home/quyuan/data"
10 | #magicpdf跑测结果
11 | pdf_res_path = "/tmp/magic-pdf"
12 | file_types = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
13 | #file_types = ["academic_literature"]
14 | 
15 | def test_cli():
16 |     magicpdf_path = os.path.join(pdf_dev_path, "output")
17 |     rm_cmd = "rm -rf %s" % (pdf_res_path)
18 |     os.system(rm_cmd)
19 |     os.makedirs(pdf_res_path)
20 |     cmd = 'cd %s && export PYTHONPATH=. && find %s -type f -name "*.pdf" | xargs -I{} python magic_pdf/cli/magicpdf.py  pdf-command  --pdf {}' % (code_path, magicpdf_path)
21 |     os.system(cmd)
22 |     for root, dirs, files in os.walk(pdf_res_path):
23 |          for magic_file in files:
24 |             for file_type in file_types:
25 |                 target_dir = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf")
26 |                 if magic_file.endswith(".md") and magic_file.startswith(file_type):
27 |                     source_file = os.path.join(root, magic_file)
28 |                     target_file = os.path.join(pdf_dev_path, "ci", file_type, "magicpdf", magic_file)
29 |                     if not os.path.exists(target_dir):
30 |                          os.makedirs(target_dir) 
31 |                     shutil.copy(source_file, target_file)   
32 | 
33 | def calculate_score():
34 |     data_path = os.path.join(pdf_dev_path, "ci")
35 |     cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name annotations --download_dir %s" % (code_path, data_path)
36 |     os.system(cmd)
37 |     cmd = "cd %s && export PYTHONPATH=. && python tools/clean_photo.py --tool_name magicpdf --download_dir %s" % (code_path, data_path)
38 |     os.system(cmd)
39 |     score = markdown_calculate.Scoring(os.path.join(data_path, "result.json"))
40 |     score.calculate_similarity_total("magicpdf", file_types, data_path)
41 |     res = score.summary_scores()
42 |     return res
43 | 
44 | 
45 | def extrat_zip(zip_file_path, extract_to_path):
46 |     if zipfile.is_zipfile(zip_file_path):
47 |         with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
48 |             zip_ref.extractall(extract_to_path)
49 |         print(f'Files extracted to {extract_to_path}')
50 |     else:
51 |         print(f'{zip_file_path} is not a zip file')
52 | 
53 | 
54 | def ci_ben():
55 |     fr = open(os.path.join(pdf_dev_path, "ci", "result.json"), "r")
56 |     lines = fr.readlines()
57 |     last_line = lines[-1].strip()
58 |     last_score = json.loads(last_line)
59 |     print ("last_score:", last_score)
60 |     last_simscore = last_score["average_sim_score"]
61 |     last_editdistance = last_score["average_edit_distance"]
62 |     last_bleu = last_score["average_bleu_score"]
63 |     extrat_zip(os.path.join(pdf_dev_path, 'output.zip'), os.path.join(pdf_dev_path))
64 |     test_cli()
65 |     now_score = calculate_score()
66 |     print ("now_score:", now_score)
67 |     now_simscore = now_score["average_sim_score"]
68 |     now_editdistance = now_score["average_edit_distance"]
69 |     now_bleu = now_score["average_bleu_score"]
70 |     assert last_simscore <= now_simscore
71 |     assert last_editdistance <= now_editdistance
72 |     assert last_bleu <= now_bleu
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     ci_ben()
77 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/extractors/weixin_extractor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | 
  3 | from magic_doc.contrib.magic_html.utils import *
  4 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor
  5 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor
  6 | 
  7 | 
  8 | class WeixinExtractor(BaseExtractor):
  9 |     def __init__(self) -> None:
 10 |         super().__init__()
 11 | 
 12 |     def extract(self, html="", base_url="") -> dict:
 13 |         html = html.replace("&nbsp;", " ")
 14 |         tree = load_html(html)
 15 |         if tree is None:
 16 |             raise ValueError
 17 | 
 18 |         # 获取title
 19 |         title = TitleExtractor().process(tree)
 20 | 
 21 |         # base_url
 22 |         base_href = tree.xpath("//base/@href")
 23 | 
 24 |         if base_href and "http" in base_href[0]:
 25 |             base_url = base_href[0]
 26 | 
 27 |         # 文章区域
 28 |         try:
 29 |             body_tree = tree.xpath('.//*[@id="img-content"]')[0]
 30 |         except:
 31 |             raise ValueError
 32 | 
 33 |         # 去除 js , style, comment
 34 |         for script in body_tree.xpath(".//script"):
 35 |             self.remove_node(script)
 36 |         for style in body_tree.xpath(".//style"):
 37 |             self.remove_node(style)
 38 |         for comment in body_tree.xpath(".//comment()"):
 39 |             self.remove_node(comment)
 40 | 
 41 |         # 删除所有的公众号介绍
 42 |         for mp in body_tree.xpath('.//div[@id="meta_content"]'):
 43 |             self.remove_node(mp)
 44 |         for mp in body_tree.xpath('.//div[@id="js_tags"]'):
 45 |             self.remove_node(mp)
 46 |         for mp in body_tree.xpath('.//div[@class="original_area_primary"]'):
 47 |             self.remove_node(mp)
 48 |             # 隐藏的封禁 介绍
 49 |         for mp in body_tree.xpath('.//section[@class="wx_profile_card_inner"]'):
 50 |             self.remove_node(mp)
 51 |             # 特殊的wx卡片介绍
 52 |         for mp in body_tree.xpath(
 53 |                 ".//section[contains(@class, 'wx_profile_msg_inner')]"
 54 |         ):
 55 |             self.remove_node(mp)
 56 | 
 57 |         #  针对杂乱内容进行去除
 58 |         all_raga = body_tree.xpath(
 59 |             ".//*[contains(@style, 'color: rgba(255, 255, 255, 0)')] | .//*[contains(@style, 'color: rgba(255 255 255 0)')]"
 60 |         )
 61 | 
 62 |         for mp in all_raga:
 63 |             flag_have_color_rgb, detail_style = self.ensure_have_color_rgb(
 64 |                 mp.attrib["style"]
 65 |             )
 66 | 
 67 |             if not flag_have_color_rgb:
 68 |                 continue
 69 |             self.remove_node(mp)
 70 | 
 71 |         for img in body_tree.xpath(".//img"):
 72 | 
 73 |             if "data-src" not in img.attrib:
 74 |                 continue
 75 | 
 76 |             try:
 77 |                 img.set("src", img.attrib["data-src"])
 78 |             except Exception as e:
 79 |                 continue
 80 | 
 81 |         for h1 in body_tree.xpath(".//h1"):
 82 |             if not h1.text:
 83 |                 continue
 84 |             h1.text = h1.text.replace("\n", "").strip()
 85 | 
 86 |         body_html = tostring(body_tree, encoding=str)
 87 | 
 88 |         return {
 89 |             "xp_num": "weixin",
 90 |             "drop_list": False,
 91 |             "html": body_html,
 92 |             "title": title,
 93 |             "base_url": base_url
 94 |         }
 95 | 
 96 |     @staticmethod
 97 |     def ensure_have_color_rgb(htmlstr):
 98 |         pattern = r"(?<!-)\bcolor\s*:\s*rgba\(\s*255\s*[,|\s]\s*255\s*[,|\s]\s*255\s*[,|\s]\s*0\s*\)"
 99 | 
100 |         result = re.search(pattern, htmlstr)
101 |         if result:
102 |             return True, result.group()
103 |         else:
104 |             return False, ""
105 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/office/doc.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import json
  3 | 
  4 | from loguru import logger
  5 | from pathlib import Path
  6 | from subprocess import Popen, PIPE
  7 | 
  8 | from werkzeug.datastructures import FileStorage
  9 | 
 10 | from magic_doc.contrib.office import OfficeExtractor
 11 | from magic_doc.contrib.model import Page, Content, ExtractResponse
 12 | 
 13 | 
 14 | class DocExtractor(OfficeExtractor):
 15 |     def __init__(self) -> None:
 16 |         super().__init__()
 17 | 
 18 |     def setup(self):
 19 |         pass
 20 | 
 21 |     def extract(
 22 |         self,
 23 |         r: FileStorage | Path,
 24 |         id: str,
 25 |         dir: Path,
 26 |         media_dir: Path,
 27 |         skip_image: bool,
 28 |         cwd_path="/opt/antiword"
 29 |     ) -> ExtractResponse:
 30 |         doc_path = dir.joinpath("its.doc")
 31 | 
 32 |         if type(r) is FileStorage:
 33 |             r.save(doc_path)
 34 |         else:
 35 |             shutil.copyfile(r, doc_path)
 36 | 
 37 |         if skip_image:
 38 |             cmd = f"./antiword -f -i 1 -o {dir.as_posix()} {doc_path.as_posix()}"
 39 |         else:
 40 |             cmd = f"./antiword -f -i 3 -o {dir.as_posix()} {doc_path.as_posix()}"
 41 |         logger.info(f"cmd: {cmd}")
 42 |         process = Popen(cmd, shell=True, cwd=Path(cwd_path), stdout=PIPE, stderr=PIPE)
 43 |         stdout, stderr = process.communicate()
 44 |         process.wait()
 45 | 
 46 |         shutil.rmtree(media_dir.absolute().as_posix())
 47 |         shutil.move(
 48 |             dir.joinpath("pic").absolute().as_posix(), media_dir.absolute().as_posix()
 49 |         )
 50 |         code = process.returncode
 51 |         if code != 0:
 52 |             err = stderr.decode()
 53 |             raise Exception(f"parse doc failed: {err}")
 54 | 
 55 |         pure_text_path = dir.joinpath("text")
 56 | 
 57 |         with open(pure_text_path, "r") as f:
 58 |             content = f.read()
 59 | 
 60 |         # img_map: dict[Path, str] = {}
 61 |         # imgs = media_dir.glob("*")
 62 |         # for img in imgs:
 63 |         #     img_map[img] = self.generate_img_path(id, img.name)
 64 |         #
 65 |         # self.upload_background(id, img_map)
 66 | 
 67 |         pages = [
 68 |             Page(page_no=idx, content=x)
 69 |             for idx, x in enumerate(content.split("[pedia-page]"))
 70 |         ]
 71 | 
 72 |         for page in pages:
 73 |             content: str = page.pop("content")
 74 |             content_list = [
 75 |                 Content(data=x.strip(), type="text")
 76 |                 for x in content.split("\n")
 77 |                 if x.strip()
 78 |             ]
 79 | 
 80 |             for content in content_list:
 81 |                 if not content["data"].startswith("[pedia-"):
 82 |                     continue
 83 |                 if content["data"] == "[pedia-badpic]":
 84 |                     content["data"] = ""
 85 |                     content["type"] = "image"
 86 |                 elif content["data"].startswith("[pedia-pic"):
 87 |                     content["type"] = "image"
 88 |                     img_name = content["data"][len("[pedia-") : -1]
 89 |                     img_path = media_dir.joinpath(img_name)
 90 |                     content["data"] = img_map[img_path]
 91 |                 else:
 92 |                     content["data"] = content["data"] + "\n"
 93 | 
 94 |             page["content_list"] = content_list
 95 | 
 96 |         return pages
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     e = DocExtractor()
101 |     print(
102 |         json.dumps(
103 |             e.run("abc", Path("/home/SENSETIME/wuziming/diclm/doc2docx/doc/md4.doc")),
104 |             ensure_ascii=False,
105 |             indent=4,
106 |         ),
107 |     )
108 |     e.wait_all()
109 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
  1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
  2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
  3 | 
  4 | name: Python package
  5 | 
  6 | on:
  7 |   push:
  8 |     tags:
  9 |       - '*released'
 10 |   workflow_dispatch:
 11 | 
 12 | 
 13 | jobs:
 14 | 
 15 |   update-version:
 16 |     runs-on: ubuntu-latest
 17 |     steps:
 18 |       - name: Checkout repository
 19 |         uses: actions/checkout@v4
 20 |         with:
 21 |           ref: main
 22 |           fetch-depth: 0
 23 | 
 24 |       - name: Set up Python
 25 |         uses: actions/setup-python@v5
 26 |         with:
 27 |           python-version: "3.10"
 28 | 
 29 |       - name: Update version.py
 30 |         run: |
 31 |           python update_version.py
 32 | 
 33 |       - name: Verify version.py
 34 |         run: |
 35 |           ls -l magic_doc/libs/version.py
 36 |           cat magic_doc/libs/version.py
 37 | 
 38 |       - name: Commit changes
 39 |         run: |
 40 |           git config --local user.email "moe@myhloli.com"
 41 |           git config --local user.name "myhloli"
 42 |           git add magic_doc/libs/version.py
 43 |           if git diff-index --quiet HEAD; then
 44 |             echo "No changes to commit"
 45 |           else
 46 |             git commit -m "Update version.py with new version"
 47 |           fi
 48 |         id: commit_changes
 49 | 
 50 |       - name: Push changes
 51 |         if: steps.commit_changes.outcome == 'success'
 52 |         env:
 53 |           GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
 54 |         run: |
 55 |           git push origin HEAD:main
 56 | 
 57 |   build:
 58 |     needs: [ update-version ]
 59 |     runs-on: ubuntu-latest
 60 |     strategy:
 61 |       fail-fast: false
 62 |       matrix:
 63 |         python-version: ["3.10"]
 64 | 
 65 |     steps:
 66 |     - name: Checkout code
 67 |       uses: actions/checkout@v4
 68 |       with:
 69 |         ref: main
 70 |         fetch-depth: 0
 71 | 
 72 |     - name: Verify version.py
 73 |       run: |
 74 |         ls -l magic_doc/libs/version.py
 75 |         cat magic_doc/libs/version.py
 76 | 
 77 |     - name: Set up Python ${{ matrix.python-version }}
 78 |       uses: actions/setup-python@v5
 79 |       with:
 80 |         python-version: ${{ matrix.python-version }}
 81 | 
 82 |     - name: Install dependencies
 83 |       run: |
 84 |         python -m pip install --upgrade pip
 85 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
 86 | 
 87 |     - name: Install wheel
 88 |       run: |
 89 |         python -m pip install wheel
 90 | 
 91 |     - name: Build wheel
 92 |       run: |
 93 |         python setup.py bdist_wheel
 94 | 
 95 |     - name: Upload artifact
 96 |       uses: actions/upload-artifact@v4
 97 |       with:
 98 |         name: wheel-file
 99 |         path: dist/*.whl
100 |         retention-days: 30
101 | 
102 |   release:
103 |     needs: [ build ]
104 |     runs-on: ubuntu-latest
105 |     steps:
106 |       - name: Checkout code
107 |         uses: actions/checkout@v4
108 | 
109 |       - name: Download artifact
110 |         uses: actions/download-artifact@v4
111 |         with:
112 |           name: wheel-file
113 |           path: dist
114 | 
115 |       - name: Create and Upload Release
116 |         id: create_release
117 |         uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
118 |         with:
119 |           files: './dist/*.whl'
120 |         env:
121 |           GITHUB_TOKEN: ${{ secrets.RELEASE_TOKEN }}
122 | 
123 |       - name: Publish distribution to PyPI
124 |         run: |
125 |           pip install twine
126 |           twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
127 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/common/oss/oss.py:
--------------------------------------------------------------------------------
  1 | import oss2
  2 | from magic_doc.restful_api.common.ext import singleton_func
  3 | from loguru import logger
  4 | 
  5 | 
  6 | @singleton_func
  7 | class Oss(object):
  8 |     def __init__(self, access_key_id, access_secret_key, bucket_name, endpoint, expires=60):
  9 |         self.access_key_id = access_key_id
 10 |         self.access_secret_key = access_secret_key
 11 |         self.bucket_name = bucket_name
 12 |         self.endpoint = endpoint
 13 |         self.expires = expires
 14 |         self.auth = oss2.Auth(self.access_key_id, self.access_secret_key)
 15 | 
 16 |     def create_bucket(self, bucket_name=None):
 17 |         """
 18 |         创建存储空间
 19 |         :param bucket_name:  bucket名称
 20 |         :return:
 21 |         """
 22 |         bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name)
 23 |         # 设置存储空间为私有读写权限
 24 |         # bucket.create_bucket(oss2.models.BUCKET_ACL_PRIVATE)
 25 |         bucket.create_bucket()
 26 |         return True
 27 | 
 28 |     def delete_bucket(self, bucket_name=None):
 29 |         """
 30 |         删除存储空间
 31 |         :param bucket_name:  bucket名称
 32 |         :return:
 33 |         """
 34 |         bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name)
 35 |         try:
 36 |             bucket.delete_bucket()
 37 |             return True
 38 |         except oss2.exceptions.BucketNotEmpty:
 39 |             logger.error('bucket is not empty.')
 40 |             return False
 41 |         except oss2.exceptions.NoSuchBucket:
 42 |             logger.error('bucket does not exist')
 43 |             return False
 44 | 
 45 |     def pub_object(self, bucket_name=None, object_name=None, object_data=None):
 46 |         """
 47 |         上传文件
 48 |             Str
 49 |             Bytes
 50 |             Unicode
 51 |             Stream
 52 |         :param bucket_name:  bucket名称
 53 |         :param object_name:  不包含Bucket名称组成的Object完整路径
 54 |         :param object_data:
 55 |         :return:
 56 |         """
 57 |         bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name)
 58 |         result = bucket.put_object(object_name, object_data)
 59 |         file_link = bucket.sign_url('GET', object_name, self.expires, slash_safe=True)
 60 |         return {
 61 |             "status": result.status,
 62 |             "request_id": result.request_id,
 63 |             "etag": result.etag,
 64 |             "date": result.headers['date'],
 65 |             "file_link": file_link
 66 |         }
 67 | 
 68 |     def put_file(self, bucket_name=None, object_name=None, file_path=None):
 69 |         """
 70 |         上传文件
 71 |             file
 72 |         :param bucket_name:  bucket名称
 73 |         :param object_name:  不包含Bucket名称组成的Object完整路径
 74 |         :param file_path:   文件路径
 75 |         :return:
 76 |         """
 77 |         bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name)
 78 |         result = bucket.put_object_from_file(object_name, file_path)
 79 |         file_link = bucket.sign_url('GET', object_name, self.expires, slash_safe=True)
 80 |         return {
 81 |             "status": result.status,
 82 |             "request_id": result.request_id,
 83 |             "etag": result.etag,
 84 |             "date": result.headers['date'],
 85 |             "file_link": file_link
 86 |         }
 87 | 
 88 |     def delete_objects(self, bucket_name=None, object_name=None):
 89 |         """
 90 |         批量删除文件
 91 |         :param bucket_name:  bucket名称
 92 |         :param object_name:  不包含Bucket名称组成的Object完整路径列表
 93 |         :return:
 94 |         """
 95 |         if object_name is None:
 96 |             object_name = []
 97 |         bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name)
 98 |         result = bucket.batch_delete_objects(object_name)
 99 |         return result.deleted_keys
100 | 
101 |     def download_file(self, bucket_name=None, object_name=None, save_path=None):
102 |         """
103 |         下载文件到本地
104 |         :param bucket_name:  bucket名称
105 |         :param object_name:  不包含Bucket名称组成的Object完整路径
106 |         :param save_path:  保存路径
107 |         :return:
108 |         """
109 |         bucket = oss2.Bucket(self.auth, self.endpoint, bucket_name if bucket_name else self.bucket_name)
110 |         bucket.get_object_to_file(object_name, save_path)
111 | 


--------------------------------------------------------------------------------
/magic_doc/restful_api/api/analysis/magic_pdf_view.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import time
 4 | import requests
 5 | from flask import request, current_app
 6 | from flask_restful import Resource
 7 | from marshmallow import ValidationError
 8 | from pathlib import Path
 9 | from magic_doc.pdf_transform import DocConverter, S3Config
10 | from .serialization import MagicPdfSchema
11 | from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_mm_markdown_with_para_and_pagination
12 | from magic_doc.restful_api.common.oss.oss import Oss
13 | from .ext import upload_image_to_oss, upload_md_to_oss
14 | from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED
15 | from magic_doc.restful_api.common.custom_response import generate_response
16 | from loguru import logger
17 | 
18 | executor = ThreadPoolExecutor()
19 | 
20 | 
21 | class MagicPdfView(Resource):
22 |     @logger.catch
23 |     def post(self):
24 |         """
25 |         PDF解析，将markdown结果上传至服务器
26 |         """
27 |         t0 = time.time()
28 |         magic_pdf_schema = MagicPdfSchema()
29 |         try:
30 |             params = magic_pdf_schema.load(request.get_json())
31 |         except ValidationError as err:
32 |             return generate_response(code=400, msg=err.messages)
33 |         pdf_path = params.get('pageUrl')
34 |         # ############ pdf解析  ###############
35 |         file_name = str(Path(pdf_path).stem)
36 |         pf_path = f"/tmp/{file_name}.txt"
37 |         pdf_dir = f"{current_app.static_folder}/pdf/{file_name}"
38 |         NULL_IMG_DIR = f"{current_app.static_folder}/pdf/{file_name}"
39 |         app_config = current_app.config
40 |         if not Path(NULL_IMG_DIR).exists():
41 |             Path(NULL_IMG_DIR).mkdir(parents=True, exist_ok=True)
42 |         if pdf_path.startswith("http://") or pdf_path.startswith("https://"):
43 |             download_pdf = requests.get(pdf_path, stream=True)
44 |             pdf_path = f"{pdf_dir}/{file_name}.pdf"
45 |             with open(pdf_path, "wb") as wf:
46 |                 wf.write(download_pdf.content)
47 |             doc_conv = DocConverter(None)
48 |         elif pdf_path.startswith("s3://"):
49 |             s3_config = S3Config(app_config["S3AK"], app_config["S3SK"], app_config["S3ENDPOINT"])
50 |             doc_conv = DocConverter(s3_config)
51 |         else:
52 |             doc_conv = DocConverter(None)
53 |         t1 = time.time()
54 |         logger.info(f"param init cost_time:{t1 - t0}")
55 |         result = doc_conv.convert_to_mid_result(pdf_path, pf_path, 60)
56 |         t2 = time.time()
57 |         logger.info(f"pdf doc_conv cost_time:{t2 - t1}")
58 |         md_content = json.dumps(ocr_mk_mm_markdown_with_para_and_pagination(result[0], NULL_IMG_DIR), ensure_ascii=False)
59 |         t3 = time.time()
60 |         logger.info(f"make markdown cost_time:{t3 - t2}")
61 |         # local_md_path = f"{pdf_dir}/{file_name}.md"
62 |         # with open(local_md_path, "w", encoding="utf-8") as f:
63 |         #     f.write(md_content)
64 |         # t4 = time.time()
65 |         # logger.info(f"save markdown cost_time:{t4 - t3}")
66 |         _t0 = time.time()
67 |         oss_client = Oss(
68 |             app_config["AccessKeyID"],
69 |             app_config["AccessKeySecret"],
70 |             app_config["BucketName"],
71 |             app_config["Endpoint"],
72 |             app_config["UrlExpires"]
73 |         )
74 |         img_list = Path(f"{NULL_IMG_DIR}/images").glob('*') if Path(f"{NULL_IMG_DIR}/images").exists() else []
75 |         all_task = [executor.submit(upload_image_to_oss, oss_client, file_name, img_path, NULL_IMG_DIR, app_config["BucketName"]) for img_path in img_list]
76 |         wait(all_task, return_when=ALL_COMPLETED)
77 |         for task in all_task:
78 |             task_result = task.result()
79 |             regex = re.compile(fr'.*\((.*?{Path(task_result[0]).name})')
80 |             regex_result = regex.search(md_content)
81 |             if regex_result:
82 |                 md_content = md_content.replace(regex_result.group(1), task_result[1])
83 |         _t1 = time.time()
84 |         logger.info(f"upload img cost_time:{_t1 - _t0}")
85 | 
86 |         all_md_task = [executor.submit(upload_md_to_oss, oss_client, app_config["BucketName"], f"pdf/{file_name}/{md.get('page_no', n)}.md", md["md_content"]) for n, md in enumerate(json.loads(md_content))]
87 |         wait(all_md_task, return_when=ALL_COMPLETED)
88 |         md_link_list = []
89 |         for task in all_md_task:
90 |             task_result = task.result()
91 |             md_link_list.append(task_result)
92 |         _t2 = time.time()
93 |         logger.info(f"upload md cost_time:{_t2 - _t1}")
94 | 
95 |         return generate_response(markDownUrl=md_link_list)
96 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/mmltex/tables.xsl:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding="UTF-8"?>
  2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  3 | 		xmlns:m="http://www.w3.org/1998/Math/MathML"
  4 |                 version='1.0'>
  5 |                 
  6 | <!-- ====================================================================== -->
  7 | <!-- $id: tables.xsl, 2002/17/05 Exp $
  8 |      This file is part of the XSLT MathML Library distribution.
  9 |      See ./README or http://www.raleigh.ru/MathML/mmltex for
 10 |      copyright and other information                                        -->
 11 | <!-- ====================================================================== -->
 12 | 
 13 | <xsl:template match="m:mtd[@columnspan]">
 14 | 	<xsl:text>\multicolumn{</xsl:text>
 15 | 	<xsl:value-of select="@columnspan"/>
 16 | 	<xsl:text>}{c}{</xsl:text>
 17 | 	<xsl:apply-templates/>
 18 | 	<xsl:text>}</xsl:text>
 19 | 	<xsl:if test="count(following-sibling::*)>0">
 20 | 		<xsl:text>&amp; </xsl:text>
 21 | 	</xsl:if>
 22 | </xsl:template>
 23 | 
 24 | 
 25 | <xsl:template match="m:mtd">
 26 | 	<xsl:if test="@columnalign='right' or @columnalign='center'">
 27 | 		<xsl:text>\hfill </xsl:text>
 28 | 	</xsl:if>
 29 | 	<xsl:apply-templates/>
 30 | 	<xsl:if test="@columnalign='left' or @columnalign='center'">
 31 | 		<xsl:text>\hfill </xsl:text>
 32 | 	</xsl:if>
 33 | 	<xsl:if test="count(following-sibling::*)>0">
 34 | <!--    this test valid for Sablotron, another form - test="not(position()=last())".
 35 | 	Also for m:mtd[@columnspan] and m:mtr  -->
 36 | 		<xsl:text>&amp; </xsl:text>
 37 | 	</xsl:if>
 38 | </xsl:template>
 39 | 
 40 | <xsl:template match="m:mtr">
 41 | 	<xsl:apply-templates/>
 42 | 	<xsl:if test="count(following-sibling::*)>0">
 43 | 		<xsl:text>\\ </xsl:text>
 44 | 	</xsl:if>
 45 | </xsl:template>
 46 | 
 47 | <xsl:template match="m:mtable">
 48 | 	<xsl:text>\begin{array}{</xsl:text>
 49 | 	<xsl:if test="@frame='solid'">
 50 | 		<xsl:text>|</xsl:text>
 51 | 	</xsl:if>
 52 | 	<xsl:variable name="numbercols" select="count(./m:mtr[1]/m:mtd[not(@columnspan)])+sum(./m:mtr[1]/m:mtd/@columnspan)"/>
 53 | 	<xsl:choose>
 54 | 		<xsl:when test="@columnalign">
 55 | 			<xsl:variable name="colalign">
 56 | 				<xsl:call-template name="colalign">
 57 | 					<xsl:with-param name="colalign" select="@columnalign"/>
 58 | 				</xsl:call-template>
 59 | 			</xsl:variable>
 60 | 			<xsl:choose>
 61 | 				<xsl:when test="string-length($colalign) > $numbercols">
 62 | 					<xsl:value-of select="substring($colalign,1,$numbercols)"/>
 63 | 				</xsl:when>
 64 | 				<xsl:when test="string-length($colalign) &lt; $numbercols">
 65 | 					<xsl:value-of select="$colalign"/>
 66 | 					<xsl:call-template name="generate-string">
 67 | 						<xsl:with-param name="text" select="substring($colalign,string-length($colalign))"/>
 68 | 						<xsl:with-param name="count" select="$numbercols - string-length($colalign)"/>
 69 | 					</xsl:call-template>
 70 | 				</xsl:when>
 71 | 				<xsl:otherwise>
 72 | 					<xsl:value-of select="$colalign"/>
 73 | 				</xsl:otherwise>
 74 | 			</xsl:choose>
 75 | 		</xsl:when>
 76 | 		<xsl:otherwise>
 77 | 			<xsl:call-template name="generate-string">
 78 | 				<xsl:with-param name="text" select="'c'"/>
 79 | 				<xsl:with-param name="count" select="$numbercols"/>
 80 | 			</xsl:call-template>
 81 | 		</xsl:otherwise>
 82 | 	</xsl:choose>
 83 | 	<xsl:if test="@frame='solid'">
 84 | 		<xsl:text>|</xsl:text>
 85 | 	</xsl:if>
 86 | 	<xsl:text>}</xsl:text>
 87 | 	<xsl:if test="@frame='solid'">
 88 | 		<xsl:text>\hline </xsl:text>
 89 | 	</xsl:if>
 90 | 	<xsl:apply-templates/>
 91 | 	<xsl:if test="@frame='solid'">
 92 | 		<xsl:text>\\ \hline</xsl:text>
 93 | 	</xsl:if>
 94 | 	<xsl:text>\end{array}</xsl:text>
 95 | </xsl:template>
 96 | 
 97 | <xsl:template name="colalign">
 98 | 	<xsl:param name="colalign"/>
 99 | 	<xsl:choose>
100 | 		<xsl:when test="contains($colalign,' ')">
101 | 			<xsl:value-of select="substring($colalign,1,1)"/>
102 | 			<xsl:call-template name="colalign">
103 | 				<xsl:with-param name="colalign" select="substring-after($colalign,' ')"/>
104 | 			</xsl:call-template>
105 | 		</xsl:when>
106 | 		<xsl:otherwise>
107 | 			<xsl:value-of select="substring($colalign,1,1)"/>
108 | 		</xsl:otherwise>
109 | 	</xsl:choose>
110 | </xsl:template>
111 | 
112 | <xsl:template name="generate-string">
113 | <!-- template from XSLT Standard Library v1.1 -->
114 |     <xsl:param name="text"/>
115 |     <xsl:param name="count"/>
116 | 
117 |     <xsl:choose>
118 |       <xsl:when test="string-length($text) = 0 or $count &lt;= 0"/>
119 | 
120 |       <xsl:otherwise>
121 | 	<xsl:value-of select="$text"/>
122 | 	<xsl:call-template name="generate-string">
123 | 	  <xsl:with-param name="text" select="$text"/>
124 | 	  <xsl:with-param name="count" select="$count - 1"/>
125 | 	</xsl:call-template>
126 |       </xsl:otherwise>
127 |     </xsl:choose>
128 | </xsl:template>
129 | 
130 | </xsl:stylesheet>


--------------------------------------------------------------------------------
/magic_doc/contrib/office/formula/mml/xsl/tables.xsl:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding="UTF-8"?>
  2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  3 | 		xmlns:m="http://www.w3.org/1998/Math/MathML"
  4 |                 version='1.0'>
  5 |                 
  6 | <!-- ====================================================================== -->
  7 | <!-- $id: tables.xsl, 2002/17/05 Exp $
  8 |      This file is part of the XSLT MathML Library distribution.
  9 |      See ./README or http://www.raleigh.ru/MathML/mmltex for
 10 |      copyright and other information                                        -->
 11 | <!-- ====================================================================== -->
 12 | 
 13 | <xsl:template match="m:mtd[@columnspan]">
 14 | 	<xsl:text>\multicolumn{</xsl:text>
 15 | 	<xsl:value-of select="@columnspan"/>
 16 | 	<xsl:text>}{c}{</xsl:text>
 17 | 	<xsl:apply-templates/>
 18 | 	<xsl:text>}</xsl:text>
 19 | 	<xsl:if test="count(following-sibling::*)>0">
 20 | 		<xsl:text>&amp; </xsl:text>
 21 | 	</xsl:if>
 22 | </xsl:template>
 23 | 
 24 | 
 25 | <xsl:template match="m:mtd">
 26 | 	<xsl:if test="@columnalign='right' or @columnalign='center'">
 27 | 		<xsl:text>\hfill </xsl:text>
 28 | 	</xsl:if>
 29 | 	<xsl:apply-templates/>
 30 | 	<xsl:if test="@columnalign='left' or @columnalign='center'">
 31 | 		<xsl:text>\hfill </xsl:text>
 32 | 	</xsl:if>
 33 | 	<xsl:if test="count(following-sibling::*)>0">
 34 | <!--    this test valid for Sablotron, another form - test="not(position()=last())".
 35 | 	Also for m:mtd[@columnspan] and m:mtr  -->
 36 | 		<xsl:text>&amp; </xsl:text>
 37 | 	</xsl:if>
 38 | </xsl:template>
 39 | 
 40 | <xsl:template match="m:mtr">
 41 | 	<xsl:apply-templates/>
 42 | 	<xsl:if test="count(following-sibling::*)>0">
 43 | 		<xsl:text>\\ </xsl:text>
 44 | 	</xsl:if>
 45 | </xsl:template>
 46 | 
 47 | <xsl:template match="m:mtable">
 48 | 	<xsl:text>\begin{array}{</xsl:text>
 49 | 	<xsl:if test="@frame='solid'">
 50 | 		<xsl:text>|</xsl:text>
 51 | 	</xsl:if>
 52 | 	<xsl:variable name="numbercols" select="count(./m:mtr[1]/m:mtd[not(@columnspan)])+sum(./m:mtr[1]/m:mtd/@columnspan)"/>
 53 | 	<xsl:choose>
 54 | 		<xsl:when test="@columnalign">
 55 | 			<xsl:variable name="colalign">
 56 | 				<xsl:call-template name="colalign">
 57 | 					<xsl:with-param name="colalign" select="@columnalign"/>
 58 | 				</xsl:call-template>
 59 | 			</xsl:variable>
 60 | 			<xsl:choose>
 61 | 				<xsl:when test="string-length($colalign) > $numbercols">
 62 | 					<xsl:value-of select="substring($colalign,1,$numbercols)"/>
 63 | 				</xsl:when>
 64 | 				<xsl:when test="string-length($colalign) &lt; $numbercols">
 65 | 					<xsl:value-of select="$colalign"/>
 66 | 					<xsl:call-template name="generate-string">
 67 | 						<xsl:with-param name="text" select="substring($colalign,string-length($colalign))"/>
 68 | 						<xsl:with-param name="count" select="$numbercols - string-length($colalign)"/>
 69 | 					</xsl:call-template>
 70 | 				</xsl:when>
 71 | 				<xsl:otherwise>
 72 | 					<xsl:value-of select="$colalign"/>
 73 | 				</xsl:otherwise>
 74 | 			</xsl:choose>
 75 | 		</xsl:when>
 76 | 		<xsl:otherwise>
 77 | 			<xsl:call-template name="generate-string">
 78 | 				<xsl:with-param name="text" select="'c'"/>
 79 | 				<xsl:with-param name="count" select="$numbercols"/>
 80 | 			</xsl:call-template>
 81 | 		</xsl:otherwise>
 82 | 	</xsl:choose>
 83 | 	<xsl:if test="@frame='solid'">
 84 | 		<xsl:text>|</xsl:text>
 85 | 	</xsl:if>
 86 | 	<xsl:text>}</xsl:text>
 87 | 	<xsl:if test="@frame='solid'">
 88 | 		<xsl:text>\hline </xsl:text>
 89 | 	</xsl:if>
 90 | 	<xsl:apply-templates/>
 91 | 	<xsl:if test="@frame='solid'">
 92 | 		<xsl:text>\\ \hline</xsl:text>
 93 | 	</xsl:if>
 94 | 	<xsl:text>\end{array}</xsl:text>
 95 | </xsl:template>
 96 | 
 97 | <xsl:template name="colalign">
 98 | 	<xsl:param name="colalign"/>
 99 | 	<xsl:choose>
100 | 		<xsl:when test="contains($colalign,' ')">
101 | 			<xsl:value-of select="substring($colalign,1,1)"/>
102 | 			<xsl:call-template name="colalign">
103 | 				<xsl:with-param name="colalign" select="substring-after($colalign,' ')"/>
104 | 			</xsl:call-template>
105 | 		</xsl:when>
106 | 		<xsl:otherwise>
107 | 			<xsl:value-of select="substring($colalign,1,1)"/>
108 | 		</xsl:otherwise>
109 | 	</xsl:choose>
110 | </xsl:template>
111 | 
112 | <xsl:template name="generate-string">
113 | <!-- template from XSLT Standard Library v1.1 -->
114 |     <xsl:param name="text"/>
115 |     <xsl:param name="count"/>
116 | 
117 |     <xsl:choose>
118 |       <xsl:when test="string-length($text) = 0 or $count &lt;= 0"/>
119 | 
120 |       <xsl:otherwise>
121 | 	<xsl:value-of select="$text"/>
122 | 	<xsl:call-template name="generate-string">
123 | 	  <xsl:with-param name="text" select="$text"/>
124 | 	  <xsl:with-param name="count" select="$count - 1"/>
125 | 	</xsl:call-template>
126 |       </xsl:otherwise>
127 |     </xsl:choose>
128 | </xsl:template>
129 | 
130 | </xsl:stylesheet>


--------------------------------------------------------------------------------
/magic_doc/conv/pdf_magicpdf.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode
  4 | from magic_pdf.pipe.UNIPipe import UNIPipe
  5 | from magic_pdf.pipe.OCRPipe import OCRPipe
  6 | from magic_pdf.pipe.TXTPipe import TXTPipe
  7 | from magic_doc.conv.base import BaseConv
  8 | 
  9 | from magic_doc.progress.filepupdator import FileBaseProgressUpdator
 10 | from magic_doc.progress.pupdator import ConvProgressUpdator
 11 | from magic_doc.utils import get_repo_directory
 12 | from magic_doc.utils.null_writer import NullWriter
 13 | from magic_pdf.dict2md.ocr_mkcontent import union_make
 14 | from magic_pdf.libs.json_compressor import JsonCompressor
 15 | from magic_pdf.rw.AbsReaderWriter import AbsReaderWriter
 16 | from magic_doc.common.default_config import DEFAULT_CONFIG, PdfHqParseMethod
 17 | 
 18 | 
 19 | NULL_IMG_DIR = "/tmp"
 20 | 
 21 | class SingletonModelWrapper:
 22 | 
 23 |     def __new__(cls):
 24 |         if not hasattr(cls, "instance"):
 25 |             from magic_doc.model.doc_analysis import DocAnalysis
 26 |             apply_ocr = os.getenv("APPLY_OCR", "TRUE") == "TRUE" 
 27 |             apply_layout = os.getenv("APPLY_LAYOUT", "TRUE") == "TRUE" 
 28 |             apply_formula = os.getenv("APPLY_FORMULA", "FALSE") == "TRUE"
 29 |             
 30 |             cls.instance = super(SingletonModelWrapper, cls).__new__(cls)
 31 |             cls.instance.doc_analysis = DocAnalysis(  # type: ignore
 32 |                 configs=os.path.join(
 33 |                     get_repo_directory(), "resources/model/model_configs.yaml"
 34 |                 ),
 35 |                 apply_ocr=apply_ocr, apply_layout=apply_layout, apply_formula=apply_formula,
 36 |             )
 37 |         return cls.instance
 38 |     
 39 |     def __call__(self, bits: bytes):
 40 |         from magic_doc.model.doc_analysis import load_images_from_pdf
 41 |         images = load_images_from_pdf(bits, dpi=200)
 42 |         return self.doc_analysis(images) # type: ignore
 43 | 
 44 | class Pdf(BaseConv):
 45 | 
 46 |     def __construct_pdf_pipe(self, bits, model_list, image_writer):
 47 |         if DEFAULT_CONFIG["pdf"]["hq"]["parsemethod"] == PdfHqParseMethod.AUTO:
 48 |             pipe = UNIPipe(bits, model_list, image_writer, is_debug=True)  # type: ignore
 49 |         elif DEFAULT_CONFIG["pdf"]["hq"]["parsemethod"] == PdfHqParseMethod.OCR:
 50 |             pipe = OCRPipe(bits, model_list, image_writer, is_debug=True)  # type: ignore
 51 |         elif DEFAULT_CONFIG["pdf"]["hq"]["parsemethod"] == PdfHqParseMethod.TXT:
 52 |             pipe = TXTPipe(bits, model_list, image_writer, is_debug=True)  # type: ignore
 53 |         else:
 54 |             raise Exception("unknown parse method under hq mode")
 55 |         return pipe
 56 | 
 57 | 
 58 |     def to_md(self, bits: bytes | str, pupdator: ConvProgressUpdator) -> str:
 59 |         model_proc = SingletonModelWrapper()
 60 |         pupdator.update(0)
 61 | 
 62 |         model_list = model_proc(bits)  # type: ignore
 63 |         pupdator.update(50)
 64 |         # jso_useful_key = {
 65 |         #     "_pdf_type": "",
 66 |         #     "model_list": model_list,
 67 |         # }
 68 |         image_writer = NullWriter()
 69 |         pipe = self.__construct_pdf_pipe(bits, model_list, image_writer)
 70 |         # pipe.pipe_classify() # 默认ocrpipe的时候不需要再做分类，可以节省时间
 71 |         pipe.pipe_parse()
 72 |         pupdator.update(100)
 73 | 
 74 |         pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
 75 |         pdf_info_list = pdf_mid_data["pdf_info"]
 76 |         md_content = union_make(pdf_info_list, MakeMode.NLP_MD, DropMode.NONE, NULL_IMG_DIR)
 77 |         return md_content # type: ignore
 78 | 
 79 |     def to_mid_result(self, image_writer: AbsReaderWriter, bits: bytes | str, pupdator: ConvProgressUpdator) -> list[dict] | dict:
 80 |         model_proc = SingletonModelWrapper()
 81 |         pupdator.update(0)
 82 | 
 83 |         model_list = model_proc(bits)  # type: ignore
 84 |         pupdator.update(50)
 85 |         # jso_useful_key = {
 86 |         #     "_pdf_type": "",
 87 |         #     "model_list": model_list,
 88 |         # }
 89 |         pipe = self.__construct_pdf_pipe(bits, model_list, image_writer)
 90 |         # pipe.pipe_classify()
 91 |         pipe.pipe_parse()
 92 |         pupdator.update(100)
 93 | 
 94 |         pdf_mid_data = JsonCompressor.decompress_json(pipe.get_compress_pdf_mid_data())
 95 |         pdf_info_list = pdf_mid_data["pdf_info"]
 96 |         return pdf_info_list
 97 | 
 98 | if __name__ == "__main__":
 99 |     with open("/opt/data/pdf/20240423/pdf_test2/ol006018w.pdf", "rb") as f:
100 |         bits_data = f.read()
101 |         parser = Pdf()
102 |         md_content = parser.to_md(
103 |             bits_data, FileBaseProgressUpdator("debug/progress.txt")
104 |         )
105 |         with open("debug/pdf2md.by_model.md", "w") as f:
106 |             f.write(md_content) # type: ignore
107 | 
108 | 


--------------------------------------------------------------------------------
/tools/clean_photo.py:
--------------------------------------------------------------------------------
  1 | import pypandoc
  2 | import re  
  3 | import htmltabletomd
  4 | import os  
  5 | import argparse
  6 | import zipfile
  7 | 
  8 | parser = argparse.ArgumentParser(description="get tool type")
  9 | parser.add_argument(
 10 |     "--tool_name",
 11 |     type=str,
 12 |     required=True,
 13 |     help="input tool name",
 14 | )
 15 | parser.add_argument(
 16 |     "--download_dir",
 17 |     type=str,
 18 |     required=True,
 19 |     help="input download dir",
 20 | )
 21 | args = parser.parse_args()
 22 | 
 23 | def clean_markdown_images(content):  
 24 |     pattern = re.compile(r'!\[[^\]]*\]\([^)]*\)', re.IGNORECASE)  
 25 |     cleaned_content = pattern.sub('', content)   
 26 |     return cleaned_content
 27 |    
 28 | def clean_ocrmath_photo(content):
 29 |     pattern = re.compile(r'\\includegraphics\[.*?\]\{.*?\}', re.IGNORECASE)  
 30 |     cleaned_content = pattern.sub('', content)   
 31 |     return cleaned_content
 32 | 
 33 | def convert_html_table_to_md(html_table):  
 34 |     lines = html_table.strip().split('\n')  
 35 |     md_table = ''  
 36 |     if lines and '<tr>' in lines[0]:  
 37 |         in_thead = True  
 38 |         for line in lines:  
 39 |             if '<th>' in line:  
 40 |                 cells = re.findall(r'<th>(.*?)</th>', line)  
 41 |                 md_table += '| ' + ' | '.join(cells) + ' |\n'  
 42 |                 in_thead = False  
 43 |             elif '<td>' in line and not in_thead:  
 44 |                 cells = re.findall(r'<td>(.*?)</td>', line)  
 45 |                 md_table += '| ' + ' | '.join(cells) + ' |\n'  
 46 |         md_table = md_table.rstrip() + '\n'    
 47 |     return md_table  
 48 |  
 49 | def convert_latext_to_md(content):  
 50 |     tables = re.findall(r'\\begin\{tabular\}(.*?)\\end\{tabular\}', content, re.DOTALL)  
 51 |     placeholders = []  
 52 |     for table in tables:  
 53 |         placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
 54 |         replace_str = f"\\begin{{tabular}}{table}cl\\end{{tabular}}"
 55 |         content = content.replace(replace_str, placeholder)  
 56 |         try:
 57 |             pypandoc.convert_text(replace_str,  format="latex", to="md", outputfile="output.md", encoding="utf-8")
 58 |         except:
 59 |             markdown_string = replace_str
 60 |         else: 
 61 |             markdown_string = open('output.md', 'r', encoding='utf-8').read()
 62 |         placeholders.append((placeholder, markdown_string)) 
 63 |     new_content = content  
 64 |     for placeholder, md_table in placeholders:  
 65 |         new_content = new_content.replace(placeholder, md_table)  
 66 |         # 写入文件  
 67 |     return new_content
 68 | 
 69 |  
 70 | def convert_htmltale_to_md(content):  
 71 |     tables = re.findall(r'<table>(.*?)</table>', content, re.DOTALL)  
 72 |     placeholders = []  
 73 |     for table in tables:  
 74 |         placeholder = f"<!-- TABLE_PLACEHOLDER_{len(placeholders)} -->"  
 75 |         content = content.replace(f"<table>{table}</table>", placeholder)  
 76 |         try:
 77 |             convert_table = htmltabletomd.convert_table(table)
 78 |         except:
 79 |             convert_table = table
 80 |         placeholders.append((placeholder,convert_table)) 
 81 |     new_content = content  
 82 |     for placeholder, md_table in placeholders:  
 83 |         new_content = new_content.replace(placeholder, md_table)  
 84 |         # 写入文件  
 85 |     return new_content
 86 | 
 87 | def clean_data(prod_type, download_dir):
 88 |     file_type = ["academic_literature", "atlas", "courseware", "colorful_textbook", "historical_documents", "notes", "ordinary_books", "ordinary_exam_paper", "ordinary_textbook", "research_report", "special_exam_paper"]
 89 |     for filetype in file_type:
 90 |         tgt_dir = os.path.join(download_dir, filetype, prod_type, "cleaned")
 91 |         if not os.path.exists(tgt_dir):  
 92 |             os.makedirs(tgt_dir) 
 93 |         source_dir = os.path.join(download_dir, filetype, prod_type)
 94 |         filenames = os.listdir(source_dir)
 95 |         for filename in filenames:
 96 |             if filename.endswith('.md'):
 97 |                 input_file = os.path.join(source_dir, filename)
 98 |                 output_file = os.path.join(tgt_dir, "cleaned_" + filename)
 99 |                 with open(input_file, 'r', encoding='utf-8') as fr:
100 |                     content = fr.read()
101 |                     new_content = convert_htmltale_to_md(content)
102 |                     new_content = clean_markdown_images(new_content)
103 |                     new_content = clean_ocrmath_photo(new_content)
104 |                     new_content = convert_latext_to_md(new_content)
105 |                     with open(output_file, 'w', encoding='utf-8') as fw:
106 |                         fw.write(new_content)
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     tool_type = args.tool_name
111 |     download_dir = args.download_dir
112 |     clean_data(tool_type, download_dir)
113 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/extractors/forum_extractor.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import re
  3 | 
  4 | from magic_doc.contrib.magic_html.config import Forum_XPATH, Unique_ID
  5 | from magic_doc.contrib.magic_html.utils import *
  6 | from magic_doc.contrib.magic_html.extractors.base_extractor import BaseExtractor
  7 | from magic_doc.contrib.magic_html.extractors.title_extractor import TitleExtractor
  8 | 
  9 | 
 10 | class ForumExtractor(BaseExtractor):
 11 |     def __init__(self) -> None:
 12 |         super().__init__()
 13 | 
 14 |     def extract(self, html="", base_url="") -> dict:
 15 |         self.need_comment = True
 16 |         html = html.replace("&nbsp;", " ").replace("&#160;", " ")
 17 |         tree = load_html(html)
 18 |         if tree is None:
 19 |             raise ValueError
 20 | 
 21 |         # 获取title
 22 |         title = TitleExtractor().process(tree)
 23 | 
 24 |         # base_url
 25 |         base_href = tree.xpath("//base/@href")
 26 | 
 27 |         if base_href and "http" in base_href[0]:
 28 |             base_url = base_href[0]
 29 |         self.generate_unique_id(tree)
 30 | 
 31 |         format_tree = self.convert_tags(tree, base_url=base_url)
 32 | 
 33 |         normal_tree = self.clean_tags(format_tree)
 34 | 
 35 |         subtree, xp_num, drop_list = self.xp_1_5(normal_tree)
 36 |         if xp_num == "others":
 37 |             subtree, drop_list = self.prune_unwanted_sections(normal_tree)
 38 |         body_html = self.get_content_html(subtree, xp_num, base_url)
 39 | 
 40 |         # 论坛等独有
 41 |         body_html_tree = fromstring(body_html)
 42 |         try:
 43 |             body_tree = body_html_tree.body
 44 |         except:
 45 |             body_tree = Element("body")
 46 |             body_tree.extend(body_html_tree)
 47 |         main_ids = body_tree.xpath(f".//@{Unique_ID}")
 48 | 
 49 |         for main_id in main_ids:
 50 |             main_tree = normal_tree.xpath(
 51 |                 f".//*[@{Unique_ID}={main_id}]"
 52 |             )
 53 |             if main_tree:
 54 |                 self.remove_node(main_tree[0])
 55 |         if not main_ids:
 56 |             main_ids = [-1]
 57 | 
 58 |         if xp_num != "others":
 59 |             normal_tree, _ = self.prune_unwanted_sections(normal_tree)
 60 |         for c_xpath in Forum_XPATH:
 61 |             while normal_tree.xpath(c_xpath):
 62 |                 x = normal_tree.xpath(c_xpath)[0]
 63 |                 self.remove_node(x)
 64 |                 if "'post-'" in c_xpath:
 65 |                     if not (re.findall('post-\d+', x.attrib.get("id", "").lower()) or re.findall('post_\d+',
 66 |                                                                                                  x.attrib.get("id",
 67 |                                                                                                               "").lower())):
 68 |                         continue
 69 |                 if (
 70 |                         "header" in x.attrib.get("class", "").lower()
 71 |                         or "header" in x.attrib.get("id", "").lower()
 72 |                 ):
 73 |                     continue
 74 |                 try:
 75 |                     if int(x.attrib.get(Unique_ID, "0")) > int(
 76 |                             main_ids[-1]
 77 |                     ):
 78 |                         body_tree.append(x)
 79 |                     else:
 80 |                         prefix_div = Element("div")
 81 |                         suffix_div = Element("div")
 82 |                         need_prefix = False
 83 |                         need_suffix = False
 84 |                         while x.xpath(
 85 |                                 f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
 86 |                         ):
 87 |                             tmp_x = x.xpath(
 88 |                                 f".//*[number(@{Unique_ID}) > {int(main_ids[-1])}]"
 89 |                             )[0]
 90 |                             self.remove_node(tmp_x)
 91 |                             suffix_div.append(tmp_x)
 92 |                             need_suffix = True
 93 |                         while x.xpath(
 94 |                                 f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
 95 |                         ):
 96 |                             tmp_x = x.xpath(
 97 |                                 f".//*[number(@{Unique_ID}) < {int(main_ids[-1])}]"
 98 |                             )[0]
 99 |                             self.remove_node(tmp_x)
100 |                             prefix_div.append(tmp_x)
101 |                             need_prefix = True
102 |                         if need_prefix:
103 |                             body_tree.insert(0, prefix_div)
104 |                         if need_suffix:
105 |                             body_tree.append(suffix_div)
106 | 
107 |                 except:
108 |                     pass
109 | 
110 |         body_html = re.sub(
111 |             f' {Unique_ID}="\d+"',
112 |             "",
113 |             tostring(body_tree, encoding=str),
114 |         )
115 | 
116 |         return {
117 |             "xp_num": xp_num,
118 |             "drop_list": drop_list,
119 |             "html": body_html,
120 |             "title": title,
121 |             "base_url": base_url
122 |         }
123 | 


--------------------------------------------------------------------------------
/magic_doc/bin/linux/share/antiword/fontnames:
--------------------------------------------------------------------------------
  1 | # Default fontnames translation table
  2 | # uses only Standard PostScript (TM) fonts
  3 | #
  4 | # MS-Word fontname,	Italic,	Bold,	PostScript fontname,	Special
  5 | Arial,			0,	0,	Helvetica,		0
  6 | Arial,			0,	1,	Helvetica-Bold,		0
  7 | Arial,			1,	0,	Helvetica-Oblique,	0
  8 | Arial,			1,	1,	Helvetica-BoldOblique,	0
  9 | Arial Black,		0,	0,	Helvetica,		0
 10 | Arial Black,		0,	1,	Helvetica-Bold,		0
 11 | Arial Black,		1,	0,	Helvetica-Oblique,	0
 12 | Arial Black,		1,	1,	Helvetica-BoldOblique,	0
 13 | Arial CE,		0,	0,	Helvetica,		0
 14 | Arial CE,		0,	1,	Helvetica-Bold,		0
 15 | Arial CE,		1,	0,	Helvetica-Oblique,	0
 16 | Arial CE,		1,	1,	Helvetica-BoldOblique,	0
 17 | Arial Narrow,		0,	0,	Helvetica-Narrow,	0
 18 | Arial Narrow,		0,	1,	Helvetica-Narrow-Bold,	0
 19 | Arial Narrow,		1,	0,	Helvetica-Narrow-Oblique,	0
 20 | Arial Narrow,		1,	1,	Helvetica-Narrow-BoldOblique,	0
 21 | AvantGarde,		0,	0,	AvantGarde-Book,	0
 22 | AvantGarde,		0,	1,	AvantGarde-Demi,	0
 23 | AvantGarde,		1,	0,	AvantGarde-BookOblique,	0
 24 | AvantGarde,		1,	1,	AvantGarde-DemiOblique,	0
 25 | Bookman Old Style,	0,	0,	Bookman-Light,		0
 26 | Bookman Old Style,	0,	1,	Bookman-Demi,		0
 27 | Bookman Old Style,	1,	0,	Bookman-LightItalic,	0
 28 | Bookman Old Style,	1,	1,	Bookman-DemiItalic,	0
 29 | Century Schoolbook,	0,	0,	NewCenturySchlbk-Roman,	0
 30 | Century Schoolbook,	0,	1,	NewCenturySchlbk-Bold,	0
 31 | Century Schoolbook,	1,	0,	NewCenturySchlbk-Italic,	0
 32 | Century Schoolbook,	1,	1,	NewCenturySchlbk-BoldItalic,	0
 33 | CG Omega,		0,	0,	Helvetica,		0
 34 | CG Omega,		0,	1,	Helvetica-Bold,		0
 35 | CG Omega,		1,	0,	Helvetica-Oblique,	0
 36 | CG Omega,		1,	1,	Helvetica-BoldOblique,	0
 37 | Comic Sans MS,		0,	0,	Helvetica,		0
 38 | Comic Sans MS,		0,	1,	Helvetica-Bold,		0
 39 | Comic Sans MS,		1,	0,	Helvetica-Oblique,	0
 40 | Comic Sans MS,		1,	1,	Helvetica-BoldOblique,	0
 41 | Courier,		0,	0,	Courier,		0
 42 | Courier,		0,	1,	Courier-Bold,		0
 43 | Courier,		1,	0,	Courier-Oblique,	0
 44 | Courier,		1,	1,	Courier-BoldOblique,	0
 45 | Courier New,		0,	0,	Courier,		0
 46 | Courier New,		0,	1,	Courier-Bold,		0
 47 | Courier New,		1,	0,	Courier-Oblique,	0
 48 | Courier New,		1,	1,	Courier-BoldOblique,	0
 49 | Fixedsys,		0,	0,	Courier,		0
 50 | Fixedsys,		0,	1,	Courier-Bold,		0
 51 | Fixedsys,		1,	0,	Courier-Oblique,	0
 52 | Fixedsys,		1,	1,	Courier-BoldOblique,	0
 53 | Helvetica,		0,	0,	Helvetica,		0
 54 | Helvetica,		0,	1,	Helvetica-Bold,		0
 55 | Helvetica,		1,	0,	Helvetica-Oblique,	0
 56 | Helvetica,		1,	1,	Helvetica-BoldOblique,	0
 57 | Helvetica-Narrow,	0,	0,	Helvetica-Narrow,	0
 58 | Helvetica-Narrow,	0,	1,	Helvetica-Narrow-Bold,	0
 59 | Helvetica-Narrow,	1,	0,	Helvetica-Narrow-Oblique,	0
 60 | Helvetica-Narrow,	1,	1,	Helvetica-Narrow-BoldOblique,	0
 61 | ITC Bookman,		0,	0,	Bookman-Light,		0
 62 | ITC Bookman,		0,	1,	Bookman-Demi,		0
 63 | ITC Bookman,		1,	0,	Bookman-LightItalic,	0
 64 | ITC Bookman,		1,	1,	Bookman-DemiItalic,	0
 65 | Lucida Console,		0,	0,	Courier,		0
 66 | Lucida Console,		0,	1,	Courier-Bold,		0
 67 | Lucida Console,		1,	0,	Courier-Oblique,	0
 68 | Lucida Console,		1,	1,	Courier-BoldOblique,	0
 69 | Lucida Sans Typewriter,	0,	0,	Courier,		0
 70 | Lucida Sans Typewriter,	0,	1,	Courier-Bold,		0
 71 | Lucida Sans Typewriter,	1,	0,	Courier-Oblique,	0
 72 | Lucida Sans Typewriter,	1,	1,	Courier-BoldOblique,	0
 73 | Monotype.com,		0,	0,	Courier,		0
 74 | Monotype.com,		0,	1,	Courier-Bold,		0
 75 | Monotype.com,		1,	0,	Courier-Oblique,	0
 76 | Monotype.com,		1,	1,	Courier-BoldOblique,	0
 77 | MS Sans Serif,		0,	0,	Helvetica,		0
 78 | MS Sans Serif,		0,	1,	Helvetica-Bold,		0
 79 | MS Sans Serif,		1,	0,	Helvetica-Oblique,	0
 80 | MS Sans Serif,		1,	1,	Helvetica-BoldOblique,	0
 81 | New Century Schlbk,	0,	0,	NewCenturySchlbk-Roman,	0
 82 | New Century Schlbk,	0,	1,	NewCenturySchlbk-Bold,	0
 83 | New Century Schlbk,	1,	0,	NewCenturySchlbk-Italic,	0
 84 | New Century Schlbk,	1,	1,	NewCenturySchlbk-BoldItalic,	0
 85 | NewCenturySchlbk,	0,	0,	NewCenturySchlbk-Roman,	0
 86 | NewCenturySchlbk,	0,	1,	NewCenturySchlbk-Bold,	0
 87 | NewCenturySchlbk,	1,	0,	NewCenturySchlbk-Italic,	0
 88 | NewCenturySchlbk,	1,	1,	NewCenturySchlbk-BoldItalic,	0
 89 | Palatino,		0,	0,	Palatino-Roman,		0
 90 | Palatino,		0,	1,	Palatino-Bold,		0
 91 | Palatino,		1,	0,	Palatino-Italic,	0
 92 | Palatino,		1,	1,	Palatino-BoldItalic,	0
 93 | Swiss,			0,	0,	Helvetica,		0
 94 | Swiss,			0,	1,	Helvetica-Bold,		0
 95 | Swiss,			1,	0,	Helvetica-Oblique,	0
 96 | Swiss,			1,	1,	Helvetica-BoldOblique,	0
 97 | Tahoma,			0,	0,	Helvetica,		0
 98 | Tahoma,			0,	1,	Helvetica-Bold,		0
 99 | Tahoma,			1,	0,	Helvetica-Oblique,	0
100 | Tahoma,			1,	1,	Helvetica-BoldOblique,	0
101 | Trebuchet MS,		0,	0,	Helvetica,		0
102 | Trebuchet MS,		0,	1,	Helvetica-Bold,		0
103 | Trebuchet MS,		1,	0,	Helvetica-Oblique,	0
104 | Trebuchet MS,		1,	1,	Helvetica-BoldOblique,	0
105 | Univers,		0,	0,	Helvetica,		0
106 | Univers,		0,	1,	Helvetica-Bold,		0
107 | Univers,		1,	0,	Helvetica-Oblique,	0
108 | Univers,		1,	1,	Helvetica-BoldOblique,	0
109 | Verdana,		0,	0,	Helvetica,		0
110 | Verdana,		0,	1,	Helvetica-Bold,		0
111 | Verdana,		1,	0,	Helvetica-Oblique,	0
112 | Verdana,		1,	1,	Helvetica-BoldOblique,	0
113 | # All the other fonts
114 | *,			0,	0,	Times-Roman,		0
115 | *,			0,	1,	Times-Bold,		0
116 | *,			1,	0,	Times-Italic,		0
117 | *,			1,	1,	Times-BoldItalic,	0
118 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/model_init.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from .visualizer import Visualizer
  3 | from .rcnn_vl import *
  4 | from .backbone import *
  5 | 
  6 | from detectron2.config import get_cfg
  7 | from detectron2.config import CfgNode as CN
  8 | from detectron2.data import MetadataCatalog, DatasetCatalog
  9 | from detectron2.data.datasets import register_coco_instances
 10 | from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor
 11 | from magic_doc.utils import get_repo_directory
 12 | 
 13 | def add_vit_config(cfg):
 14 |     """
 15 |     Add config for VIT.
 16 |     """
 17 |     _C = cfg
 18 | 
 19 |     _C.MODEL.VIT = CN()
 20 | 
 21 |     # CoaT model name.
 22 |     _C.MODEL.VIT.NAME = ""
 23 | 
 24 |     # Output features from CoaT backbone.
 25 |     _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"]
 26 | 
 27 |     _C.MODEL.VIT.IMG_SIZE = [224, 224]
 28 | 
 29 |     _C.MODEL.VIT.POS_TYPE = "shared_rel"
 30 | 
 31 |     _C.MODEL.VIT.DROP_PATH = 0.
 32 | 
 33 |     _C.MODEL.VIT.MODEL_KWARGS = "{}"
 34 | 
 35 |     _C.SOLVER.OPTIMIZER = "ADAMW"
 36 | 
 37 |     _C.SOLVER.BACKBONE_MULTIPLIER = 1.0
 38 | 
 39 |     _C.AUG = CN()
 40 | 
 41 |     _C.AUG.DETR = False
 42 | 
 43 |     _C.MODEL.IMAGE_ONLY = True
 44 |     _C.PUBLAYNET_DATA_DIR_TRAIN = ""
 45 |     _C.PUBLAYNET_DATA_DIR_TEST = ""
 46 |     _C.FOOTNOTE_DATA_DIR_TRAIN = ""
 47 |     _C.FOOTNOTE_DATA_DIR_VAL = ""
 48 |     _C.SCIHUB_DATA_DIR_TRAIN = ""
 49 |     _C.SCIHUB_DATA_DIR_TEST = ""
 50 |     _C.JIAOCAI_DATA_DIR_TRAIN = ""
 51 |     _C.JIAOCAI_DATA_DIR_TEST = ""
 52 |     _C.ICDAR_DATA_DIR_TRAIN = ""
 53 |     _C.ICDAR_DATA_DIR_TEST = ""
 54 |     _C.M6DOC_DATA_DIR_TEST = ""
 55 |     _C.DOCSTRUCTBENCH_DATA_DIR_TEST = ""
 56 |     _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = ""
 57 |     _C.CACHE_DIR = ""
 58 |     _C.MODEL.CONFIG_PATH = ""
 59 | 
 60 |     # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS
 61 |     # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS
 62 |     _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1
 63 | 
 64 | 
 65 | def setup(args):
 66 |     """
 67 |     Create configs and perform basic setups.
 68 |     """
 69 |     cfg = get_cfg()
 70 |     # add_coat_config(cfg)
 71 |     add_vit_config(cfg)
 72 |     cfg.merge_from_file(args.config_file)
 73 |     cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2  # set threshold for this model
 74 |     cfg.merge_from_list(args.opts)
 75 |     cfg.freeze()
 76 |     default_setup(cfg, args)
 77 |     
 78 |     """ 
 79 |     #TODO: 可以去掉？
 80 |     register_coco_instances(
 81 |         "scihub_train",
 82 |         {},
 83 |         cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
 84 |         cfg.SCIHUB_DATA_DIR_TRAIN
 85 |     )
 86 |     """
 87 |     
 88 |     return cfg
 89 | 
 90 | 
 91 | class DotDict(dict):
 92 |     def __init__(self, *args, **kwargs):
 93 |         super(DotDict, self).__init__(*args, **kwargs)
 94 | 
 95 |     def __getattr__(self, key):
 96 |         if key not in self.keys():
 97 |             return None
 98 |         value = self[key]
 99 |         if isinstance(value, dict):
100 |             value = DotDict(value)
101 |         return value
102 |     
103 |     def __setattr__(self, key, value):
104 |         self[key] = value
105 |         
106 | class Layoutlmv3_Predictor(object):
107 |     def __init__(self, weights):
108 |         layout_args = {
109 |             "config_file": os.path.join(get_repo_directory(), "resources/model/layoutlmv3/layoutlmv3_base_inference.yaml"), # TODO 修改配置路径
110 |             "resume": False,
111 |             "eval_only": False,
112 |             "num_gpus": 1,
113 |             "num_machines": 1,
114 |             "machine_rank": 0,
115 |             "dist_url": "tcp://127.0.0.1:57823",
116 |             "opts": ["MODEL.WEIGHTS", weights],
117 |         }
118 |         layout_args = DotDict(layout_args)
119 |         
120 |         cfg = setup(layout_args)
121 |         self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption", "table_footnote", "isolate_formula", "formula_caption"]
122 |         MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping
123 |         self.predictor = DefaultPredictor(cfg)
124 |         
125 |     def __call__(self, image, ignore_catids=[]):
126 |         page_layout_result = {
127 |             "layout_dets": []
128 |         }
129 |         outputs = self.predictor(image)
130 |         boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist()
131 |         labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist()
132 |         scores = outputs["instances"].to("cpu")._fields["scores"].tolist()
133 |         for bbox_idx in range(len(boxes)):
134 |             if labels[bbox_idx] in ignore_catids:
135 |                 continue
136 |             page_layout_result["layout_dets"].append({
137 |                 "category_id": labels[bbox_idx],
138 |                 "poly": [
139 |                     boxes[bbox_idx][0], boxes[bbox_idx][1],
140 |                     boxes[bbox_idx][2], boxes[bbox_idx][1],
141 |                     boxes[bbox_idx][2], boxes[bbox_idx][3],
142 |                     boxes[bbox_idx][0], boxes[bbox_idx][3],
143 |                 ],
144 |                 "score": scores[bbox_idx]
145 |             })
146 |         return page_layout_result


--------------------------------------------------------------------------------
/magic_doc/contrib/office/pptx_extract.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import sys
  3 | 
  4 | from pathlib import Path
  5 | from typing import List
  6 | 
  7 | from loguru import logger
  8 | from pptx import Presentation
  9 | from pptx.enum.shapes import MSO_SHAPE_TYPE
 10 | from pptx.parts.image import Image
 11 | from pptx.presentation import Presentation as ppt
 12 | from pptx.shapes.autoshape import Shape
 13 | from pptx.shapes.picture import Picture
 14 | from pptx.shapes.graphfrm import GraphicFrame
 15 | from pptx.table import Table, _Row, _Cell
 16 | from pptx.slide import Slide
 17 | from pptx.shapes.group import GroupShape
 18 | from werkzeug.datastructures import FileStorage
 19 | 
 20 | from magic_doc.contrib.office import OfficeExtractor
 21 | from magic_doc.contrib.model import ExtractResponse, Page, Content
 22 | 
 23 | 
 24 | class PptxExtractor(OfficeExtractor):
 25 |     def __init__(self) -> None:
 26 |         super().__init__()
 27 | 
 28 |     def setup(self):
 29 |         pass
 30 | 
 31 |     def handle_shape(
 32 |         self,
 33 |         shape: Shape,
 34 |         content_list: List[Content],
 35 |         media_dir: Path,
 36 |         img_map: dict[Path, str],
 37 |         id: str,
 38 |         skip_image: bool,
 39 |     ):
 40 |         if shape.has_text_frame:
 41 |             for paragraph in shape.text_frame.paragraphs:
 42 |                 content_list.append(
 43 |                     Content(
 44 |                         type="text",
 45 |                         data=paragraph.text + "\n",
 46 |                     )
 47 |                 )
 48 |         elif shape.shape_type == MSO_SHAPE_TYPE.PICTURE and not skip_image:
 49 |             shape: Picture
 50 |             image: Image = shape.image
 51 |             image_bytes = image.blob
 52 |             img_path = media_dir.joinpath(f"pic-{len(img_map)}.{image.ext}")
 53 |             img_s3_path = self.generate_img_path(id, img_path.name)
 54 |             img_map[img_path] = img_s3_path
 55 |             content_list.append(Content(type="image", data=img_s3_path))
 56 |             with open(img_path, "wb") as file:
 57 |                 file.write(image_bytes)
 58 |         elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
 59 |             shape: GraphicFrame
 60 |             table: Table = shape.table
 61 |             md = "\n"
 62 |             for row_no, row in enumerate(table.rows):
 63 |                 row: _Row
 64 |                 md += "|"
 65 |                 if row_no == 1:
 66 |                     for col in row.cells:
 67 |                         md += "---|"
 68 |                     md += "\n|"
 69 |                 for col in row.cells:
 70 |                     cell: _Cell = col
 71 |                     md += " " + cell.text.replace("\r", " ").replace("\n", " ") + " |"
 72 |                 md += "\n"
 73 |             md += "\n"
 74 |             content_list.append(Content(type="md", data=md))
 75 |         elif shape.shape_type == MSO_SHAPE_TYPE.GROUP:
 76 |             shape: GroupShape
 77 |             for sub_shape in shape.shapes:
 78 |                 self.handle_shape(sub_shape, content_list, media_dir, img_map, id, skip_image)
 79 |         else:
 80 |             # print(shape.shape_type, type(shape), file=sys.stderr)
 81 |             pass
 82 | 
 83 |     def extract(
 84 |         self,
 85 |         r: FileStorage | Path,
 86 |         id: str,
 87 |         dir: Path,
 88 |         media_dir: Path,
 89 |         skip_image: bool,
 90 |     ) -> ExtractResponse:
 91 |         pages = []
 92 |         img_map = {}
 93 | 
 94 |         presentation: ppt = Presentation(r)
 95 |         for page_no, slide in enumerate(presentation.slides):
 96 |             slide: Slide
 97 |             page = Page(page_no=page_no, content_list=[])
 98 |             for shape in slide.shapes:
 99 |                 self.handle_shape(
100 |                     shape,
101 |                     page["content_list"],
102 |                     media_dir,
103 |                     img_map,
104 |                     id,
105 |                     skip_image,
106 |                 )
107 | 
108 |             pages.append(page)
109 | 
110 |         # self.upload_background(id, img_map)
111 | 
112 |         return pages
113 | 
114 | 
115 | if __name__ == "__main__":
116 |     e = PptxExtractor()
117 |     # from pedia_document_parser.s3.client import get_s3_client
118 | 
119 |     # cli = get_s3_client()
120 | 
121 |     # data = cli.read_object(
122 |     #     "s3://pedia-document-parser/office-doucments/【英文-模板】Professional Pack Standard.pptx"
123 |     # )
124 |     # with open("1.pptx", "wb") as f:
125 |     #     f.write(data.read())
126 | 
127 |     x = e.run(
128 |         "ghi",
129 |         Path("test_data/doc/商业项目市场分析与产品定位报告.pptx"),
130 |     )
131 |     content = ""
132 |     for p in x:
133 |         content += f"\n====== page {p['page_no']} ======\n"
134 |         for pp in p["content_list"]:
135 |             content += pp["data"]
136 | 
137 |     print(content)
138 | 
139 |     # cli.read_object("s3://pedia-document-parser/office-doucments/【英文-课件】MIT15_082JF10_av.pptx")
140 | 
141 |     # print(
142 |     #     json.dumps(
143 |     #         e.run(
144 |     #             "ghi",
145 |     #             Path(
146 |     #                 "/home/SENSETIME/wuziming/doc/doc/【英文-模板】Professional Pack Standard.pptx",
147 |     #             ),
148 |     #         ),
149 |     #         ensure_ascii=False,
150 |     #         indent=4,
151 |     #     )
152 |     # )
153 |     e.wait_all()
154 | 


--------------------------------------------------------------------------------
/tools/markdown_calculate.py:
--------------------------------------------------------------------------------
 1 | import os  
 2 | from Levenshtein import distance  
 3 | from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
 4 | from nltk.tokenize import word_tokenize  
 5 | import json 
 6 | import re
 7 | import scoring
 8 | import argparse
 9 | 
10 | # 初始化列表来存储编辑距离和BLEU分数  
11 | class Scoring:
12 |     def __init__(self, result_path):
13 |         self.edit_distances = []
14 |         self.bleu_scores = []
15 |         self.sim_scores = []
16 |         self.filenames = []
17 |         self.score_dict = {}
18 |         self.anntion_cnt = 0
19 |         self.fw = open(result_path, "w+")
20 |     def simple_bleu_score(self, candidate, reference):  
21 |         candidate_tokens = word_tokenize(candidate)  
22 |         reference_tokens = word_tokenize(reference) 
23 |         return sentence_bleu([reference_tokens], candidate_tokens, smoothing_function=SmoothingFunction().method1) 
24 | 
25 | 
26 |     def preprocess_string(self, s):  
27 |         sub_enter = re.sub(r'\n+', '\n', s)
28 |         return re.sub(r'  ', ' ', sub_enter)
29 |     
30 |     def calculate_similarity(self, annotion, actual, tool_type):
31 |         class_dict = {}
32 |         edit_distances = []
33 |         bleu_scores = []
34 |         sim_scores = list()
35 |         total_file = 0
36 |         for filename in os.listdir(annotion):  
37 |             if filename.endswith('.md') and not filename.startswith('.'):  # 忽略隐藏文件  
38 |                 total_file = total_file + 1
39 |                 # 读取A目录中的文件  
40 |                 with open(os.path.join(annotion, filename), 'r', encoding='utf-8') as file_a:  
41 |                     content_a = file_a.read()
42 |                 self.anntion_cnt = self.anntion_cnt + 1
43 |                 filepath_b = os.path.join(actual, filename)  
44 |                 if os.path.exists(filepath_b):  
45 |                     with open(filepath_b, 'r', encoding='utf-8') as file_b:  
46 |                         content_b = file_b.read()
47 |                         self.filenames.append(filename)
48 |                         # 计算编辑距离
49 |                         edit_dist = distance(self.preprocess_string(content_b),self.preprocess_string(content_a)) / max(len(content_a), len(content_b))
50 |                         self.edit_distances.append(edit_dist)  
51 |                         edit_distances.append(edit_dist)
52 |                         #计算BLUE分数
53 |                         bleu_score = self.simple_bleu_score(content_b, content_a)  
54 |                         bleu_scores.append(bleu_score)
55 |                         self.bleu_scores.append(bleu_score)  
56 |                         #计算marker分数
57 |                         score = scoring.score_text(content_b, content_a)
58 |                         sim_scores.append(score)
59 |                         self.sim_scores.append(score)
60 |                         class_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
61 |                         self.score_dict[filename] = {"edit_dist": edit_dist, "bleu_score": bleu_score, "sim_score": score}
62 |                 else:  
63 |                     print(f"File {filename} not found in actual directory.")  
64 |         # 计算每类平均值
65 |         class_average_edit_distance = sum(edit_distances) / len(edit_distances) if edit_distances else 0  
66 |         class_average_bleu_score = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0  
67 |         class_average_sim_score = sum(sim_scores) / len(sim_scores) if sim_scores else 0
68 |         self.fw.write(json.dumps(class_dict, ensure_ascii=False) + "\n")
69 |         ratio = len(class_dict)/total_file
70 |         self.fw.write(f"{tool_type} extract ratio:  {ratio}" + "\n")
71 |         self.fw.write(f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}" + "\n")
72 |         self.fw.write(f"{tool_type} Average BLEU Score: {class_average_bleu_score}" + "\n")
73 |         self.fw.write(f"{tool_type} Average Sim Score: {class_average_sim_score}" + "\n")
74 | 
75 |         print (f"{tool_type} extract ratio: {ratio}")
76 |         print (f"{tool_type} Average Levenshtein Distance: {class_average_edit_distance}")
77 |         print (f"{tool_type} Average BLEU Score: {class_average_bleu_score}")
78 |         print (f"{tool_type} Average Sim Score: {class_average_sim_score}")
79 |         return self.score_dict
80 |     
81 |     def summary_scores(self):
82 |          # 计算整体平均值
83 |         over_all_dict = dict()
84 |         average_edit_distance = sum(self.edit_distances) / len(self.edit_distances) if self.edit_distances else 0  
85 |         average_bleu_score = sum(self.bleu_scores) / len(self.bleu_scores) if self.bleu_scores else 0  
86 |         average_sim_score = sum(self.sim_scores) / len(self.sim_scores) if self.sim_scores else 0
87 |         over_all_dict["average_edit_distance"] = average_edit_distance
88 |         over_all_dict["average_bleu_score"] = average_bleu_score
89 |         over_all_dict["average_sim_score"] = average_sim_score
90 |         self.fw.write(json.dumps(over_all_dict, ensure_ascii=False) + "\n")
91 |         return over_all_dict
92 | 
93 |     def calculate_similarity_total(self, tool_type, file_types, download_dir):
94 |         for file_type in file_types:
95 |             annotion = os.path.join(download_dir, file_type, "annotations", "cleaned")
96 |             actual = os.path.join(download_dir, file_type, tool_type, "cleaned")
97 |             self.calculate_similarity(annotion, actual, file_type)
98 | 
99 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/data/funsd.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | '''
  3 | Reference: https://huggingface.co/datasets/nielsr/funsd/blob/main/funsd.py
  4 | '''
  5 | import json
  6 | import os
  7 | 
  8 | import datasets
  9 | 
 10 | from .image_utils import load_image, normalize_bbox
 11 | 
 12 | 
 13 | logger = datasets.logging.get_logger(__name__)
 14 | 
 15 | 
 16 | _CITATION = """\
 17 | @article{Jaume2019FUNSDAD,
 18 |   title={FUNSD: A Dataset for Form Understanding in Noisy Scanned Documents},
 19 |   author={Guillaume Jaume and H. K. Ekenel and J. Thiran},
 20 |   journal={2019 International Conference on Document Analysis and Recognition Workshops (ICDARW)},
 21 |   year={2019},
 22 |   volume={2},
 23 |   pages={1-6}
 24 | }
 25 | """
 26 | 
 27 | _DESCRIPTION = """\
 28 | https://guillaumejaume.github.io/FUNSD/
 29 | """
 30 | 
 31 | 
 32 | class FunsdConfig(datasets.BuilderConfig):
 33 |     """BuilderConfig for FUNSD"""
 34 | 
 35 |     def __init__(self, **kwargs):
 36 |         """BuilderConfig for FUNSD.
 37 | 
 38 |         Args:
 39 |           **kwargs: keyword arguments forwarded to super.
 40 |         """
 41 |         super(FunsdConfig, self).__init__(**kwargs)
 42 | 
 43 | 
 44 | class Funsd(datasets.GeneratorBasedBuilder):
 45 |     """Conll2003 dataset."""
 46 | 
 47 |     BUILDER_CONFIGS = [
 48 |         FunsdConfig(name="funsd", version=datasets.Version("1.0.0"), description="FUNSD dataset"),
 49 |     ]
 50 | 
 51 |     def _info(self):
 52 |         return datasets.DatasetInfo(
 53 |             description=_DESCRIPTION,
 54 |             features=datasets.Features(
 55 |                 {
 56 |                     "id": datasets.Value("string"),
 57 |                     "tokens": datasets.Sequence(datasets.Value("string")),
 58 |                     "bboxes": datasets.Sequence(datasets.Sequence(datasets.Value("int64"))),
 59 |                     "ner_tags": datasets.Sequence(
 60 |                         datasets.features.ClassLabel(
 61 |                             names=["O", "B-HEADER", "I-HEADER", "B-QUESTION", "I-QUESTION", "B-ANSWER", "I-ANSWER"]
 62 |                         )
 63 |                     ),
 64 |                     "image": datasets.Array3D(shape=(3, 224, 224), dtype="uint8"),
 65 |                     "image_path": datasets.Value("string"),
 66 |                 }
 67 |             ),
 68 |             supervised_keys=None,
 69 |             homepage="https://guillaumejaume.github.io/FUNSD/",
 70 |             citation=_CITATION,
 71 |         )
 72 | 
 73 |     def _split_generators(self, dl_manager):
 74 |         """Returns SplitGenerators."""
 75 |         downloaded_file = dl_manager.download_and_extract("https://guillaumejaume.github.io/FUNSD/dataset.zip")
 76 |         return [
 77 |             datasets.SplitGenerator(
 78 |                 name=datasets.Split.TRAIN, gen_kwargs={"filepath": f"{downloaded_file}/dataset/training_data/"}
 79 |             ),
 80 |             datasets.SplitGenerator(
 81 |                 name=datasets.Split.TEST, gen_kwargs={"filepath": f"{downloaded_file}/dataset/testing_data/"}
 82 |             ),
 83 |         ]
 84 | 
 85 |     def get_line_bbox(self, bboxs):
 86 |         x = [bboxs[i][j] for i in range(len(bboxs)) for j in range(0, len(bboxs[i]), 2)]
 87 |         y = [bboxs[i][j] for i in range(len(bboxs)) for j in range(1, len(bboxs[i]), 2)]
 88 | 
 89 |         x0, y0, x1, y1 = min(x), min(y), max(x), max(y)
 90 | 
 91 |         assert x1 >= x0 and y1 >= y0
 92 |         bbox = [[x0, y0, x1, y1] for _ in range(len(bboxs))]
 93 |         return bbox
 94 | 
 95 |     def _generate_examples(self, filepath):
 96 |         logger.info("⏳ Generating examples from = %s", filepath)
 97 |         ann_dir = os.path.join(filepath, "annotations")
 98 |         img_dir = os.path.join(filepath, "images")
 99 |         for guid, file in enumerate(sorted(os.listdir(ann_dir))):
100 |             tokens = []
101 |             bboxes = []
102 |             ner_tags = []
103 | 
104 |             file_path = os.path.join(ann_dir, file)
105 |             with open(file_path, "r", encoding="utf8") as f:
106 |                 data = json.load(f)
107 |             image_path = os.path.join(img_dir, file)
108 |             image_path = image_path.replace("json", "png")
109 |             image, size = load_image(image_path)
110 |             for item in data["form"]:
111 |                 cur_line_bboxes = []
112 |                 words, label = item["words"], item["label"]
113 |                 words = [w for w in words if w["text"].strip() != ""]
114 |                 if len(words) == 0:
115 |                     continue
116 |                 if label == "other":
117 |                     for w in words:
118 |                         tokens.append(w["text"])
119 |                         ner_tags.append("O")
120 |                         cur_line_bboxes.append(normalize_bbox(w["box"], size))
121 |                 else:
122 |                     tokens.append(words[0]["text"])
123 |                     ner_tags.append("B-" + label.upper())
124 |                     cur_line_bboxes.append(normalize_bbox(words[0]["box"], size))
125 |                     for w in words[1:]:
126 |                         tokens.append(w["text"])
127 |                         ner_tags.append("I-" + label.upper())
128 |                         cur_line_bboxes.append(normalize_bbox(w["box"], size))
129 |                 # by default: --segment_level_layout 1
130 |                 # if do not want to use segment_level_layout, comment the following line
131 |                 cur_line_bboxes = self.get_line_bbox(cur_line_bboxes)
132 |                 # box = normalize_bbox(item["box"], size)
133 |                 # cur_line_bboxes = [box for _ in range(len(words))]
134 |                 bboxes.extend(cur_line_bboxes)
135 |             yield guid, {"id": str(guid), "tokens": tokens, "bboxes": bboxes, "ner_tags": ner_tags,
136 |                          "image": image, "image_path": image_path}


--------------------------------------------------------------------------------
/magic_doc/contrib/office/docx_extract.py:
--------------------------------------------------------------------------------
  1 | import xml.etree.ElementTree as ET
  2 | import zipfile
  3 | 
  4 | from pathlib import Path
  5 | from magic_doc.contrib.model import ExtractResponse, Content, Page
  6 | from magic_doc.contrib.office import OfficeExtractor
  7 | from typing import IO
  8 | from io import BytesIO
  9 | from werkzeug.datastructures import FileStorage
 10 | 
 11 | from magic_doc.contrib.office.formula.omml import omml2tex
 12 | 
 13 | 
 14 | class DocxExtractor(OfficeExtractor):
 15 |     def __init__(self) -> None:
 16 |         super().__init__()
 17 | 
 18 |     def setup(self):
 19 |         pass
 20 | 
 21 |     def __word2markdown(
 22 |         self,
 23 |         id: str,
 24 |         docx_file_stream: IO[bytes],
 25 |         save_fig_dir,
 26 |     ):
 27 |         tag_w = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}"
 28 |         tag_body = f"{tag_w}body"
 29 | 
 30 |         content_list = []
 31 |         text_count = 0
 32 |         with zipfile.ZipFile(docx_file_stream, "r") as docx:
 33 |             xml_content = docx.read("word/document.xml")
 34 |             tree = ET.XML(xml_content)
 35 |             body = tree.find(tag_body)
 36 | 
 37 |             for child in body:
 38 |                 tag = child.tag.split("}")[-1]
 39 |                 if text_count >= self.max_text_count:
 40 |                     break
 41 | 
 42 |                 match tag:
 43 |                     case "p":
 44 |                         text = ""
 45 |                         for ele in child.iter():
 46 |                             if "math" in ele.tag:
 47 |                                 if ele.tag.endswith("oMath"):
 48 |                                     math_xml = BytesIO()
 49 |                                     ET.ElementTree(ele).write(
 50 |                                         math_xml,
 51 |                                         encoding="utf-8",
 52 |                                         xml_declaration=True,
 53 |                                     )
 54 |                                     math_xml = math_xml.getvalue().decode("utf-8")
 55 |                                     math_xml = "\n".join(math_xml.split("\n")[1:])
 56 |                                     math_formula = "\n" + omml2tex(math_xml) + "\n"
 57 | 
 58 |                                     text = text.strip()
 59 |                                     if len(text) > 0:
 60 |                                         text_count += len(text) + 1
 61 |                                         content_list.append(
 62 |                                             Content(type="text", data=text)
 63 |                                         )
 64 |                                         text = ""
 65 | 
 66 |                                     text_count += len(math_formula)
 67 |                                     content_list.append(
 68 |                                         Content(type="md", data=math_formula)
 69 |                                     )
 70 |                                 continue
 71 |                             if t := ele.text:
 72 |                                 if len(t) > 0:
 73 |                                     text += t
 74 |                         text = text.strip()
 75 |                         if len(text) > 0:
 76 |                             text_count += len(text) + 1
 77 |                             content_list.append(Content(type="text", data=text))
 78 |                             text = ""
 79 |                     case "tbl":
 80 |                         col_size = len(list(child.find(f"{tag_w}tblGrid")))
 81 |                         md = "\n"
 82 |                         for idx, row in enumerate(child.iter(f"{tag_w}tr")):
 83 |                             if idx == 1:
 84 |                                 md += "|"
 85 |                                 for _ in range(col_size):
 86 |                                     md += "---|"
 87 |                                 md += "\n"
 88 |                             md += "|"
 89 |                             # print(row)
 90 |                             for cell in row.iter(f"{tag_w}tc"):
 91 |                                 t = ""
 92 |                                 for cell_ele in cell.itertext():
 93 |                                     t += (
 94 |                                         cell_ele.strip()
 95 |                                         .replace("\r", "")
 96 |                                         .replace("\n", "")
 97 |                                     )
 98 |                                 md += f" {t} |"
 99 |                             md += "\n"
100 |                         md += "\n"
101 |                         text_count += len(md)
102 |                         content_list.append(Content(type="md", data=md))
103 |                     case "sectPr":
104 |                         # docx section pointer, meaningless
105 |                         pass
106 |                     case unknown:
107 |                         pass
108 |                         # print(unknown)
109 |             return content_list
110 | 
111 |     def extract(
112 |         self,
113 |         r: FileStorage | Path,
114 |         id: str,
115 |         dir: Path,
116 |         media_dir: Path,
117 |         skip_image: bool,
118 |     ) -> ExtractResponse:
119 |         if type(r) is FileStorage:
120 |             page = Page(
121 |                 page_no=0,
122 |                 content_list=self.__word2markdown(id, r.stream, media_dir),
123 |             )
124 |         else:
125 |             page = Page(
126 |                 page_no=0,
127 |                 content_list=self.__word2markdown(id, open(r, "rb"), media_dir),
128 |             )
129 |         # self.clean_up(id)
130 |         return [page]
131 | 
132 | 
133 | if __name__ == "__main__":
134 |     e = DocxExtractor()
135 | 
136 |     res = e.run(
137 |         "def",
138 |         Path(
139 |             "test_data/doc/【中简】模电自测第四版.docx",
140 |         ),
141 |     )
142 | 
143 |     print(res)
144 | 
145 |     e.wait_all()
146 | 


--------------------------------------------------------------------------------
/magic_doc/model/sub_modules/layoutlmv3/layoutlmft/data/data_collator.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from dataclasses import dataclass
  3 | from typing import Any, Dict, List, Optional, Tuple, Union
  4 | 
  5 | from transformers import BatchEncoding, PreTrainedTokenizerBase
  6 | from transformers.data.data_collator import (
  7 |     DataCollatorMixin,
  8 |     _torch_collate_batch,
  9 | )
 10 | from transformers.file_utils import PaddingStrategy
 11 | 
 12 | from typing import NewType
 13 | InputDataClass = NewType("InputDataClass", Any)
 14 | 
 15 | def pre_calc_rel_mat(segment_ids):
 16 |     valid_span = torch.zeros((segment_ids.shape[0], segment_ids.shape[1], segment_ids.shape[1]),
 17 |                              device=segment_ids.device, dtype=torch.bool)
 18 |     for i in range(segment_ids.shape[0]):
 19 |         for j in range(segment_ids.shape[1]):
 20 |             valid_span[i, j, :] = segment_ids[i, :] == segment_ids[i, j]
 21 | 
 22 |     return valid_span
 23 | 
 24 | @dataclass
 25 | class DataCollatorForKeyValueExtraction(DataCollatorMixin):
 26 |     """
 27 |     Data collator that will dynamically pad the inputs received, as well as the labels.
 28 |     Args:
 29 |         tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
 30 |             The tokenizer used for encoding the data.
 31 |         padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
 32 |             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
 33 |             among:
 34 |             * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
 35 |               sequence if provided).
 36 |             * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
 37 |               maximum acceptable input length for the model if that argument is not provided.
 38 |             * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
 39 |               different lengths).
 40 |         max_length (:obj:`int`, `optional`):
 41 |             Maximum length of the returned list and optionally padding length (see above).
 42 |         pad_to_multiple_of (:obj:`int`, `optional`):
 43 |             If set will pad the sequence to a multiple of the provided value.
 44 |             This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
 45 |             7.5 (Volta).
 46 |         label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
 47 |             The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
 48 |     """
 49 | 
 50 |     tokenizer: PreTrainedTokenizerBase
 51 |     padding: Union[bool, str, PaddingStrategy] = True
 52 |     max_length: Optional[int] = None
 53 |     pad_to_multiple_of: Optional[int] = None
 54 |     label_pad_token_id: int = -100
 55 | 
 56 |     def __call__(self, features):
 57 |         label_name = "label" if "label" in features[0].keys() else "labels"
 58 |         labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
 59 | 
 60 |         images = None
 61 |         if "images" in features[0]:
 62 |             images = torch.stack([torch.tensor(d.pop("images")) for d in features])
 63 |             IMAGE_LEN = int(images.shape[-1] / 16) * int(images.shape[-1] / 16) + 1
 64 | 
 65 |         batch = self.tokenizer.pad(
 66 |             features,
 67 |             padding=self.padding,
 68 |             max_length=self.max_length,
 69 |             pad_to_multiple_of=self.pad_to_multiple_of,
 70 |             # Conversion to tensors will fail if we have labels as they are not of the same length yet.
 71 |             return_tensors="pt" if labels is None else None,
 72 |         )
 73 | 
 74 |         if images is not None:
 75 |             batch["images"] = images
 76 |             batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) and k == 'attention_mask' else v
 77 |                      for k, v in batch.items()}
 78 |             visual_attention_mask = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long)
 79 |             batch["attention_mask"] = torch.cat([batch['attention_mask'], visual_attention_mask], dim=1)
 80 | 
 81 |         if labels is None:
 82 |             return batch
 83 | 
 84 |         has_bbox_input = "bbox" in features[0]
 85 |         has_position_input = "position_ids" in features[0]
 86 |         padding_idx=self.tokenizer.pad_token_id
 87 |         sequence_length = torch.tensor(batch["input_ids"]).shape[1]
 88 |         padding_side = self.tokenizer.padding_side
 89 |         if padding_side == "right":
 90 |             batch["labels"] = [label + [self.label_pad_token_id] * (sequence_length - len(label)) for label in labels]
 91 |             if has_bbox_input:
 92 |                 batch["bbox"] = [bbox + [[0, 0, 0, 0]] * (sequence_length - len(bbox)) for bbox in batch["bbox"]]
 93 |             if has_position_input:
 94 |                 batch["position_ids"] = [position_id + [padding_idx] * (sequence_length - len(position_id))
 95 |                                           for position_id in batch["position_ids"]]
 96 | 
 97 |         else:
 98 |             batch["labels"] = [[self.label_pad_token_id] * (sequence_length - len(label)) + label for label in labels]
 99 |             if has_bbox_input:
100 |                 batch["bbox"] = [[[0, 0, 0, 0]] * (sequence_length - len(bbox)) + bbox for bbox in batch["bbox"]]
101 |             if has_position_input:
102 |                 batch["position_ids"] = [[padding_idx] * (sequence_length - len(position_id))
103 |                                           + position_id for position_id in batch["position_ids"]]
104 | 
105 |         if 'segment_ids' in batch:
106 |             assert 'position_ids' in batch
107 |             for i in range(len(batch['segment_ids'])):
108 |                 batch['segment_ids'][i] = batch['segment_ids'][i] + [batch['segment_ids'][i][-1] + 1] * (sequence_length - len(batch['segment_ids'][i])) + [
109 |                     batch['segment_ids'][i][-1] + 2] * IMAGE_LEN
110 | 
111 |         batch = {k: torch.tensor(v, dtype=torch.int64) if isinstance(v[0], list) else v for k, v in batch.items()}
112 | 
113 |         if 'segment_ids' in batch:
114 |             valid_span = pre_calc_rel_mat(
115 |                 segment_ids=batch['segment_ids']
116 |             )
117 |             batch['valid_span'] = valid_span
118 |             del batch['segment_ids']
119 | 
120 |         if images is not None:
121 |             visual_labels = torch.ones((len(batch['input_ids']), IMAGE_LEN), dtype=torch.long) * -100
122 |             batch["labels"] = torch.cat([batch['labels'], visual_labels], dim=1)
123 | 
124 |         return batch
125 | 


--------------------------------------------------------------------------------
/magic_doc/contrib/magic_html/mmltex/glayout.xsl:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding="UTF-8"?>
  2 | <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  3 | 		xmlns:m="http://www.w3.org/1998/Math/MathML"
  4 |                 version='1.0'>
  5 | 
  6 | <!-- ====================================================================== -->
  7 | <!-- $id: glayout.xsl, 2002/17/05 Exp $
  8 |      This file is part of the XSLT MathML Library distribution.
  9 |      See ./README or http://www.raleigh.ru/MathML/mmltex for
 10 |      copyright and other information                                        -->
 11 | <!-- ====================================================================== -->
 12 | 
 13 | <xsl:template match="m:mfrac">
 14 | 	<xsl:choose>
 15 | 		<xsl:when test="@bevelled='true'">
 16 | <!--			<xsl:text>\raisebox{1ex}{</xsl:text>
 17 | 			<xsl:apply-templates select="./*[1]"/>
 18 | 			<xsl:text>}\!\left/ \!\raisebox{-1ex}{</xsl:text>
 19 | 			<xsl:apply-templates select="./*[2]"/>
 20 | 			<xsl:text>}\right.</xsl:text>-->
 21 | 		</xsl:when>
 22 | 		<xsl:when test="@linethickness">
 23 | 			<xsl:text>\genfrac{}{}{</xsl:text>
 24 | 			<xsl:choose>
 25 | 				<xsl:when test="number(@linethickness)">
 26 | 					<xsl:value-of select="@linethickness div 10"/>
 27 | 					<xsl:text>ex</xsl:text>
 28 | 				</xsl:when>
 29 | 				<xsl:when test="@linethickness='thin'">
 30 | 					<xsl:text>.05ex</xsl:text>
 31 | 				</xsl:when>
 32 | 				<xsl:when test="@linethickness='medium'"/>
 33 | 				<xsl:when test="@linethickness='thick'">
 34 | 					<xsl:text>.2ex</xsl:text>
 35 | 				</xsl:when>
 36 | 				<xsl:otherwise>
 37 | 					<xsl:value-of select="@linethickness"/>
 38 | 				</xsl:otherwise>
 39 | 			</xsl:choose>
 40 | 			<xsl:text>}{}{</xsl:text>
 41 | 		</xsl:when>
 42 | 		<xsl:otherwise>
 43 | 			<xsl:text>\frac{</xsl:text>
 44 | 		</xsl:otherwise>
 45 | 	</xsl:choose>
 46 | 	<xsl:if test="@numalign='right'">
 47 | 		<xsl:text>\hfill </xsl:text>
 48 | 	</xsl:if>
 49 | 	<xsl:apply-templates select="./*[1]"/>
 50 | 	<xsl:if test="@numalign='left'">
 51 | 		<xsl:text>\hfill </xsl:text>
 52 | 	</xsl:if>
 53 | 	<xsl:text>}{</xsl:text>	
 54 | 	<xsl:if test="@denomalign='right'">
 55 | 		<xsl:text>\hfill </xsl:text>
 56 | 	</xsl:if>
 57 | 	<xsl:apply-templates select="./*[2]"/>
 58 | 		<xsl:if test="@denomalign='left'">
 59 | 		<xsl:text>\hfill </xsl:text>
 60 | 	</xsl:if>
 61 | 	<xsl:text>}</xsl:text>
 62 | </xsl:template>
 63 | 
 64 | <xsl:template match="m:mroot">
 65 | 	<xsl:choose>
 66 | 		<xsl:when test="count(./*)=2">
 67 | 			<xsl:text>\sqrt[</xsl:text>
 68 | 			<xsl:apply-templates select="./*[2]"/>
 69 | 			<xsl:text>]{</xsl:text>	
 70 | 			<xsl:apply-templates select="./*[1]"/>
 71 | 			<xsl:text>}</xsl:text>	
 72 | 		</xsl:when>
 73 | 		<xsl:otherwise>
 74 | 		<!-- number of argumnets is not 2 - code 25 -->
 75 | 			<xsl:message>exception 25:</xsl:message>
 76 | 			<xsl:text>\text{exception 25:}</xsl:text> 
 77 | 		</xsl:otherwise>
 78 | 	</xsl:choose>
 79 | </xsl:template>
 80 | 
 81 | <xsl:template match="m:msqrt">
 82 | 	<xsl:text>\sqrt{</xsl:text>
 83 | 	<xsl:apply-templates/>
 84 | 	<xsl:text>}</xsl:text>
 85 | </xsl:template>
 86 | 
 87 | <xsl:template match="m:mfenced">
 88 | 	<xsl:choose>
 89 | 		<xsl:when test="@open">
 90 | 			<xsl:if test="translate(@open,'{}[]()|','{{{{{{{')='{'">
 91 | 				<xsl:text>\left</xsl:text>
 92 | 			</xsl:if>
 93 | 			<xsl:if test="@open='{' or @open='}'">
 94 | 				<xsl:text>\</xsl:text>
 95 | 			</xsl:if>
 96 | 			<xsl:value-of select="@open"/>
 97 | 		</xsl:when>
 98 | 		<xsl:otherwise><xsl:text>\left(</xsl:text></xsl:otherwise>
 99 | 	</xsl:choose>
100 | 	<xsl:choose>
101 | 		<xsl:when test="count(./*)>1">
102 | 			<xsl:variable name="symbol">
103 | 				<xsl:choose>
104 | 					<xsl:when test="@separators">
105 | 						<xsl:call-template name="startspace">
106 | 							<xsl:with-param name="symbol" select="@separators"/>
107 | 						</xsl:call-template>
108 | 					</xsl:when>
109 | 					<xsl:otherwise>,</xsl:otherwise>
110 | 				</xsl:choose>
111 | 			</xsl:variable>
112 | 			<xsl:for-each select="./*">
113 | 				<xsl:apply-templates select="."/>
114 | 				<xsl:if test="not(position()=last())">
115 | 					<xsl:choose>
116 | 						<xsl:when test="position()>string-length($symbol)">
117 | 							<xsl:value-of select="substring($symbol,string-length($symbol))"/>
118 | 						</xsl:when>
119 | 						<xsl:otherwise>
120 | 							<xsl:value-of select="substring($symbol,position(),1)"/>
121 | 						</xsl:otherwise>
122 | 					</xsl:choose>
123 | 				</xsl:if>
124 | 			</xsl:for-each>
125 | 		</xsl:when>
126 | 		<xsl:otherwise>
127 | 			<xsl:apply-templates/>
128 | 		</xsl:otherwise>
129 | 	</xsl:choose>
130 | 	<xsl:choose>
131 | 		<xsl:when test="@close">
132 | 			<xsl:if test="translate(@open,'{}[]()|','{{{{{{{')='{'">
133 | 				<xsl:text>\right</xsl:text>
134 | 			</xsl:if>
135 | 			<xsl:if test="@open='{' or @open='}'">
136 | 				<xsl:text>\</xsl:text>
137 | 			</xsl:if>		
138 | 			<xsl:value-of select="@close"/>
139 | 		</xsl:when>
140 | 		<xsl:otherwise><xsl:text>\right)</xsl:text></xsl:otherwise>
141 | 	</xsl:choose>	
142 | </xsl:template>
143 | 
144 | <xsl:template match="m:mphantom">
145 | 	<xsl:text>\phantom{</xsl:text>
146 | 	<xsl:apply-templates/>
147 | 	<xsl:text>}</xsl:text>
148 | </xsl:template>
149 | 
150 | <xsl:template match="m:menclose">
151 | 	<xsl:choose>
152 | 		<xsl:when test="@notation = 'actuarial'">
153 | 			<xsl:text>\overline{</xsl:text>
154 | 			<xsl:apply-templates/>
155 | 			<xsl:text>\hspace{.2em}|}</xsl:text>
156 | 		</xsl:when>
157 | 		<xsl:when test="@notation = 'radical'">
158 | 			<xsl:text>\sqrt{</xsl:text>
159 | 			<xsl:apply-templates/>
160 | 			<xsl:text>}</xsl:text>
161 | 		</xsl:when>
162 | 		<xsl:otherwise>
163 | 			<xsl:text>\overline{)</xsl:text>
164 | 			<xsl:apply-templates/>
165 | 			<xsl:text>}</xsl:text>
166 | 		</xsl:otherwise>
167 | 	</xsl:choose>
168 | </xsl:template>
169 | 
170 | <xsl:template match="m:mrow">
171 | 	<xsl:apply-templates/>
172 | </xsl:template>
173 | 
174 | <xsl:template match="m:mstyle">
175 | 	<xsl:if test="@background">
176 | 		<xsl:text>\colorbox[rgb]{</xsl:text>
177 | 		<xsl:call-template name="color">
178 | 			<xsl:with-param name="color" select="@background"/>
179 | 		</xsl:call-template>
180 | 		<xsl:text>}{$</xsl:text>
181 | 	</xsl:if>
182 | 	<xsl:if test="@color">
183 | 		<xsl:text>\textcolor[rgb]{</xsl:text>
184 | 		<xsl:call-template name="color">
185 | 			<xsl:with-param name="color" select="@color"/>
186 | 		</xsl:call-template>
187 | 		<xsl:text>}{</xsl:text>
188 | 	</xsl:if>
189 | 	<xsl:apply-templates/>
190 | 	<xsl:if test="@color">
191 | 		<xsl:text>}</xsl:text>
192 | 	</xsl:if>
193 | 	<xsl:if test="@background">
194 | 		<xsl:text>$}</xsl:text>
195 | 	</xsl:if>
196 | </xsl:template>
197 | <!--
198 | 
199 | <xsl:template match="m:mstyle">
200 | 	<xsl:if test="@displaystyle='true'">
201 | 		<xsl:text>{\displaystyle</xsl:text>
202 | 	</xsl:if>			
203 | 	<xsl:if test="@scriptlevel=2">
204 | 		<xsl:text>{\scriptscriptstyle</xsl:text>	
205 | 	</xsl:if>
206 | 	<xsl:apply-templates/>
207 | 	<xsl:if test="@scriptlevel=2">
208 | 		<xsl:text>}</xsl:text>
209 | 	</xsl:if>
210 | 	<xsl:if test="@displaystyle='true'">
211 | 		<xsl:text>}</xsl:text>
212 | 	</xsl:if>
213 | </xsl:template>
214 | -->
215 | 
216 | <xsl:template match="m:merror">
217 | 	<xsl:apply-templates/>
218 | </xsl:template>
219 | 
220 | </xsl:stylesheet>


--------------------------------------------------------------------------------