├── .cursorignore ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yaml │ └── feature_request.yaml ├── PULL_REQUEST_TEMPLATE │ └── pr_form.yml ├── dependabot.yml ├── labels.yml ├── release-drafter.yml └── workflows │ ├── codeql.yml │ ├── docs.yml │ ├── labeler.yml │ ├── lint.yml │ ├── pr-lint.yml │ ├── publish-to-pypi.yml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── babeldoc ├── __init__.py ├── assets │ ├── assets.py │ └── embedding_assets_metadata.py ├── asynchronize │ └── __init__.py ├── const.py ├── converter.py ├── document_il │ ├── __init__.py │ ├── babeldoc_exception │ │ └── BabelDOCException.py │ ├── backend │ │ ├── __init__.py │ │ └── pdf_creater.py │ ├── frontend │ │ ├── __init__.py │ │ └── il_creater.py │ ├── il_version_1.py │ ├── il_version_1.rnc │ ├── il_version_1.rng │ ├── il_version_1.xsd │ ├── midend │ │ ├── __init__.py │ │ ├── add_debug_information.py │ │ ├── detect_scanned_file.py │ │ ├── il_translator.py │ │ ├── il_translator_llm_only.py │ │ ├── layout_parser.py │ │ ├── paragraph_finder.py │ │ ├── remove_descent.py │ │ ├── styles_and_formulas.py │ │ ├── table_parser.py │ │ └── typesetting.py │ ├── translator │ │ ├── __init__.py │ │ ├── cache.py │ │ └── translator.py │ ├── utils │ │ ├── atomic_integer.py │ │ ├── fontmap.py │ │ ├── layout_helper.py │ │ ├── priority_thread_pool_executor.py │ │ └── style_helper.py │ └── xml_converter.py ├── docvision │ ├── README.md │ ├── __init__.py │ ├── doclayout.py │ ├── rpc_doclayout.py │ └── table_detection │ │ └── rapidocr.py ├── format │ └── office │ │ └── __init__.py ├── high_level.py ├── main.py ├── pdfinterp.py ├── progress_monitor.py ├── result_merger.py ├── split_manager.py ├── tools │ ├── generate_font_metadata.py │ ├── italic_assistance.py │ └── italic_recognize_tool.py └── translation_config.py ├── docs ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── CONTRIBUTOR_REWARD.md ├── ImplementationDetails │ ├── AsyncTranslate │ │ └── AsyncTranslate.md │ ├── ILTranslator │ │ └── ILTranslator.md │ ├── PDFCreation │ │ └── PDFCreation.md │ ├── PDFParsing │ │ └── PDFParsing.md │ ├── ParagraphFinding │ │ └── ParagraphFinding.md │ ├── README.md │ ├── StylesAndFormulas │ │ └── StylesAndFormulas.md │ └── Typesetting │ │ └── Typesetting.md ├── README.md ├── deploy.sh ├── images │ ├── babeldoc-banner.png │ ├── babeldoc-big-logo-darkmode-with-transparent-background.png │ ├── babeldoc-big-logo-darkmode-with-transparent-background.svg │ ├── babeldoc-big-logo-with-transparent-background.png │ ├── babeldoc-big-logo-with-transparent-background.svg │ ├── babeldoc-big-logo.png │ ├── babeldoc-contributor_reward_example.png │ ├── babeldoc-preview.gif │ ├── babeldoc-small-logo-with-transparent-background.png │ ├── babeldoc-small-logo-with-transparent-background.svg │ └── babeldoc-small-logo.png ├── index.md ├── intro-to-pdf-object.md └── requirements.txt ├── examples ├── basic.xml ├── ci │ └── test.pdf ├── code-figure.xml ├── complex.xml ├── formular.xml └── table.xml ├── mkdocs.yml ├── pyproject.toml ├── tests └── test_translation_config.py └── uv.lock /.cursorignore: -------------------------------------------------------------------------------- 1 | # Project notes and templates 2 | xnotes/ 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yaml: -------------------------------------------------------------------------------- 1 | name: "🐞 Bug Report" 2 | description: Create a report to help us improve 3 | labels: ['bug'] 4 | body: 5 | - type: checkboxes 6 | id: checks 7 | attributes: 8 | label: Before you submit 9 | options: 10 | - label: I have searched existing issues 11 | required: true 12 | - label: I spent at least 5 minutes investigating and preparing this report 13 | required: true 14 | - label: I confirmed this is not caused by a network issue 15 | required: true 16 | 17 | - type: markdown 18 | attributes: 19 | value: | 20 | Thank you for using **BabelDOC** and helping us improve it! 🙏 21 | 22 | - type: textarea 23 | id: environment 24 | attributes: 25 | label: Environment 26 | description: Provide your system details (required) 27 | value: | 28 | - OS: 29 | - Python: 30 | - BabelDOC: 31 | render: markdown 32 | validations: 33 | required: true 34 | 35 | - type: textarea 36 | id: describe 37 | attributes: 38 | label: Describe the bug 39 | description: A clear and concise description of what the bug is. 40 | validations: 41 | required: true 42 | 43 | - type: textarea 44 | id: reproduce 45 | attributes: 46 | label: Steps to Reproduce 47 | description: Help us reproduce the issue 48 | value: | 49 | 1. Go to '...' 50 | 2. Click on '...' 51 | 3. See error 52 | validations: 53 | required: false 54 | 55 | - type: textarea 56 | id: expected 57 | attributes: 58 | label: Expected Behavior 59 | description: What did you expect to happen? 60 | validations: 61 | required: false 62 | 63 | - type: textarea 64 | id: logs 65 | attributes: 66 | label: Relevant Log Output or Screenshots 67 | description: Copy and paste any logs or attach screenshots. This will be formatted automatically. 68 | render: text 69 | validations: 70 | required: false 71 | 72 | - type: textarea 73 | id: pdf 74 | attributes: 75 | label: Original PDF File 76 | description: Upload the input PDF if applicable. 77 | validations: 78 | required: false 79 | 80 | - type: textarea 81 | id: others 82 | attributes: 83 | label: Additional Context 84 | description: Anything else we should know? 85 | validations: 86 | required: false 87 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yaml: -------------------------------------------------------------------------------- 1 | name: "✨ Feature Request" 2 | description: Suggest a new idea or improvement for BabelDOC 3 | labels: ['enhancement'] 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: | 8 | Thank you for helping improve **BabelDOC**! Please fill out the form below to suggest a feature. 9 | 10 | - type: textarea 11 | id: describe 12 | attributes: 13 | label: Is your feature request related to a problem? 14 | description: If applicable, describe what problem this feature would solve. 15 | placeholder: Ex. I'm always frustrated when ... 16 | validations: 17 | required: false 18 | 19 | - type: textarea 20 | id: solution 21 | attributes: 22 | label: Describe the solution you'd like 23 | description: What would you like to see happen? 24 | validations: 25 | required: true 26 | 27 | - type: textarea 28 | id: alternatives 29 | attributes: 30 | label: Describe alternatives you've considered 31 | description: Have you thought of other ways to solve this? 32 | validations: 33 | required: false 34 | 35 | - type: textarea 36 | id: additional 37 | attributes: 38 | label: Additional context 39 | description: Any other context, examples, or screenshots? 40 | validations: 41 | required: false 42 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/pr_form.yml: -------------------------------------------------------------------------------- 1 | name: Pull Request 2 | description: Submit a pull request to contribute to BabelDOC 3 | title: "[PR] " 4 | labels: 5 | - needs triage 6 | body: 7 | - type: markdown 8 | attributes: 9 | value: | 10 | ## 👋 Thanks for contributing to **BabelDOC**! 11 | 12 | Please fill out this form to help us review your pull request effectively. 13 | 14 | - type: input 15 | id: issue 16 | attributes: 17 | label: Related Issue(s) 18 | description: If this pull request closes or is related to one or more issues, list them here (e.g., #37) 19 | placeholder: "#37" 20 | validations: 21 | required: false 22 | 23 | - type: textarea 24 | id: summary 25 | attributes: 26 | label: Description 27 | description: Describe the purpose of this pull request and what was changed. 28 | placeholder: | 29 | - What does this PR introduce or fix? 30 | - What is the motivation behind it? 31 | validations: 32 | required: true 33 | 34 | - type: dropdown 35 | id: pr_type 36 | attributes: 37 | label: PR Type 38 | description: What kind of change is this? 39 | multiple: true 40 | options: 41 | - enhancement 42 | - bug 43 | - documentation 44 | - refactor 45 | - test 46 | - chore 47 | validations: 48 | required: true 49 | 50 | - type: checkboxes 51 | id: checklist 52 | attributes: 53 | label: Contributor Checklist 54 | options: 55 | - label: I’ve read the **CONTRIBUTING.md** guide 56 | required: true 57 | - label: My changes follow the project’s code style and guidelines 58 | required: true 59 | - label: I’ve linked the related issue(s) in the description above 60 | - label: I’ve updated relevant documentation (if applicable) 61 | - label: I’ve added necessary tests (if applicable) 62 | - label: All new and existing tests passed locally 63 | - label: I understand that due to limited maintainer resources, only small pull requests are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary. 64 | 65 | - type: textarea 66 | id: testing 67 | attributes: 68 | label: Testing Instructions 69 | description: Provide step-by-step instructions on how to test your changes 70 | placeholder: | 71 | 1. Run `...` 72 | 2. Visit `...` 73 | 3. Click `...` 74 | 4. Verify `...` 75 | validations: 76 | required: false 77 | 78 | - type: textarea 79 | id: screenshots 80 | attributes: 81 | label: Screenshots (if applicable) 82 | description: If UI changes were made, please attach before/after screenshots. 83 | validations: 84 | required: false 85 | 86 | - type: textarea 87 | id: notes 88 | attributes: 89 | label: Additional Notes 90 | description: Anything else the reviewer should know? 91 | validations: 92 | required: false 93 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: github-actions 4 | directory: "/" 5 | schedule: 6 | interval: weekly 7 | # - package-ecosystem: pip 8 | # directory: "/.github/workflows" 9 | # schedule: 10 | # interval: weekly 11 | # - package-ecosystem: pip 12 | # directory: "/docs" 13 | # schedule: 14 | # interval: weekly 15 | - package-ecosystem: pip 16 | directory: "/" 17 | schedule: 18 | interval: weekly 19 | versioning-strategy: lockfile-only 20 | allow: 21 | - dependency-type: "all" -------------------------------------------------------------------------------- /.github/labels.yml: -------------------------------------------------------------------------------- 1 | --- 2 | # Labels names are important as they are used by Release Drafter to decide 3 | # regarding where to record them in changelog or if to skip them. 4 | # 5 | # The repository labels will be automatically configured using this file and 6 | # the GitHub Action https://github.com/marketplace/actions/github-labeler. 7 | - name: breaking 8 | description: Breaking Changes 9 | color: "bfd4f2" 10 | - name: bug 11 | description: Something isn't working 12 | color: "d73a4a" 13 | - name: build 14 | description: Build System and Dependencies 15 | color: "bfdadc" 16 | - name: ci 17 | description: Continuous Integration 18 | color: "4a97d6" 19 | - name: dependencies 20 | description: Pull requests that update a dependency file 21 | color: "0366d6" 22 | - name: documentation 23 | description: Improvements or additions to documentation 24 | color: "0075ca" 25 | - name: duplicate 26 | description: This issue or pull request already exists 27 | color: "cfd3d7" 28 | - name: enhancement 29 | description: New feature or request 30 | color: "a2eeef" 31 | - name: github_actions 32 | description: Pull requests that update Github_actions code 33 | color: "000000" 34 | - name: good first issue 35 | description: Good for newcomers 36 | color: "7057ff" 37 | - name: help wanted 38 | description: Extra attention is needed 39 | color: "008672" 40 | - name: invalid 41 | description: This doesn't seem right 42 | color: "e4e669" 43 | - name: performance 44 | description: Performance 45 | color: "016175" 46 | - name: python 47 | description: Pull requests that update Python code 48 | color: "2b67c6" 49 | - name: question 50 | description: Further information is requested 51 | color: "d876e3" 52 | - name: refactoring 53 | description: Refactoring 54 | color: "ef67c4" 55 | - name: removal 56 | description: Removals and Deprecations 57 | color: "9ae7ea" 58 | - name: style 59 | description: Style 60 | color: "c120e5" 61 | - name: testing 62 | description: Testing 63 | color: "b1fc6f" 64 | - name: wontfix 65 | description: This will not be worked on 66 | color: "ffffff" -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name-template: 'v$RESOLVED_VERSION' 2 | tag-template: 'v$RESOLVED_VERSION' 3 | categories: 4 | - title: '🚀 Features' 5 | labels: 6 | - 'feature' 7 | - 'enhancement' 8 | - title: '🐛 Bug Fixes' 9 | labels: 10 | - 'fix' 11 | - 'bugfix' 12 | - 'bug' 13 | - title: '🧰 Maintenance' 14 | labels: 15 | - 'chore' 16 | - 'maintenance' 17 | - 'refactor' 18 | - title: '📝 Documentation' 19 | labels: 20 | - 'docs' 21 | - 'documentation' 22 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)' 23 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions 24 | version-resolver: 25 | major: 26 | labels: 27 | - 'major' 28 | minor: 29 | labels: 30 | - 'minor' 31 | patch: 32 | labels: 33 | - 'patch' 34 | default: patch 35 | template: | 36 | ## Changes 37 | 38 | $CHANGES 39 | 40 | ## Contributors 41 | 42 | $CONTRIBUTORS 43 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL Advanced" 13 | 14 | on: 15 | push: 16 | pull_request: 17 | branches: [ "main" ] 18 | schedule: 19 | - cron: '36 14 * * 1' 20 | 21 | jobs: 22 | analyze: 23 | name: Analyze (${{ matrix.language }}) 24 | # Runner size impacts CodeQL analysis time. To learn more, please see: 25 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 26 | # - https://gh.io/supported-runners-and-hardware-resources 27 | # - https://gh.io/using-larger-runners (GitHub.com only) 28 | # Consider using larger runners or machines with greater resources for possible analysis time improvements. 29 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 30 | permissions: 31 | # required for all workflows 32 | security-events: write 33 | 34 | # required to fetch internal or private CodeQL packs 35 | packages: read 36 | 37 | # only required for workflows in private repositories 38 | actions: read 39 | contents: read 40 | 41 | strategy: 42 | fail-fast: false 43 | matrix: 44 | include: 45 | - language: python 46 | build-mode: none 47 | - language: actions 48 | # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' 49 | # Use `c-cpp` to analyze code written in C, C++ or both 50 | # Use 'java-kotlin' to analyze code written in Java, Kotlin or both 51 | # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both 52 | # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, 53 | # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. 54 | # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how 55 | # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages 56 | steps: 57 | - name: Checkout repository 58 | uses: actions/checkout@v4 59 | 60 | # Initializes the CodeQL tools for scanning. 61 | - name: Initialize CodeQL 62 | uses: github/codeql-action/init@v3 63 | with: 64 | languages: ${{ matrix.language }} 65 | build-mode: ${{ matrix.build-mode }} 66 | # If you wish to specify custom queries, you can do so here or in a config file. 67 | # By default, queries listed here will override any specified in a config file. 68 | # Prefix the list here with "+" to use these queries and those in the config file. 69 | 70 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 71 | # queries: security-extended,security-and-quality 72 | 73 | # If the analyze step fails for one of the languages you are analyzing with 74 | # "We were unable to automatically build your code", modify the matrix above 75 | # to set the build mode to "manual" for that language. Then modify this step 76 | # to build your code. 77 | # ℹ️ Command-line programs to run using the OS shell. 78 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 79 | - if: matrix.build-mode == 'manual' 80 | shell: bash 81 | run: | 82 | echo 'If you are using a "manual" build mode for one or more of the' \ 83 | 'languages you are analyzing, replace this with the commands to build' \ 84 | 'your code, for example:' 85 | echo ' make bootstrap' 86 | echo ' make release' 87 | exit 1 88 | 89 | - name: Perform CodeQL Analysis 90 | uses: github/codeql-action/analyze@v3 91 | with: 92 | category: "/language:${{matrix.language}}" 93 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | push: 4 | branches: 5 | - main 6 | permissions: 7 | contents: write 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | with: 14 | fetch-depth: 0 15 | - name: Configure Git Credentials 16 | run: | 17 | git config user.name github-actions[bot] 18 | git config user.email 41898282+github-actions[bot]@users.noreply.github.com 19 | - name: Setup uv with Python 3.12 20 | uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 21 | with: 22 | python-version: "3.12" 23 | enable-cache: true 24 | cache-dependency-glob: "uv.lock" 25 | - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 26 | - uses: actions/cache@v4 27 | with: 28 | key: mkdocs-material-${{ env.cache_id }} 29 | path: .cache 30 | restore-keys: | 31 | mkdocs-material- 32 | - run: uv sync 33 | - run: uv run mkdocs gh-deploy --force -------------------------------------------------------------------------------- /.github/workflows/labeler.yml: -------------------------------------------------------------------------------- 1 | name: Labeler 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'main' 7 | paths: 8 | - '.github/labels.yml' 9 | - '.github/workflows/labels.yml' 10 | pull_request: 11 | paths: 12 | - '.github/labels.yml' 13 | - '.github/workflows/labels.yml' 14 | 15 | permissions: 16 | contents: read 17 | issues: write 18 | pull-requests: write 19 | 20 | jobs: 21 | labeler: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - name: Check out the repository 25 | uses: actions/checkout@v4 26 | 27 | - name: Run Labeler 28 | uses: crazy-max/ghaction-github-labeler@24d110aa46a59976b8a7f35518cb7f14f434c916 # v5.3.0 29 | with: 30 | skip-delete: true 31 | dry-run: ${{ github.event_name == 'pull_request' }} 32 | github-token: ${{ secrets.GITHUB_TOKEN }} 33 | yaml-file: .github/labels.yml 34 | exclude: | 35 | help* 36 | *issue -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint Code 2 | permissions: 3 | contents: read 4 | pull-requests: write 5 | on: [push] 6 | 7 | jobs: 8 | lint: 9 | strategy: 10 | fail-fast: false 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | - name: Ruff 15 | uses: astral-sh/ruff-action@v3 16 | - name: AutoCorrect 17 | uses: huacnlee/autocorrect-action@main 18 | -------------------------------------------------------------------------------- /.github/workflows/pr-lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint Code and Review Dog Report 2 | 3 | on: [pull_request] 4 | permissions: 5 | contents: read 6 | pull-requests: write 7 | jobs: 8 | ruff: 9 | name: runner / ruff 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Install Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: '3.11' 18 | 19 | - name: Install ruff 20 | run: pip install ruff 21 | 22 | - name: Install reviewdog 23 | uses: reviewdog/action-setup@e04ffabe3898a0af8d0fb1af00c188831c4b5893 # v1.3.2 24 | with: 25 | reviewdog_version: latest 26 | 27 | - name: Run ruff with reviewdog 28 | env: 29 | REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} 30 | run: | 31 | ruff check . --output-format=rdjson | reviewdog -f=rdjson -reporter=github-pr-review -fail-on-error 32 | 33 | autocorrect: 34 | name: runner / autocorrect 35 | runs-on: ubuntu-latest 36 | steps: 37 | - uses: actions/checkout@v4 38 | - name: AutoCorrect 39 | uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3 40 | - name: Report ReviewDog 41 | if: failure() 42 | uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3 43 | env: 44 | REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} 45 | with: 46 | reviewdog: true -------------------------------------------------------------------------------- /.github/workflows/publish-to-pypi.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - master 8 | 9 | permissions: 10 | id-token: write 11 | contents: write 12 | pull-requests: write 13 | 14 | jobs: 15 | check-repository: 16 | name: Check if running in main repository 17 | runs-on: ubuntu-latest 18 | outputs: 19 | is_main_repo: ${{ github.repository == 'funstory-ai/BabelDOC' }} 20 | steps: 21 | - run: echo "Running repository check" 22 | 23 | build: 24 | name: Build distribution 📦 25 | needs: check-repository 26 | if: needs.check-repository.outputs.is_main_repo == 'true' 27 | runs-on: ubuntu-latest 28 | outputs: 29 | is_release: ${{ steps.check-version.outputs.tag }} 30 | steps: 31 | - uses: actions/checkout@v4 32 | with: 33 | persist-credentials: true 34 | fetch-depth: 2 35 | token: ${{ secrets.GITHUB_TOKEN }} 36 | 37 | - name: Setup uv with Python 3.12 38 | uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 39 | with: 40 | python-version: "3.12" 41 | enable-cache: true 42 | cache-dependency-glob: "uv.lock" 43 | 44 | - name: Check if there is a parent commit 45 | id: check-parent-commit 46 | run: | 47 | echo "sha=$(git rev-parse --verify --quiet HEAD^)" >> $GITHUB_OUTPUT 48 | 49 | - name: Detect and tag new version 50 | id: check-version 51 | if: steps.check-parent-commit.outputs.sha 52 | uses: salsify/action-detect-and-tag-new-version@b1778166f13188a9d478e2d1198f993011ba9864 # v2.0.3 53 | with: 54 | version-command: | 55 | cat pyproject.toml | grep "version = " | head -n 1 | awk -F'"' '{print $2}' 56 | 57 | - name: Install Dependencies 58 | run: | 59 | uv sync 60 | 61 | - name: Bump version for developmental release 62 | if: "! steps.check-version.outputs.tag" 63 | run: | 64 | version=$(bumpver update --patch --tag=final --dry 2>&1 | grep "New Version" | awk '{print $NF}') && 65 | bumpver update --set-version $version.dev$(date +%s) 66 | 67 | - name: Build package 68 | run: "uv build" 69 | 70 | - name: Store the distribution packages 71 | uses: actions/upload-artifact@v4.6.2 72 | with: 73 | name: python-package-distributions 74 | path: dist/ 75 | 76 | publish-to-pypi: 77 | name: Publish Python 🐍 distribution 📦 to PyPI 78 | if: needs.build.outputs.is_release != '' 79 | needs: 80 | - check-repository 81 | - build 82 | runs-on: ubuntu-latest 83 | environment: 84 | name: pypi 85 | url: https://pypi.org/p/BabelDOC 86 | 87 | permissions: 88 | id-token: write 89 | 90 | steps: 91 | - name: Download all the dists 92 | uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1 93 | with: 94 | name: python-package-distributions 95 | path: dist/ 96 | 97 | - name: Publish distribution 📦 to PyPI 98 | uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 99 | 100 | publish-to-testpypi: 101 | name: Publish Python 🐍 distribution 📦 to TestPyPI 102 | if: needs.build.outputs.is_release == '' 103 | needs: 104 | - check-repository 105 | - build 106 | runs-on: ubuntu-latest 107 | environment: 108 | name: testpypi 109 | url: https://test.pypi.org/p/BabelDOC 110 | 111 | permissions: 112 | id-token: write 113 | 114 | steps: 115 | - name: Download all the dists 116 | uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1 117 | with: 118 | name: python-package-distributions 119 | path: dist/ 120 | 121 | - name: Publish distribution 📦 to TestPyPI 122 | uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4 123 | with: 124 | repository-url: https://test.pypi.org/legacy/ 125 | 126 | post-release: 127 | name: Post Release Tasks 128 | needs: 129 | - check-repository 130 | - build 131 | - publish-to-pypi 132 | - publish-to-testpypi 133 | if: | 134 | always() && needs.check-repository.outputs.is_main_repo == 'true' && 135 | (needs.publish-to-pypi.result == 'success' || needs.publish-to-testpypi.result == 'success') 136 | runs-on: ubuntu-latest 137 | permissions: 138 | contents: write 139 | pull-requests: write 140 | steps: 141 | - uses: actions/checkout@v4 142 | with: 143 | persist-credentials: true 144 | fetch-depth: 2 145 | token: ${{ secrets.GITHUB_TOKEN }} 146 | 147 | - name: Publish the release notes 148 | uses: release-drafter/release-drafter@b1476f6e6eb133afa41ed8589daba6dc69b4d3f5 # v6.1.0 149 | with: 150 | publish: ${{ needs.build.outputs.is_release != '' }} 151 | tag: ${{ needs.build.outputs.is_release }} 152 | env: 153 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 🧪 2 | 3 | on: 4 | push: 5 | pull_request: 6 | branches: ["main"] 7 | 8 | permissions: 9 | contents: read 10 | pull-requests: read 11 | 12 | jobs: 13 | test: 14 | name: Run Python Tests 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.10", "3.11", "3.12"] 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | with: 23 | persist-credentials: false 24 | - name: Cached Assets 25 | id: cache-assets 26 | uses: actions/cache@v4.2.0 27 | with: 28 | path: ~/.cache/babeldoc 29 | key: babeldoc-assets-${{ hashFiles('babeldoc/assets/embedding_assets_metadata.py') }} 30 | - name: Setup uv with Python ${{ matrix.python-version }} 31 | uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | enable-cache: true 35 | cache-dependency-glob: "uv.lock" 36 | - name: Warm up cache 37 | run: | 38 | uv run babeldoc --warmup 39 | - name: Run tests 40 | env: 41 | OPENAI_API_KEY: ${{ secrets.OPENAIAPIKEY }} 42 | OPENAI_BASE_URL: ${{ secrets.OPENAIAPIURL }} 43 | OPENAI_MODEL: ${{ secrets.OPENAIMODEL }} 44 | run: | 45 | uv run babeldoc --help 46 | uv run babeldoc --openai --files examples/ci/test.pdf --openai-api-key ${{ env.OPENAI_API_KEY }} --openai-base-url ${{ env.OPENAI_BASE_URL }} --openai-model ${{ env.OPENAI_MODEL }} 47 | - name: Generate offline assets package 48 | run: | 49 | uv run babeldoc --generate-offline-assets /tmp/offline_assets 50 | - name: Restore offline assets package 51 | run: | 52 | rm -rf ~/.cache/babeldoc 53 | uv run babeldoc --restore-offline-assets /tmp/offline_assets 54 | - name: Clean up 55 | run: | 56 | rm -rf /tmp/offline_assets 57 | rm -rf ~/.cache/babeldoc/cache.v1.db 58 | rm -rf ~/.cache/babeldoc/working 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | web/logs 3 | web/*.log 4 | web/npm-debug.log* 5 | web/yarn-debug.log* 6 | web/yarn-error.log* 7 | web/pnpm-debug.log* 8 | web/lerna-debug.log* 9 | 10 | web/node_modules 11 | web/dist 12 | web/dist-ssr 13 | web/*.local 14 | 15 | memray* 16 | **/*.so 17 | *.pdf 18 | *.docx 19 | *.json 20 | **/*.pyc 21 | .venv 22 | .idea 23 | *.egg-info 24 | .DS_Store 25 | .vscode 26 | __pycache__ 27 | .ruff_cache 28 | yadt.toml 29 | examples/ 30 | /make_gif.py 31 | /dist 32 | .cache 33 | .cursor/rules/_*.mdc 34 | /.cursor 35 | /xnotes 36 | /docs/workflow-rules.md -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | files: '^.*\.py$' 2 | repos: 3 | - repo: https://github.com/astral-sh/ruff-pre-commit 4 | # Ruff version. 5 | rev: v0.9.5 6 | hooks: 7 | # Run the linter. 8 | - id: ruff 9 | args: [ "--fix", 10 | "--ignore=E203,E261,E501,E741,F841" ] 11 | # Run the formatter. 12 | - id: ruff-format 13 | -------------------------------------------------------------------------------- /babeldoc/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.21" 2 | -------------------------------------------------------------------------------- /babeldoc/asynchronize/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import time 3 | 4 | 5 | class Args: 6 | def __init__(self, args, kwargs): 7 | self.args = args 8 | self.kwargs = kwargs 9 | 10 | 11 | class AsyncCallback: 12 | def __init__(self): 13 | self.queue = asyncio.Queue() 14 | self.finished = False 15 | self.loop = asyncio.get_event_loop() 16 | 17 | def step_callback(self, *args, **kwargs): 18 | # Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue 19 | args = Args(args, kwargs) 20 | 21 | # We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping: 22 | # https://stackoverflow.com/a/49912853/2148718 23 | self.loop.call_soon_threadsafe(self.queue.put_nowait, args) 24 | 25 | # Add a small delay to release the GIL, ensuring the event loop has time to process messages 26 | time.sleep(0.01) 27 | 28 | def finished_callback(self, *args, **kwargs): 29 | # Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__ 30 | # will terminate after processing the remaining items 31 | if self.finished: 32 | return 33 | self.step_callback(*args, **kwargs) 34 | self.finished = True 35 | 36 | def __await__(self): 37 | # Since this implements __anext__, this can return itself 38 | return self.queue.get().__await__() 39 | 40 | def __aiter__(self): 41 | # Since this implements __anext__, this can return itself 42 | return self 43 | 44 | async def __anext__(self): 45 | # Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish 46 | # processing the remaining items even after we've finished 47 | if self.finished and self.queue.empty(): 48 | raise StopAsyncIteration 49 | 50 | result = await self.queue.get() 51 | return result 52 | -------------------------------------------------------------------------------- /babeldoc/const.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import subprocess 4 | from pathlib import Path 5 | 6 | __version__ = "0.3.21" 7 | 8 | CACHE_FOLDER = Path.home() / ".cache" / "babeldoc" 9 | 10 | 11 | def get_cache_file_path(filename: str, sub_folder: str | None = None) -> Path: 12 | if sub_folder is not None: 13 | sub_folder = sub_folder.strip("/") 14 | sub_folder_path = CACHE_FOLDER / sub_folder 15 | sub_folder_path.mkdir(parents=True, exist_ok=True) 16 | return sub_folder_path / filename 17 | return CACHE_FOLDER / filename 18 | 19 | 20 | try: 21 | git_path = shutil.which("git") 22 | if git_path is None: 23 | raise FileNotFoundError("git executable not found") 24 | two_parent = Path(__file__).resolve().parent.parent 25 | md_ = two_parent / "docs" / "README.md" 26 | if two_parent.name == "site-packages" or not md_.exists(): 27 | raise FileNotFoundError("not in git repo") 28 | WATERMARK_VERSION = ( 29 | subprocess.check_output( # noqa: S603 30 | [git_path, "describe", "--always"], 31 | cwd=Path(__file__).resolve().parent, 32 | ) 33 | .strip() 34 | .decode() 35 | ) 36 | except (OSError, FileNotFoundError, subprocess.CalledProcessError): 37 | WATERMARK_VERSION = f"v{__version__}" 38 | 39 | TIKTOKEN_CACHE_FOLDER = CACHE_FOLDER / "tiktoken" 40 | TIKTOKEN_CACHE_FOLDER.mkdir(parents=True, exist_ok=True) 41 | os.environ["TIKTOKEN_CACHE_DIR"] = str(TIKTOKEN_CACHE_FOLDER) 42 | -------------------------------------------------------------------------------- /babeldoc/document_il/__init__.py: -------------------------------------------------------------------------------- 1 | from babeldoc.document_il.il_version_1 import BaseOperations 2 | from babeldoc.document_il.il_version_1 import Box 3 | from babeldoc.document_il.il_version_1 import Cropbox 4 | from babeldoc.document_il.il_version_1 import Document 5 | from babeldoc.document_il.il_version_1 import GraphicState 6 | from babeldoc.document_il.il_version_1 import Mediabox 7 | from babeldoc.document_il.il_version_1 import Page 8 | from babeldoc.document_il.il_version_1 import PageLayout 9 | from babeldoc.document_il.il_version_1 import PdfCharacter 10 | from babeldoc.document_il.il_version_1 import PdfFigure 11 | from babeldoc.document_il.il_version_1 import PdfFont 12 | from babeldoc.document_il.il_version_1 import PdfFontCharBoundingBox 13 | from babeldoc.document_il.il_version_1 import PdfFormula 14 | from babeldoc.document_il.il_version_1 import PdfLine 15 | from babeldoc.document_il.il_version_1 import PdfParagraph 16 | from babeldoc.document_il.il_version_1 import PdfParagraphComposition 17 | from babeldoc.document_il.il_version_1 import PdfRectangle 18 | from babeldoc.document_il.il_version_1 import PdfSameStyleCharacters 19 | from babeldoc.document_il.il_version_1 import PdfSameStyleUnicodeCharacters 20 | from babeldoc.document_il.il_version_1 import PdfStyle 21 | from babeldoc.document_il.il_version_1 import PdfXobject 22 | from babeldoc.document_il.il_version_1 import VisualBbox 23 | 24 | __all__ = [ 25 | "BaseOperations", 26 | "Box", 27 | "Cropbox", 28 | "Document", 29 | "GraphicState", 30 | "Mediabox", 31 | "Page", 32 | "PageLayout", 33 | "PdfCharacter", 34 | "PdfFigure", 35 | "PdfFont", 36 | "PdfFontCharBoundingBox", 37 | "PdfFormula", 38 | "PdfLine", 39 | "PdfParagraph", 40 | "PdfParagraphComposition", 41 | "PdfRectangle", 42 | "PdfSameStyleCharacters", 43 | "PdfSameStyleUnicodeCharacters", 44 | "PdfStyle", 45 | "PdfXobject", 46 | "VisualBbox", 47 | ] 48 | -------------------------------------------------------------------------------- /babeldoc/document_il/babeldoc_exception/BabelDOCException.py: -------------------------------------------------------------------------------- 1 | class ScannedPDFError(Exception): 2 | def __init__(self, message): 3 | super().__init__(message) 4 | -------------------------------------------------------------------------------- /babeldoc/document_il/backend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/backend/__init__.py -------------------------------------------------------------------------------- /babeldoc/document_il/frontend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/frontend/__init__.py -------------------------------------------------------------------------------- /babeldoc/document_il/il_version_1.rnc: -------------------------------------------------------------------------------- 1 | start = Document 2 | Document = 3 | element document { 4 | Page+, 5 | attribute totalPages { xsd:int } 6 | } 7 | Page = 8 | element page { 9 | element mediabox { Box }, 10 | element cropbox { Box }, 11 | PDFXobject*, 12 | PageLayout*, 13 | PDFRectangle*, 14 | PDFFont*, 15 | PDFParagraph*, 16 | PDFFigure*, 17 | PDFCharacter*, 18 | attribute pageNumber { xsd:int }, 19 | attribute Unit { xsd:string }, 20 | element baseOperations { xsd:string } 21 | } 22 | Box = 23 | element box { 24 | # from (x,y) to (x2,y2) 25 | attribute x { xsd:float }, 26 | attribute y { xsd:float }, 27 | attribute x2 { xsd:float }, 28 | attribute y2 { xsd:float } 29 | } 30 | PDFXrefId = xsd:int 31 | PDFFont = 32 | element pdfFont { 33 | attribute name { xsd:string }, 34 | attribute fontId { xsd:string }, 35 | attribute xrefId { PDFXrefId }, 36 | attribute encodingLength { xsd:int }, 37 | attribute bold { xsd:boolean }?, 38 | attribute italic { xsd:boolean }?, 39 | attribute monospace { xsd:boolean }?, 40 | attribute serif { xsd:boolean }?, 41 | attribute ascent { xsd:float }?, 42 | attribute descent { xsd:float }?, 43 | PDFFontCharBoundingBox* 44 | } 45 | PDFFontCharBoundingBox = 46 | element pdfFontCharBoundingBox { 47 | attribute x { xsd:float }, 48 | attribute y { xsd:float }, 49 | attribute x2 { xsd:float }, 50 | attribute y2 { xsd:float }, 51 | attribute char_id { xsd:int } 52 | } 53 | PDFXobject = 54 | element pdfXobject { 55 | attribute xobjId { xsd:int }, 56 | attribute xrefId { PDFXrefId }, 57 | Box, 58 | PDFFont*, 59 | element baseOperations { xsd:string } 60 | } 61 | PDFCharacter = 62 | element pdfCharacter { 63 | attribute vertical { xsd:boolean }?, 64 | attribute scale { xsd:float }?, 65 | attribute pdfCharacterId { xsd:int }?, 66 | attribute char_unicode { xsd:string }, 67 | attribute advance { xsd:float }?, 68 | # xobject nesting depth 69 | attribute xobjId { xsd:int }?, 70 | attribute debug_info { xsd:boolean }?, 71 | PDFStyle, 72 | Box, 73 | element visual_bbox { Box }? 74 | } 75 | PageLayout = 76 | element pageLayout { 77 | attribute id { xsd:int }, 78 | attribute conf { xsd:float }, 79 | attribute class_name { xsd:string }, 80 | Box 81 | } 82 | GraphicState = 83 | element graphicState { 84 | attribute linewidth { xsd:float }?, 85 | attribute dash { 86 | list { xsd:float+ } 87 | }?, 88 | attribute flatness { xsd:float }?, 89 | attribute intent { xsd:string }?, 90 | attribute linecap { xsd:int }?, 91 | attribute linejoin { xsd:int }?, 92 | attribute miterlimit { xsd:float }?, 93 | attribute ncolor { 94 | list { xsd:float+ } 95 | }?, 96 | attribute scolor { 97 | list { xsd:float+ } 98 | }?, 99 | attribute strokingColorSpaceName { xsd:string }?, 100 | attribute nonStrokingColorSpaceName { xsd:string }?, 101 | attribute passthroughPerCharInstruction { xsd:string }? 102 | } 103 | PDFStyle = 104 | element pdfStyle { 105 | attribute font_id { xsd:string }, 106 | attribute font_size { xsd:float }, 107 | GraphicState 108 | } 109 | PDFParagraph = 110 | element pdfParagraph { 111 | attribute xobjId { xsd:int }?, 112 | attribute unicode { xsd:string }, 113 | attribute scale { xsd:float }?, 114 | attribute vertical { xsd:boolean }?, 115 | attribute FirstLineIndent { xsd:boolean }?, 116 | attribute debug_id { xsd:string }?, 117 | attribute layout_label { xsd:string }?, 118 | attribute layout_id { xsd:int }?, 119 | Box, 120 | PDFStyle, 121 | PDFParagraphComposition* 122 | } 123 | PDFParagraphComposition = 124 | element pdfParagraphComposition { 125 | PDFLine 126 | | PDFFormula 127 | | PDFSameStyleCharacters 128 | | PDFCharacter 129 | | PDFSameStyleUnicodeCharacters 130 | } 131 | PDFLine = element pdfLine { Box, PDFCharacter+ } 132 | PDFSameStyleCharacters = 133 | element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ } 134 | PDFSameStyleUnicodeCharacters = 135 | element pdfSameStyleUnicodeCharacters { 136 | PDFStyle?, 137 | attribute unicode { xsd:string }, 138 | attribute debug_info { xsd:boolean }? 139 | } 140 | PDFFormula = 141 | element pdfFormula { 142 | Box, 143 | PDFCharacter+, 144 | attribute x_offset { xsd:float }, 145 | attribute y_offset { xsd:float } 146 | } 147 | PDFFigure = element pdfFigure { Box } 148 | PDFRectangle = 149 | element pdfRectangle { 150 | Box, 151 | GraphicState, 152 | attribute debug_info { xsd:boolean }?, 153 | attribute fill_background { xsd:boolean }?, 154 | attribute xobjId { xsd:int }? 155 | } 156 | -------------------------------------------------------------------------------- /babeldoc/document_il/il_version_1.xsd: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | -------------------------------------------------------------------------------- /babeldoc/document_il/midend/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/midend/__init__.py -------------------------------------------------------------------------------- /babeldoc/document_il/midend/add_debug_information.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import babeldoc.document_il.il_version_1 as il_version_1 4 | from babeldoc.document_il import GraphicState 5 | from babeldoc.document_il.utils.style_helper import BLUE 6 | from babeldoc.document_il.utils.style_helper import ORANGE 7 | from babeldoc.document_il.utils.style_helper import YELLOW 8 | from babeldoc.translation_config import TranslationConfig 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class AddDebugInformation: 14 | stage_name = "Add Debug Information" 15 | 16 | def __init__(self, translation_config: TranslationConfig): 17 | self.translation_config = translation_config 18 | self.model = translation_config.doc_layout_model 19 | 20 | def process(self, docs: il_version_1.Document): 21 | if not self.translation_config.debug: 22 | return 23 | 24 | for page in docs.page: 25 | self.process_page(page) 26 | 27 | def _create_rectangle(self, box: il_version_1.Box, color: GraphicState): 28 | rect = il_version_1.PdfRectangle( 29 | box=box, 30 | graphic_state=color, 31 | debug_info=True, 32 | ) 33 | return rect 34 | 35 | def _create_text(self, text: str, color: GraphicState, box: il_version_1.Box): 36 | style = il_version_1.PdfStyle( 37 | font_id="china-ss", 38 | font_size=4, 39 | graphic_state=color, 40 | ) 41 | return il_version_1.PdfParagraph( 42 | first_line_indent=False, 43 | box=il_version_1.Box( 44 | x=box.x, 45 | y=box.y2, 46 | x2=box.x2, 47 | y2=box.y2 + 5, 48 | ), 49 | vertical=False, 50 | pdf_style=style, 51 | unicode=text, 52 | pdf_paragraph_composition=[ 53 | il_version_1.PdfParagraphComposition( 54 | pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( 55 | unicode=text, 56 | pdf_style=style, 57 | debug_info=True, 58 | ), 59 | ), 60 | ], 61 | xobj_id=-1, 62 | ) 63 | 64 | def process_page(self, page: il_version_1.Page): 65 | # Add page number text at top-left corner 66 | page_width = page.cropbox.box.x2 - page.cropbox.box.x 67 | page_height = page.cropbox.box.y2 - page.cropbox.box.y 68 | page_number_text = f"pagenumber: {page.page_number}" 69 | page_number_box = il_version_1.Box( 70 | x=page.cropbox.box.x + page_width * 0.02, 71 | y=page.cropbox.box.y, 72 | x2=page.cropbox.box.x2, 73 | y2=page.cropbox.box.y2 - page_height * 0.02, 74 | ) 75 | page_number_paragraph = self._create_text( 76 | page_number_text, 77 | BLUE, 78 | page_number_box, 79 | ) 80 | page.pdf_paragraph.append(page_number_paragraph) 81 | 82 | new_paragraphs = [] 83 | 84 | for paragraph in page.pdf_paragraph: 85 | if not paragraph.pdf_paragraph_composition: 86 | continue 87 | if any( 88 | x.pdf_same_style_unicode_characters.debug_info 89 | for x in paragraph.pdf_paragraph_composition 90 | if x.pdf_same_style_unicode_characters 91 | ): 92 | continue 93 | # Create a rectangle box 94 | rect = self._create_rectangle(paragraph.box, BLUE) 95 | 96 | page.pdf_rectangle.append(rect) 97 | 98 | # Create text label at top-left corner 99 | # Note: PDF coordinates are from bottom-left, 100 | # so we use y2 for top position 101 | 102 | debug_text = "paragraph" 103 | if hasattr(paragraph, "debug_id") and paragraph.debug_id: 104 | debug_text = f"paragraph[{paragraph.debug_id}]" 105 | new_paragraphs.append(self._create_text(debug_text, BLUE, paragraph.box)) 106 | 107 | for composition in paragraph.pdf_paragraph_composition: 108 | if composition.pdf_formula: 109 | new_paragraphs.append( 110 | self._create_text( 111 | "formula", 112 | ORANGE, 113 | composition.pdf_formula.box, 114 | ), 115 | ) 116 | page.pdf_rectangle.append( 117 | self._create_rectangle( 118 | composition.pdf_formula.box, 119 | ORANGE, 120 | ), 121 | ) 122 | 123 | for xobj in page.pdf_xobject: 124 | new_paragraphs.append( 125 | self._create_text( 126 | "xobj", 127 | YELLOW, 128 | xobj.box, 129 | ), 130 | ) 131 | page.pdf_rectangle.append( 132 | self._create_rectangle( 133 | xobj.box, 134 | YELLOW, 135 | ), 136 | ) 137 | 138 | page.pdf_paragraph.extend(new_paragraphs) 139 | -------------------------------------------------------------------------------- /babeldoc/document_il/midend/detect_scanned_file.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import cv2 4 | import numpy as np 5 | import pymupdf 6 | from skimage.metrics import structural_similarity 7 | 8 | from babeldoc.document_il import il_version_1 9 | from babeldoc.document_il.babeldoc_exception.BabelDOCException import ScannedPDFError 10 | from babeldoc.document_il.utils.style_helper import GREEN 11 | from babeldoc.translation_config import TranslationConfig 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class DetectScannedFile: 17 | stage_name = "DetectScannedFile" 18 | 19 | def __init__(self, translation_config: TranslationConfig): 20 | self.translation_config = translation_config 21 | 22 | def _save_debug_box_to_page(self, page: il_version_1.Page, similarity: float): 23 | """Save debug boxes and text labels to the PDF page.""" 24 | if not self.translation_config.debug: 25 | return 26 | 27 | color = GREEN 28 | 29 | # Create text label at top-left corner 30 | # Note: PDF coordinates are from bottom-left, 31 | # so we use y2 for top position 32 | style = il_version_1.PdfStyle( 33 | font_id="china-ss", 34 | font_size=4, 35 | graphic_state=color, 36 | ) 37 | page_width = page.cropbox.box.x2 - page.cropbox.box.x 38 | page_height = page.cropbox.box.y2 - page.cropbox.box.y 39 | unicode = f"scanned score: {similarity * 100:.2f} %" 40 | page.pdf_paragraph.append( 41 | il_version_1.PdfParagraph( 42 | first_line_indent=False, 43 | box=il_version_1.Box( 44 | x=page.cropbox.box.x + page_width * 0.03, 45 | y=page.cropbox.box.y, 46 | x2=page.cropbox.box.x2, 47 | y2=page.cropbox.box.y2 - page_height * 0.03, 48 | ), 49 | vertical=False, 50 | pdf_style=style, 51 | unicode=unicode, 52 | pdf_paragraph_composition=[ 53 | il_version_1.PdfParagraphComposition( 54 | pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( 55 | unicode=unicode, 56 | pdf_style=style, 57 | debug_info=True, 58 | ), 59 | ), 60 | ], 61 | xobj_id=-1, 62 | ), 63 | ) 64 | 65 | def process(self, docs: il_version_1.Document): 66 | """Generate layouts for all pages that need to be translated.""" 67 | # Get pages that need to be translated 68 | pages_to_translate = [ 69 | page 70 | for page in docs.page 71 | if self.translation_config.should_translate_page(page.page_number + 1) 72 | ] 73 | mupdf = pymupdf.open(self.translation_config.get_working_file_path("input.pdf")) 74 | total = len(pages_to_translate) 75 | threshold = 0.8 * total 76 | threshold = max(threshold, 1) 77 | scanned = 0 78 | non_scanned = 0 79 | non_scanned_threshold = total - threshold 80 | with self.translation_config.progress_monitor.stage_start( 81 | self.stage_name, 82 | total, 83 | ) as progress: 84 | for page in pages_to_translate: 85 | if scanned < threshold and non_scanned < non_scanned_threshold: 86 | # Only continue detection if both counts are below thresholds 87 | is_scanned = self.detect_page_is_scanned(page, mupdf) 88 | if is_scanned: 89 | scanned += 1 90 | else: 91 | non_scanned += 1 92 | else: 93 | # We have enough information to determine document type 94 | non_scanned += 1 95 | progress.advance(1) 96 | 97 | if scanned > threshold: 98 | logger.warning( 99 | f"Detected {scanned} scanned pages, which is more than 80% of the total pages. " 100 | "Please check the input PDF file.", 101 | ) 102 | raise ScannedPDFError("Scanned PDF detected.") 103 | 104 | @staticmethod 105 | def detect_page_is_scanned(page: il_version_1.Page, pdf: pymupdf.Document) -> bool: 106 | before_page_image = pdf[page.page_number].get_pixmap() 107 | before_page_image = np.frombuffer(before_page_image.samples, np.uint8).reshape( 108 | before_page_image.height, 109 | before_page_image.width, 110 | 3, 111 | )[:, :, ::-1] 112 | new_xref = pdf.get_new_xref() 113 | pdf.update_object(new_xref, "<<>>") 114 | pdf.update_stream(new_xref, page.base_operations.value.encode("utf-8")) 115 | pdf[page.page_number].set_contents(new_xref) 116 | 117 | for xobj in page.pdf_xobject: 118 | pdf.update_stream(xobj.xref_id, xobj.base_operations.value.encode("utf-8")) 119 | 120 | after_page_image = pdf[page.page_number].get_pixmap() 121 | after_page_image = np.frombuffer(after_page_image.samples, np.uint8).reshape( 122 | after_page_image.height, 123 | after_page_image.width, 124 | 3, 125 | )[:, :, ::-1] 126 | before_page_image = cv2.cvtColor(before_page_image, cv2.COLOR_RGB2GRAY) 127 | after_page_image = cv2.cvtColor(after_page_image, cv2.COLOR_RGB2GRAY) 128 | return structural_similarity(before_page_image, after_page_image) > 0.9 129 | -------------------------------------------------------------------------------- /babeldoc/document_il/midend/layout_parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import cv2 5 | import numpy as np 6 | from pymupdf import Document 7 | 8 | from babeldoc.document_il import il_version_1 9 | from babeldoc.document_il.utils.style_helper import GREEN 10 | from babeldoc.translation_config import TranslationConfig 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class LayoutParser: 16 | stage_name = "Parse Page Layout" 17 | 18 | def __init__(self, translation_config: TranslationConfig): 19 | self.translation_config = translation_config 20 | self.model = translation_config.doc_layout_model 21 | 22 | def _save_debug_image(self, image: np.ndarray, layout, page_number: int): 23 | """Save debug image with drawn boxes if debug mode is enabled.""" 24 | if not self.translation_config.debug: 25 | return 26 | 27 | debug_dir = Path(self.translation_config.get_working_file_path("ocr-box-image")) 28 | debug_dir.mkdir(parents=True, exist_ok=True) 29 | 30 | # Draw boxes on the image 31 | debug_image = image.copy() 32 | for box in layout.boxes: 33 | x0, y0, x1, y1 = box.xyxy 34 | cv2.rectangle( 35 | debug_image, 36 | (int(x0), int(y0)), 37 | (int(x1), int(y1)), 38 | (0, 255, 0), 39 | 2, 40 | ) 41 | # Add text label 42 | cv2.putText( 43 | debug_image, 44 | layout.names[box.cls], 45 | (int(x0), int(y0) - 5), 46 | cv2.FONT_HERSHEY_SIMPLEX, 47 | 0.5, 48 | (0, 255, 0), 49 | 1, 50 | ) 51 | 52 | # Save the image 53 | output_path = debug_dir / f"{page_number}.jpg" 54 | cv2.imwrite(str(output_path), debug_image) 55 | 56 | def _save_debug_box_to_page(self, page: il_version_1.Page): 57 | """Save debug boxes and text labels to the PDF page.""" 58 | if not self.translation_config.debug: 59 | return 60 | 61 | color = GREEN 62 | 63 | for layout in page.page_layout: 64 | # Create a rectangle box 65 | rect = il_version_1.PdfRectangle( 66 | box=il_version_1.Box( 67 | x=layout.box.x, 68 | y=layout.box.y, 69 | x2=layout.box.x2, 70 | y2=layout.box.y2, 71 | ), 72 | graphic_state=color, 73 | debug_info=True, 74 | ) 75 | page.pdf_rectangle.append(rect) 76 | 77 | # Create text label at top-left corner 78 | # Note: PDF coordinates are from bottom-left, 79 | # so we use y2 for top position 80 | style = il_version_1.PdfStyle( 81 | font_id="china-ss", 82 | font_size=4, 83 | graphic_state=color, 84 | ) 85 | page.pdf_paragraph.append( 86 | il_version_1.PdfParagraph( 87 | first_line_indent=False, 88 | box=il_version_1.Box( 89 | x=layout.box.x, 90 | y=layout.box.y2, 91 | x2=layout.box.x2, 92 | y2=layout.box.y2 + 5, 93 | ), 94 | vertical=False, 95 | pdf_style=style, 96 | unicode=layout.class_name, 97 | pdf_paragraph_composition=[ 98 | il_version_1.PdfParagraphComposition( 99 | pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( 100 | unicode=layout.class_name, 101 | pdf_style=style, 102 | debug_info=True, 103 | ), 104 | ), 105 | ], 106 | xobj_id=-1, 107 | ), 108 | ) 109 | 110 | def process(self, docs: il_version_1.Document, mupdf_doc: Document): 111 | """Generate layouts for all pages that need to be translated.""" 112 | # Get pages that need to be translated 113 | total = len(docs.page) 114 | with self.translation_config.progress_monitor.stage_start( 115 | self.stage_name, 116 | total, 117 | ) as progress: 118 | # Process predictions for each page 119 | for page, layouts in self.model.handle_document( 120 | docs.page, mupdf_doc, self.translation_config, self._save_debug_image 121 | ): 122 | page_layouts = [] 123 | for layout in layouts.boxes: 124 | # Convert coordinate system from picture to il 125 | # system to the il coordinate system 126 | x0, y0, x1, y1 = layout.xyxy 127 | pix = mupdf_doc[page.page_number].get_pixmap() 128 | h, w = pix.height, pix.width 129 | x0, y0, x1, y1 = ( 130 | np.clip(int(x0 - 1), 0, w - 1), 131 | np.clip(int(h - y1 - 1), 0, h - 1), 132 | np.clip(int(x1 + 1), 0, w - 1), 133 | np.clip(int(h - y0 + 1), 0, h - 1), 134 | ) 135 | page_layout = il_version_1.PageLayout( 136 | id=len(page_layouts) + 1, 137 | box=il_version_1.Box( 138 | x0.item(), 139 | y0.item(), 140 | x1.item(), 141 | y1.item(), 142 | ), 143 | conf=layout.conf.item(), 144 | class_name=layouts.names[layout.cls], 145 | ) 146 | page_layouts.append(page_layout) 147 | 148 | page.page_layout = page_layouts 149 | self._save_debug_box_to_page(page) 150 | progress.advance(1) 151 | 152 | return docs 153 | -------------------------------------------------------------------------------- /babeldoc/document_il/midend/remove_descent.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from collections import Counter 3 | from functools import cache 4 | 5 | from babeldoc.document_il import il_version_1 6 | from babeldoc.translation_config import TranslationConfig 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | class RemoveDescent: 12 | stage_name = "Remove Char Descent" 13 | 14 | def __init__(self, translation_config: TranslationConfig): 15 | self.translation_config = translation_config 16 | 17 | def _remove_char_descent( 18 | self, 19 | char: il_version_1.PdfCharacter, 20 | font: il_version_1.PdfFont, 21 | ) -> float | None: 22 | """Remove descent from a single character and return the descent value. 23 | 24 | Args: 25 | char: The character to process 26 | font: The font used by this character 27 | 28 | Returns: 29 | The descent value if it was removed, None otherwise 30 | """ 31 | if ( 32 | char.box 33 | and char.box.y is not None 34 | and char.box.y2 is not None 35 | and font 36 | and hasattr(font, "descent") 37 | ): 38 | descent = font.descent * char.pdf_style.font_size / 1000 39 | if char.vertical: 40 | # For vertical text, remove descent from x coordinates 41 | char.box.x += descent 42 | char.box.x2 += descent 43 | else: 44 | # For horizontal text, remove descent from y coordinates 45 | char.box.y -= descent 46 | char.box.y2 -= descent 47 | return descent 48 | return None 49 | 50 | def process(self, document: il_version_1.Document): 51 | """Process the document to remove descent adjustments from character boxes. 52 | 53 | Args: 54 | document: The document to process 55 | """ 56 | with self.translation_config.progress_monitor.stage_start( 57 | self.stage_name, 58 | len(document.page), 59 | ) as pbar: 60 | for page in document.page: 61 | self.translation_config.raise_if_cancelled() 62 | self.process_page(page) 63 | pbar.advance() 64 | 65 | def process_page(self, page: il_version_1.Page): 66 | """Process a single page to remove descent adjustments. 67 | 68 | Args: 69 | page: The page to process 70 | """ 71 | # Build font map including xobjects 72 | fonts: dict[ 73 | str | int, 74 | il_version_1.PdfFont | dict[str, il_version_1.PdfFont], 75 | ] = {f.font_id: f for f in page.pdf_font} 76 | page_fonts = {f.font_id: f for f in page.pdf_font} 77 | 78 | # Add xobject fonts 79 | for xobj in page.pdf_xobject: 80 | fonts[xobj.xobj_id] = page_fonts.copy() 81 | for font in xobj.pdf_font: 82 | fonts[xobj.xobj_id][font.font_id] = font 83 | 84 | @cache 85 | def get_font( 86 | font_id: str, 87 | xobj_id: int | None = None, 88 | ) -> il_version_1.PdfFont | None: 89 | if xobj_id is not None and xobj_id in fonts: 90 | font_map = fonts[xobj_id] 91 | if isinstance(font_map, dict) and font_id in font_map: 92 | return font_map[font_id] 93 | return ( 94 | fonts.get(font_id) 95 | if isinstance(fonts.get(font_id), il_version_1.PdfFont) 96 | else None 97 | ) 98 | 99 | # Process all standalone characters in the page 100 | for char in page.pdf_character: 101 | if font := get_font(char.pdf_style.font_id, char.xobj_id): 102 | self._remove_char_descent(char, font) 103 | 104 | # Process all paragraphs 105 | for paragraph in page.pdf_paragraph: 106 | descent_values = [] 107 | vertical_chars = [] 108 | 109 | # Process all characters in paragraph compositions 110 | for comp in paragraph.pdf_paragraph_composition: 111 | # Handle direct characters 112 | if comp.pdf_character: 113 | font = get_font( 114 | comp.pdf_character.pdf_style.font_id, 115 | comp.pdf_character.xobj_id, 116 | ) 117 | if font: 118 | descent = self._remove_char_descent(comp.pdf_character, font) 119 | if descent is not None: 120 | descent_values.append(descent) 121 | vertical_chars.append(comp.pdf_character.vertical) 122 | 123 | # Handle characters in PdfLine 124 | elif comp.pdf_line: 125 | for char in comp.pdf_line.pdf_character: 126 | if font := get_font(char.pdf_style.font_id, char.xobj_id): 127 | descent = self._remove_char_descent(char, font) 128 | if descent is not None: 129 | descent_values.append(descent) 130 | vertical_chars.append(char.vertical) 131 | 132 | # Handle characters in PdfFormula 133 | elif comp.pdf_formula: 134 | for char in comp.pdf_formula.pdf_character: 135 | if font := get_font(char.pdf_style.font_id, char.xobj_id): 136 | descent = self._remove_char_descent(char, font) 137 | if descent is not None: 138 | descent_values.append(descent) 139 | vertical_chars.append(char.vertical) 140 | 141 | # Handle characters in PdfSameStyleCharacters 142 | elif comp.pdf_same_style_characters: 143 | for char in comp.pdf_same_style_characters.pdf_character: 144 | if font := get_font(char.pdf_style.font_id, char.xobj_id): 145 | descent = self._remove_char_descent(char, font) 146 | if descent is not None: 147 | descent_values.append(descent) 148 | vertical_chars.append(char.vertical) 149 | 150 | # Adjust paragraph box based on most common descent value 151 | if descent_values and paragraph.box: 152 | # Calculate mode of descent values 153 | descent_counter = Counter(descent_values) 154 | most_common_descent = descent_counter.most_common(1)[0][0] 155 | 156 | # Check if paragraph is vertical (all characters are vertical) 157 | is_vertical = all(vertical_chars) if vertical_chars else False 158 | 159 | # Adjust paragraph box 160 | if paragraph.box.y is not None and paragraph.box.y2 is not None: 161 | if is_vertical: 162 | # For vertical paragraphs, adjust x coordinates 163 | paragraph.box.x += most_common_descent 164 | paragraph.box.x2 += most_common_descent 165 | else: 166 | # For horizontal paragraphs, adjust y coordinates 167 | paragraph.box.y -= most_common_descent 168 | paragraph.box.y2 -= most_common_descent 169 | -------------------------------------------------------------------------------- /babeldoc/document_il/midend/table_parser.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | import cv2 5 | import numpy as np 6 | from pymupdf import Document 7 | 8 | from babeldoc.document_il import il_version_1 9 | from babeldoc.document_il.utils.style_helper import GREEN 10 | from babeldoc.translation_config import TranslationConfig 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class TableParser: 16 | stage_name = "Parse Table" 17 | 18 | def __init__(self, translation_config: TranslationConfig): 19 | self.translation_config = translation_config 20 | self.model = translation_config.table_model 21 | 22 | def _save_debug_image(self, image: np.ndarray, layouts, page_number: int): 23 | """Save debug image with drawn boxes if debug mode is enabled.""" 24 | if not self.translation_config.debug: 25 | return 26 | 27 | if not isinstance(layouts, list): 28 | layouts = [layouts] 29 | debug_dir = Path( 30 | self.translation_config.get_working_file_path("table-ocr-box-image") 31 | ) 32 | debug_dir.mkdir(parents=True, exist_ok=True) 33 | 34 | # Draw boxes on the image 35 | debug_image = image.copy() 36 | for layout in layouts: 37 | for box in layout.boxes: 38 | x0, y0, x1, y1 = box.xyxy 39 | cv2.rectangle( 40 | debug_image, 41 | (int(x0), int(y0)), 42 | (int(x1), int(y1)), 43 | (0, 255, 0), 44 | 2, 45 | ) 46 | # Add text label 47 | cv2.putText( 48 | debug_image, 49 | layout.names[box.cls], 50 | (int(x0), int(y0) - 5), 51 | cv2.FONT_HERSHEY_SIMPLEX, 52 | 0.5, 53 | (0, 255, 0), 54 | 1, 55 | ) 56 | 57 | # Save the image 58 | output_path = debug_dir / f"{page_number}.jpg" 59 | cv2.imwrite(str(output_path), debug_image) 60 | 61 | def _save_debug_box_to_page(self, page: il_version_1.Page): 62 | """Save debug boxes and text labels to the PDF page.""" 63 | if not self.translation_config.debug: 64 | return 65 | 66 | color = GREEN 67 | 68 | for layout in page.page_layout: 69 | # Create a rectangle box 70 | rect = il_version_1.PdfRectangle( 71 | box=il_version_1.Box( 72 | x=layout.box.x, 73 | y=layout.box.y, 74 | x2=layout.box.x2, 75 | y2=layout.box.y2, 76 | ), 77 | graphic_state=color, 78 | debug_info=True, 79 | ) 80 | page.pdf_rectangle.append(rect) 81 | 82 | # Create text label at top-left corner 83 | # Note: PDF coordinates are from bottom-left, 84 | # so we use y2 for top position 85 | style = il_version_1.PdfStyle( 86 | font_id="china-ss", 87 | font_size=4, 88 | graphic_state=color, 89 | ) 90 | page.pdf_paragraph.append( 91 | il_version_1.PdfParagraph( 92 | first_line_indent=False, 93 | box=il_version_1.Box( 94 | x=layout.box.x, 95 | y=layout.box.y2, 96 | x2=layout.box.x2, 97 | y2=layout.box.y2 + 5, 98 | ), 99 | vertical=False, 100 | pdf_style=style, 101 | unicode=layout.class_name, 102 | pdf_paragraph_composition=[ 103 | il_version_1.PdfParagraphComposition( 104 | pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters( 105 | unicode=layout.class_name, 106 | pdf_style=style, 107 | debug_info=True, 108 | ), 109 | ), 110 | ], 111 | xobj_id=-1, 112 | ), 113 | ) 114 | 115 | def process(self, docs: il_version_1.Document, mupdf_doc: Document): 116 | """Generate layouts for all pages that need to be translated.""" 117 | # Get pages that need to be translated 118 | have_table_pages = {} 119 | for page in docs.page: 120 | for layout in page.page_layout: 121 | if layout.class_name == "table": 122 | have_table_pages[page.page_number] = page 123 | with self.translation_config.progress_monitor.stage_start( 124 | self.stage_name, 125 | len(have_table_pages), 126 | ) as progress: 127 | # Process predictions for each page 128 | for page, layouts in self.model.handle_document( 129 | have_table_pages.values(), 130 | mupdf_doc, 131 | self.translation_config, 132 | self._save_debug_image, 133 | ): 134 | page_layouts = [] 135 | for layout in layouts.boxes: 136 | # Convert coordinate system from picture to il 137 | # system to the il coordinate system 138 | x0, y0, x1, y1 = layout.xyxy 139 | pix = mupdf_doc[page.page_number].get_pixmap() 140 | h, w = pix.height, pix.width 141 | x0, y0, x1, y1 = ( 142 | np.clip(int(x0 - 1), 0, w - 1), 143 | np.clip(int(h - y1 - 1), 0, h - 1), 144 | np.clip(int(x1 + 1), 0, w - 1), 145 | np.clip(int(h - y0 + 1), 0, h - 1), 146 | ) 147 | page_layout = il_version_1.PageLayout( 148 | id=len(page_layouts) + 1, 149 | box=il_version_1.Box( 150 | x0.item(), 151 | y0.item(), 152 | x1.item(), 153 | y1.item(), 154 | ), 155 | conf=layout.conf.item(), 156 | class_name=layouts.names[layout.cls], 157 | ) 158 | page_layouts.append(page_layout) 159 | 160 | page.page_layout.extend(page_layouts) 161 | self._save_debug_box_to_page(page) 162 | progress.advance(1) 163 | 164 | return docs 165 | -------------------------------------------------------------------------------- /babeldoc/document_il/translator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/translator/__init__.py -------------------------------------------------------------------------------- /babeldoc/document_il/translator/cache.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | from peewee import SQL 5 | from peewee import AutoField 6 | from peewee import CharField 7 | from peewee import Model 8 | from peewee import SqliteDatabase 9 | from peewee import TextField 10 | 11 | from babeldoc.const import CACHE_FOLDER 12 | 13 | # we don't init the database here 14 | db = SqliteDatabase(None) 15 | 16 | 17 | class _TranslationCache(Model): 18 | id = AutoField() 19 | translate_engine = CharField(max_length=20) 20 | translate_engine_params = TextField() 21 | original_text = TextField() 22 | translation = TextField() 23 | 24 | class Meta: 25 | database = db 26 | constraints = [ 27 | SQL( 28 | """ 29 | UNIQUE ( 30 | translate_engine, 31 | translate_engine_params, 32 | original_text 33 | ) 34 | ON CONFLICT REPLACE 35 | """, 36 | ), 37 | ] 38 | 39 | 40 | class TranslationCache: 41 | @staticmethod 42 | def _sort_dict_recursively(obj): 43 | if isinstance(obj, dict): 44 | return { 45 | k: TranslationCache._sort_dict_recursively(v) 46 | for k in sorted(obj.keys()) 47 | for v in [obj[k]] 48 | } 49 | elif isinstance(obj, list): 50 | return [TranslationCache._sort_dict_recursively(item) for item in obj] 51 | return obj 52 | 53 | def __init__(self, translate_engine: str, translate_engine_params: dict = None): 54 | self.translate_engine = translate_engine 55 | self.replace_params(translate_engine_params) 56 | 57 | # The program typically starts multi-threaded translation 58 | # only after cache parameters are fully configured, 59 | # so thread safety doesn't need to be considered here. 60 | def replace_params(self, params: dict = None): 61 | if params is None: 62 | params = {} 63 | self.params = params 64 | params = self._sort_dict_recursively(params) 65 | self.translate_engine_params = json.dumps(params) 66 | 67 | def update_params(self, params: dict = None): 68 | if params is None: 69 | params = {} 70 | self.params.update(params) 71 | self.replace_params(self.params) 72 | 73 | def add_params(self, k: str, v): 74 | self.params[k] = v 75 | self.replace_params(self.params) 76 | 77 | # Since peewee and the underlying sqlite are thread-safe, 78 | # get and set operations don't need locks. 79 | def get(self, original_text: str) -> str | None: 80 | result = _TranslationCache.get_or_none( 81 | translate_engine=self.translate_engine, 82 | translate_engine_params=self.translate_engine_params, 83 | original_text=original_text, 84 | ) 85 | return result.translation if result else None 86 | 87 | def set(self, original_text: str, translation: str): 88 | _TranslationCache.create( 89 | translate_engine=self.translate_engine, 90 | translate_engine_params=self.translate_engine_params, 91 | original_text=original_text, 92 | translation=translation, 93 | ) 94 | 95 | 96 | def init_db(remove_exists=False): 97 | CACHE_FOLDER.mkdir(parents=True, exist_ok=True) 98 | # The current version does not support database migration, so add the version number to the file name. 99 | cache_db_path = CACHE_FOLDER / "cache.v1.db" 100 | if remove_exists and cache_db_path.exists(): 101 | cache_db_path.unlink() 102 | db.init( 103 | cache_db_path, 104 | pragmas={ 105 | "journal_mode": "wal", 106 | "busy_timeout": 1000, 107 | }, 108 | ) 109 | db.create_tables([_TranslationCache], safe=True) 110 | 111 | 112 | def init_test_db(): 113 | import tempfile 114 | 115 | temp_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False) 116 | cache_db_path = temp_file.name 117 | temp_file.close() 118 | 119 | test_db = SqliteDatabase( 120 | cache_db_path, 121 | pragmas={ 122 | "journal_mode": "wal", 123 | "busy_timeout": 1000, 124 | }, 125 | ) 126 | test_db.bind([_TranslationCache], bind_refs=False, bind_backrefs=False) 127 | test_db.connect() 128 | test_db.create_tables([_TranslationCache], safe=True) 129 | return test_db 130 | 131 | 132 | def clean_test_db(test_db): 133 | test_db.drop_tables([_TranslationCache]) 134 | test_db.close() 135 | db_path = Path(test_db.database) 136 | if db_path.exists(): 137 | db_path.unlink() 138 | wal_path = Path(str(db_path) + "-wal") 139 | if wal_path.exists(): 140 | wal_path.unlink() 141 | shm_path = Path(str(db_path) + "-shm") 142 | if shm_path.exists(): 143 | shm_path.unlink() 144 | 145 | 146 | init_db() 147 | -------------------------------------------------------------------------------- /babeldoc/document_il/utils/atomic_integer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | 4 | class AtomicInteger: 5 | def __init__(self, value=0): 6 | self._value = int(value) 7 | self._lock = threading.Lock() 8 | 9 | def inc(self, d=1): 10 | with self._lock: 11 | self._value += int(d) 12 | return self._value 13 | 14 | def dec(self, d=1): 15 | return self.inc(-d) 16 | 17 | @property 18 | def value(self): 19 | with self._lock: 20 | return self._value 21 | 22 | @value.setter 23 | def value(self, v): 24 | with self._lock: 25 | self._value = int(v) 26 | return self._value 27 | -------------------------------------------------------------------------------- /babeldoc/document_il/utils/priority_thread_pool_executor.py: -------------------------------------------------------------------------------- 1 | # thanks to: 2 | # https://github.com/oleglpts/PriorityThreadPoolExecutor/blob/master/PriorityThreadPoolExecutor/__init__.py 3 | # https://github.com/oleglpts/PriorityThreadPoolExecutor/issues/4 4 | 5 | import atexit 6 | import itertools 7 | import logging 8 | import queue 9 | import random 10 | import sys 11 | import threading 12 | import weakref 13 | from concurrent.futures import _base 14 | from concurrent.futures.thread import BrokenThreadPool 15 | from concurrent.futures.thread import ThreadPoolExecutor 16 | from concurrent.futures.thread import _python_exit 17 | from concurrent.futures.thread import _threads_queues 18 | from concurrent.futures.thread import _WorkItem 19 | from heapq import heappop 20 | from heapq import heappush 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | ######################################################################################################################## 25 | # Global variables # 26 | ######################################################################################################################## 27 | 28 | NULL_ENTRY = (sys.maxsize, _WorkItem(None, None, (), {})) 29 | _shutdown = False 30 | 31 | ######################################################################################################################## 32 | # Before system exit procedure # 33 | ######################################################################################################################## 34 | 35 | 36 | def python_exit(): 37 | """ 38 | 39 | Cleanup before system exit 40 | 41 | """ 42 | global _shutdown 43 | _shutdown = True 44 | items = list(_threads_queues.items()) 45 | for _t, q in items: 46 | q.put(NULL_ENTRY) 47 | for t, _q in items: 48 | t.join() 49 | 50 | 51 | # change default cleanup 52 | 53 | 54 | atexit.unregister(_python_exit) 55 | atexit.register(python_exit) 56 | 57 | 58 | class PriorityQueue(queue.Queue): 59 | """Variant of Queue that retrieves open entries in priority order (lowest first). 60 | 61 | Entries are typically tuples of the form: (priority number, data). 62 | """ 63 | 64 | REMOVED = "" 65 | DEFAULT_PRIORITY = 100 66 | 67 | def _init(self, maxsize): 68 | self.queue = [] 69 | self.entry_finder = {} 70 | self.counter = itertools.count() 71 | 72 | def _qsize(self): 73 | return len(self.queue) 74 | 75 | def _put(self, item): 76 | # heappush(self.queue, item) 77 | try: 78 | if item[1] in self.entry_finder: 79 | self.remove(item[1]) 80 | count = next(self.counter) 81 | entry = [item[0], count, item[1]] 82 | self.entry_finder[item[1]] = entry 83 | heappush(self.queue, entry) 84 | except TypeError: # handle item==None 85 | self._put((self.DEFAULT_PRIORITY, None)) 86 | 87 | def remove(self, task): 88 | """ 89 | This simply replaces the data with the REMOVED value, 90 | which will get cleared out once _get reaches it. 91 | """ 92 | entry = self.entry_finder.pop(task) 93 | entry[-1] = self.REMOVED 94 | 95 | def _get(self): 96 | while self.queue: 97 | entry = heappop(self.queue) 98 | if entry[2] is not self.REMOVED: 99 | del self.entry_finder[entry[2]] 100 | return entry 101 | return None 102 | 103 | 104 | def _worker(executor_reference, work_queue, initializer, initargs): 105 | if initializer is not None: 106 | try: 107 | initializer(*initargs) 108 | except BaseException: 109 | _base.LOGGER.critical("Exception in initializer:", exc_info=True) 110 | executor = executor_reference() 111 | if executor is not None: 112 | executor._initializer_failed() 113 | return 114 | try: 115 | while True: 116 | work_item = work_queue.get(block=True) 117 | try: 118 | if work_item[2] is not None: 119 | work_item[2].run() 120 | # Delete references to object. See issue16284 121 | del work_item 122 | 123 | # attempt to increment idle count 124 | executor = executor_reference() 125 | if executor is not None: 126 | executor._idle_semaphore.release() 127 | del executor 128 | continue 129 | 130 | executor = executor_reference() 131 | # Exit if: 132 | # - The interpreter is shutting down OR 133 | # - The executor that owns the worker has been collected OR 134 | # - The executor that owns the worker has been shutdown. 135 | if _shutdown or executor is None or executor._shutdown: 136 | # Flag the executor as shutting down as early as possible if it 137 | # is not gc-ed yet. 138 | if executor is not None: 139 | executor._shutdown = True 140 | # Notice other workers 141 | work_queue.put(None) 142 | return 143 | del executor 144 | finally: 145 | work_queue.task_done() 146 | except BaseException: 147 | _base.LOGGER.critical("Exception in worker", exc_info=True) 148 | 149 | 150 | class PriorityThreadPoolExecutor(ThreadPoolExecutor): 151 | """ 152 | Thread pool executor with priority queue (priorities must be different, lowest first) 153 | """ 154 | 155 | def __init__(self, *args, **kwargs): 156 | super().__init__(*args, **kwargs) 157 | 158 | # change work queue type to queue.PriorityQueue 159 | self._work_queue: PriorityQueue = PriorityQueue() 160 | 161 | def submit(self, fn, *args, **kwargs): 162 | """ 163 | 164 | Sending the function to the execution queue 165 | 166 | :param fn: function being executed 167 | :type fn: callable 168 | :param args: function's positional arguments 169 | :param kwargs: function's keywords arguments 170 | :return: future instance 171 | :rtype: _base.Future 172 | 173 | Added keyword: 174 | 175 | - priority (integer later sys.maxsize) 176 | 177 | """ 178 | with self._shutdown_lock: 179 | if self._broken: 180 | raise BrokenThreadPool(self._broken) 181 | 182 | if self._shutdown: 183 | raise RuntimeError("cannot schedule new futures after shutdown") 184 | if _shutdown: 185 | raise RuntimeError( 186 | "cannot schedule new futures after interpreter shutdown" 187 | ) 188 | 189 | priority = kwargs.get("priority", random.randint(0, sys.maxsize - 1)) # noqa: S311 190 | if "priority" in kwargs: 191 | del kwargs["priority"] 192 | 193 | f = _base.Future() 194 | w = _WorkItem(f, fn, args, kwargs) 195 | 196 | self._work_queue.put((priority, w)) 197 | self._adjust_thread_count() 198 | return f 199 | 200 | def _adjust_thread_count(self): 201 | # if idle threads are available, don't spin new threads 202 | if self._idle_semaphore.acquire(timeout=0): 203 | return 204 | 205 | # When the executor gets lost, the weakref callback will wake up 206 | # the worker threads. 207 | def weakref_cb(_, q=self._work_queue): 208 | q.put(None) 209 | 210 | num_threads = len(self._threads) 211 | if num_threads < self._max_workers: 212 | thread_name = f"{self._thread_name_prefix or self}_{num_threads:d}" 213 | t = threading.Thread( 214 | name=thread_name, 215 | target=_worker, 216 | args=( 217 | weakref.ref(self, weakref_cb), 218 | self._work_queue, 219 | self._initializer, 220 | self._initargs, 221 | ), 222 | ) 223 | t.start() 224 | self._threads.add(t) 225 | _threads_queues[t] = self._work_queue 226 | 227 | def shutdown(self, wait=True, *, cancel_futures=False): 228 | logger.debug("Shutting down executor %s", self._thread_name_prefix or self) 229 | if wait: 230 | logger.debug( 231 | "Waiting for all tasks done %s", self._thread_name_prefix or self 232 | ) 233 | self._work_queue.join() 234 | logger.debug("All tasks done %s", self._thread_name_prefix or self) 235 | 236 | with self._shutdown_lock: 237 | self._shutdown = True 238 | if cancel_futures: 239 | # Drain all work items from the queue, and then cancel their 240 | # associated futures. 241 | while True: 242 | try: 243 | work_item = self._work_queue.get_nowait() 244 | except queue.Empty: 245 | break 246 | if work_item is not None: 247 | work_item.future.cancel() 248 | 249 | # Send a wake-up to prevent threads calling 250 | # _work_queue.get(block=True) from permanently blocking. 251 | self._work_queue.put(None) 252 | if wait: 253 | logger.debug( 254 | "Waiting for all thread done %s", self._thread_name_prefix or self 255 | ) 256 | for t in self._threads: 257 | self._work_queue.put(None) 258 | t.join() 259 | logger.debug("shutdown finish %s", self._thread_name_prefix or self) 260 | -------------------------------------------------------------------------------- /babeldoc/document_il/utils/style_helper.py: -------------------------------------------------------------------------------- 1 | from babeldoc.document_il import il_version_1 2 | 3 | 4 | def create_pdf_style(r, g, b, font_id="china-ss", font_size=6): 5 | """ 6 | Create a PdfStyle object from RGB values. 7 | 8 | Args: 9 | r: Red component in range 0-255 10 | g: Green component in range 0-255 11 | b: Blue component in range 0-255 12 | font_id: Font identifier 13 | font_size: Font size 14 | 15 | Returns: 16 | PdfStyle object with the specified color 17 | """ 18 | r, g, b = [x / 255.0 for x in (r, g, b)] 19 | return il_version_1.PdfStyle( 20 | font_id=font_id, 21 | font_size=font_size, 22 | graphic_state=il_version_1.GraphicState( 23 | passthrough_per_char_instruction=f"{r:.10f} {g:.10f} {b:.10f} rg", 24 | ), 25 | ) 26 | 27 | 28 | BLACK = il_version_1.GraphicState(passthrough_per_char_instruction="0 g 0 G") 29 | 30 | WHITE = il_version_1.GraphicState(passthrough_per_char_instruction="1 g 1 G") 31 | 32 | # Generate all color styles 33 | RED = il_version_1.GraphicState( 34 | passthrough_per_char_instruction="1.0000000000 0.2313725490 0.1882352941 rg " 35 | "1.0000000000 0.2313725490 0.1882352941 RG", 36 | ) 37 | 38 | ORANGE = il_version_1.GraphicState( 39 | passthrough_per_char_instruction="1.0000000000 0.5843137255 0.0000000000 rg " 40 | "1.0000000000 0.5843137255 0.0000000000 RG", 41 | ) 42 | YELLOW = il_version_1.GraphicState( 43 | passthrough_per_char_instruction="1.0000000000 0.8000000000 0.0000000000 rg " 44 | "1.0000000000 0.8000000000 0.0000000000 RG", 45 | ) 46 | 47 | GREEN = il_version_1.GraphicState( 48 | passthrough_per_char_instruction="0.2039215686 0.7803921569 0.3490196078 rg " 49 | "0.2039215686 0.7803921569 0.3490196078 RG", 50 | ) 51 | 52 | MINT = il_version_1.GraphicState( 53 | passthrough_per_char_instruction="0.0000000000 0.7803921569 0.7450980392 rg " 54 | "0.0000000000 0.7803921569 0.7450980392 RG", 55 | ) 56 | 57 | TEAL = il_version_1.GraphicState( 58 | passthrough_per_char_instruction="0.1882352941 0.6901960784 0.7803921569 rg " 59 | "0.1882352941 0.6901960784 0.7803921569 RG", 60 | ) 61 | 62 | CYAN = il_version_1.GraphicState( 63 | passthrough_per_char_instruction="0.1960784314 0.6784313725 0.9019607843 rg " 64 | "0.1960784314 0.6784313725 0.9019607843 RG", 65 | ) 66 | 67 | BLUE = il_version_1.GraphicState( 68 | passthrough_per_char_instruction="0.0000000000 0.4784313725 1.0000000000 rg " 69 | "0.0000000000 0.4784313725 1.0000000000 RG", 70 | ) 71 | 72 | INDIGO = il_version_1.GraphicState( 73 | passthrough_per_char_instruction="0.3450980392 0.3372549020 0.8392156863 rg " 74 | "0.3450980392 0.3372549020 0.8392156863 RG", 75 | ) 76 | 77 | PURPLE = il_version_1.GraphicState( 78 | passthrough_per_char_instruction="0.6862745098 0.3215686275 0.8705882353 rg " 79 | "0.6862745098 0.3215686275 0.8705882353 RG", 80 | ) 81 | 82 | PINK = il_version_1.GraphicState( 83 | passthrough_per_char_instruction="1.0000000000 0.1764705882 0.3333333333 rg " 84 | "1.0000000000 0.1764705882 0.3333333333 RG", 85 | ) 86 | 87 | BROWN = il_version_1.GraphicState( 88 | passthrough_per_char_instruction="0.6352941176 0.5176470588 0.3686274510 rg " 89 | "0.6352941176 0.5176470588 0.3686274510 RG", 90 | ) 91 | -------------------------------------------------------------------------------- /babeldoc/document_il/xml_converter.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from pathlib import Path 3 | 4 | import orjson 5 | from xsdata.formats.dataclass.context import XmlContext 6 | from xsdata.formats.dataclass.parsers import XmlParser 7 | from xsdata.formats.dataclass.serializers import XmlSerializer 8 | from xsdata.formats.dataclass.serializers.config import SerializerConfig 9 | 10 | from babeldoc.document_il import il_version_1 11 | 12 | 13 | class XMLConverter: 14 | def __init__(self): 15 | self.parser = XmlParser() 16 | config = SerializerConfig(indent=" ") 17 | context = XmlContext() 18 | self.serializer = XmlSerializer(context=context, config=config) 19 | 20 | def write_xml(self, document: il_version_1.Document, path: str): 21 | with Path(path).open("w", encoding="utf-8") as f: 22 | f.write(self.to_xml(document)) 23 | 24 | def read_xml(self, path: str) -> il_version_1.Document: 25 | with Path(path).open(encoding="utf-8") as f: 26 | return self.from_xml(f.read()) 27 | 28 | def to_xml(self, document: il_version_1.Document) -> str: 29 | return self.serializer.render(document) 30 | 31 | def from_xml(self, xml: str) -> il_version_1.Document: 32 | return self.parser.from_string( 33 | xml, 34 | il_version_1.Document, 35 | ) 36 | 37 | def deepcopy(self, document: il_version_1.Document) -> il_version_1.Document: 38 | return copy.deepcopy(document) 39 | # return self.from_xml(self.to_xml(document)) 40 | 41 | def to_json(self, document: il_version_1.Document) -> str: 42 | return orjson.dumps( 43 | document, 44 | option=orjson.OPT_APPEND_NEWLINE 45 | | orjson.OPT_INDENT_2 46 | | orjson.OPT_SORT_KEYS, 47 | ).decode() 48 | 49 | def write_json(self, document: il_version_1.Document, path: str): 50 | with Path(path).open("w", encoding="utf-8") as f: 51 | f.write(self.to_json(document)) 52 | -------------------------------------------------------------------------------- /babeldoc/docvision/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/docvision/README.md -------------------------------------------------------------------------------- /babeldoc/docvision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/docvision/__init__.py -------------------------------------------------------------------------------- /babeldoc/docvision/doclayout.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import ast 3 | import logging 4 | import platform 5 | import re 6 | import threading 7 | from collections.abc import Generator 8 | 9 | import cv2 10 | import numpy as np 11 | 12 | try: 13 | import onnx 14 | import onnxruntime 15 | except ImportError as e: 16 | if "DLL load failed" in str(e): 17 | raise OSError( 18 | "Microsoft Visual C++ Redistributable is not installed. " 19 | "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe" 20 | ) from e 21 | raise 22 | import pymupdf 23 | 24 | import babeldoc.document_il.il_version_1 25 | from babeldoc.assets.assets import get_doclayout_onnx_model_path 26 | 27 | # from huggingface_hub import hf_hub_download 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | 32 | class YoloResult: 33 | """Helper class to store detection results from ONNX model.""" 34 | 35 | def __init__(self, names, boxes=None, boxes_data=None): 36 | if boxes is not None: 37 | self.boxes = boxes 38 | else: 39 | assert boxes_data is not None 40 | self.boxes = [YoloBox(data=d) for d in boxes_data] 41 | self.boxes.sort(key=lambda x: x.conf, reverse=True) 42 | self.names = names 43 | 44 | 45 | class DocLayoutModel(abc.ABC): 46 | @staticmethod 47 | def load_onnx(): 48 | logger.info("Loading ONNX model...") 49 | model = OnnxModel.from_pretrained() 50 | return model 51 | 52 | @staticmethod 53 | def load_available(): 54 | return DocLayoutModel.load_onnx() 55 | 56 | @property 57 | @abc.abstractmethod 58 | def stride(self) -> int: 59 | """Stride of the model input.""" 60 | 61 | @abc.abstractmethod 62 | def predict(self, image: bytes, imgsz: int = 1024, **kwargs) -> list[int]: 63 | """ 64 | Predict the layout of a document page. 65 | 66 | Args: 67 | image: The image of the document page. 68 | imgsz: Resize the image to this size. Must be a multiple of the stride. 69 | **kwargs: Additional arguments. 70 | """ 71 | 72 | @abc.abstractmethod 73 | def handle_document( 74 | self, 75 | pages: list[babeldoc.document_il.il_version_1.Page], 76 | mupdf_doc: pymupdf.Document, 77 | translate_config, 78 | save_debug_image, 79 | ) -> Generator[ 80 | tuple[babeldoc.document_il.il_version_1.Page, YoloResult], None, None 81 | ]: 82 | """ 83 | Handle a document. 84 | """ 85 | 86 | 87 | class YoloBox: 88 | """Helper class to store detection results from ONNX model.""" 89 | 90 | def __init__(self, data=None, xyxy=None, conf=None, cls=None): 91 | if data is not None: 92 | self.xyxy = data[:4] 93 | self.conf = data[-2] 94 | self.cls = data[-1] 95 | return 96 | assert xyxy is not None and conf is not None and cls is not None 97 | self.xyxy = xyxy 98 | self.conf = conf 99 | self.cls = cls 100 | 101 | 102 | # 检测操作系统类型 103 | os_name = platform.system() 104 | 105 | 106 | class OnnxModel(DocLayoutModel): 107 | def __init__(self, model_path: str): 108 | self.model_path = model_path 109 | 110 | model = onnx.load(model_path) 111 | metadata = {d.key: d.value for d in model.metadata_props} 112 | self._stride = ast.literal_eval(metadata["stride"]) 113 | self._names = ast.literal_eval(metadata["names"]) 114 | providers = [] 115 | 116 | available_providers = onnxruntime.get_available_providers() 117 | for provider in available_providers: 118 | if re.match(r"dml|cuda|cpu", provider, re.IGNORECASE): 119 | logger.info(f"Available Provider: {provider}") 120 | providers.append(provider) 121 | self.model = onnxruntime.InferenceSession( 122 | model.SerializeToString(), 123 | providers=providers, 124 | ) 125 | self.lock = threading.Lock() 126 | 127 | @staticmethod 128 | def from_pretrained(): 129 | pth = get_doclayout_onnx_model_path() 130 | return OnnxModel(pth) 131 | 132 | @property 133 | def stride(self): 134 | return self._stride 135 | 136 | def resize_and_pad_image(self, image, new_shape): 137 | """ 138 | Resize and pad the image to the specified size, ensuring dimensions are multiples of stride. 139 | 140 | Parameters: 141 | - image: Input image 142 | - new_shape: Target size (integer or (height, width) tuple) 143 | - stride: Padding alignment stride, default 32 144 | 145 | Returns: 146 | - Processed image 147 | """ 148 | if isinstance(new_shape, int): 149 | new_shape = (new_shape, new_shape) 150 | 151 | h, w = image.shape[:2] 152 | new_h, new_w = new_shape 153 | 154 | # Calculate scaling ratio 155 | r = min(new_h / h, new_w / w) 156 | resized_h, resized_w = int(round(h * r)), int(round(w * r)) 157 | 158 | # Resize image 159 | image = cv2.resize( 160 | image, 161 | (resized_w, resized_h), 162 | interpolation=cv2.INTER_LINEAR, 163 | ) 164 | 165 | # Calculate padding size and align to stride multiple 166 | pad_w = (new_w - resized_w) % self.stride 167 | pad_h = (new_h - resized_h) % self.stride 168 | top, bottom = pad_h // 2, pad_h - pad_h // 2 169 | left, right = pad_w // 2, pad_w - pad_w // 2 170 | 171 | # Add padding 172 | image = cv2.copyMakeBorder( 173 | image, 174 | top, 175 | bottom, 176 | left, 177 | right, 178 | cv2.BORDER_CONSTANT, 179 | value=(114, 114, 114), 180 | ) 181 | 182 | return image 183 | 184 | def scale_boxes(self, img1_shape, boxes, img0_shape): 185 | """ 186 | Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally 187 | specified in (img1_shape) to the shape of a different image (img0_shape). 188 | 189 | Args: 190 | img1_shape (tuple): The shape of the image that the bounding boxes are for, 191 | in the format of (height, width). 192 | boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2) 193 | img0_shape (tuple): the shape of the target image, in the format of (height, width). 194 | 195 | Returns: 196 | boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2) 197 | """ 198 | 199 | # Calculate scaling ratio 200 | gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1]) 201 | 202 | # Calculate padding size 203 | pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1) 204 | pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1) 205 | 206 | # Remove padding and scale boxes 207 | boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain 208 | return boxes 209 | 210 | def predict(self, image, imgsz=800, batch_size=16, **kwargs): 211 | """ 212 | Predict the layout of document pages. 213 | 214 | Args: 215 | image: A single image or a list of images of document pages. 216 | imgsz: Resize the image to this size. Must be a multiple of the stride. 217 | batch_size: Number of images to process in one batch. 218 | **kwargs: Additional arguments. 219 | 220 | Returns: 221 | A list of YoloResult objects, one for each input image. 222 | """ 223 | # Handle single image input 224 | if isinstance(image, np.ndarray) and len(image.shape) == 3: 225 | image = [image] 226 | 227 | total_images = len(image) 228 | results = [] 229 | batch_size = 1 230 | 231 | # Process images in batches 232 | for i in range(0, total_images, batch_size): 233 | batch_images = image[i : i + batch_size] 234 | batch_size_actual = len(batch_images) 235 | 236 | # Calculate target size based on the maximum height in the batch 237 | max_height = max(img.shape[0] for img in batch_images) 238 | target_imgsz = 1024 239 | 240 | # Preprocess batch 241 | processed_batch = [] 242 | orig_shapes = [] 243 | for img in batch_images: 244 | orig_h, orig_w = img.shape[:2] 245 | orig_shapes.append((orig_h, orig_w)) 246 | 247 | pix = self.resize_and_pad_image(img, new_shape=target_imgsz) 248 | pix = np.transpose(pix, (2, 0, 1)) # CHW 249 | pix = pix.astype(np.float32) / 255.0 # Normalize to [0, 1] 250 | processed_batch.append(pix) 251 | 252 | # Stack batch 253 | batch_input = np.stack(processed_batch, axis=0) # BCHW 254 | new_h, new_w = batch_input.shape[2:] 255 | 256 | # Run inference 257 | batch_preds = self.model.run(None, {"images": batch_input})[0] 258 | 259 | # Process each prediction in the batch 260 | for j in range(batch_size_actual): 261 | preds = batch_preds[j] 262 | preds = preds[preds[..., 4] > 0.25] 263 | if len(preds) > 0: 264 | preds[..., :4] = self.scale_boxes( 265 | (new_h, new_w), 266 | preds[..., :4], 267 | orig_shapes[j], 268 | ) 269 | results.append(YoloResult(boxes_data=preds, names=self._names)) 270 | 271 | return results 272 | 273 | def handle_document( 274 | self, 275 | pages: list[babeldoc.document_il.il_version_1.Page], 276 | mupdf_doc: pymupdf.Document, 277 | translate_config, 278 | save_debug_image, 279 | ) -> Generator[ 280 | tuple[babeldoc.document_il.il_version_1.Page, YoloResult], None, None 281 | ]: 282 | for page in pages: 283 | translate_config.raise_if_cancelled() 284 | with self.lock: 285 | pix = mupdf_doc[page.page_number].get_pixmap(dpi=72) 286 | image = np.fromstring(pix.samples, np.uint8).reshape( 287 | pix.height, 288 | pix.width, 289 | 3, 290 | )[:, :, ::-1] 291 | predict_result = self.predict(image)[0] 292 | save_debug_image( 293 | image, 294 | predict_result, 295 | page.page_number + 1, 296 | ) 297 | yield page, predict_result 298 | -------------------------------------------------------------------------------- /babeldoc/format/office/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/format/office/__init__.py -------------------------------------------------------------------------------- /babeldoc/result_merger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | 4 | from pymupdf import Document 5 | 6 | from babeldoc.document_il.backend.pdf_creater import PDFCreater 7 | from babeldoc.translation_config import TranslateResult 8 | from babeldoc.translation_config import TranslationConfig 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class ResultMerger: 14 | """Handles merging of split translation results""" 15 | 16 | def __init__(self, translation_config: TranslationConfig): 17 | self.config = translation_config 18 | 19 | def merge_results(self, results: dict[int, TranslateResult]) -> TranslateResult: 20 | """Merge multiple translation results into one""" 21 | if not results: 22 | raise ValueError("No results to merge") 23 | 24 | basename = Path(self.config.input_file).stem 25 | debug_suffix = ".debug" if self.config.debug else "" 26 | 27 | mono_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf" 28 | dual_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf" 29 | 30 | debug_suffix += ".no_watermark" 31 | 32 | mono_file_name_no_watermark = ( 33 | f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf" 34 | ) 35 | dual_file_name_no_watermark = ( 36 | f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf" 37 | ) 38 | 39 | # Sort results by part index 40 | sorted_results = dict(sorted(results.items())) 41 | first_result = next(iter(sorted_results.values())) 42 | 43 | # Initialize paths for merged files 44 | merged_mono_path = None 45 | merged_dual_path = None 46 | merged_no_watermark_mono_path = None 47 | merged_no_watermark_dual_path = None 48 | 49 | # Merge monolingual PDFs if they exist 50 | if any(r.mono_pdf_path for r in results.values()): 51 | merged_mono_path = self._merge_pdfs( 52 | [r.mono_pdf_path for r in sorted_results.values() if r.mono_pdf_path], 53 | mono_file_name, 54 | tag="merged_mono", 55 | ) 56 | 57 | # Merge dual-language PDFs if they exist 58 | if any(r.dual_pdf_path for r in results.values()): 59 | merged_dual_path = self._merge_pdfs( 60 | [r.dual_pdf_path for r in sorted_results.values() if r.dual_pdf_path], 61 | dual_file_name, 62 | tag="merged_dual", 63 | ) 64 | 65 | if any( 66 | r.dual_pdf_path != r.no_watermark_dual_pdf_path 67 | or r.mono_pdf_path != r.no_watermark_mono_pdf_path 68 | for r in results.values() 69 | ): 70 | # Merge no-watermark PDFs if they exist 71 | if any(r.no_watermark_mono_pdf_path for r in results.values()): 72 | merged_no_watermark_mono_path = self._merge_pdfs( 73 | [ 74 | r.no_watermark_mono_pdf_path 75 | for r in sorted_results.values() 76 | if r.no_watermark_mono_pdf_path 77 | ], 78 | mono_file_name_no_watermark, 79 | tag="merged_no_watermark_mono", 80 | ) 81 | 82 | if any(r.no_watermark_dual_pdf_path for r in results.values()): 83 | merged_no_watermark_dual_path = self._merge_pdfs( 84 | [ 85 | r.no_watermark_dual_pdf_path 86 | for r in sorted_results.values() 87 | if r.no_watermark_dual_pdf_path 88 | ], 89 | "merged_no_watermark_dual.pdf", 90 | tag="merged_no_watermark_dual", 91 | ) 92 | 93 | # Create merged result 94 | merged_result = TranslateResult( 95 | mono_pdf_path=merged_mono_path, 96 | dual_pdf_path=merged_dual_path, 97 | ) 98 | merged_result.no_watermark_mono_pdf_path = merged_no_watermark_mono_path 99 | merged_result.no_watermark_dual_pdf_path = merged_no_watermark_dual_path 100 | 101 | if merged_result.no_watermark_mono_pdf_path is None: 102 | merged_result.no_watermark_mono_pdf_path = merged_mono_path 103 | elif merged_result.mono_pdf_path is None: 104 | merged_result.mono_pdf_path = merged_no_watermark_mono_path 105 | 106 | if merged_result.no_watermark_dual_pdf_path is None: 107 | merged_result.no_watermark_dual_pdf_path = merged_dual_path 108 | elif merged_result.dual_pdf_path is None: 109 | merged_result.dual_pdf_path = merged_no_watermark_dual_path 110 | 111 | # Calculate total time 112 | total_time = sum( 113 | r.total_seconds for r in results.values() if hasattr(r, "total_seconds") 114 | ) 115 | merged_result.total_seconds = total_time 116 | 117 | return merged_result 118 | 119 | def _merge_pdfs( 120 | self, pdf_paths: list[str | Path], output_name: str, tag: str 121 | ) -> Path: 122 | """Merge multiple PDFs into one""" 123 | if not pdf_paths: 124 | return None 125 | 126 | output_path = self.config.get_output_file_path(output_name) 127 | merged_doc = Document() 128 | 129 | for pdf_path in pdf_paths: 130 | doc = Document(str(pdf_path)) 131 | merged_doc.insert_pdf(doc) 132 | 133 | merged_doc = PDFCreater.subset_fonts_in_subprocess( 134 | merged_doc, self.config, tag=tag 135 | ) 136 | PDFCreater.save_pdf_with_timeout( 137 | merged_doc, str(output_path), translation_config=self.config 138 | ) 139 | 140 | return output_path 141 | -------------------------------------------------------------------------------- /babeldoc/split_manager.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from dataclasses import dataclass 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | 7 | @dataclass 8 | class SplitPoint: 9 | """Represents a point where the document should be split""" 10 | 11 | start_page: int 12 | end_page: int 13 | estimated_complexity: float = 1.0 14 | chapter_title: str | None = None 15 | 16 | 17 | class BaseSplitStrategy: 18 | """Base class for split strategies""" 19 | 20 | def determine_split_points(self, config) -> list[SplitPoint]: 21 | raise NotImplementedError 22 | 23 | 24 | class PageCountStrategy(BaseSplitStrategy): 25 | """Split document based on page count""" 26 | 27 | def __init__(self, max_pages_per_part: int = 20): 28 | self.max_pages_per_part = max_pages_per_part 29 | 30 | def determine_split_points(self, config) -> list[SplitPoint]: 31 | from pymupdf import Document 32 | 33 | doc = Document(str(config.input_file)) 34 | total_pages = doc.page_count 35 | 36 | split_points = [] 37 | current_page = 0 38 | 39 | while current_page < total_pages: 40 | end_page = min(current_page + self.max_pages_per_part, total_pages) 41 | split_points.append( 42 | SplitPoint( 43 | start_page=current_page, 44 | end_page=end_page - 1, # end_page is inclusive 45 | ) 46 | ) 47 | current_page = end_page 48 | 49 | return split_points 50 | 51 | 52 | class SplitManager: 53 | """Manages document splitting process""" 54 | 55 | def __init__(self, config=None): 56 | self.strategy = config.split_strategy 57 | 58 | def determine_split_points(self, config) -> list[SplitPoint]: 59 | """Determine where to split the document""" 60 | return self.strategy.determine_split_points(config) 61 | 62 | def estimate_part_complexity(self, split_point: SplitPoint) -> float: 63 | """Estimate the complexity of a document part""" 64 | # Simple estimation based on page count for now 65 | return ( 66 | split_point.end_page - split_point.start_page + 1 67 | ) * split_point.estimated_complexity 68 | -------------------------------------------------------------------------------- /babeldoc/tools/generate_font_metadata.py: -------------------------------------------------------------------------------- 1 | # This script is used to automatically generate the following files: 2 | # https://github.com/funstory-ai/BabelDOC-Assets/blob/main/font_metadata.json 3 | 4 | 5 | import argparse 6 | import hashlib 7 | import io 8 | import logging 9 | from pathlib import Path 10 | 11 | import babeldoc.high_level 12 | import babeldoc.translation_config 13 | import orjson 14 | import pymupdf 15 | from babeldoc.document_il import PdfFont 16 | from rich.logging import RichHandler 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | def get_font_metadata(font_path) -> PdfFont: 22 | doc = pymupdf.open() 23 | page = doc.new_page(width=1000, height=1000) 24 | page.insert_font("test_font", font_path) 25 | translation_config = babeldoc.translation_config.TranslationConfig( 26 | *[None for _ in range(4)], doc_layout_model=1 27 | ) 28 | translation_config.progress_monitor = babeldoc.high_level.ProgressMonitor( 29 | babeldoc.high_level.TRANSLATE_STAGES 30 | ) 31 | translation_config.font = font_path 32 | il_creater = babeldoc.high_level.ILCreater(translation_config) 33 | il_creater.mupdf = doc 34 | buffer = io.BytesIO() 35 | doc.save(buffer) 36 | babeldoc.high_level.start_parse_il( 37 | buffer, 38 | doc_zh=doc, 39 | resfont="test_font", 40 | il_creater=il_creater, 41 | translation_config=translation_config, 42 | ) 43 | 44 | il = il_creater.create_il() 45 | il_page = il.page[0] 46 | font_metadata = il_page.pdf_font[0] 47 | return font_metadata 48 | 49 | 50 | def main(): 51 | logging.basicConfig(level=logging.INFO, handlers=[RichHandler()]) 52 | parser = argparse.ArgumentParser(description="Get font metadata.") 53 | parser.add_argument("assets_repo_path", type=str, help="Path to the font file.") 54 | args = parser.parse_args() 55 | repo_path = Path(args.assets_repo_path) 56 | assert repo_path.exists(), f"Assets repo path {repo_path} does not exist." 57 | assert (repo_path / "README.md").exists(), ( 58 | f"Assets repo path {repo_path} does not contain a README.md file." 59 | ) 60 | assert (repo_path / "fonts").exists(), ( 61 | f"Assets repo path {repo_path} does not contain a fonts folder." 62 | ) 63 | logger.info(f"Getting font metadata for {repo_path}") 64 | 65 | metadatas = {} 66 | for font_path in list((repo_path / "fonts").glob("**/*.ttf")): 67 | logger.info(f"Getting font metadata for {font_path}") 68 | with Path(font_path).open("rb") as f: 69 | # Read the file in chunks to handle large files efficiently 70 | hash_ = hashlib.sha3_256() 71 | while True: 72 | chunk = f.read(1024 * 1024) 73 | if not chunk: 74 | break 75 | hash_.update(chunk) 76 | extracted_metadata = get_font_metadata(font_path) 77 | metadata = { 78 | "file_name": font_path.name, 79 | "font_name": extracted_metadata.name, 80 | "encoding_length": extracted_metadata.encoding_length, 81 | "bold": extracted_metadata.bold, 82 | "italic": extracted_metadata.italic, 83 | "monospace": extracted_metadata.monospace, 84 | "serif": extracted_metadata.serif, 85 | "ascent": extracted_metadata.ascent, 86 | "descent": extracted_metadata.descent, 87 | "sha3_256": hash_.hexdigest(), 88 | "size": font_path.stat().st_size, 89 | } 90 | metadatas[font_path.name] = metadata 91 | metadatas = orjson.dumps( 92 | metadatas, 93 | option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS, 94 | ).decode() 95 | print(f"FONT METADATA: {metadatas}") 96 | with (repo_path / "font_metadata.json").open("w") as f: 97 | f.write(metadatas) 98 | 99 | 100 | if __name__ == "__main__": 101 | main() 102 | -------------------------------------------------------------------------------- /babeldoc/tools/italic_assistance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import re 4 | from pathlib import Path 5 | 6 | import orjson 7 | from babeldoc.const import CACHE_FOLDER 8 | 9 | WORKING_FOLDER = Path(CACHE_FOLDER) / "working" 10 | 11 | 12 | def find_latest_il_json() -> Path | None: 13 | """ 14 | Find the latest il_translated.json file in ~/.cache/babeldoc/ subdirectories. 15 | 16 | Returns: 17 | Path to the most recently modified il_translated.json file, or None if not found. 18 | """ 19 | base_dir = Path(WORKING_FOLDER) 20 | json_files = list(base_dir.glob("*/il_translated.json")) 21 | 22 | if not json_files: 23 | return None 24 | 25 | # Sort by modification time (newest first) 26 | json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True) 27 | return json_files[0] 28 | 29 | 30 | def extract_fonts_from_paragraph( 31 | paragraph: dict, page_font_map: dict[str, tuple[str, str]] 32 | ) -> set[tuple[str, str]]: 33 | """ 34 | Extract all font_ids and names used in a paragraph. 35 | 36 | Args: 37 | paragraph: The paragraph dictionary 38 | page_font_map: Dictionary mapping font_id to (font_id, name) tuples 39 | 40 | Returns: 41 | Set of (font_id, name) tuples 42 | """ 43 | fonts = set() 44 | 45 | # Check if paragraph has a pdfStyle with font_id 46 | if ( 47 | "pdf_style" in paragraph 48 | and paragraph["pdf_style"] 49 | and "font_id" in paragraph["pdf_style"] 50 | ): 51 | font_id = paragraph["pdf_style"]["font_id"] 52 | if font_id in page_font_map: 53 | fonts.add(page_font_map[font_id]) 54 | 55 | # Process paragraph compositions if present 56 | if "pdf_paragraph_composition" in paragraph: 57 | for comp in paragraph["pdf_paragraph_composition"]: 58 | # Check different composition types that might contain font information 59 | 60 | # Direct pdfCharacter in composition 61 | if "pdf_character" in comp and comp["pdf_character"]: 62 | char = comp["pdf_character"] 63 | if "pdf_style" in char and "font_id" in char["pdf_style"]: 64 | font_id = char["pdf_style"]["font_id"] 65 | if font_id in page_font_map: 66 | fonts.add(page_font_map[font_id]) 67 | 68 | # PdfLine in composition 69 | elif "pdf_line" in comp and comp["pdf_line"]: 70 | line = comp["pdf_line"] 71 | if "pdf_character" in line: 72 | for char in line["pdf_character"]: 73 | if "pdf_style" in char and "font_id" in char["pdf_style"]: 74 | font_id = char["pdf_style"]["font_id"] 75 | if font_id in page_font_map: 76 | fonts.add(page_font_map[font_id]) 77 | 78 | # PdfFormula in composition 79 | elif "pdf_formula" in comp and comp["pdf_formula"]: 80 | formula = comp["pdf_formula"] 81 | if "pdf_character" in formula: 82 | for char in formula["pdf_character"]: 83 | if "pdf_style" in char and "font_id" in char["pdf_style"]: 84 | font_id = char["pdf_style"]["font_id"] 85 | if font_id in page_font_map: 86 | fonts.add(page_font_map[font_id]) 87 | 88 | # PdfSameStyleCharacters in composition 89 | elif ( 90 | "pdf_same_style_characters" in comp 91 | and comp["pdf_same_style_characters"] 92 | ): 93 | same_style = comp["pdf_same_style_characters"] 94 | if "pdf_style" in same_style and "font_id" in same_style["pdf_style"]: 95 | font_id = same_style["pdf_style"]["font_id"] 96 | if font_id in page_font_map: 97 | fonts.add(page_font_map[font_id]) 98 | 99 | # PdfSameStyleUnicodeCharacters in composition 100 | elif ( 101 | "pdf_same_style_unicode_characters" in comp 102 | and comp["pdf_same_style_unicode_characters"] 103 | ): 104 | same_style_unicode = comp["pdf_same_style_unicode_characters"] 105 | if ( 106 | "pdf_style" in same_style_unicode 107 | and same_style_unicode["pdf_style"] is not None 108 | and "font_id" in same_style_unicode["pdf_style"] 109 | ): 110 | font_id = same_style_unicode["pdf_style"]["font_id"] 111 | if font_id in page_font_map: 112 | fonts.add(page_font_map[font_id]) 113 | 114 | return fonts 115 | 116 | 117 | def find_fonts_by_debug_id(json_path: Path, debug_id_regex: str) -> dict[str, str]: 118 | """ 119 | Find all fonts used in paragraphs with matching debug_id. 120 | 121 | Args: 122 | json_path: Path to the il_translated.json file 123 | debug_id_regex: Regular expression to match debug_id values 124 | 125 | Returns: 126 | Dictionary mapping font_ids to font names 127 | """ 128 | # Load and parse JSON 129 | with json_path.open("rb") as f: 130 | doc_data = orjson.loads(f.read()) 131 | 132 | # Compile regex pattern (case insensitive) 133 | pattern = re.compile(debug_id_regex.strip(" \"'"), re.IGNORECASE) 134 | 135 | # Set to collect all found font information 136 | found_fonts = set() 137 | 138 | # Process each page 139 | for page in doc_data.get("page", []): 140 | # Create a mapping of font_id to (font_id, name) tuples for this page 141 | page_font_map = {} 142 | for font in page.get("pdf_font", []): 143 | if "font_id" in font and "name" in font: 144 | page_font_map[font["font_id"]] = (font["font_id"], font["name"]) 145 | 146 | # Check each paragraph 147 | for paragraph in page.get("pdf_paragraph", []): 148 | # Check if paragraph has debug_id and if it matches the pattern 149 | debug_id = paragraph.get("debug_id") 150 | if debug_id and pattern.search(debug_id): 151 | # Get all fonts used in this paragraph 152 | paragraph_fonts = extract_fonts_from_paragraph(paragraph, page_font_map) 153 | found_fonts.update(paragraph_fonts) 154 | 155 | # Convert set of tuples to dictionary 156 | return dict(found_fonts) 157 | 158 | 159 | def main(): 160 | parser = argparse.ArgumentParser( 161 | description="Extract fonts from paragraphs with matching debug_id" 162 | ) 163 | parser.add_argument( 164 | "debug_id_regex", nargs="+", help="Regular expression to match debug_id values" 165 | ) 166 | parser.add_argument( 167 | "--json-path", 168 | help="Path to il_translated.json (if not provided, will use the latest file)", 169 | ) 170 | 171 | args = parser.parse_args() 172 | 173 | # Determine JSON file path 174 | json_path = None 175 | if args.json_path: 176 | json_path = Path(args.json_path) 177 | if not json_path.exists(): 178 | print(f"Error: File not found: {json_path}") 179 | return 1 180 | else: 181 | json_path = find_latest_il_json() 182 | if not json_path: 183 | print("Error: Could not find any il_translated.json file") 184 | return 1 185 | 186 | print(f"Using JSON file: {json_path}") 187 | 188 | # Find fonts matching the debug_id pattern 189 | fonts = find_fonts_by_debug_id(json_path, "|".join(args.debug_id_regex)) 190 | 191 | # Output the results 192 | if fonts: 193 | print( 194 | f"Found {len(fonts)} fonts in paragraphs matching debug_id pattern: {args.debug_id_regex}" 195 | ) 196 | print(json.dumps(fonts, indent=2, ensure_ascii=False)) 197 | else: 198 | print( 199 | f"No fonts found for paragraphs matching debug_id pattern: {args.debug_id_regex}" 200 | ) 201 | 202 | return 0 203 | 204 | 205 | if __name__ == "__main__": 206 | exit(main()) 207 | -------------------------------------------------------------------------------- /babeldoc/tools/italic_recognize_tool.py: -------------------------------------------------------------------------------- 1 | # Identify non-formula italic fonts that were incorrectly classified as formulas in BableDOC translation results (intermediate) 2 | 3 | import json 4 | 5 | import babeldoc.tools.italic_assistance as italic_assistance 6 | from babeldoc.document_il.midend.styles_and_formulas import StylesAndFormulas 7 | from babeldoc.translation_config import TranslationConfig 8 | from rich.console import Console 9 | from rich.table import Table 10 | 11 | console = Console() 12 | 13 | json_path = italic_assistance.find_latest_il_json() 14 | 15 | fonts = [] 16 | 17 | # Read intermediate representation 18 | with json_path.open(encoding="utf-8") as f: 19 | pdf_data = json.load(f) 20 | 21 | for page_index, page in enumerate(pdf_data["page"]): 22 | for paragraph_index, paragraph_content in enumerate(page["pdf_paragraph"]): 23 | font_debug_id = paragraph_content["debug_id"] 24 | if font_debug_id: 25 | # Create page font mapping 26 | page_font_map = {} 27 | for font in page["pdf_font"]: 28 | if "font_id" in font and "name" in font: 29 | page_font_map[font["font_id"]] = (font["font_id"], font["name"]) 30 | 31 | # Extract fonts from paragraph 32 | name_list = [] 33 | paragraph_fonts = italic_assistance.extract_fonts_from_paragraph( 34 | paragraph_content, page_font_map 35 | ) 36 | for _font_id, font_name in paragraph_fonts: 37 | name_list.append(font_name) 38 | 39 | font_list = [] 40 | for each in fonts: 41 | font_list.append(each[1]) 42 | 43 | for each_name in name_list: 44 | if each_name not in font_list: 45 | fonts.append( 46 | (page_index, each_name, paragraph_index, font_debug_id) 47 | ) 48 | 49 | # Initialize checker 50 | translation_config = TranslationConfig( 51 | *[None for _ in range(3)], lang_out="zh_cn", doc_layout_model=1 52 | ) 53 | checker = StylesAndFormulas(translation_config) 54 | 55 | # Create table 56 | table = Table(title="Font Recognition Results") 57 | table.add_column("Page #", justify="center", style="cyan") 58 | table.add_column("Paragraph #", justify="center", style="cyan") 59 | table.add_column("DEBUG_ID", justify="center", style="cyan") 60 | table.add_column("Font Name", style="magenta") 61 | table.add_column("Recognition Result", justify="center") 62 | 63 | # Output results 64 | for each_font in fonts: 65 | page_index, font_name, paragraph_index, font_debug_id = each_font 66 | 67 | if checker.is_formulas_font(font_name): 68 | table.add_row( 69 | str(page_index), 70 | str(paragraph_index), 71 | str(font_debug_id), 72 | font_name, 73 | "[bold red]Formula Font[/bold red]", 74 | ) 75 | else: 76 | table.add_row( 77 | str(page_index), 78 | str(paragraph_index), 79 | str(font_debug_id), 80 | font_name, 81 | "[bold blue]Non-Formula Font[/bold blue]", 82 | ) 83 | 84 | # Print table 85 | console.print(table) 86 | -------------------------------------------------------------------------------- /docs/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | aw@funstory.ai . 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /docs/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to BabelDOC 2 | 3 | ## How to contribute to BabelDOC 4 | 5 | ### **About Language** 6 | 7 | - Issues can be in Chinese or English 8 | - PRs are limited to English 9 | - All documents are provided in English only 10 | 11 | ### **Did you find a bug?** 12 | 13 | - **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/funstory-ai/BabelDOC/issues). 14 | 15 | Please pay special attention to: 16 | 17 | 1. Known compatibility issues with pdf2zh - see [#20](https://github.com/funstory-ai/BabelDOC/issues/20) for details 18 | 2. Reported edge cases and limitations from downstream applications - see [#23](https://github.com/funstory-ai/BabelDOC/issues/23) for discussion 19 | 20 | - If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/funstory-ai/BabelDOC/issues/new?template=bug_report.md). Be sure to include a **title and clear description**, as much relevant information as possible. 21 | 22 | ### **If you wish to request changes or new features** 23 | 24 | - Suggest your change in the [Issues](https://github.com/funstory-ai/BabelDOC/issues/new?template=feature_request.md) section. 25 | 26 | ### **If you wish to add more translators** 27 | 28 | - This project is not intended for direct end-user use, and the supported translators are mainly for debugging purposes. Unless it clearly helps with development and debugging, PRs for directly adding translators will not be accepted. 29 | - You can directly use [PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate) to get support for more translators. 30 | 31 | ### **If you wish to contribute to BabelDOC** 32 | 33 | > [!TIP] 34 | > 35 | > If you have any questions about the source code or related matters, please contact the maintainer at aw@funstory.ai . 36 | > 37 | > You can also raise questions in [Issues](https://github.com/funstory-ai/BabelDOC/issues). 38 | > 39 | > You can contact the maintainers in the pdf2zh discussion group. 40 | > 41 | > Due to the current high rate of code changes, this project only accepts small PRs. If you would like to suggest a change and you include a patch as a proof-of-concept, that would be great. However, please do not be offended if we rewrite your patch from scratch. 42 | 43 | [//]: # (> We welcome pull requests and will review your contributions.) 44 | 45 | 46 | 1. Fork this repository and clone it locally. 47 | 2. Use `doc/deploy.sh` to set up the development environment. 48 | 3. Create a new branch and make code changes on that branch. `git checkout -b feature/` 49 | 4. Perform development and ensure the code meets the requirements. 50 | 51 | 5. Commit your changes to your new branch. 52 | 53 | ``` 54 | git add . 55 | 56 | git commit -m "" 57 | ``` 58 | 59 | 5. Push to your repository: `git push origin feature/`. 60 | 61 | 6. Create a PR on GitHub and provide a detailed description. 62 | 63 | 7. Ensure all automated checks pass. 64 | 65 | #### Basic Requirements 66 | 67 | ##### Workflow 68 | 69 | 1. Please create a fork on the main branch and develop on the forked branch. 70 | 71 | - When submitting a Pull Request (PR), please provide detailed descriptions of the changes. 72 | 73 | - If the PR fails automated checks (showing checks failed and red cross marks), please review the corresponding details and modify the submission to ensure the new PR passes automated checks. 74 | 75 | 2. Development and Testing 76 | 77 | - Use the `uv run BabelDOC` command for development and testing. 78 | 79 | - When you need print log, please use `log.debug()` to print info. **DO NOT USE `print()`** 80 | 81 | - Code formatting 82 | 83 | 3. Dependency Updates 84 | 85 | - If new dependencies are introduced, please update the dependency list in pyproject.toml accordingly. 86 | 87 | - It is recommended to use the `uv add` command for adding dependencies. 88 | 89 | 4. Documentation Updates 90 | 91 | - If new command-line options are added, please update the command-line options list in README.md accordingly. 92 | 93 | 5. Commit Messages 94 | 95 | - Use [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/), for example: feat(translator): add openai. 96 | 97 | 6. Coding Style 98 | 99 | - Please ensure submitted code follows basic coding style guidelines. 100 | - Use pep8-naming. 101 | - Comments should be in English. 102 | - Follow these specific Python coding style guidelines: 103 | 104 | a. Naming Conventions: 105 | 106 | - Class names should use CapWords (PascalCase): `class TranslatorConfig` 107 | - Function and variable names should use snake_case: `def process_text()`, `word_count = 0` 108 | - Constants should be UPPER_CASE: `MAX_RETRY_COUNT = 3` 109 | - Private attributes should start with underscore: `_internal_state` 110 | 111 | b. Code Layout: 112 | 113 | - Use 4 spaces for indentation (no tabs) 114 | - Maximum line length is 88 characters (compatible with black formatter) 115 | - Add 2 blank lines before top-level classes and functions 116 | - Add 1 blank line before class methods 117 | - No trailing whitespace 118 | 119 | c. Imports: 120 | 121 | - Imports should be on separate lines: `import os\nimport sys` 122 | - Imports should be grouped in the following order: 123 | 1. Standard library imports 124 | 2. Related third party imports 125 | 3. Local application/library specific imports 126 | - Use absolute imports over relative imports 127 | 128 | d. String Formatting: 129 | 130 | - Prefer f-strings for string formatting: `f"Count: {count}"` 131 | - Use double quotes for docstrings 132 | 133 | e. Type Hints: 134 | 135 | - Use type hints for function arguments and return values 136 | - Example: `def translate_text(text: str) -> str:` 137 | 138 | f. Documentation: 139 | 140 | - All public functions and classes must have docstrings 141 | - Use Google style for docstrings 142 | - Example: 143 | 144 | ```python 145 | def function_name(arg1: str, arg2: int) -> bool: 146 | """Short description of function. 147 | 148 | Args: 149 | arg1: Description of arg1 150 | arg2: Description of arg2 151 | 152 | Returns: 153 | Description of return value 154 | 155 | Raises: 156 | ValueError: Description of when this error occurs 157 | """ 158 | ``` 159 | 160 | The existing codebase does not comply with the above specifications in some aspects. Contributions for modifications are welcome. 161 | 162 | #### How to modify the intermediate representation 163 | 164 | The intermediate representation is described by [il_version_1.rnc](https://github.com/funstory-ai/BabelDOC/blob/main/BabelDOC/document_il/il_version_1.rnc). Corresponding Python data classes are generated using [xsdata](https://xsdata.readthedocs.io/en/latest/). The files `il_version_1.rng`, `il_version_1.xsd`, and `il_version_1.py` are auto-generated and must not be manually modified. 165 | 166 | ##### Format RNC file 167 | 168 | ```bash 169 | trang babeldoc/document_il/il_version_1.rnc babeldoc/document_il/il_version_1.rnc 170 | ``` 171 | 172 | ##### Generate RNG, XSD and Python classes 173 | 174 | ```bash 175 | # Generate RNG from RNC 176 | trang babeldoc/document_il/il_version_1.rnc babeldoc/document_il/il_version_1.rng 177 | 178 | # Generate XSD from RNC 179 | trang babeldoc/document_il/il_version_1.rnc babeldoc/document_il/il_version_1.xsd 180 | 181 | # Generate Python classes from XSD 182 | xsdata generate babeldoc/document_il/il_version_1.xsd --package babeldoc.document_il 183 | ``` 184 | 185 | ##### Profile memory usage 186 | 187 | ```bash 188 | uv run memray run --native --aggregate babeldoc/main.py -c yadt.toml 189 | ``` -------------------------------------------------------------------------------- /docs/CONTRIBUTOR_REWARD.md: -------------------------------------------------------------------------------- 1 | # BabelDOC/PDFMathTranslate 贡献者奖励规则 2 | 3 | ## 月度活跃贡献者奖励规则 4 | 5 | ### 一、资格标准 6 | #### **贡献类型要求** 7 | - 需提交 **至少 1 个有效 PR**(Pull Request),或进行 **PR 审核、文档编写** 等贡献。 8 | - 有效贡献定义: 9 | - 非简单的文档错别字修复 10 | - 非简单的代码格式化调整(如仅调整缩进、空格等) 11 | - 需做出实质性贡献(如功能开发、Bug 修复、性能优化、架构调整、技术文档编写、PR 审核等) 12 | - 示例合格贡献:新增功能模块、修复逻辑错误、优化算法效率、编写技术文档等 13 | 14 | #### **时间范围** 15 | - 每月 1 日至月末最后一天合并的 PR 计入当月统计 16 | 17 | ### 二、申请流程 18 | #### **申请条件** 19 | - PR 需被成功合并至[funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 仓库或 [Byaidu/PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate/pulls)的主分支。 20 | - 若目标为 [funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 的 PR 未被合并,但被维护者认定为有价值的概念验证,同样符合条件。 21 | - 审核 PR、撰写 wiki 等贡献也必须是以上两个仓库。 22 | - 同一贡献者每月仅可申请一次(无论提交 PR 数量) 23 | - 同一贡献者每月最多可以获得 1 个兑换码 24 | - 对于 PR,只有发起者可以申请兑换码 25 | - 仅可使用当月的贡献申请兑换码(特殊情况请联系 aw@funstory.ai 说明) 26 | 27 | #### **申请方式** 28 | - 发送邮件至 **aw@funstory.ai** 29 | - 邮件标题格式:`[贡献者会员兑换码申请] GitHub用户名-月份`(例:`[贡献者会员兑换码申请] awwaawwa-2024-07`) 30 | - 邮件正文需包含: 31 | - GitHub 用户名 32 | - 合并 PR 的完整链接 33 | - 附件要求: 34 | - PR 页面完整截图(需包含合并状态、仓库名称及点击头像后弹出来的侧边栏,如下图所示) 35 | 36 | ![附件示例](https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-contributor_reward_example.png) 37 | 38 | #### **奖励说明** 39 | - 奖励内容:[沉浸式翻译(Immersive Translate)](https://immersivetranslate.com/zh-Hans/pricing/)月度会员兑换码 40 | - 兑换码使用:在[沉浸式翻译官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入即可激活 41 | - 会员权益:沉浸式翻译 Pro 会员一个月(详见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明) 42 | - 兑换码为专属福利,不可转让 43 | 44 | ### 三、审核与发放 45 | #### **审核周期** 46 | - 我们会尽力在收到申请邮件后 1 个工作日内完成审核 47 | - 审核时间可能因申请数量、审核复杂度等因素有所延长 48 | - 审核通过后,兑换码将通过邮件方式发送 49 | - 若审核未通过,我们会通过邮件说明原因 50 | 51 | #### **兑换码规则** 52 | - 使用方式:[官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入兑换码激活 53 | - 权益内容:月度会员(具体权益见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明) 54 | - 不可转让 55 | 56 | ### 四、注意事项 57 | #### **禁止行为** 58 | - 将完整功能拆分为多个无关 PR 提交 59 | - 提交质量不合格或具有潜在危害的代码 60 | - 提供虚假或误导性的申请材料 61 | 62 | #### **特别说明** 63 | - funstory.ai 保留对贡献价值的评估权、规则的最终解释权等所有必要权利 64 | - 规则如有实质性更新(格式调整等除外),将提前 1 天在 [BabelDOC GitHub PR](https://github.com/funstory-ai/BabelDOC/pulls) 公告 65 | - 过期未使用的兑换码不予补发 66 | - 自 2025 年 2 月 1 日起的贡献可以申请兑换码 67 | - 为了确认您是 Pull Request (PR) 的发起者,防止他人冒领,我们可能会要求您使用发起者账号在 PR 下方留言指定的随机数字。 68 | 69 | ## 常见问题解答(FAQ) 70 | 71 | **Q:如何判断文档翻译贡献是否有效?** 72 | 73 | A:系统性的人工翻译(如完整章节的翻译并经过人工校对)视为有效贡献。零散段落翻译或仅依赖机器翻译的内容不计入有效贡献。 74 | 75 | **Q:兑换码过期了可以补发吗?** 76 | 77 | A:为确保公平性,过期的兑换码将不予补发,请在有效期内及时使用。 78 | 79 | **Q:为什么这个文档是中文的?** 80 | 81 | A:因为目前应该是中文贡献者多吧,所以就先写中文的。后面再撰写英文版的。 82 | 83 | --- 84 | **规则公示**:本规则文档存放于 BabelDOC 仓库 [CONTRIBUTOR_REWARD.md](https://github.com/funstory-ai/BabelDOC/blob/main/docs/CONTRIBUTOR_REWARD.md),并在 [Contributor Reward - BabelDOC](https://funstory-ai.github.io/BabelDOC/CONTRIBUTOR_REWARD/) 展示。 85 | -------------------------------------------------------------------------------- /docs/ImplementationDetails/AsyncTranslate/AsyncTranslate.md: -------------------------------------------------------------------------------- 1 | # Async Translation API 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Overview 10 | 11 | The `yadt.high_level.async_translate` function provides an asynchronous interface for translating PDF files with real-time progress reporting. This function yields progress events that can be used to update progress bars or other UI elements. 12 | 13 | ## Usage 14 | 15 | ```python linenums="1" 16 | async def translate_with_progress(): 17 | config = TranslationConfig( 18 | input_file="example.pdf", 19 | translator=your_translator, 20 | # ... other configuration options 21 | ) 22 | 23 | try: 24 | async for event in async_translate(config): 25 | if event["type"] == "progress_update": 26 | print(f"Progress: {event['overall_progress']}%") 27 | elif event["type"] == "finish": 28 | result = event["translate_result"] 29 | print(f"Translation completed: {result.original_pdf_path}") 30 | elif event["type"] == "error": 31 | print(f"Error occurred: {event['error']}") 32 | break 33 | except asyncio.CancelledError: 34 | print("Translation was cancelled") 35 | except KeyboardInterrupt: 36 | print("Translation was interrupted") 37 | ``` 38 | 39 | ## Event Types 40 | 41 | The function yields different types of events during the translation process: 42 | 43 | ### 1. Progress Start Event 44 | 45 | Emitted when a translation stage begins: 46 | 47 | ```python 48 | { 49 | "type": "progress_start", 50 | "stage": str, # Name of the current stage 51 | "stage_progress": float, # Always 0.0 52 | "stage_current": int, # Current progress count (0) 53 | "stage_total": int # Total items to process in this stage 54 | } 55 | ``` 56 | 57 | ### 2. Progress Update Event 58 | 59 | Emitted periodically during translation (controlled by report_interval, default 0.1s): 60 | 61 | ```python 62 | { 63 | "type": "progress_update", 64 | "stage": str, # Name of the current stage 65 | "stage_progress": float, # Progress percentage of current stage (0-100) 66 | "stage_current": int, # Current items processed in this stage 67 | "stage_total": int, # Total items to process in this stage 68 | "overall_progress": float # Overall translation progress (0-100) 69 | } 70 | ``` 71 | 72 | ### 3. Progress End Event 73 | 74 | Emitted when a stage completes: 75 | 76 | ```python 77 | { 78 | "type": "progress_end", 79 | "stage": str, # Name of the completed stage 80 | "stage_progress": float, # Always 100.0 81 | "stage_current": int, # Equal to stage_total 82 | "stage_total": int, # Total items processed in this stage 83 | "overall_progress": float # Overall translation progress (0-100) 84 | } 85 | ``` 86 | 87 | ### 4. Finish Event 88 | 89 | Emitted when translation completes successfully: 90 | 91 | ```python 92 | { 93 | "type": "finish", 94 | "translate_result": TranslateResult # Contains paths to translated files and timing info 95 | } 96 | ``` 97 | 98 | ### 5. Error Event 99 | 100 | Emitted if an error occurs during translation: 101 | 102 | ```python 103 | { 104 | "type": "error", 105 | "error": str # Error message 106 | } 107 | ``` 108 | 109 | ## Translation Stages 110 | 111 | The translation process goes through the following stages in order: 112 | 113 | 1. ILCreater 114 | 2. LayoutParser 115 | 3. ParagraphFinder 116 | 4. StylesAndFormulas 117 | 5. ILTranslator 118 | 6. Typesetting 119 | 7. FontMapper 120 | 8. PDFCreater 121 | 122 | Each stage will emit its own set of progress events. 123 | 124 | ## Cancellation 125 | 126 | The translation process can be cancelled in several ways: 127 | 128 | 1. By raising a `CancelledError` (e.g., when using `asyncio.Task.cancel()`) 129 | 2. Through `KeyboardInterrupt` (e.g., when user presses Ctrl+C) 130 | 3. By calling `translation_config.cancel_translation()` method 131 | 132 | Example of programmatic cancellation: 133 | 134 | ```python linenums="1" 135 | async def translate_with_cancellation(): 136 | config = TranslationConfig( 137 | input_file="example.pdf", 138 | translator=your_translator, 139 | # ... other configuration options 140 | ) 141 | 142 | try: 143 | # Start translation in another task 144 | translation_task = asyncio.create_task(process_translation(config)) 145 | 146 | # Simulate some condition that requires cancellation 147 | await asyncio.sleep(5) 148 | config.cancel_translation() # This will trigger cancellation 149 | 150 | await translation_task # Wait for the task to finish 151 | except asyncio.CancelledError: 152 | print("Translation was cancelled") 153 | 154 | async def process_translation(config): 155 | async for event in async_translate(config): 156 | if event["type"] == "error": 157 | if isinstance(event["error"], asyncio.CancelledError): 158 | print("Translation was cancelled") 159 | break 160 | print(f"Error occurred: {event['error']}") 161 | break 162 | # ... handle other events ... 163 | ``` 164 | 165 | When cancelled: 166 | - The function will log the cancellation reason 167 | - All resources will be cleaned up properly 168 | - Any ongoing translation tasks will be stopped 169 | - A final error event with `CancelledError` will be emitted 170 | - The function will exit gracefully 171 | 172 | ## Error Handling 173 | 174 | Any errors during translation will be: 175 | 1. Logged with full traceback (if debug mode is enabled) 176 | 2. Reported through an error event 177 | 3. Cause the event stream to stop after the error event 178 | 4. Clean up resources properly before exiting 179 | 180 | It's recommended to handle these events appropriately in your application to provide feedback to users. The example in the Usage section shows a basic error handling pattern. -------------------------------------------------------------------------------- /docs/ImplementationDetails/ILTranslator/ILTranslator.md: -------------------------------------------------------------------------------- 1 | # Intermediate Layer Translator 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Background 10 | 11 | After formula and style processing, we need to translate the document while preserving all formatting, formulas, and styles. The intermediate layer translator handles this complex task by using placeholders and style preservation techniques. 12 | 13 | ## Goal 14 | 15 | 1. Translate text while preserving document structure 16 | 2. Maintain formulas and special formatting 17 | 3. Handle rich text with different styles 18 | 4. Support concurrent translation for better performance 19 | 20 | ## Specific Implementation 21 | 22 | The translation process consists of several key steps: 23 | 24 | ### Step 1: Translation Preparation 25 | 26 | 1. Process paragraphs: 27 | - Skip vertical text 28 | - Handle single-component paragraphs directly 29 | - Process multi-component paragraphs with placeholders 30 | 31 | 2. Create placeholders: 32 | - Formula placeholders for mathematical expressions 33 | - Rich text placeholders for styled text 34 | - Ensure placeholder uniqueness within each paragraph 35 | 36 | ### Step 2: Translation Input Creation 37 | 38 | 1. Analyze paragraph components: 39 | - Regular text components 40 | - Formula components 41 | - Styled text components 42 | 43 | 2. Handle special cases: 44 | - Skip pure formula paragraphs 45 | - Preserve original text when style matches base style 46 | - Handle font mapping cases 47 | 48 | ### Step 3: Translation Execution 49 | 50 | 1. Concurrent translation: 51 | - Use thread pool for parallel processing 52 | - Control QPS (Queries Per Second) 53 | - Track translation progress 54 | 55 | 2. Translation tracking: 56 | - Record original text 57 | - Record translated text 58 | - Save tracking information for debugging 59 | 60 | ### Step 4: Translation Output Processing 61 | 62 | 1. Parse translated text: 63 | - Extract text between placeholders 64 | - Restore formulas at placeholder positions 65 | - Restore rich text with original styles 66 | 67 | 2. Create new paragraph components: 68 | - Maintain style information 69 | - Preserve formula positioning 70 | - Handle empty text segments 71 | 72 | ## Additional Features 73 | 74 | 1. Style preservation: 75 | - Maintains original text styles 76 | - Handles font size variations 77 | - Preserves formatting attributes 78 | 79 | 2. Formula handling: 80 | - Preserves formula integrity 81 | - Maintains formula positioning 82 | - Supports complex mathematical expressions 83 | 84 | 3. Debug support: 85 | - Translation tracking 86 | - JSON output for debugging 87 | - Detailed logging 88 | 89 | ## Limitations 90 | 91 | 1. Vertical text is not supported 92 | 93 | 2. Complex nested styles might not be perfectly preserved 94 | 95 | 3. Placeholder conflicts could occur in rare cases 96 | 97 | 4. Translation quality depends on external translation engine 98 | 99 | ## Configuration Options 100 | 101 | The translation process can be customized through `TranslationConfig`: 102 | 103 | 1. `qps`: Maximum queries per second for translation 104 | 2. `debug`: Enable/disable debug mode and tracking 105 | 3. Translation engine specific settings -------------------------------------------------------------------------------- /docs/ImplementationDetails/PDFCreation/PDFCreation.md: -------------------------------------------------------------------------------- 1 | # PDF Creation 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Background 10 | 11 | After translation and typesetting, we need to create the final PDF document that preserves all the formatting, styles, and layout of the original document while containing the translated text. The PDF creation process handles this final step. 12 | 13 | ## Goal 14 | 15 | 1. Create a new PDF document with translated content 16 | 2. Preserve all original formatting and styles 17 | 3. Support both monolingual and dual-language output 18 | 4. Maintain font consistency and character encoding 19 | 5. Optimize the output file size and performance 20 | 21 | ## Specific Implementation 22 | 23 | The PDF creation process consists of several key steps: 24 | 25 | ### Step 1: Font Management 26 | 27 | 1. Font initialization: 28 | - Add required fonts to the document 29 | - Map font identifiers 30 | - Handle font encoding lengths 31 | 32 | 2. Font availability checking: 33 | - Check available fonts for each page 34 | - Handle XObject font requirements 35 | - Manage font resources 36 | 37 | 3. Font subsetting: 38 | - Optimize font usage 39 | - Reduce file size 40 | - Maintain character support 41 | 42 | ### Step 2: Content Rendering 43 | 44 | 1. Character processing: 45 | - Handle individual characters 46 | - Process character encodings 47 | - Manage character positioning 48 | 49 | 2. Graphics state handling: 50 | - Process color spaces 51 | - Handle transparency 52 | - Manage graphic state instructions 53 | 54 | 3. XObject management: 55 | - Process form XObjects 56 | - Handle drawing operations 57 | - Maintain XObject hierarchy 58 | 59 | ### Step 3: Document Assembly 60 | 61 | 1. Page construction: 62 | - Build page content 63 | - Process page resources 64 | - Handle page boundaries 65 | 66 | 2. Content stream creation: 67 | - Generate drawing operations 68 | - Handle text positioning 69 | - Manage content streams 70 | 71 | 3. Resource management: 72 | - Handle font resources 73 | - Manage XObject resources 74 | - Process graphic states 75 | 76 | ### Step 4: Output Generation 77 | 78 | 1. Monolingual output: 79 | - Create translated-only PDF 80 | - Optimize file size 81 | - Apply compression 82 | 83 | 2. Dual-language output: 84 | - Combine original and translated pages 85 | - Handle page ordering 86 | - Maintain document structure 87 | 88 | 3. File optimization: 89 | - Apply garbage collection 90 | - Enable compression 91 | - Optimize for linear reading 92 | 93 | ## Additional Features 94 | 95 | 1. Font handling: 96 | - Support for CID fonts 97 | - Font subsetting 98 | - Font resource management 99 | 100 | 2. Document optimization: 101 | - File size reduction 102 | - Performance optimization 103 | - Resource cleanup 104 | 105 | 3. Debug support: 106 | - Decompressed output 107 | - Debug information 108 | - Progress tracking 109 | 110 | ## Limitations 111 | 112 | 1. Font support: 113 | - Limited to available font formats 114 | - Font subsetting restrictions 115 | - Character encoding constraints 116 | 117 | 2. File size: 118 | - Dual-language output increases size 119 | - Font embedding impact 120 | - Resource duplication 121 | 122 | 3. Performance considerations: 123 | - Processing time for large documents 124 | - Memory usage during creation 125 | - Optimization overhead 126 | 127 | ## Configuration Options 128 | 129 | The PDF creation process can be customized through `TranslationConfig`: 130 | 131 | 1. Output options: 132 | - `no_mono`: Disable monolingual output 133 | - `no_dual`: Disable dual-language output 134 | - Output file naming patterns 135 | 136 | 2. Optimization settings: 137 | - Compression options 138 | - Garbage collection 139 | - Font subsetting 140 | 141 | 3. Debug options: 142 | - Debug mode 143 | - Decompressed output 144 | - Progress tracking -------------------------------------------------------------------------------- /docs/ImplementationDetails/PDFParsing/PDFParsing.md: -------------------------------------------------------------------------------- 1 | # PDF Parsing and Intermediate Layer Creation 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Background 10 | 11 | The first step in the translation process is to parse the PDF document and create an intermediate layer (IL) representation. This step involves extracting text, styles, formulas, and layout information from the PDF while maintaining their relationships and properties. 12 | 13 | ## Goal 14 | 15 | 1. Extract text content while preserving character-level information 16 | 2. Maintain font and style information 17 | 3. Preserve document structure and layout 18 | 4. Handle special elements like XObjects and graphics 19 | 5. Create a structured intermediate representation for later processing 20 | 21 | ## Specific Implementation 22 | 23 | The parsing process consists of several key components working together: 24 | 25 | ### Step 1: PDF Interpreter (PDFPageInterpreterEx) 26 | 27 | 1. Page content processing: 28 | - Parse PDF operators and their parameters 29 | - Handle graphics state operations 30 | - Process text and font operations 31 | - Manage XObject rendering 32 | 33 | 2. Graphics filtering: 34 | - Filter non-formula lines 35 | - Handle color space operations 36 | - Process stroke and fill operations 37 | 38 | 3. XObject handling: 39 | - Process form XObjects 40 | - Handle image XObjects 41 | - Maintain XObject hierarchy 42 | 43 | ### Step 2: PDF Converter (PDFConverterEx) 44 | 45 | 1. Character processing: 46 | - Extract character information 47 | - Maintain character positions 48 | - Preserve style attributes 49 | 50 | 2. Layout management: 51 | - Handle page boundaries 52 | - Process figure elements 53 | - Manage coordinate systems 54 | 55 | 3. Font handling: 56 | - Map font identifiers 57 | - Process font metadata 58 | - Handle CID fonts 59 | 60 | ### Step 3: Intermediate Layer Creator (ILCreater) 61 | 62 | 1. Document structure creation: 63 | - Build page hierarchy 64 | - Create character objects 65 | - Maintain font registry 66 | 67 | 2. Resource management: 68 | - Process font resources 69 | - Handle color spaces 70 | - Manage graphic states 71 | 72 | 3. XObject tracking: 73 | - Track XObject hierarchy 74 | - Maintain XObject states 75 | - Process form content 76 | 77 | ### Step 4: High-level Coordination 78 | 79 | 1. Process management: 80 | - Initialize resources 81 | - Coordinate component interactions 82 | - Handle progress tracking 83 | 84 | 2. Resource initialization: 85 | - Set up font management 86 | - Initialize graphics resources 87 | - Prepare document structure 88 | 89 | 3. Error handling: 90 | - Handle malformed content 91 | - Manage resource errors 92 | - Provide debug information 93 | 94 | ## Additional Features 95 | 96 | 1. Font management: 97 | - Support for CID fonts 98 | - Font metadata extraction 99 | - Font mapping capabilities 100 | 101 | 2. Graphics state tracking: 102 | - Color space management 103 | - Line style preservation 104 | - Transparency handling 105 | 106 | 3. Coordinate system handling: 107 | - Support for transformations 108 | - Boundary box calculations 109 | - Position normalization 110 | 111 | 4. Debug support: 112 | - Detailed logging 113 | - Intermediate file generation 114 | - Progress tracking 115 | 116 | ## Limitations 117 | 118 | 1. Complex PDF features: 119 | - Limited support for some PDF extensions 120 | - Simplified graphics model 121 | - Basic transparency support 122 | 123 | 2. Font handling: 124 | - Limited support for some font formats 125 | - Simplified font metrics 126 | - Basic font feature support 127 | 128 | 3. Performance considerations: 129 | - Memory usage for large documents 130 | - Processing time for complex layouts 131 | - Resource management overhead 132 | 133 | ## Configuration Options 134 | 135 | The parsing process can be customized through `TranslationConfig`: 136 | 137 | 1. `debug`: Enable/disable debug mode and intermediate file generation 138 | 2. Font-related settings: 139 | - Font mapping configurations 140 | - CID font handling options 141 | 3. Layout processing options: 142 | - Page selection 143 | - Content filtering rules -------------------------------------------------------------------------------- /docs/ImplementationDetails/ParagraphFinding/ParagraphFinding.md: -------------------------------------------------------------------------------- 1 | # Paragraph Finding 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Background 10 | 11 | After PDF analysis, we need to identify paragraphs from individual characters. This is a crucial step before translation and typesetting, as it helps maintain the logical structure of the document. 12 | 13 | ## Goal 14 | 15 | 1. Group characters into meaningful paragraphs while preserving the document's logical structure 16 | 2. Handle special cases like table of contents, short lines, and multi-line paragraphs 17 | 3. Maintain layout information for later typesetting 18 | 19 | ## Specific Implementation 20 | 21 | The paragraph finding process consists of four main steps: 22 | 23 | ### Step 1: Create Initial Paragraphs 24 | 25 | 1. Group characters into lines based on their spatial relationships 26 | 2. Create paragraphs based on layout information and XObject IDs 27 | 3. Characters that don't belong to text layouts are skipped 28 | 29 | ### Step 2: Process Paragraph Spacing 30 | 31 | 1. Remove completely empty lines 32 | 2. Handle trailing spaces within lines 33 | 3. Update paragraph boundary boxes and metadata 34 | 35 | ### Step 3: Calculate Line Width Statistics 36 | 37 | 1. Calculate the median width of all lines 38 | 2. This information is used for identifying potential paragraph breaks 39 | 40 | ### Step 4: Process Independent Paragraphs 41 | 42 | 1. Analyze paragraphs with multiple lines 43 | 2. Split paragraphs in two cases: 44 | - When encountering table of contents entries (identified by consecutive dots) 45 | - When finding lines significantly shorter than the median width (configurable via `short_line_split_factor`) 46 | 47 | ## Additional Features 48 | 49 | 1. Layout-aware processing: 50 | - Respects different layout types (plain text, title, figure caption, etc.) 51 | - Maintains layout priority order for overlapping regions 52 | 53 | 2. First line indent detection: 54 | - Automatically detects and marks paragraphs with first line indentation 55 | 56 | 3. Flexible character position detection: 57 | - Uses multiple position detection modes (middle, topleft, bottomright) 58 | - Special handling for characters with unreliable height information 59 | 60 | ## Limitations 61 | 62 | 1. The current implementation assumes left-to-right text direction 63 | 64 | 2. May not perfectly handle complex layouts with overlapping regions 65 | 66 | 3. Table of contents detection relies on consecutive dots pattern 67 | 68 | 4. Short line splitting might occasionally create incorrect paragraph breaks 69 | 70 | ## Configuration Options 71 | 72 | The paragraph finding behavior can be customized through `TranslationConfig`: 73 | 74 | 1. `split_short_lines`: Enable/disable splitting paragraphs at short lines 75 | 2. `short_line_split_factor`: Threshold factor for short line detection (relative to median width) -------------------------------------------------------------------------------- /docs/ImplementationDetails/README.md: -------------------------------------------------------------------------------- 1 | # Implementation Details 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Core Processing Flow 10 | 11 | Main processing stages in order of actual execution and corresponding documentation: 12 | 13 | 1. [PDFParser.md](PDFParsing/PDFParsing.md): **PDF Parsing and Intermediate Layer Creation** 14 | 15 | 2. [LayoutParser](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/midend/layout_parser.py): **Layout OCR** 16 | 17 | 3. [ParagraphFinding.md](ParagraphFinding/ParagraphFinding.md): **Paragraph Recognition** 18 | 19 | 4. [StylesAndFormulas.md](StylesAndFormulas/StylesAndFormulas.md): **Style and Formula Processing** 20 | 21 | 5. [ILTranslator.md](ILTranslator/ILTranslator.md): **Intermediate Layer Translation** 22 | 23 | 6. [Typesetting.md](Typesetting/Typesetting.md): **Typesetting Processing** 24 | 25 | 7. [FontMapper](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/utils/fontmap.py): **Font Mapping** 26 | 27 | 8. [PDFCreation.md](PDFCreation/PDFCreation.md): **PDF Generation** 28 | 29 | ## API 30 | 31 | 1. [Async Translation API](AsyncTranslate/AsyncTranslate.md): **Async Translation API** 32 | 33 | > [!TIP] 34 | > 35 | > Click on document links to view detailed implementation principles and configuration options 36 | -------------------------------------------------------------------------------- /docs/ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md: -------------------------------------------------------------------------------- 1 | # Styles and Formulas Processing 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Background 10 | 11 | After paragraph finding, we need to identify formulas and text styles within each paragraph. This step is crucial for maintaining mathematical expressions and text formatting during translation. 12 | 13 | ## Goal 14 | 15 | 1. Identify and preserve mathematical formulas 16 | 2. Detect and maintain consistent text styles 17 | 3. Handle special cases like subscripts and superscripts 18 | 4. Calculate proper offsets for formula positioning 19 | 20 | ## Specific Implementation 21 | 22 | The processing consists of several main steps: 23 | 24 | ### Step 1: Formula Detection 25 | 26 | 1. Identify formula characters based on: 27 | - Formula-specific fonts 28 | - Special Unicode characters 29 | - Vertical text 30 | - Corner marks (subscripts/superscripts) 31 | 32 | 2. Group consecutive formula characters into formula units 33 | 34 | ### Step 2: Formula Processing 35 | 36 | 1. Process comma-containing formulas: 37 | - Split complex formulas at commas when appropriate 38 | - Preserve brackets and their contents 39 | - Convert simple number-only formulas to regular text 40 | 41 | 2. Merge overlapping formulas: 42 | - Handle cases where subscripts/superscripts are detected as separate formulas 43 | - Maintain proper character ordering 44 | 45 | ### Step 3: Style Analysis 46 | 47 | 1. Calculate base style for each paragraph: 48 | - Find common style attributes across all text 49 | - Handle font variations 50 | - Process graphic states 51 | 52 | 2. Group characters with identical styles: 53 | - Font properties 54 | - Size properties 55 | - Graphic state properties 56 | 57 | ### Step 4: Position Calculation 58 | 59 | 1. Calculate formula offsets: 60 | - Compute x-offset relative to surrounding text 61 | - Compute y-offset for proper vertical alignment 62 | - Handle line spacing variations 63 | 64 | ## Additional Features 65 | 66 | 1. Font mapping: 67 | - Maps different fonts to standard ones 68 | - Special handling for formula fonts 69 | 70 | 2. Style inheritance: 71 | - Maintains style hierarchy 72 | - Handles partial style overrides 73 | 74 | 3. Formula classification: 75 | - Distinguishes between translatable and non-translatable formulas 76 | - Special handling for numeric formulas with commas 77 | 78 | ## Limitations 79 | 80 | 1. Formula detection relies on font and character patterns 81 | 82 | 2. May not handle all types of mathematical notations 83 | 84 | 3. Complex subscript/superscript combinations might be misidentified 85 | 86 | 4. Limited support for vertical formulas 87 | 88 | ## Configuration Options 89 | 90 | The formula and style processing can be customized through `TranslationConfig`: 91 | 92 | 1. `formular_font_pattern`: Regex pattern for identifying formula fonts 93 | 2. `formular_char_pattern`: Regex pattern for identifying formula characters -------------------------------------------------------------------------------- /docs/ImplementationDetails/Typesetting/Typesetting.md: -------------------------------------------------------------------------------- 1 | # Typography 2 | 3 | > [!NOTE] 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via: 5 | > 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues) 7 | > - Community contribution (PRs welcome!) 8 | 9 | ## Background 10 | 11 | After translation, text needs to be typeset before placing into PDF. 12 | 13 | Translated paragraphs can contain any combination of the following types: 14 | 15 | 1. PDF formulas 16 | 17 | 2. Single PDF original character 18 | 19 | 3. PDF original string with same style 20 | 21 | 4. Translated Unicode string with same style 22 | 23 | Let's discuss different cases: 24 | 25 | For the following 3 types, they can be directly transmitted transparently to new positions: 26 | 27 | 1. PDF formulas 28 | 29 | 2. Single PDF original character 30 | 31 | 3. PDF original string with same style 32 | 33 | Only "translated Unicode string with same style" needs typesetting operation, as this step loses original layout information. However, since paragraphs may contain other components that need transparent transmission, their positions may also change and need to participate in typesetting. 34 | 35 | ## Goal 36 | 37 | Try to fit all components within the original paragraph bounding box. If impossible, try to expand the bounding box in writing direction. 38 | 39 | ## Specific Implementation 40 | 41 | First perform reflow judgment to determine if the paragraph needs reflow. If all elements can be transmitted transparently, no reflow is needed. Then, if reflow is needed, execute Algorithm 1: 42 | 43 | 1. Convert all elements to typesetting unit type, which records length and width information. 44 | 45 | 2. Start from top-left of original paragraph bounding box, place elements sequentially. 46 | 47 | 3. If current line cannot fit next element, wrap to next line. 48 | 49 | 4. Repeat 2-3 until all elements are placed or exceed original bounding box. 50 | 51 | Algorithm 1 works normally when translated text is shorter than original. When translated text is longer, Algorithm 2 needs to be added: 52 | 53 | 1. Initialize element scaling factor as 1.0. 54 | 55 | 2. Initialize line spacing as 1.5. 56 | 57 | 3. Try typesetting using Algorithm 1. 58 | 59 | 4. If it cannot fit all elements: 60 | 61 | - First try to reduce line spacing by 0.1 step until reaching minimum line spacing (1.4) 62 | - If still cannot fit: 63 | - When scale > 0.6, reduce element scaling by 0.05 64 | - When scale <= 0.6, reduce element scaling by 0.1 65 | - Reset line spacing to 1.5 66 | - When scale becomes less than 0.7, adjust minimum line spacing to 1.1 67 | 68 | 5. Report error if element scaling is less than 0.1. 69 | 70 | Algorithm 2 can fit translations of almost all languages in original position. 71 | 72 | However, for special cases like "图 1" translated to "Figure 1", even with the above algorithms some text may still overflow. So Algorithm 3: 73 | 74 | 1. Before reducing scale, first try to expand the bounding box in writing direction. 75 | 76 | 2. Calculate paragraph's right whitespace by: 77 | 78 | - Using 90% of page crop box width as maximum limit 79 | - Checking for overlapping paragraphs on the right 80 | - Checking for overlapping figures on the right 81 | 82 | 3. Expand paragraph bounding box based on available whitespace. 83 | 84 | 4. If still cannot fit all elements, continue with scale reduction as in Algorithm 2. 85 | 86 | ## Additional Features 87 | 88 | 1. Mixed Chinese-English text handling: 89 | - Adds 0.5 character width spacing between Chinese and English text transitions 90 | - Excludes certain punctuation marks from this spacing rule 91 | 2. First line indent: 92 | 93 | - Adds 2 Chinese characters width indent for the first line when specified 94 | 95 | 3. Hanging punctuation: 96 | - Allows certain punctuation marks to extend beyond the right margin 97 | - Helps maintain better visual alignment 98 | 99 | ## Limitations 100 | 101 | 1. Currently, we use PDFPlumber for PDF analysis, this is only implemented for paragraphs, only handles left-to-right writing. 102 | 103 | 2. Cannot handle table of contents alignment by dots. 104 | 105 | 3. Poor performance, needs optimization. 106 | 107 | 4. No global page information consideration, inconsistent text sizes. 108 | 109 | 5. No advanced typography features, poor reading experience. 110 | 111 | ## Related Resources 112 | 113 | [UTR #59: East Asian Spacing](https://www.unicode.org/reports/tr59/) specifies which characters need spacing between them. 114 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | YADT Spec 2 | === 3 | 4 | ## YADT Document Intermediate Language 5 | 6 | [il_version_1.rnc](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/il_version_1.rnc): The definition of the intermediate language used between PDF parsing and rendering stages. 7 | 8 | For other implementation details, please refer to [Implementation Details](ImplementationDetails/README.md). -------------------------------------------------------------------------------- /docs/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | command_exists() { 5 | command -v "$1" >/dev/null 2>&1 6 | } 7 | 8 | echo "check uv installed ……" 9 | if command_exists uv; then 10 | echo "uv installed !" 11 | exit 0 12 | fi 13 | 14 | echo "uv not install, start installing ……" 15 | 16 | OS=$(uname -s) 17 | case "$OS" in 18 | Linux) 19 | if command_exists curl; then 20 | curl -LsSf https://astral.sh/uv/install.sh | sh 21 | elif command_exists wget; then 22 | wget -qO- https://astral.sh/uv/install.sh | sh 23 | else 24 | echo "curl or wget not found. uv installed failed." 25 | exit 1 26 | fi 27 | ;; 28 | Darwin) 29 | if command_exists brew; then 30 | brew install uv 31 | else 32 | echo "Homebrew not installed, please installed uv munally. " 33 | exit 1 34 | fi 35 | ;; 36 | *) 37 | echo "not support OS: $OS" 38 | exit 1 39 | ;; 40 | esac 41 | 42 | if command_exists uv; then 43 | uv run babeldoc --version 44 | pre-commit install 45 | else 46 | exit 1 47 | fi 48 | -------------------------------------------------------------------------------- /docs/images/babeldoc-banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-banner.png -------------------------------------------------------------------------------- /docs/images/babeldoc-big-logo-darkmode-with-transparent-background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-big-logo-darkmode-with-transparent-background.png -------------------------------------------------------------------------------- /docs/images/babeldoc-big-logo-darkmode-with-transparent-background.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/images/babeldoc-big-logo-with-transparent-background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-big-logo-with-transparent-background.png -------------------------------------------------------------------------------- /docs/images/babeldoc-big-logo-with-transparent-background.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /docs/images/babeldoc-big-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-big-logo.png -------------------------------------------------------------------------------- /docs/images/babeldoc-contributor_reward_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-contributor_reward_example.png -------------------------------------------------------------------------------- /docs/images/babeldoc-preview.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-preview.gif -------------------------------------------------------------------------------- /docs/images/babeldoc-small-logo-with-transparent-background.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-small-logo-with-transparent-background.png -------------------------------------------------------------------------------- /docs/images/babeldoc-small-logo-with-transparent-background.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/images/babeldoc-small-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-small-logo.png -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | 2 | {!README.md!} 3 | -------------------------------------------------------------------------------- /docs/intro-to-pdf-object.md: -------------------------------------------------------------------------------- 1 | An Introduction to PDF Object Definitions in dpml 2 | === 3 | 4 | ## 1. Understanding PDF Structure 5 | A PDF file is fundamentally an indexed collection of objects, where each object represents a structured data unit. The file structure consists of four main components: 6 | 7 | 1. A header 8 | 2. Object definitions 9 | 3. A cross-reference table 10 | 4. A trailer 11 | 12 | The cross-reference table serves as a lookup directory, mapping each numbered object to its byte offset location within the file. The trailer contains critical metadata, including the location of the root object (document catalog), which serves as the entry point for PDF interpretation. The file concludes with a byte offset pointing to the cross-reference table. 13 | 14 | Here's an illustrative example of a PDF file structure: 15 | 16 | ```pdf 17 | %PDF-2.0 18 | 1 0 obj 19 | << 20 | /Pages 2 0 R 21 | /Type /Catalog 22 | >> 23 | endobj 24 | 2 0 obj 25 | << 26 | /Count 1 27 | /Kids [ 28 | 3 0 R 29 | ] 30 | /Type /Pages 31 | >> 32 | endobj 33 | 3 0 obj 34 | << 35 | /Contents 4 0 R 36 | /MediaBox [ 0 0 612 792 ] 37 | /Parent 2 0 R 38 | /Resources << 39 | /Font << /F1 5 0 R >> 40 | >> 41 | /Type /Page 42 | >> 43 | endobj 44 | 4 0 obj 45 | << 46 | /Length 44 47 | >> 48 | stream 49 | BT 50 | /F1 24 Tf 51 | 72 720 Td 52 | (Potato) Tj 53 | ET 54 | endstream 55 | endobj 56 | 5 0 obj 57 | << 58 | /BaseFont /Helvetica 59 | /Encoding /WinAnsiEncoding 60 | /Subtype /Type1 61 | /Type /Font 62 | >> 63 | endobj 64 | 65 | xref 66 | 0 6 67 | 0000000000 65535 f 68 | 0000000009 00000 n 69 | 0000000062 00000 n 70 | 0000000133 00000 n 71 | 0000000277 00000 n 72 | 0000000372 00000 n 73 | trailer << 74 | /Root 1 0 R 75 | /Size 6 76 | /ID [<42841c13bbf709d79a200fa1691836f8>] 77 | >> 78 | startxref 79 | 478 80 | %%EOF 81 | ``` 82 | 83 | ### PDF File Interpretation 84 | When a PDF viewer processes a file, it follows these steps: 85 | 86 | 1. Starts at the file's end to locate the cross-reference table offset 87 | 2. Accesses the cross-reference table to find object locations 88 | 3. Reads the trailer dictionary to identify the document catalog 89 | 4. Uses the document catalog to access various document components: 90 | - Pages 91 | - Outlines 92 | - Thumbnails 93 | - Annotations 94 | - Other PDF elements 95 | 96 | The pages tree root is particularly crucial as it enables navigation to specific pages within the document. 97 | 98 | ### Example Interpretation Flow 99 | Let's trace through our example: 100 | 101 | 1. The cross-reference table begins at byte offset 478 (indicated after `startxref`) 102 | 2. The trailer identifies object 1 as the document catalog (`/Root 1 0 R`) 103 | 3. Object 1 is located at byte offset 9 104 | 4. The document catalog points to object 2 as the pages tree root 105 | 5. Object 2 is found at byte offset 62 106 | 6. The pages tree identifies page 3 as the first page 107 | 7. Object 3 is positioned at byte offset 133 108 | 8. Object 3 defines the page properties and links to object 4 for content 109 | 9. Object 4, at byte offset 277, contains the drawing instructions for rendering "Potato" 110 | 111 | This structure enables efficient random access to any part of the PDF document. 112 | 113 | ## 2. PDF Objects 114 | 115 | Earlier, we discussed PDF objects and introduced the concept of dictionaries. At the top level of a PDF file, objects are identified by two numbers followed by the keyword "obj". The first number serves as the object number, while the second—known as the generation number—is typically 0. Everything between these identifiers and the "endobj" keyword constitutes the object's body. 116 | 117 | The PDF specification provides a mechanism for modifying files by appending object updates and cross-reference table entries. When an object's contents are completely replaced (rather than modified), its generation number can be incremented. This allows object numbers to be reused while preventing old indirect references from resolving to new objects. However, such files are rare in practice, and generation numbers can generally be disregarded. Modern PDF specifications using object streams have even eliminated generation numbers entirely. 118 | 119 | PDF objects share similarities with data structures found in JSON, YAML, and modern programming languages, though PDF includes some unique object types. Here are the available PDF object types: 120 | 121 | - String: A text sequence enclosed in parentheses, e.g., (potato). Note that PDF strings typically don't support full Unicode encoding, though there are specific cases where this is possible. (A detailed discussion of character encoding is beyond our current scope.) 122 | 123 | - Number: Both integers and floating-point numbers (e.g., 12, 3.14159). While the PDF specification distinguishes between integers and real numbers, they're often interchangeable in practice—integers can be used where real numbers are expected, and viewers typically handle real numbers appropriately when integers are required. 124 | 125 | - Boolean: Simple true/false values 126 | 127 | - Null: Represented by the keyword "null" 128 | 129 | - Name: A keyword or dictionary key identifier starting with a forward slash (/), e.g., /Type 130 | 131 | - Array: An ordered collection of objects enclosed in square brackets, with no separators between items. Arrays support nested structures, including other arrays and dictionaries. Example: `[1 (two) 3.14 false]` 132 | 133 | - Dictionary: A collection of key-value pairs where keys are Names and values can be any object type. Dictionaries are enclosed in << and >> with no separators between entries. Example: `<< /A 1 /B [2, 3 <> ] >>` 134 | 135 | - Indirect object reference: A reference to a numbered object in the file, consisting of two numbers (object and generation) followed by 'R', e.g., 1 0 R. While some objects must be direct per the PDF specification, most can be defined at the top level and referenced indirectly. 136 | 137 | - Stream: A container for binary data, structured as a dictionary (containing at least a /Length key and other format-specific entries) followed by the specified number of bytes between "stream" and "endstream" keywords. 🔍 The stream length can be specified as an indirect object, enabling single-pass PDF generation where the stream length isn't known in advance—a common practice in PDF creation. 138 | 139 | ## 3. PDF Object Definitions In dpml 140 | 141 | ### Coordinate system definition 142 | 143 | The positive x-axis extends horizontally to the right, while the positive y-axis extends vertically upward, following 144 | standard mathematical conventions. The unit length along both the x and y axes is defined as 1/72 inch (or 1 point). 145 | 146 | ## 4. Useful Information 147 | 148 | - [PDF32000_2008](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf) page 111: Table 51 - Operator Categories -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx>=8.2.0 2 | sphinx-click>=5.1.0 3 | furo>=2024.1.29 4 | myst-parser[linkify,html_meta,html_admonition]>=2.0.0 -------------------------------------------------------------------------------- /examples/basic.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | This is a simple paragraph with some text. 7 | 8 | And this is a new line. 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /examples/ci/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/examples/ci/test.pdf -------------------------------------------------------------------------------- /examples/code-figure.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | def hello_world(): 7 | print("Hello, World!") 8 | return None 9 | 10 | 11 | 12 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /examples/formular.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Here's a mathematical formula: 6 | \frac{-b \pm \sqrt{b^2-4ac}}{2a} 7 | And here's a special symbol: 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /examples/table.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | Header 1 14 | 15 | 16 | 17 | 18 | Header 2 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | Cell 1 28 | 29 | 30 | 31 | 32 | Cell 2 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2016-2025 Martin Donath 2 | 3 | # Permission is hereby granted, free of charge, to any person obtaining a copy 4 | # of this software and associated documentation files (the "Software"), to 5 | # deal in the Software without restriction, including without limitation the 6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 7 | # sell copies of the Software, and to permit persons to whom the Software is 8 | # furnished to do so, subject to the following conditions: 9 | 10 | # The above copyright notice and this permission notice shall be included in 11 | # all copies or substantial portions of the Software. 12 | 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 19 | # IN THE SOFTWARE. 20 | 21 | # Project information 22 | site_name: BabelDOC 23 | site_url: https://squidfunk.github.io/mkdocs-material/ 24 | site_author: funstory.ai 25 | site_description: >- 26 | Write your documentation in Markdown and create a professional static site in 27 | minutes – searchable, customizable, in 60+ languages, for all devices 28 | 29 | # Repository 30 | repo_name: funstory-ai/BabelDOC 31 | repo_url: https://github.com/funstory-ai/BabelDOC 32 | edit_uri: edit/main/docs/ 33 | 34 | # Copyright 35 | copyright: Copyright © 2025 funstory.ai 36 | 37 | # Configuration 38 | theme: 39 | name: material 40 | # custom_dir: material/overrides 41 | features: 42 | - announce.dismiss 43 | - content.action.edit 44 | - content.action.view 45 | - content.code.annotate 46 | - content.code.copy 47 | - content.code.select 48 | # - content.footnote.tooltips 49 | # - content.tabs.link 50 | - content.tooltips 51 | # - header.autohide 52 | # - navigation.expand 53 | - navigation.footer 54 | - navigation.indexes 55 | # - navigation.instant 56 | # - navigation.instant.prefetch 57 | # - navigation.instant.progress 58 | # - navigation.prune 59 | - navigation.sections 60 | - navigation.tabs 61 | # - navigation.tabs.sticky 62 | - navigation.top 63 | - navigation.tracking 64 | - search.highlight 65 | - search.share 66 | - search.suggest 67 | - toc.follow 68 | # - toc.integrate 69 | palette: 70 | - media: "(prefers-color-scheme)" 71 | toggle: 72 | icon: material/brightness-auto 73 | name: Switch to light mode 74 | - media: "(prefers-color-scheme: light)" 75 | scheme: default 76 | primary: white 77 | accent: indigo 78 | toggle: 79 | icon: material/brightness-7 80 | name: Switch to dark mode 81 | - media: "(prefers-color-scheme: dark)" 82 | scheme: slate 83 | primary: black 84 | accent: indigo 85 | toggle: 86 | icon: material/brightness-4 87 | name: Switch to system preference 88 | font: 89 | text: Roboto 90 | code: Roboto Mono 91 | # favicon: assets/favicon.png 92 | favicon: images/babeldoc-small-logo-with-transparent-background.svg 93 | logo: images/babeldoc-small-logo-with-transparent-background.svg 94 | 95 | # Plugins 96 | plugins: 97 | - search: 98 | separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])' 99 | - minify: 100 | minify_html: true 101 | - git-authors 102 | - git-revision-date-localized: 103 | enable_creation_date: true 104 | # Additional configuration 105 | extra: 106 | status: 107 | new: Recently added 108 | deprecated: Deprecated 109 | social: 110 | - icon: fontawesome/brands/github 111 | link: https://github.com/funstory-ai/BabelDOC 112 | - icon: fontawesome/brands/python 113 | link: https://pypi.org/project/BabelDOC/ 114 | 115 | # Extensions 116 | markdown_extensions: 117 | - github-callouts 118 | - markdown_include.include 119 | - pymdownx.highlight: 120 | anchor_linenums: true 121 | line_spans: __span 122 | pygments_lang_class: true 123 | - pymdownx.inlinehilite 124 | - pymdownx.snippets 125 | - pymdownx.superfences 126 | - def_list 127 | - pymdownx.tasklist: 128 | custom_checkbox: true 129 | not_in_nav: | 130 | /tutorials/**/*.md 131 | 132 | # Page tree 133 | nav: 134 | - Home: index.md 135 | - API: 136 | - Async Translation API: ImplementationDetails/AsyncTranslate/AsyncTranslate.md 137 | - Implementation Details: 138 | - ImplementationDetails/README.md 139 | - PDF Parsing: ImplementationDetails/PDFParsing/PDFParsing.md 140 | - Layout Parser(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/midend/layout_parser.py 141 | - Paragraph Finding: ImplementationDetails/ParagraphFinding/ParagraphFinding.md 142 | - Styles and Formulas: ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md 143 | - IL Translator: ImplementationDetails/ILTranslator/ILTranslator.md 144 | - Typesetting: ImplementationDetails/Typesetting/Typesetting.md 145 | - Font Mapper(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/utils/fontmap.py 146 | - PDF Creation: ImplementationDetails/PDFCreation/PDFCreation.md 147 | - Intro To PDF Object: intro-to-pdf-object.md 148 | - Community: 149 | - Code of Conduct: CODE_OF_CONDUCT.md 150 | - Contributing: 151 | - Contributing: CONTRIBUTING.md 152 | - Contributor Reward: CONTRIBUTOR_REWARD.md -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "BabelDOC" 3 | version = "0.3.21" 4 | description = "Yet Another Document Translator" 5 | license = "AGPL-3.0" 6 | readme = "README.md" 7 | requires-python = ">=3.10,<3.13" 8 | authors = [ 9 | { name = "awwaawwa", email = "aw@funstory.ai" } 10 | ] 11 | maintainers = [ 12 | { name = "awwaawwa", email = "aw@funstory.ai" } 13 | ] 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "Operating System :: OS Independent", 17 | ] 18 | keywords = ["PDF"] 19 | dependencies = [ 20 | "bitstring>=4.3.0", 21 | "configargparse>=1.7", 22 | "httpx[socks]>=0.27.0", 23 | "huggingface-hub>=0.27.0", 24 | "numpy>=2.0.2", 25 | "onnx>=1.17.0", 26 | "onnxruntime>=1.16.1", 27 | "openai>=1.59.3", 28 | "orjson>=3.10.14", 29 | "pdfminer-six>=20240706", 30 | "peewee>=3.17.8", 31 | "psutil>=7.0.0", 32 | "pymupdf>=1.25.1", 33 | "rich>=13.9.4", 34 | "toml>=0.10.2", 35 | "tqdm>=4.67.1", 36 | "xsdata[cli,lxml,soap]>=24.12", 37 | "msgpack>=1.1.0", 38 | "pydantic>=2.10.6", 39 | "tenacity>=9.0.0", 40 | "scikit-image>=0.25.2", 41 | "freetype-py>=2.5.1", 42 | "tiktoken>=0.9.0", 43 | "python-levenshtein>=0.27.1", 44 | "opencv-python-headless>=4.10.0.84", 45 | "rapidocr-onnxruntime>=1.4.4", 46 | ] 47 | 48 | [project.optional-dependencies] 49 | directml = ["onnxruntime-directml>=1.16.1"] 50 | cuda = ["onnxruntime-gpu>=1.16.1"] 51 | memray = ["memray>=1.17.1"] 52 | 53 | [project.urls] 54 | Homepage = "https://github.com/funstory-ai/BabelDOC" 55 | Issues = "https://github.com/funstory-ai/BabelDOC/issues" 56 | 57 | [project.scripts] 58 | babeldoc = "babeldoc.main:cli" 59 | 60 | [build-system] 61 | requires = ["hatchling"] 62 | build-backend = "hatchling.build" 63 | 64 | [tool.flake8] 65 | ignore = ["E203", "E261", "E501", "W503", "E741", "E501"] 66 | max-line-length = 88 67 | 68 | [tool.ruff] 69 | src = ["babeldoc"] 70 | target-version = "py310" 71 | show-fixes = true 72 | 73 | [tool.ruff.format] 74 | # Enable reformatting of code snippets in docstrings. 75 | docstring-code-format = true 76 | 77 | [tool.ruff.lint] 78 | ignore = [ 79 | "E203", # 冒号前的空格 80 | "E261", # 注释前至少两个空格 81 | "E501", # 行太长 82 | "E741", # 变量名歧义 83 | "F841", # 未使用的变量 84 | "C901", # 太复杂的函数 85 | "S101", # use assert 86 | "SIM", # flake8-simplify 87 | "ARG002", # unused argument 88 | "S110", # `try`-`except`-`pass` detected, consider logging the exception 89 | "B024", # abstract class without abstract methods 90 | "S112", # `try`-`except`-`continue` detected, consider logging the exception 91 | "COM812", # missing-trailing-comma 92 | 93 | ] 94 | select = [ 95 | "E", # pycodestyle 错误 96 | "F", # Pyflakes 97 | "N", # PEP8 命名 98 | "B", # flake8-bugbear 99 | "I", # isort 100 | "C", # mccabe 101 | "UP", # pyupgrade 102 | "S", # flake8-bandit 103 | "A", # flake8-builtins 104 | "COM", # flake8-commas 105 | "ARG", # flake8-unused-arguments 106 | "PTH", # 使用 pathlib 107 | ] 108 | 109 | [tool.ruff.lint.flake8-quotes] 110 | docstring-quotes = "double" 111 | 112 | [tool.ruff.lint.flake8-annotations] 113 | suppress-none-returning = true 114 | 115 | [tool.ruff.lint.isort] 116 | force-single-line = true 117 | 118 | [tool.ruff.lint.pydocstyle] 119 | convention = "google" 120 | 121 | # 设置一些规则的特定配置 122 | [tool.ruff.lint.mccabe] 123 | max-complexity = 10 # 函数圈复杂度阈值 124 | 125 | [tool.ruff.lint.per-file-ignores] 126 | "babeldoc/pdfinterp.py" = ["N"] # 忽略命名规范 127 | "tests/*" = ["S101"] # 在测试文件中允许 assert 128 | "**/__init__.py" = ["F401"] # 允许未使用的导入 129 | # 忽略 S311 警告,因为这是有意的 130 | "babeldoc/document_il/midend/paragraph_finder.py" = ["S311"] 131 | "docs/*" = ["A001"] 132 | [dependency-groups] 133 | dev = [ 134 | "bumpver>=2024.1130", 135 | "markdown-callouts>=0.4.0", 136 | "markdown-include>=0.8.1", 137 | "mkdocs-git-authors-plugin>=0.9.2", 138 | "mkdocs-git-committers-plugin-2>=2.5.0", 139 | "mkdocs-git-revision-date-localized-plugin>=1.3.0", 140 | "mkdocs-material[recommended]>=9.6.4", 141 | "pre-commit>=4.1.0", 142 | "pygments>=2.19.1", 143 | "ruff>=0.9.2", 144 | "pytest>=8.3.4", 145 | ] 146 | 147 | [tool.pytest.ini_options] 148 | pythonpath = [".", "src"] 149 | testpaths = ["tests"] 150 | 151 | [bumpver] 152 | current_version = "0.3.21" 153 | version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]" 154 | 155 | [bumpver.file_patterns] 156 | "pyproject.toml" = [ 157 | 'current_version = "{version}"', 158 | 'version = "{version}"' 159 | ] 160 | "babeldoc/__init__.py" = [ 161 | '__version__ = "{version}"' 162 | ] 163 | "babeldoc/main.py" = [ 164 | '__version__ = "{version}"' 165 | ] 166 | "babeldoc/const.py" = [ 167 | '__version__ = "{version}"' 168 | ] 169 | -------------------------------------------------------------------------------- /tests/test_translation_config.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from babeldoc.translation_config import ConfigModel 4 | 5 | # Since it is necessary to test whether the functionality meets the expected requirements, 6 | # private functions and private methods are allowed to be called. 7 | # pyright: reportPrivateUsage=false 8 | 9 | 10 | class TestConfigArgs: 11 | def test_page_range_regex(self): 12 | test_strings = [ 13 | "1,3,5,7,9", 14 | "1-3,5,7-9", 15 | "1,2-4,5,6-8,9", 16 | "10-12,14,16-18", 17 | "1-,5", 18 | "-5,10", 19 | "1-, 5, -3, 10-12", 20 | ] 21 | pattern = ConfigModel._page_range_pattern() 22 | for string in test_strings: 23 | assert re.match(pattern, string) 24 | --------------------------------------------------------------------------------