├── .cursorignore
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yaml
    │   └── feature_request.yaml
    ├── PULL_REQUEST_TEMPLATE
    │   └── pr_form.yml
    ├── dependabot.yml
    ├── labels.yml
    ├── release-drafter.yml
    └── workflows
    │   ├── codeql.yml
    │   ├── docs.yml
    │   ├── labeler.yml
    │   ├── lint.yml
    │   ├── pr-lint.yml
    │   ├── publish-to-pypi.yml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── babeldoc
    ├── __init__.py
    ├── assets
    │   ├── assets.py
    │   └── embedding_assets_metadata.py
    ├── asynchronize
    │   └── __init__.py
    ├── const.py
    ├── converter.py
    ├── document_il
    │   ├── __init__.py
    │   ├── babeldoc_exception
    │   │   └── BabelDOCException.py
    │   ├── backend
    │   │   ├── __init__.py
    │   │   └── pdf_creater.py
    │   ├── frontend
    │   │   ├── __init__.py
    │   │   └── il_creater.py
    │   ├── il_version_1.py
    │   ├── il_version_1.rnc
    │   ├── il_version_1.rng
    │   ├── il_version_1.xsd
    │   ├── midend
    │   │   ├── __init__.py
    │   │   ├── add_debug_information.py
    │   │   ├── detect_scanned_file.py
    │   │   ├── il_translator.py
    │   │   ├── il_translator_llm_only.py
    │   │   ├── layout_parser.py
    │   │   ├── paragraph_finder.py
    │   │   ├── remove_descent.py
    │   │   ├── styles_and_formulas.py
    │   │   ├── table_parser.py
    │   │   └── typesetting.py
    │   ├── translator
    │   │   ├── __init__.py
    │   │   ├── cache.py
    │   │   └── translator.py
    │   ├── utils
    │   │   ├── atomic_integer.py
    │   │   ├── fontmap.py
    │   │   ├── layout_helper.py
    │   │   ├── priority_thread_pool_executor.py
    │   │   └── style_helper.py
    │   └── xml_converter.py
    ├── docvision
    │   ├── README.md
    │   ├── __init__.py
    │   ├── doclayout.py
    │   ├── rpc_doclayout.py
    │   └── table_detection
    │   │   └── rapidocr.py
    ├── format
    │   └── office
    │   │   └── __init__.py
    ├── high_level.py
    ├── main.py
    ├── pdfinterp.py
    ├── progress_monitor.py
    ├── result_merger.py
    ├── split_manager.py
    ├── tools
    │   ├── generate_font_metadata.py
    │   ├── italic_assistance.py
    │   └── italic_recognize_tool.py
    └── translation_config.py
├── docs
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── CONTRIBUTOR_REWARD.md
    ├── ImplementationDetails
    │   ├── AsyncTranslate
    │   │   └── AsyncTranslate.md
    │   ├── ILTranslator
    │   │   └── ILTranslator.md
    │   ├── PDFCreation
    │   │   └── PDFCreation.md
    │   ├── PDFParsing
    │   │   └── PDFParsing.md
    │   ├── ParagraphFinding
    │   │   └── ParagraphFinding.md
    │   ├── README.md
    │   ├── StylesAndFormulas
    │   │   └── StylesAndFormulas.md
    │   └── Typesetting
    │   │   └── Typesetting.md
    ├── README.md
    ├── deploy.sh
    ├── images
    │   ├── babeldoc-banner.png
    │   ├── babeldoc-big-logo-darkmode-with-transparent-background.png
    │   ├── babeldoc-big-logo-darkmode-with-transparent-background.svg
    │   ├── babeldoc-big-logo-with-transparent-background.png
    │   ├── babeldoc-big-logo-with-transparent-background.svg
    │   ├── babeldoc-big-logo.png
    │   ├── babeldoc-contributor_reward_example.png
    │   ├── babeldoc-preview.gif
    │   ├── babeldoc-small-logo-with-transparent-background.png
    │   ├── babeldoc-small-logo-with-transparent-background.svg
    │   └── babeldoc-small-logo.png
    ├── index.md
    ├── intro-to-pdf-object.md
    └── requirements.txt
├── examples
    ├── basic.xml
    ├── ci
    │   └── test.pdf
    ├── code-figure.xml
    ├── complex.xml
    ├── formular.xml
    └── table.xml
├── mkdocs.yml
├── pyproject.toml
├── tests
    └── test_translation_config.py
└── uv.lock


/.cursorignore:
--------------------------------------------------------------------------------
1 | # Project notes and templates
2 | xnotes/
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yaml:
--------------------------------------------------------------------------------
 1 | name: "🐞 Bug Report"
 2 | description: Create a report to help us improve
 3 | labels: ['bug']
 4 | body:
 5 |   - type: checkboxes
 6 |     id: checks
 7 |     attributes:
 8 |       label: Before you submit
 9 |       options:
10 |         - label: I have searched existing issues
11 |           required: true
12 |         - label: I spent at least 5 minutes investigating and preparing this report
13 |           required: true
14 |         - label: I confirmed this is not caused by a network issue
15 |           required: true
16 | 
17 |   - type: markdown
18 |     attributes:
19 |       value: |
20 |         Thank you for using **BabelDOC** and helping us improve it! 🙏
21 | 
22 |   - type: textarea
23 |     id: environment
24 |     attributes:
25 |       label: Environment
26 |       description: Provide your system details (required)
27 |       value: |
28 |         - OS:
29 |         - Python:
30 |         - BabelDOC:
31 |       render: markdown
32 |     validations:
33 |       required: true
34 | 
35 |   - type: textarea
36 |     id: describe
37 |     attributes:
38 |       label: Describe the bug
39 |       description: A clear and concise description of what the bug is.
40 |     validations:
41 |       required: true
42 | 
43 |   - type: textarea
44 |     id: reproduce
45 |     attributes:
46 |       label: Steps to Reproduce
47 |       description: Help us reproduce the issue
48 |       value: |
49 |         1. Go to '...'
50 |         2. Click on '...'
51 |         3. See error
52 |     validations:
53 |       required: false
54 | 
55 |   - type: textarea
56 |     id: expected
57 |     attributes:
58 |       label: Expected Behavior
59 |       description: What did you expect to happen?
60 |     validations:
61 |       required: false
62 | 
63 |   - type: textarea
64 |     id: logs
65 |     attributes:
66 |       label: Relevant Log Output or Screenshots
67 |       description: Copy and paste any logs or attach screenshots. This will be formatted automatically.
68 |       render: text
69 |     validations:
70 |       required: false
71 | 
72 |   - type: textarea
73 |     id: pdf
74 |     attributes:
75 |       label: Original PDF File
76 |       description: Upload the input PDF if applicable.
77 |     validations:
78 |       required: false
79 | 
80 |   - type: textarea
81 |     id: others
82 |     attributes:
83 |       label: Additional Context
84 |       description: Anything else we should know?
85 |     validations:
86 |       required: false
87 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
 1 | name: "✨ Feature Request"
 2 | description: Suggest a new idea or improvement for BabelDOC
 3 | labels: ['enhancement']
 4 | body:
 5 |   - type: markdown
 6 |     attributes:
 7 |       value: |
 8 |         Thank you for helping improve **BabelDOC**! Please fill out the form below to suggest a feature.
 9 | 
10 |   - type: textarea
11 |     id: describe
12 |     attributes:
13 |       label: Is your feature request related to a problem?
14 |       description: If applicable, describe what problem this feature would solve.
15 |       placeholder: Ex. I'm always frustrated when ...
16 |     validations:
17 |       required: false
18 | 
19 |   - type: textarea
20 |     id: solution
21 |     attributes:
22 |       label: Describe the solution you'd like
23 |       description: What would you like to see happen?
24 |     validations:
25 |       required: true
26 | 
27 |   - type: textarea
28 |     id: alternatives
29 |     attributes:
30 |       label: Describe alternatives you've considered
31 |       description: Have you thought of other ways to solve this?
32 |     validations:
33 |       required: false
34 | 
35 |   - type: textarea
36 |     id: additional
37 |     attributes:
38 |       label: Additional context
39 |       description: Any other context, examples, or screenshots?
40 |     validations:
41 |       required: false
42 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/pr_form.yml:
--------------------------------------------------------------------------------
 1 | name: Pull Request
 2 | description: Submit a pull request to contribute to BabelDOC
 3 | title: "[PR] <Your concise title here>"
 4 | labels:
 5 |   - needs triage
 6 | body:
 7 |   - type: markdown
 8 |     attributes:
 9 |       value: |
10 |         ## 👋 Thanks for contributing to **BabelDOC**!
11 | 
12 |         Please fill out this form to help us review your pull request effectively.
13 | 
14 |   - type: input
15 |     id: issue
16 |     attributes:
17 |       label: Related Issue(s)
18 |       description: If this pull request closes or is related to one or more issues, list them here (e.g., #37)
19 |       placeholder: "#37"
20 |     validations:
21 |       required: false
22 | 
23 |   - type: textarea
24 |     id: summary
25 |     attributes:
26 |       label: Description
27 |       description: Describe the purpose of this pull request and what was changed.
28 |       placeholder: |
29 |         - What does this PR introduce or fix?
30 |         - What is the motivation behind it?
31 |     validations:
32 |       required: true
33 | 
34 |   - type: dropdown
35 |     id: pr_type
36 |     attributes:
37 |       label: PR Type
38 |       description: What kind of change is this?
39 |       multiple: true
40 |       options:
41 |         - enhancement
42 |         - bug
43 |         - documentation
44 |         - refactor
45 |         - test
46 |         - chore
47 |     validations:
48 |       required: true
49 | 
50 |   - type: checkboxes
51 |     id: checklist
52 |     attributes:
53 |       label: Contributor Checklist
54 |       options:
55 |         - label: I’ve read the **CONTRIBUTING.md** guide
56 |           required: true
57 |         - label: My changes follow the project’s code style and guidelines
58 |           required: true
59 |         - label: I’ve linked the related issue(s) in the description above
60 |         - label: I’ve updated relevant documentation (if applicable)
61 |         - label: I’ve added necessary tests (if applicable)
62 |         - label: All new and existing tests passed locally
63 |         - label: I understand that due to limited maintainer resources, only small pull requests are accepted. Suggestions with proof-of-concept patches are appreciated, and my patch may be rewritten if necessary.
64 | 
65 |   - type: textarea
66 |     id: testing
67 |     attributes:
68 |       label: Testing Instructions
69 |       description: Provide step-by-step instructions on how to test your changes
70 |       placeholder: |
71 |         1. Run `...`
72 |         2. Visit `...`
73 |         3. Click `...`
74 |         4. Verify `...`
75 |     validations:
76 |       required: false
77 | 
78 |   - type: textarea
79 |     id: screenshots
80 |     attributes:
81 |       label: Screenshots (if applicable)
82 |       description: If UI changes were made, please attach before/after screenshots.
83 |     validations:
84 |       required: false
85 | 
86 |   - type: textarea
87 |     id: notes
88 |     attributes:
89 |       label: Additional Notes
90 |       description: Anything else the reviewer should know?
91 |     validations:
92 |       required: false
93 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: github-actions
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: weekly
 7 |   # - package-ecosystem: pip
 8 |   #   directory: "/.github/workflows"
 9 |   #   schedule:
10 |   #     interval: weekly
11 |   # - package-ecosystem: pip
12 |   #   directory: "/docs"
13 |   #   schedule:
14 |   #     interval: weekly
15 |   - package-ecosystem: pip
16 |     directory: "/"
17 |     schedule:
18 |       interval: weekly
19 |     versioning-strategy: lockfile-only
20 |     allow:
21 |       - dependency-type: "all"


--------------------------------------------------------------------------------
/.github/labels.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Labels names are important as they are used by Release Drafter to decide
 3 | # regarding where to record them in changelog or if to skip them.
 4 | #
 5 | # The repository labels will be automatically configured using this file and
 6 | # the GitHub Action https://github.com/marketplace/actions/github-labeler.
 7 | - name: breaking
 8 |   description: Breaking Changes
 9 |   color: "bfd4f2"
10 | - name: bug
11 |   description: Something isn't working
12 |   color: "d73a4a"
13 | - name: build
14 |   description: Build System and Dependencies
15 |   color: "bfdadc"
16 | - name: ci
17 |   description: Continuous Integration
18 |   color: "4a97d6"
19 | - name: dependencies
20 |   description: Pull requests that update a dependency file
21 |   color: "0366d6"
22 | - name: documentation
23 |   description: Improvements or additions to documentation
24 |   color: "0075ca"
25 | - name: duplicate
26 |   description: This issue or pull request already exists
27 |   color: "cfd3d7"
28 | - name: enhancement
29 |   description: New feature or request
30 |   color: "a2eeef"
31 | - name: github_actions
32 |   description: Pull requests that update Github_actions code
33 |   color: "000000"
34 | - name: good first issue
35 |   description: Good for newcomers
36 |   color: "7057ff"
37 | - name: help wanted
38 |   description: Extra attention is needed
39 |   color: "008672"
40 | - name: invalid
41 |   description: This doesn't seem right
42 |   color: "e4e669"
43 | - name: performance
44 |   description: Performance
45 |   color: "016175"
46 | - name: python
47 |   description: Pull requests that update Python code
48 |   color: "2b67c6"
49 | - name: question
50 |   description: Further information is requested
51 |   color: "d876e3"
52 | - name: refactoring
53 |   description: Refactoring
54 |   color: "ef67c4"
55 | - name: removal
56 |   description: Removals and Deprecations
57 |   color: "9ae7ea"
58 | - name: style
59 |   description: Style
60 |   color: "c120e5"
61 | - name: testing
62 |   description: Testing
63 |   color: "b1fc6f"
64 | - name: wontfix
65 |   description: This will not be worked on
66 |   color: "ffffff"


--------------------------------------------------------------------------------
/.github/release-drafter.yml:
--------------------------------------------------------------------------------
 1 | name-template: 'v$RESOLVED_VERSION'
 2 | tag-template: 'v$RESOLVED_VERSION'
 3 | categories:
 4 |   - title: '🚀 Features'
 5 |     labels:
 6 |       - 'feature'
 7 |       - 'enhancement'
 8 |   - title: '🐛 Bug Fixes'
 9 |     labels:
10 |       - 'fix'
11 |       - 'bugfix'
12 |       - 'bug'
13 |   - title: '🧰 Maintenance'
14 |     labels:
15 |       - 'chore'
16 |       - 'maintenance'
17 |       - 'refactor'
18 |   - title: '📝 Documentation'
19 |     labels:
20 |       - 'docs'
21 |       - 'documentation'
22 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
23 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions
24 | version-resolver:
25 |   major:
26 |     labels:
27 |       - 'major'
28 |   minor:
29 |     labels:
30 |       - 'minor'
31 |   patch:
32 |     labels:
33 |       - 'patch'
34 |   default: patch
35 | template: |
36 |   ## Changes
37 | 
38 |   $CHANGES
39 | 
40 |   ## Contributors
41 |   
42 |   $CONTRIBUTORS
43 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL Advanced"
13 | 
14 | on:
15 |   push:
16 |   pull_request:
17 |     branches: [ "main" ]
18 |   schedule:
19 |     - cron: '36 14 * * 1'
20 | 
21 | jobs:
22 |   analyze:
23 |     name: Analyze (${{ matrix.language }})
24 |     # Runner size impacts CodeQL analysis time. To learn more, please see:
25 |     #   - https://gh.io/recommended-hardware-resources-for-running-codeql
26 |     #   - https://gh.io/supported-runners-and-hardware-resources
27 |     #   - https://gh.io/using-larger-runners (GitHub.com only)
28 |     # Consider using larger runners or machines with greater resources for possible analysis time improvements.
29 |     runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
30 |     permissions:
31 |       # required for all workflows
32 |       security-events: write
33 | 
34 |       # required to fetch internal or private CodeQL packs
35 |       packages: read
36 | 
37 |       # only required for workflows in private repositories
38 |       actions: read
39 |       contents: read
40 | 
41 |     strategy:
42 |       fail-fast: false
43 |       matrix:
44 |         include:
45 |         - language: python
46 |           build-mode: none
47 |         - language: actions
48 |         # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
49 |         # Use `c-cpp` to analyze code written in C, C++ or both
50 |         # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
51 |         # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
52 |         # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
53 |         # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
54 |         # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
55 |         # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
56 |     steps:
57 |     - name: Checkout repository
58 |       uses: actions/checkout@v4
59 | 
60 |     # Initializes the CodeQL tools for scanning.
61 |     - name: Initialize CodeQL
62 |       uses: github/codeql-action/init@v3
63 |       with:
64 |         languages: ${{ matrix.language }}
65 |         build-mode: ${{ matrix.build-mode }}
66 |         # If you wish to specify custom queries, you can do so here or in a config file.
67 |         # By default, queries listed here will override any specified in a config file.
68 |         # Prefix the list here with "+" to use these queries and those in the config file.
69 | 
70 |         # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
71 |         # queries: security-extended,security-and-quality
72 | 
73 |     # If the analyze step fails for one of the languages you are analyzing with
74 |     # "We were unable to automatically build your code", modify the matrix above
75 |     # to set the build mode to "manual" for that language. Then modify this step
76 |     # to build your code.
77 |     # ℹ️ Command-line programs to run using the OS shell.
78 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
79 |     - if: matrix.build-mode == 'manual'
80 |       shell: bash
81 |       run: |
82 |         echo 'If you are using a "manual" build mode for one or more of the' \
83 |           'languages you are analyzing, replace this with the commands to build' \
84 |           'your code, for example:'
85 |         echo '  make bootstrap'
86 |         echo '  make release'
87 |         exit 1
88 | 
89 |     - name: Perform CodeQL Analysis
90 |       uses: github/codeql-action/analyze@v3
91 |       with:
92 |         category: "/language:${{matrix.language}}"
93 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | permissions:
 7 |   contents: write
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |         with:
14 |           fetch-depth: 0
15 |       - name: Configure Git Credentials
16 |         run: |
17 |           git config user.name github-actions[bot]
18 |           git config user.email 41898282+github-actions[bot]@users.noreply.github.com
19 |       - name: Setup uv with Python 3.12
20 |         uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
21 |         with:
22 |           python-version: "3.12"
23 |           enable-cache: true
24 |           cache-dependency-glob: "uv.lock"
25 |       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV 
26 |       - uses: actions/cache@v4
27 |         with:
28 |           key: mkdocs-material-${{ env.cache_id }}
29 |           path: .cache
30 |           restore-keys: |
31 |             mkdocs-material-
32 |       - run: uv sync
33 |       - run: uv run mkdocs gh-deploy --force


--------------------------------------------------------------------------------
/.github/workflows/labeler.yml:
--------------------------------------------------------------------------------
 1 | name: Labeler
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - 'main'
 7 |     paths:
 8 |       - '.github/labels.yml'
 9 |       - '.github/workflows/labels.yml'
10 |   pull_request:
11 |     paths:
12 |       - '.github/labels.yml'
13 |       - '.github/workflows/labels.yml'
14 | 
15 | permissions:
16 |   contents: read
17 |   issues: write
18 |   pull-requests: write
19 | 
20 | jobs:
21 |   labeler:
22 |     runs-on: ubuntu-latest
23 |     steps:
24 |       - name: Check out the repository
25 |         uses: actions/checkout@v4
26 | 
27 |       - name: Run Labeler
28 |         uses: crazy-max/ghaction-github-labeler@24d110aa46a59976b8a7f35518cb7f14f434c916 # v5.3.0
29 |         with:
30 |           skip-delete: true
31 |           dry-run: ${{ github.event_name == 'pull_request' }}
32 |           github-token: ${{ secrets.GITHUB_TOKEN }}
33 |           yaml-file: .github/labels.yml
34 |           exclude: |
35 |             help*
36 |             *issue


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint Code
 2 | permissions:
 3 |   contents: read
 4 |   pull-requests: write
 5 | on: [push]
 6 | 
 7 | jobs:
 8 |   lint:
 9 |     strategy:
10 |       fail-fast: false
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - name: Ruff
15 |         uses: astral-sh/ruff-action@v3
16 |       - name: AutoCorrect
17 |         uses: huacnlee/autocorrect-action@main
18 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint Code and Review Dog Report
 2 | 
 3 | on: [pull_request]
 4 | permissions:
 5 |   contents: read
 6 |   pull-requests: write
 7 | jobs:
 8 |   ruff:
 9 |     name: runner / ruff
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       
14 |       - name: Install Python
15 |         uses: actions/setup-python@v5
16 |         with:
17 |           python-version: '3.11'
18 |           
19 |       - name: Install ruff
20 |         run: pip install ruff
21 |         
22 |       - name: Install reviewdog
23 |         uses: reviewdog/action-setup@e04ffabe3898a0af8d0fb1af00c188831c4b5893 # v1.3.2
24 |         with:
25 |           reviewdog_version: latest
26 |           
27 |       - name: Run ruff with reviewdog
28 |         env:
29 |           REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
30 |         run: |
31 |           ruff check . --output-format=rdjson | reviewdog -f=rdjson -reporter=github-pr-review -fail-on-error
32 |           
33 |   autocorrect:
34 |     name: runner / autocorrect
35 |     runs-on: ubuntu-latest
36 |     steps:
37 |       - uses: actions/checkout@v4
38 |       - name: AutoCorrect
39 |         uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3
40 |       - name: Report ReviewDog
41 |         if: failure()
42 |         uses: huacnlee/autocorrect-action@bf91ab3904c2908dd8e71312a8a83ed1eb632997 # v2.13.3
43 |         env:
44 |           REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
45 |         with:
46 |           reviewdog: true


--------------------------------------------------------------------------------
/.github/workflows/publish-to-pypi.yml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | 
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - main
  7 |       - master
  8 | 
  9 | permissions:
 10 |   id-token: write
 11 |   contents: write
 12 |   pull-requests: write
 13 | 
 14 | jobs:
 15 |   check-repository:
 16 |     name: Check if running in main repository
 17 |     runs-on: ubuntu-latest
 18 |     outputs:
 19 |       is_main_repo: ${{ github.repository == 'funstory-ai/BabelDOC' }}
 20 |     steps:
 21 |       - run: echo "Running repository check"
 22 | 
 23 |   build:
 24 |     name: Build distribution 📦
 25 |     needs: check-repository
 26 |     if: needs.check-repository.outputs.is_main_repo == 'true'
 27 |     runs-on: ubuntu-latest
 28 |     outputs:
 29 |       is_release: ${{ steps.check-version.outputs.tag }}
 30 |     steps:
 31 |       - uses: actions/checkout@v4
 32 |         with:
 33 |           persist-credentials: true
 34 |           fetch-depth: 2
 35 |           token: ${{ secrets.GITHUB_TOKEN }}
 36 |           
 37 |       - name: Setup uv with Python 3.12
 38 |         uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
 39 |         with:
 40 |           python-version: "3.12"
 41 |           enable-cache: true
 42 |           cache-dependency-glob: "uv.lock"
 43 | 
 44 |       - name: Check if there is a parent commit
 45 |         id: check-parent-commit
 46 |         run: |
 47 |           echo "sha=$(git rev-parse --verify --quiet HEAD^)" >> $GITHUB_OUTPUT
 48 | 
 49 |       - name: Detect and tag new version
 50 |         id: check-version
 51 |         if: steps.check-parent-commit.outputs.sha
 52 |         uses: salsify/action-detect-and-tag-new-version@b1778166f13188a9d478e2d1198f993011ba9864 # v2.0.3
 53 |         with:
 54 |           version-command: |
 55 |             cat pyproject.toml | grep "version = " | head -n 1 | awk -F'"' '{print $2}'
 56 | 
 57 |       - name: Install Dependencies
 58 |         run: |
 59 |           uv sync
 60 | 
 61 |       - name: Bump version for developmental release
 62 |         if: "! steps.check-version.outputs.tag"
 63 |         run: |
 64 |           version=$(bumpver update --patch --tag=final --dry 2>&1 | grep "New Version" | awk '{print $NF}') &&
 65 |           bumpver update --set-version $version.dev$(date +%s)
 66 | 
 67 |       - name: Build package
 68 |         run: "uv build"
 69 | 
 70 |       - name: Store the distribution packages
 71 |         uses: actions/upload-artifact@v4.6.2
 72 |         with:
 73 |           name: python-package-distributions
 74 |           path: dist/
 75 | 
 76 |   publish-to-pypi:
 77 |     name: Publish Python 🐍 distribution 📦 to PyPI
 78 |     if: needs.build.outputs.is_release != ''
 79 |     needs:
 80 |       - check-repository
 81 |       - build
 82 |     runs-on: ubuntu-latest
 83 |     environment:
 84 |       name: pypi
 85 |       url: https://pypi.org/p/BabelDOC
 86 | 
 87 |     permissions:
 88 |       id-token: write
 89 | 
 90 |     steps:
 91 |       - name: Download all the dists
 92 |         uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
 93 |         with:
 94 |           name: python-package-distributions
 95 |           path: dist/
 96 | 
 97 |       - name: Publish distribution 📦 to PyPI
 98 |         uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
 99 | 
100 |   publish-to-testpypi:
101 |     name: Publish Python 🐍 distribution 📦 to TestPyPI
102 |     if: needs.build.outputs.is_release == ''
103 |     needs:
104 |       - check-repository
105 |       - build
106 |     runs-on: ubuntu-latest
107 |     environment:
108 |       name: testpypi
109 |       url: https://test.pypi.org/p/BabelDOC
110 | 
111 |     permissions:
112 |       id-token: write
113 | 
114 |     steps:
115 |       - name: Download all the dists
116 |         uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # v4.2.1
117 |         with:
118 |           name: python-package-distributions
119 |           path: dist/
120 | 
121 |       - name: Publish distribution 📦 to TestPyPI
122 |         uses: pypa/gh-action-pypi-publish@76f52bc884231f62b9a034ebfe128415bbaabdfc # v1.12.4
123 |         with:
124 |           repository-url: https://test.pypi.org/legacy/
125 | 
126 |   post-release:
127 |     name: Post Release Tasks
128 |     needs:
129 |       - check-repository
130 |       - build
131 |       - publish-to-pypi
132 |       - publish-to-testpypi
133 |     if: |
134 |       always() && needs.check-repository.outputs.is_main_repo == 'true' && 
135 |       (needs.publish-to-pypi.result == 'success' || needs.publish-to-testpypi.result == 'success')
136 |     runs-on: ubuntu-latest
137 |     permissions:
138 |       contents: write
139 |       pull-requests: write
140 |     steps:
141 |       - uses: actions/checkout@v4
142 |         with:
143 |           persist-credentials: true
144 |           fetch-depth: 2
145 |           token: ${{ secrets.GITHUB_TOKEN }}
146 | 
147 |       - name: Publish the release notes
148 |         uses: release-drafter/release-drafter@b1476f6e6eb133afa41ed8589daba6dc69b4d3f5 # v6.1.0
149 |         with:
150 |           publish: ${{ needs.build.outputs.is_release != '' }}
151 |           tag: ${{ needs.build.outputs.is_release }}
152 |         env:
153 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Run Tests 🧪
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 |     branches: ["main"]
 7 | 
 8 | permissions:
 9 |   contents: read
10 |   pull-requests: read
11 | 
12 | jobs:
13 |   test:
14 |     name: Run Python Tests
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.10", "3.11", "3.12"]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v4
22 |         with:
23 |           persist-credentials: false
24 |       - name: Cached Assets
25 |         id: cache-assets
26 |         uses: actions/cache@v4.2.0
27 |         with:
28 |           path: ~/.cache/babeldoc
29 |           key: babeldoc-assets-${{ hashFiles('babeldoc/assets/embedding_assets_metadata.py') }}
30 |       - name: Setup uv with Python ${{ matrix.python-version }}
31 |         uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86 # v5.4.2
32 |         with:
33 |           python-version: ${{ matrix.python-version }}
34 |           enable-cache: true
35 |           cache-dependency-glob: "uv.lock"
36 |       - name: Warm up cache
37 |         run: |
38 |           uv run babeldoc --warmup
39 |       - name: Run tests
40 |         env:
41 |           OPENAI_API_KEY: ${{ secrets.OPENAIAPIKEY }}
42 |           OPENAI_BASE_URL: ${{ secrets.OPENAIAPIURL }}
43 |           OPENAI_MODEL: ${{ secrets.OPENAIMODEL }}
44 |         run: |
45 |           uv run babeldoc --help
46 |           uv run babeldoc --openai --files examples/ci/test.pdf --openai-api-key ${{ env.OPENAI_API_KEY }} --openai-base-url ${{ env.OPENAI_BASE_URL }} --openai-model ${{ env.OPENAI_MODEL }}
47 |       - name: Generate offline assets package
48 |         run: |
49 |           uv run babeldoc --generate-offline-assets /tmp/offline_assets
50 |       - name: Restore offline assets package
51 |         run: |
52 |           rm -rf ~/.cache/babeldoc
53 |           uv run babeldoc --restore-offline-assets /tmp/offline_assets
54 |       - name: Clean up
55 |         run: |
56 |           rm -rf /tmp/offline_assets
57 |           rm -rf ~/.cache/babeldoc/cache.v1.db
58 |           rm -rf ~/.cache/babeldoc/working
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | web/logs
 3 | web/*.log
 4 | web/npm-debug.log*
 5 | web/yarn-debug.log*
 6 | web/yarn-error.log*
 7 | web/pnpm-debug.log*
 8 | web/lerna-debug.log*
 9 | 
10 | web/node_modules
11 | web/dist
12 | web/dist-ssr
13 | web/*.local
14 | 
15 | memray*
16 | **/*.so
17 | *.pdf
18 | *.docx
19 | *.json
20 | **/*.pyc
21 | .venv
22 | .idea
23 | *.egg-info
24 | .DS_Store
25 | .vscode
26 | __pycache__
27 | .ruff_cache
28 | yadt.toml
29 | examples/
30 | /make_gif.py
31 | /dist
32 | .cache
33 | .cursor/rules/_*.mdc
34 | /.cursor
35 | /xnotes
36 | /docs/workflow-rules.md


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | files: '^.*\.py$'
 2 | repos:
 3 |   - repo: https://github.com/astral-sh/ruff-pre-commit
 4 |     # Ruff version.
 5 |     rev: v0.9.5
 6 |     hooks:
 7 |       # Run the linter.
 8 |       - id: ruff
 9 |         args: [ "--fix",
10 |                 "--ignore=E203,E261,E501,E741,F841" ]
11 |       # Run the formatter.
12 |       - id: ruff-format
13 | 


--------------------------------------------------------------------------------
/babeldoc/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.3.21"
2 | 


--------------------------------------------------------------------------------
/babeldoc/asynchronize/__init__.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | import time
 3 | 
 4 | 
 5 | class Args:
 6 |     def __init__(self, args, kwargs):
 7 |         self.args = args
 8 |         self.kwargs = kwargs
 9 | 
10 | 
11 | class AsyncCallback:
12 |     def __init__(self):
13 |         self.queue = asyncio.Queue()
14 |         self.finished = False
15 |         self.loop = asyncio.get_event_loop()
16 | 
17 |     def step_callback(self, *args, **kwargs):
18 |         # Whenever a step is called, add to the queue but don't set finished to True, so __anext__ will continue
19 |         args = Args(args, kwargs)
20 | 
21 |         # We have to use the threadsafe call so that it wakes up the event loop, in case it's sleeping:
22 |         # https://stackoverflow.com/a/49912853/2148718
23 |         self.loop.call_soon_threadsafe(self.queue.put_nowait, args)
24 | 
25 |         # Add a small delay to release the GIL, ensuring the event loop has time to process messages
26 |         time.sleep(0.01)
27 | 
28 |     def finished_callback(self, *args, **kwargs):
29 |         # Whenever a finished is called, add to the queue as with step, but also set finished to True, so __anext__
30 |         # will terminate after processing the remaining items
31 |         if self.finished:
32 |             return
33 |         self.step_callback(*args, **kwargs)
34 |         self.finished = True
35 | 
36 |     def __await__(self):
37 |         # Since this implements __anext__, this can return itself
38 |         return self.queue.get().__await__()
39 | 
40 |     def __aiter__(self):
41 |         # Since this implements __anext__, this can return itself
42 |         return self
43 | 
44 |     async def __anext__(self):
45 |         # Keep waiting for the queue if a) we haven't finished, or b) if the queue is still full. This lets us finish
46 |         # processing the remaining items even after we've finished
47 |         if self.finished and self.queue.empty():
48 |             raise StopAsyncIteration
49 | 
50 |         result = await self.queue.get()
51 |         return result
52 | 


--------------------------------------------------------------------------------
/babeldoc/const.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import subprocess
 4 | from pathlib import Path
 5 | 
 6 | __version__ = "0.3.21"
 7 | 
 8 | CACHE_FOLDER = Path.home() / ".cache" / "babeldoc"
 9 | 
10 | 
11 | def get_cache_file_path(filename: str, sub_folder: str | None = None) -> Path:
12 |     if sub_folder is not None:
13 |         sub_folder = sub_folder.strip("/")
14 |         sub_folder_path = CACHE_FOLDER / sub_folder
15 |         sub_folder_path.mkdir(parents=True, exist_ok=True)
16 |         return sub_folder_path / filename
17 |     return CACHE_FOLDER / filename
18 | 
19 | 
20 | try:
21 |     git_path = shutil.which("git")
22 |     if git_path is None:
23 |         raise FileNotFoundError("git executable not found")
24 |     two_parent = Path(__file__).resolve().parent.parent
25 |     md_ = two_parent / "docs" / "README.md"
26 |     if two_parent.name == "site-packages" or not md_.exists():
27 |         raise FileNotFoundError("not in git repo")
28 |     WATERMARK_VERSION = (
29 |         subprocess.check_output(  # noqa: S603
30 |             [git_path, "describe", "--always"],
31 |             cwd=Path(__file__).resolve().parent,
32 |         )
33 |         .strip()
34 |         .decode()
35 |     )
36 | except (OSError, FileNotFoundError, subprocess.CalledProcessError):
37 |     WATERMARK_VERSION = f"v{__version__}"
38 | 
39 | TIKTOKEN_CACHE_FOLDER = CACHE_FOLDER / "tiktoken"
40 | TIKTOKEN_CACHE_FOLDER.mkdir(parents=True, exist_ok=True)
41 | os.environ["TIKTOKEN_CACHE_DIR"] = str(TIKTOKEN_CACHE_FOLDER)
42 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/__init__.py:
--------------------------------------------------------------------------------
 1 | from babeldoc.document_il.il_version_1 import BaseOperations
 2 | from babeldoc.document_il.il_version_1 import Box
 3 | from babeldoc.document_il.il_version_1 import Cropbox
 4 | from babeldoc.document_il.il_version_1 import Document
 5 | from babeldoc.document_il.il_version_1 import GraphicState
 6 | from babeldoc.document_il.il_version_1 import Mediabox
 7 | from babeldoc.document_il.il_version_1 import Page
 8 | from babeldoc.document_il.il_version_1 import PageLayout
 9 | from babeldoc.document_il.il_version_1 import PdfCharacter
10 | from babeldoc.document_il.il_version_1 import PdfFigure
11 | from babeldoc.document_il.il_version_1 import PdfFont
12 | from babeldoc.document_il.il_version_1 import PdfFontCharBoundingBox
13 | from babeldoc.document_il.il_version_1 import PdfFormula
14 | from babeldoc.document_il.il_version_1 import PdfLine
15 | from babeldoc.document_il.il_version_1 import PdfParagraph
16 | from babeldoc.document_il.il_version_1 import PdfParagraphComposition
17 | from babeldoc.document_il.il_version_1 import PdfRectangle
18 | from babeldoc.document_il.il_version_1 import PdfSameStyleCharacters
19 | from babeldoc.document_il.il_version_1 import PdfSameStyleUnicodeCharacters
20 | from babeldoc.document_il.il_version_1 import PdfStyle
21 | from babeldoc.document_il.il_version_1 import PdfXobject
22 | from babeldoc.document_il.il_version_1 import VisualBbox
23 | 
24 | __all__ = [
25 |     "BaseOperations",
26 |     "Box",
27 |     "Cropbox",
28 |     "Document",
29 |     "GraphicState",
30 |     "Mediabox",
31 |     "Page",
32 |     "PageLayout",
33 |     "PdfCharacter",
34 |     "PdfFigure",
35 |     "PdfFont",
36 |     "PdfFontCharBoundingBox",
37 |     "PdfFormula",
38 |     "PdfLine",
39 |     "PdfParagraph",
40 |     "PdfParagraphComposition",
41 |     "PdfRectangle",
42 |     "PdfSameStyleCharacters",
43 |     "PdfSameStyleUnicodeCharacters",
44 |     "PdfStyle",
45 |     "PdfXobject",
46 |     "VisualBbox",
47 | ]
48 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/babeldoc_exception/BabelDOCException.py:
--------------------------------------------------------------------------------
1 | class ScannedPDFError(Exception):
2 |     def __init__(self, message):
3 |         super().__init__(message)
4 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/backend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/backend/__init__.py


--------------------------------------------------------------------------------
/babeldoc/document_il/frontend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/frontend/__init__.py


--------------------------------------------------------------------------------
/babeldoc/document_il/il_version_1.rnc:
--------------------------------------------------------------------------------
  1 | start = Document
  2 | Document =
  3 |   element document {
  4 |     Page+,
  5 |     attribute totalPages { xsd:int }
  6 |   }
  7 | Page =
  8 |   element page {
  9 |     element mediabox { Box },
 10 |     element cropbox { Box },
 11 |     PDFXobject*,
 12 |     PageLayout*,
 13 |     PDFRectangle*,
 14 |     PDFFont*,
 15 |     PDFParagraph*,
 16 |     PDFFigure*,
 17 |     PDFCharacter*,
 18 |     attribute pageNumber { xsd:int },
 19 |     attribute Unit { xsd:string },
 20 |     element baseOperations { xsd:string }
 21 |   }
 22 | Box =
 23 |   element box {
 24 |     # from (x,y) to (x2,y2)
 25 |     attribute x { xsd:float },
 26 |     attribute y { xsd:float },
 27 |     attribute x2 { xsd:float },
 28 |     attribute y2 { xsd:float }
 29 |   }
 30 | PDFXrefId = xsd:int
 31 | PDFFont =
 32 |   element pdfFont {
 33 |     attribute name { xsd:string },
 34 |     attribute fontId { xsd:string },
 35 |     attribute xrefId { PDFXrefId },
 36 |     attribute encodingLength { xsd:int },
 37 |     attribute bold { xsd:boolean }?,
 38 |     attribute italic { xsd:boolean }?,
 39 |     attribute monospace { xsd:boolean }?,
 40 |     attribute serif { xsd:boolean }?,
 41 |     attribute ascent { xsd:float }?,
 42 |     attribute descent { xsd:float }?,
 43 |     PDFFontCharBoundingBox*
 44 |   }
 45 | PDFFontCharBoundingBox =
 46 |   element pdfFontCharBoundingBox {
 47 |     attribute x { xsd:float },
 48 |     attribute y { xsd:float },
 49 |     attribute x2 { xsd:float },
 50 |     attribute y2 { xsd:float },
 51 |     attribute char_id { xsd:int }
 52 |   }
 53 | PDFXobject =
 54 |   element pdfXobject {
 55 |     attribute xobjId { xsd:int },
 56 |     attribute xrefId { PDFXrefId },
 57 |     Box,
 58 |     PDFFont*,
 59 |     element baseOperations { xsd:string }
 60 |   }
 61 | PDFCharacter =
 62 |   element pdfCharacter {
 63 |     attribute vertical { xsd:boolean }?,
 64 |     attribute scale { xsd:float }?,
 65 |     attribute pdfCharacterId { xsd:int }?,
 66 |     attribute char_unicode { xsd:string },
 67 |     attribute advance { xsd:float }?,
 68 |     # xobject nesting depth
 69 |     attribute xobjId { xsd:int }?,
 70 |     attribute debug_info { xsd:boolean }?,
 71 |     PDFStyle,
 72 |     Box,
 73 |     element visual_bbox { Box }?
 74 |   }
 75 | PageLayout =
 76 |   element pageLayout {
 77 |     attribute id { xsd:int },
 78 |     attribute conf { xsd:float },
 79 |     attribute class_name { xsd:string },
 80 |     Box
 81 |   }
 82 | GraphicState =
 83 |   element graphicState {
 84 |     attribute linewidth { xsd:float }?,
 85 |     attribute dash {
 86 |       list { xsd:float+ }
 87 |     }?,
 88 |     attribute flatness { xsd:float }?,
 89 |     attribute intent { xsd:string }?,
 90 |     attribute linecap { xsd:int }?,
 91 |     attribute linejoin { xsd:int }?,
 92 |     attribute miterlimit { xsd:float }?,
 93 |     attribute ncolor {
 94 |       list { xsd:float+ }
 95 |     }?,
 96 |     attribute scolor {
 97 |       list { xsd:float+ }
 98 |     }?,
 99 |     attribute strokingColorSpaceName { xsd:string }?,
100 |     attribute nonStrokingColorSpaceName { xsd:string }?,
101 |     attribute passthroughPerCharInstruction { xsd:string }?
102 |   }
103 | PDFStyle =
104 |   element pdfStyle {
105 |     attribute font_id { xsd:string },
106 |     attribute font_size { xsd:float },
107 |     GraphicState
108 |   }
109 | PDFParagraph =
110 |   element pdfParagraph {
111 |     attribute xobjId { xsd:int }?,
112 |     attribute unicode { xsd:string },
113 |     attribute scale { xsd:float }?,
114 |     attribute vertical { xsd:boolean }?,
115 |     attribute FirstLineIndent { xsd:boolean }?,
116 |     attribute debug_id { xsd:string }?,
117 |     attribute layout_label { xsd:string }?,
118 |     attribute layout_id { xsd:int }?,
119 |     Box,
120 |     PDFStyle,
121 |     PDFParagraphComposition*
122 |   }
123 | PDFParagraphComposition =
124 |   element pdfParagraphComposition {
125 |     PDFLine
126 |     | PDFFormula
127 |     | PDFSameStyleCharacters
128 |     | PDFCharacter
129 |     | PDFSameStyleUnicodeCharacters
130 |   }
131 | PDFLine = element pdfLine { Box, PDFCharacter+ }
132 | PDFSameStyleCharacters =
133 |   element pdfSameStyleCharacters { Box, PDFStyle, PDFCharacter+ }
134 | PDFSameStyleUnicodeCharacters =
135 |   element pdfSameStyleUnicodeCharacters {
136 |     PDFStyle?,
137 |     attribute unicode { xsd:string },
138 |     attribute debug_info { xsd:boolean }?
139 |   }
140 | PDFFormula =
141 |   element pdfFormula {
142 |     Box,
143 |     PDFCharacter+,
144 |     attribute x_offset { xsd:float },
145 |     attribute y_offset { xsd:float }
146 |   }
147 | PDFFigure = element pdfFigure { Box }
148 | PDFRectangle =
149 |   element pdfRectangle {
150 |     Box,
151 |     GraphicState,
152 |     attribute debug_info { xsd:boolean }?,
153 |     attribute fill_background { xsd:boolean }?,
154 |     attribute xobjId { xsd:int }?
155 |   }
156 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/il_version_1.xsd:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema" elementFormDefault="qualified">
  3 |   <xs:element name="document">
  4 |     <xs:complexType>
  5 |       <xs:sequence>
  6 |         <xs:element maxOccurs="unbounded" ref="page"/>
  7 |       </xs:sequence>
  8 |       <xs:attribute name="totalPages" use="required" type="xs:int"/>
  9 |     </xs:complexType>
 10 |   </xs:element>
 11 |   <xs:element name="page">
 12 |     <xs:complexType>
 13 |       <xs:sequence>
 14 |         <xs:element ref="mediabox"/>
 15 |         <xs:element ref="cropbox"/>
 16 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfXobject"/>
 17 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pageLayout"/>
 18 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfRectangle"/>
 19 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
 20 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraph"/>
 21 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFigure"/>
 22 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfCharacter"/>
 23 |         <xs:element ref="baseOperations"/>
 24 |       </xs:sequence>
 25 |       <xs:attribute name="pageNumber" use="required" type="xs:int"/>
 26 |       <xs:attribute name="Unit" use="required" type="xs:string"/>
 27 |     </xs:complexType>
 28 |   </xs:element>
 29 |   <xs:element name="mediabox">
 30 |     <xs:complexType>
 31 |       <xs:sequence>
 32 |         <xs:element ref="box"/>
 33 |       </xs:sequence>
 34 |     </xs:complexType>
 35 |   </xs:element>
 36 |   <xs:element name="cropbox">
 37 |     <xs:complexType>
 38 |       <xs:sequence>
 39 |         <xs:element ref="box"/>
 40 |       </xs:sequence>
 41 |     </xs:complexType>
 42 |   </xs:element>
 43 |   <xs:element name="baseOperations" type="xs:string"/>
 44 |   <xs:element name="box">
 45 |     <xs:complexType>
 46 |       <xs:attribute name="x" use="required" type="xs:float"/>
 47 |       <xs:attribute name="y" use="required" type="xs:float"/>
 48 |       <xs:attribute name="x2" use="required" type="xs:float"/>
 49 |       <xs:attribute name="y2" use="required" type="xs:float"/>
 50 |     </xs:complexType>
 51 |   </xs:element>
 52 |   <xs:simpleType name="PDFXrefId">
 53 |     <xs:restriction base="xs:int"/>
 54 |   </xs:simpleType>
 55 |   <xs:element name="pdfFont">
 56 |     <xs:complexType>
 57 |       <xs:sequence>
 58 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFontCharBoundingBox"/>
 59 |       </xs:sequence>
 60 |       <xs:attribute name="name" use="required" type="xs:string"/>
 61 |       <xs:attribute name="fontId" use="required" type="xs:string"/>
 62 |       <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
 63 |       <xs:attribute name="encodingLength" use="required" type="xs:int"/>
 64 |       <xs:attribute name="bold" type="xs:boolean"/>
 65 |       <xs:attribute name="italic" type="xs:boolean"/>
 66 |       <xs:attribute name="monospace" type="xs:boolean"/>
 67 |       <xs:attribute name="serif" type="xs:boolean"/>
 68 |       <xs:attribute name="ascent" type="xs:float"/>
 69 |       <xs:attribute name="descent" type="xs:float"/>
 70 |     </xs:complexType>
 71 |   </xs:element>
 72 |   <xs:element name="pdfFontCharBoundingBox">
 73 |     <xs:complexType>
 74 |       <xs:attribute name="x" use="required" type="xs:float"/>
 75 |       <xs:attribute name="y" use="required" type="xs:float"/>
 76 |       <xs:attribute name="x2" use="required" type="xs:float"/>
 77 |       <xs:attribute name="y2" use="required" type="xs:float"/>
 78 |       <xs:attribute name="char_id" use="required" type="xs:int"/>
 79 |     </xs:complexType>
 80 |   </xs:element>
 81 |   <xs:element name="pdfXobject">
 82 |     <xs:complexType>
 83 |       <xs:sequence>
 84 |         <xs:element ref="box"/>
 85 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfFont"/>
 86 |         <xs:element ref="baseOperations"/>
 87 |       </xs:sequence>
 88 |       <xs:attribute name="xobjId" use="required" type="xs:int"/>
 89 |       <xs:attribute name="xrefId" use="required" type="PDFXrefId"/>
 90 |     </xs:complexType>
 91 |   </xs:element>
 92 |   <xs:element name="pdfCharacter">
 93 |     <xs:complexType>
 94 |       <xs:sequence>
 95 |         <xs:element ref="pdfStyle"/>
 96 |         <xs:element ref="box"/>
 97 |         <xs:element minOccurs="0" ref="visual_bbox"/>
 98 |       </xs:sequence>
 99 |       <xs:attribute name="vertical" type="xs:boolean"/>
100 |       <xs:attribute name="scale" type="xs:float"/>
101 |       <xs:attribute name="pdfCharacterId" type="xs:int"/>
102 |       <xs:attribute name="char_unicode" use="required" type="xs:string"/>
103 |       <xs:attribute name="advance" type="xs:float"/>
104 |       <xs:attribute name="xobjId" type="xs:int"/>
105 |       <xs:attribute name="debug_info" type="xs:boolean"/>
106 |     </xs:complexType>
107 |   </xs:element>
108 |   <xs:element name="visual_bbox">
109 |     <xs:complexType>
110 |       <xs:sequence>
111 |         <xs:element ref="box"/>
112 |       </xs:sequence>
113 |     </xs:complexType>
114 |   </xs:element>
115 |   <xs:element name="pageLayout">
116 |     <xs:complexType>
117 |       <xs:sequence>
118 |         <xs:element ref="box"/>
119 |       </xs:sequence>
120 |       <xs:attribute name="id" use="required" type="xs:int"/>
121 |       <xs:attribute name="conf" use="required" type="xs:float"/>
122 |       <xs:attribute name="class_name" use="required" type="xs:string"/>
123 |     </xs:complexType>
124 |   </xs:element>
125 |   <xs:element name="graphicState">
126 |     <xs:complexType>
127 |       <xs:attribute name="linewidth" type="xs:float"/>
128 |       <xs:attribute name="dash">
129 |         <xs:simpleType>
130 |           <xs:restriction>
131 |             <xs:simpleType>
132 |               <xs:list itemType="xs:float"/>
133 |             </xs:simpleType>
134 |             <xs:minLength value="1"/>
135 |           </xs:restriction>
136 |         </xs:simpleType>
137 |       </xs:attribute>
138 |       <xs:attribute name="flatness" type="xs:float"/>
139 |       <xs:attribute name="intent" type="xs:string"/>
140 |       <xs:attribute name="linecap" type="xs:int"/>
141 |       <xs:attribute name="linejoin" type="xs:int"/>
142 |       <xs:attribute name="miterlimit" type="xs:float"/>
143 |       <xs:attribute name="ncolor">
144 |         <xs:simpleType>
145 |           <xs:restriction>
146 |             <xs:simpleType>
147 |               <xs:list itemType="xs:float"/>
148 |             </xs:simpleType>
149 |             <xs:minLength value="1"/>
150 |           </xs:restriction>
151 |         </xs:simpleType>
152 |       </xs:attribute>
153 |       <xs:attribute name="scolor">
154 |         <xs:simpleType>
155 |           <xs:restriction>
156 |             <xs:simpleType>
157 |               <xs:list itemType="xs:float"/>
158 |             </xs:simpleType>
159 |             <xs:minLength value="1"/>
160 |           </xs:restriction>
161 |         </xs:simpleType>
162 |       </xs:attribute>
163 |       <xs:attribute name="strokingColorSpaceName" type="xs:string"/>
164 |       <xs:attribute name="nonStrokingColorSpaceName" type="xs:string"/>
165 |       <xs:attribute name="passthroughPerCharInstruction" type="xs:string"/>
166 |     </xs:complexType>
167 |   </xs:element>
168 |   <xs:element name="pdfStyle">
169 |     <xs:complexType>
170 |       <xs:sequence>
171 |         <xs:element ref="graphicState"/>
172 |       </xs:sequence>
173 |       <xs:attribute name="font_id" use="required" type="xs:string"/>
174 |       <xs:attribute name="font_size" use="required" type="xs:float"/>
175 |     </xs:complexType>
176 |   </xs:element>
177 |   <xs:element name="pdfParagraph">
178 |     <xs:complexType>
179 |       <xs:sequence>
180 |         <xs:element ref="box"/>
181 |         <xs:element ref="pdfStyle"/>
182 |         <xs:element minOccurs="0" maxOccurs="unbounded" ref="pdfParagraphComposition"/>
183 |       </xs:sequence>
184 |       <xs:attribute name="xobjId" type="xs:int"/>
185 |       <xs:attribute name="unicode" use="required" type="xs:string"/>
186 |       <xs:attribute name="scale" type="xs:float"/>
187 |       <xs:attribute name="vertical" type="xs:boolean"/>
188 |       <xs:attribute name="FirstLineIndent" type="xs:boolean"/>
189 |       <xs:attribute name="debug_id" type="xs:string"/>
190 |       <xs:attribute name="layout_label" type="xs:string"/>
191 |     </xs:complexType>
192 |   </xs:element>
193 |   <xs:element name="pdfParagraphComposition">
194 |     <xs:complexType>
195 |       <xs:choice>
196 |         <xs:element ref="pdfLine"/>
197 |         <xs:element ref="pdfFormula"/>
198 |         <xs:element ref="pdfSameStyleCharacters"/>
199 |         <xs:element ref="pdfCharacter"/>
200 |         <xs:element ref="pdfSameStyleUnicodeCharacters"/>
201 |       </xs:choice>
202 |     </xs:complexType>
203 |   </xs:element>
204 |   <xs:element name="pdfLine">
205 |     <xs:complexType>
206 |       <xs:sequence>
207 |         <xs:element ref="box"/>
208 |         <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
209 |       </xs:sequence>
210 |     </xs:complexType>
211 |   </xs:element>
212 |   <xs:element name="pdfSameStyleCharacters">
213 |     <xs:complexType>
214 |       <xs:sequence>
215 |         <xs:element ref="box"/>
216 |         <xs:element ref="pdfStyle"/>
217 |         <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
218 |       </xs:sequence>
219 |     </xs:complexType>
220 |   </xs:element>
221 |   <xs:element name="pdfSameStyleUnicodeCharacters">
222 |     <xs:complexType>
223 |       <xs:sequence>
224 |         <xs:element minOccurs="0" ref="pdfStyle"/>
225 |       </xs:sequence>
226 |       <xs:attribute name="unicode" use="required" type="xs:string"/>
227 |       <xs:attribute name="debug_info" type="xs:boolean"/>
228 |     </xs:complexType>
229 |   </xs:element>
230 |   <xs:element name="pdfFormula">
231 |     <xs:complexType>
232 |       <xs:sequence>
233 |         <xs:element ref="box"/>
234 |         <xs:element maxOccurs="unbounded" ref="pdfCharacter"/>
235 |       </xs:sequence>
236 |       <xs:attribute name="x_offset" use="required" type="xs:float"/>
237 |       <xs:attribute name="y_offset" use="required" type="xs:float"/>
238 |     </xs:complexType>
239 |   </xs:element>
240 |   <xs:element name="pdfFigure">
241 |     <xs:complexType>
242 |       <xs:sequence>
243 |         <xs:element ref="box"/>
244 |       </xs:sequence>
245 |     </xs:complexType>
246 |   </xs:element>
247 |   <xs:element name="pdfRectangle">
248 |     <xs:complexType>
249 |       <xs:sequence>
250 |         <xs:element ref="box"/>
251 |         <xs:element ref="graphicState"/>
252 |       </xs:sequence>
253 |       <xs:attribute name="debug_info" type="xs:boolean"/>
254 |     </xs:complexType>
255 |   </xs:element>
256 | </xs:schema>
257 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/midend/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/midend/__init__.py


--------------------------------------------------------------------------------
/babeldoc/document_il/midend/add_debug_information.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import babeldoc.document_il.il_version_1 as il_version_1
  4 | from babeldoc.document_il import GraphicState
  5 | from babeldoc.document_il.utils.style_helper import BLUE
  6 | from babeldoc.document_il.utils.style_helper import ORANGE
  7 | from babeldoc.document_il.utils.style_helper import YELLOW
  8 | from babeldoc.translation_config import TranslationConfig
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class AddDebugInformation:
 14 |     stage_name = "Add Debug Information"
 15 | 
 16 |     def __init__(self, translation_config: TranslationConfig):
 17 |         self.translation_config = translation_config
 18 |         self.model = translation_config.doc_layout_model
 19 | 
 20 |     def process(self, docs: il_version_1.Document):
 21 |         if not self.translation_config.debug:
 22 |             return
 23 | 
 24 |         for page in docs.page:
 25 |             self.process_page(page)
 26 | 
 27 |     def _create_rectangle(self, box: il_version_1.Box, color: GraphicState):
 28 |         rect = il_version_1.PdfRectangle(
 29 |             box=box,
 30 |             graphic_state=color,
 31 |             debug_info=True,
 32 |         )
 33 |         return rect
 34 | 
 35 |     def _create_text(self, text: str, color: GraphicState, box: il_version_1.Box):
 36 |         style = il_version_1.PdfStyle(
 37 |             font_id="china-ss",
 38 |             font_size=4,
 39 |             graphic_state=color,
 40 |         )
 41 |         return il_version_1.PdfParagraph(
 42 |             first_line_indent=False,
 43 |             box=il_version_1.Box(
 44 |                 x=box.x,
 45 |                 y=box.y2,
 46 |                 x2=box.x2,
 47 |                 y2=box.y2 + 5,
 48 |             ),
 49 |             vertical=False,
 50 |             pdf_style=style,
 51 |             unicode=text,
 52 |             pdf_paragraph_composition=[
 53 |                 il_version_1.PdfParagraphComposition(
 54 |                     pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
 55 |                         unicode=text,
 56 |                         pdf_style=style,
 57 |                         debug_info=True,
 58 |                     ),
 59 |                 ),
 60 |             ],
 61 |             xobj_id=-1,
 62 |         )
 63 | 
 64 |     def process_page(self, page: il_version_1.Page):
 65 |         # Add page number text at top-left corner
 66 |         page_width = page.cropbox.box.x2 - page.cropbox.box.x
 67 |         page_height = page.cropbox.box.y2 - page.cropbox.box.y
 68 |         page_number_text = f"pagenumber: {page.page_number}"
 69 |         page_number_box = il_version_1.Box(
 70 |             x=page.cropbox.box.x + page_width * 0.02,
 71 |             y=page.cropbox.box.y,
 72 |             x2=page.cropbox.box.x2,
 73 |             y2=page.cropbox.box.y2 - page_height * 0.02,
 74 |         )
 75 |         page_number_paragraph = self._create_text(
 76 |             page_number_text,
 77 |             BLUE,
 78 |             page_number_box,
 79 |         )
 80 |         page.pdf_paragraph.append(page_number_paragraph)
 81 | 
 82 |         new_paragraphs = []
 83 | 
 84 |         for paragraph in page.pdf_paragraph:
 85 |             if not paragraph.pdf_paragraph_composition:
 86 |                 continue
 87 |             if any(
 88 |                 x.pdf_same_style_unicode_characters.debug_info
 89 |                 for x in paragraph.pdf_paragraph_composition
 90 |                 if x.pdf_same_style_unicode_characters
 91 |             ):
 92 |                 continue
 93 |             # Create a rectangle box
 94 |             rect = self._create_rectangle(paragraph.box, BLUE)
 95 | 
 96 |             page.pdf_rectangle.append(rect)
 97 | 
 98 |             # Create text label at top-left corner
 99 |             # Note: PDF coordinates are from bottom-left,
100 |             # so we use y2 for top position
101 | 
102 |             debug_text = "paragraph"
103 |             if hasattr(paragraph, "debug_id") and paragraph.debug_id:
104 |                 debug_text = f"paragraph[{paragraph.debug_id}]"
105 |             new_paragraphs.append(self._create_text(debug_text, BLUE, paragraph.box))
106 | 
107 |             for composition in paragraph.pdf_paragraph_composition:
108 |                 if composition.pdf_formula:
109 |                     new_paragraphs.append(
110 |                         self._create_text(
111 |                             "formula",
112 |                             ORANGE,
113 |                             composition.pdf_formula.box,
114 |                         ),
115 |                     )
116 |                     page.pdf_rectangle.append(
117 |                         self._create_rectangle(
118 |                             composition.pdf_formula.box,
119 |                             ORANGE,
120 |                         ),
121 |                     )
122 | 
123 |             for xobj in page.pdf_xobject:
124 |                 new_paragraphs.append(
125 |                     self._create_text(
126 |                         "xobj",
127 |                         YELLOW,
128 |                         xobj.box,
129 |                     ),
130 |                 )
131 |                 page.pdf_rectangle.append(
132 |                     self._create_rectangle(
133 |                         xobj.box,
134 |                         YELLOW,
135 |                     ),
136 |                 )
137 | 
138 |         page.pdf_paragraph.extend(new_paragraphs)
139 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/midend/detect_scanned_file.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import cv2
  4 | import numpy as np
  5 | import pymupdf
  6 | from skimage.metrics import structural_similarity
  7 | 
  8 | from babeldoc.document_il import il_version_1
  9 | from babeldoc.document_il.babeldoc_exception.BabelDOCException import ScannedPDFError
 10 | from babeldoc.document_il.utils.style_helper import GREEN
 11 | from babeldoc.translation_config import TranslationConfig
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class DetectScannedFile:
 17 |     stage_name = "DetectScannedFile"
 18 | 
 19 |     def __init__(self, translation_config: TranslationConfig):
 20 |         self.translation_config = translation_config
 21 | 
 22 |     def _save_debug_box_to_page(self, page: il_version_1.Page, similarity: float):
 23 |         """Save debug boxes and text labels to the PDF page."""
 24 |         if not self.translation_config.debug:
 25 |             return
 26 | 
 27 |         color = GREEN
 28 | 
 29 |         # Create text label at top-left corner
 30 |         # Note: PDF coordinates are from bottom-left,
 31 |         # so we use y2 for top position
 32 |         style = il_version_1.PdfStyle(
 33 |             font_id="china-ss",
 34 |             font_size=4,
 35 |             graphic_state=color,
 36 |         )
 37 |         page_width = page.cropbox.box.x2 - page.cropbox.box.x
 38 |         page_height = page.cropbox.box.y2 - page.cropbox.box.y
 39 |         unicode = f"scanned score: {similarity * 100:.2f} %"
 40 |         page.pdf_paragraph.append(
 41 |             il_version_1.PdfParagraph(
 42 |                 first_line_indent=False,
 43 |                 box=il_version_1.Box(
 44 |                     x=page.cropbox.box.x + page_width * 0.03,
 45 |                     y=page.cropbox.box.y,
 46 |                     x2=page.cropbox.box.x2,
 47 |                     y2=page.cropbox.box.y2 - page_height * 0.03,
 48 |                 ),
 49 |                 vertical=False,
 50 |                 pdf_style=style,
 51 |                 unicode=unicode,
 52 |                 pdf_paragraph_composition=[
 53 |                     il_version_1.PdfParagraphComposition(
 54 |                         pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
 55 |                             unicode=unicode,
 56 |                             pdf_style=style,
 57 |                             debug_info=True,
 58 |                         ),
 59 |                     ),
 60 |                 ],
 61 |                 xobj_id=-1,
 62 |             ),
 63 |         )
 64 | 
 65 |     def process(self, docs: il_version_1.Document):
 66 |         """Generate layouts for all pages that need to be translated."""
 67 |         # Get pages that need to be translated
 68 |         pages_to_translate = [
 69 |             page
 70 |             for page in docs.page
 71 |             if self.translation_config.should_translate_page(page.page_number + 1)
 72 |         ]
 73 |         mupdf = pymupdf.open(self.translation_config.get_working_file_path("input.pdf"))
 74 |         total = len(pages_to_translate)
 75 |         threshold = 0.8 * total
 76 |         threshold = max(threshold, 1)
 77 |         scanned = 0
 78 |         non_scanned = 0
 79 |         non_scanned_threshold = total - threshold
 80 |         with self.translation_config.progress_monitor.stage_start(
 81 |             self.stage_name,
 82 |             total,
 83 |         ) as progress:
 84 |             for page in pages_to_translate:
 85 |                 if scanned < threshold and non_scanned < non_scanned_threshold:
 86 |                     # Only continue detection if both counts are below thresholds
 87 |                     is_scanned = self.detect_page_is_scanned(page, mupdf)
 88 |                     if is_scanned:
 89 |                         scanned += 1
 90 |                     else:
 91 |                         non_scanned += 1
 92 |                 else:
 93 |                     # We have enough information to determine document type
 94 |                     non_scanned += 1
 95 |                 progress.advance(1)
 96 | 
 97 |         if scanned > threshold:
 98 |             logger.warning(
 99 |                 f"Detected {scanned} scanned pages, which is more than 80% of the total pages. "
100 |                 "Please check the input PDF file.",
101 |             )
102 |             raise ScannedPDFError("Scanned PDF detected.")
103 | 
104 |     @staticmethod
105 |     def detect_page_is_scanned(page: il_version_1.Page, pdf: pymupdf.Document) -> bool:
106 |         before_page_image = pdf[page.page_number].get_pixmap()
107 |         before_page_image = np.frombuffer(before_page_image.samples, np.uint8).reshape(
108 |             before_page_image.height,
109 |             before_page_image.width,
110 |             3,
111 |         )[:, :, ::-1]
112 |         new_xref = pdf.get_new_xref()
113 |         pdf.update_object(new_xref, "<<>>")
114 |         pdf.update_stream(new_xref, page.base_operations.value.encode("utf-8"))
115 |         pdf[page.page_number].set_contents(new_xref)
116 | 
117 |         for xobj in page.pdf_xobject:
118 |             pdf.update_stream(xobj.xref_id, xobj.base_operations.value.encode("utf-8"))
119 | 
120 |         after_page_image = pdf[page.page_number].get_pixmap()
121 |         after_page_image = np.frombuffer(after_page_image.samples, np.uint8).reshape(
122 |             after_page_image.height,
123 |             after_page_image.width,
124 |             3,
125 |         )[:, :, ::-1]
126 |         before_page_image = cv2.cvtColor(before_page_image, cv2.COLOR_RGB2GRAY)
127 |         after_page_image = cv2.cvtColor(after_page_image, cv2.COLOR_RGB2GRAY)
128 |         return structural_similarity(before_page_image, after_page_image) > 0.9
129 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/midend/layout_parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import cv2
  5 | import numpy as np
  6 | from pymupdf import Document
  7 | 
  8 | from babeldoc.document_il import il_version_1
  9 | from babeldoc.document_il.utils.style_helper import GREEN
 10 | from babeldoc.translation_config import TranslationConfig
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class LayoutParser:
 16 |     stage_name = "Parse Page Layout"
 17 | 
 18 |     def __init__(self, translation_config: TranslationConfig):
 19 |         self.translation_config = translation_config
 20 |         self.model = translation_config.doc_layout_model
 21 | 
 22 |     def _save_debug_image(self, image: np.ndarray, layout, page_number: int):
 23 |         """Save debug image with drawn boxes if debug mode is enabled."""
 24 |         if not self.translation_config.debug:
 25 |             return
 26 | 
 27 |         debug_dir = Path(self.translation_config.get_working_file_path("ocr-box-image"))
 28 |         debug_dir.mkdir(parents=True, exist_ok=True)
 29 | 
 30 |         # Draw boxes on the image
 31 |         debug_image = image.copy()
 32 |         for box in layout.boxes:
 33 |             x0, y0, x1, y1 = box.xyxy
 34 |             cv2.rectangle(
 35 |                 debug_image,
 36 |                 (int(x0), int(y0)),
 37 |                 (int(x1), int(y1)),
 38 |                 (0, 255, 0),
 39 |                 2,
 40 |             )
 41 |             # Add text label
 42 |             cv2.putText(
 43 |                 debug_image,
 44 |                 layout.names[box.cls],
 45 |                 (int(x0), int(y0) - 5),
 46 |                 cv2.FONT_HERSHEY_SIMPLEX,
 47 |                 0.5,
 48 |                 (0, 255, 0),
 49 |                 1,
 50 |             )
 51 | 
 52 |         # Save the image
 53 |         output_path = debug_dir / f"{page_number}.jpg"
 54 |         cv2.imwrite(str(output_path), debug_image)
 55 | 
 56 |     def _save_debug_box_to_page(self, page: il_version_1.Page):
 57 |         """Save debug boxes and text labels to the PDF page."""
 58 |         if not self.translation_config.debug:
 59 |             return
 60 | 
 61 |         color = GREEN
 62 | 
 63 |         for layout in page.page_layout:
 64 |             # Create a rectangle box
 65 |             rect = il_version_1.PdfRectangle(
 66 |                 box=il_version_1.Box(
 67 |                     x=layout.box.x,
 68 |                     y=layout.box.y,
 69 |                     x2=layout.box.x2,
 70 |                     y2=layout.box.y2,
 71 |                 ),
 72 |                 graphic_state=color,
 73 |                 debug_info=True,
 74 |             )
 75 |             page.pdf_rectangle.append(rect)
 76 | 
 77 |             # Create text label at top-left corner
 78 |             # Note: PDF coordinates are from bottom-left,
 79 |             # so we use y2 for top position
 80 |             style = il_version_1.PdfStyle(
 81 |                 font_id="china-ss",
 82 |                 font_size=4,
 83 |                 graphic_state=color,
 84 |             )
 85 |             page.pdf_paragraph.append(
 86 |                 il_version_1.PdfParagraph(
 87 |                     first_line_indent=False,
 88 |                     box=il_version_1.Box(
 89 |                         x=layout.box.x,
 90 |                         y=layout.box.y2,
 91 |                         x2=layout.box.x2,
 92 |                         y2=layout.box.y2 + 5,
 93 |                     ),
 94 |                     vertical=False,
 95 |                     pdf_style=style,
 96 |                     unicode=layout.class_name,
 97 |                     pdf_paragraph_composition=[
 98 |                         il_version_1.PdfParagraphComposition(
 99 |                             pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
100 |                                 unicode=layout.class_name,
101 |                                 pdf_style=style,
102 |                                 debug_info=True,
103 |                             ),
104 |                         ),
105 |                     ],
106 |                     xobj_id=-1,
107 |                 ),
108 |             )
109 | 
110 |     def process(self, docs: il_version_1.Document, mupdf_doc: Document):
111 |         """Generate layouts for all pages that need to be translated."""
112 |         # Get pages that need to be translated
113 |         total = len(docs.page)
114 |         with self.translation_config.progress_monitor.stage_start(
115 |             self.stage_name,
116 |             total,
117 |         ) as progress:
118 |             # Process predictions for each page
119 |             for page, layouts in self.model.handle_document(
120 |                 docs.page, mupdf_doc, self.translation_config, self._save_debug_image
121 |             ):
122 |                 page_layouts = []
123 |                 for layout in layouts.boxes:
124 |                     # Convert coordinate system from picture to il
125 |                     # system to the il coordinate system
126 |                     x0, y0, x1, y1 = layout.xyxy
127 |                     pix = mupdf_doc[page.page_number].get_pixmap()
128 |                     h, w = pix.height, pix.width
129 |                     x0, y0, x1, y1 = (
130 |                         np.clip(int(x0 - 1), 0, w - 1),
131 |                         np.clip(int(h - y1 - 1), 0, h - 1),
132 |                         np.clip(int(x1 + 1), 0, w - 1),
133 |                         np.clip(int(h - y0 + 1), 0, h - 1),
134 |                     )
135 |                     page_layout = il_version_1.PageLayout(
136 |                         id=len(page_layouts) + 1,
137 |                         box=il_version_1.Box(
138 |                             x0.item(),
139 |                             y0.item(),
140 |                             x1.item(),
141 |                             y1.item(),
142 |                         ),
143 |                         conf=layout.conf.item(),
144 |                         class_name=layouts.names[layout.cls],
145 |                     )
146 |                     page_layouts.append(page_layout)
147 | 
148 |                 page.page_layout = page_layouts
149 |                 self._save_debug_box_to_page(page)
150 |                 progress.advance(1)
151 | 
152 |         return docs
153 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/midend/remove_descent.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from collections import Counter
  3 | from functools import cache
  4 | 
  5 | from babeldoc.document_il import il_version_1
  6 | from babeldoc.translation_config import TranslationConfig
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | 
 11 | class RemoveDescent:
 12 |     stage_name = "Remove Char Descent"
 13 | 
 14 |     def __init__(self, translation_config: TranslationConfig):
 15 |         self.translation_config = translation_config
 16 | 
 17 |     def _remove_char_descent(
 18 |         self,
 19 |         char: il_version_1.PdfCharacter,
 20 |         font: il_version_1.PdfFont,
 21 |     ) -> float | None:
 22 |         """Remove descent from a single character and return the descent value.
 23 | 
 24 |         Args:
 25 |             char: The character to process
 26 |             font: The font used by this character
 27 | 
 28 |         Returns:
 29 |             The descent value if it was removed, None otherwise
 30 |         """
 31 |         if (
 32 |             char.box
 33 |             and char.box.y is not None
 34 |             and char.box.y2 is not None
 35 |             and font
 36 |             and hasattr(font, "descent")
 37 |         ):
 38 |             descent = font.descent * char.pdf_style.font_size / 1000
 39 |             if char.vertical:
 40 |                 # For vertical text, remove descent from x coordinates
 41 |                 char.box.x += descent
 42 |                 char.box.x2 += descent
 43 |             else:
 44 |                 # For horizontal text, remove descent from y coordinates
 45 |                 char.box.y -= descent
 46 |                 char.box.y2 -= descent
 47 |             return descent
 48 |         return None
 49 | 
 50 |     def process(self, document: il_version_1.Document):
 51 |         """Process the document to remove descent adjustments from character boxes.
 52 | 
 53 |         Args:
 54 |             document: The document to process
 55 |         """
 56 |         with self.translation_config.progress_monitor.stage_start(
 57 |             self.stage_name,
 58 |             len(document.page),
 59 |         ) as pbar:
 60 |             for page in document.page:
 61 |                 self.translation_config.raise_if_cancelled()
 62 |                 self.process_page(page)
 63 |                 pbar.advance()
 64 | 
 65 |     def process_page(self, page: il_version_1.Page):
 66 |         """Process a single page to remove descent adjustments.
 67 | 
 68 |         Args:
 69 |             page: The page to process
 70 |         """
 71 |         # Build font map including xobjects
 72 |         fonts: dict[
 73 |             str | int,
 74 |             il_version_1.PdfFont | dict[str, il_version_1.PdfFont],
 75 |         ] = {f.font_id: f for f in page.pdf_font}
 76 |         page_fonts = {f.font_id: f for f in page.pdf_font}
 77 | 
 78 |         # Add xobject fonts
 79 |         for xobj in page.pdf_xobject:
 80 |             fonts[xobj.xobj_id] = page_fonts.copy()
 81 |             for font in xobj.pdf_font:
 82 |                 fonts[xobj.xobj_id][font.font_id] = font
 83 | 
 84 |         @cache
 85 |         def get_font(
 86 |             font_id: str,
 87 |             xobj_id: int | None = None,
 88 |         ) -> il_version_1.PdfFont | None:
 89 |             if xobj_id is not None and xobj_id in fonts:
 90 |                 font_map = fonts[xobj_id]
 91 |                 if isinstance(font_map, dict) and font_id in font_map:
 92 |                     return font_map[font_id]
 93 |             return (
 94 |                 fonts.get(font_id)
 95 |                 if isinstance(fonts.get(font_id), il_version_1.PdfFont)
 96 |                 else None
 97 |             )
 98 | 
 99 |         # Process all standalone characters in the page
100 |         for char in page.pdf_character:
101 |             if font := get_font(char.pdf_style.font_id, char.xobj_id):
102 |                 self._remove_char_descent(char, font)
103 | 
104 |         # Process all paragraphs
105 |         for paragraph in page.pdf_paragraph:
106 |             descent_values = []
107 |             vertical_chars = []
108 | 
109 |             # Process all characters in paragraph compositions
110 |             for comp in paragraph.pdf_paragraph_composition:
111 |                 # Handle direct characters
112 |                 if comp.pdf_character:
113 |                     font = get_font(
114 |                         comp.pdf_character.pdf_style.font_id,
115 |                         comp.pdf_character.xobj_id,
116 |                     )
117 |                     if font:
118 |                         descent = self._remove_char_descent(comp.pdf_character, font)
119 |                         if descent is not None:
120 |                             descent_values.append(descent)
121 |                             vertical_chars.append(comp.pdf_character.vertical)
122 | 
123 |                 # Handle characters in PdfLine
124 |                 elif comp.pdf_line:
125 |                     for char in comp.pdf_line.pdf_character:
126 |                         if font := get_font(char.pdf_style.font_id, char.xobj_id):
127 |                             descent = self._remove_char_descent(char, font)
128 |                             if descent is not None:
129 |                                 descent_values.append(descent)
130 |                                 vertical_chars.append(char.vertical)
131 | 
132 |                 # Handle characters in PdfFormula
133 |                 elif comp.pdf_formula:
134 |                     for char in comp.pdf_formula.pdf_character:
135 |                         if font := get_font(char.pdf_style.font_id, char.xobj_id):
136 |                             descent = self._remove_char_descent(char, font)
137 |                             if descent is not None:
138 |                                 descent_values.append(descent)
139 |                                 vertical_chars.append(char.vertical)
140 | 
141 |                 # Handle characters in PdfSameStyleCharacters
142 |                 elif comp.pdf_same_style_characters:
143 |                     for char in comp.pdf_same_style_characters.pdf_character:
144 |                         if font := get_font(char.pdf_style.font_id, char.xobj_id):
145 |                             descent = self._remove_char_descent(char, font)
146 |                             if descent is not None:
147 |                                 descent_values.append(descent)
148 |                                 vertical_chars.append(char.vertical)
149 | 
150 |             # Adjust paragraph box based on most common descent value
151 |             if descent_values and paragraph.box:
152 |                 # Calculate mode of descent values
153 |                 descent_counter = Counter(descent_values)
154 |                 most_common_descent = descent_counter.most_common(1)[0][0]
155 | 
156 |                 # Check if paragraph is vertical (all characters are vertical)
157 |                 is_vertical = all(vertical_chars) if vertical_chars else False
158 | 
159 |                 # Adjust paragraph box
160 |                 if paragraph.box.y is not None and paragraph.box.y2 is not None:
161 |                     if is_vertical:
162 |                         # For vertical paragraphs, adjust x coordinates
163 |                         paragraph.box.x += most_common_descent
164 |                         paragraph.box.x2 += most_common_descent
165 |                     else:
166 |                         # For horizontal paragraphs, adjust y coordinates
167 |                         paragraph.box.y -= most_common_descent
168 |                         paragraph.box.y2 -= most_common_descent
169 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/midend/table_parser.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | import cv2
  5 | import numpy as np
  6 | from pymupdf import Document
  7 | 
  8 | from babeldoc.document_il import il_version_1
  9 | from babeldoc.document_il.utils.style_helper import GREEN
 10 | from babeldoc.translation_config import TranslationConfig
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class TableParser:
 16 |     stage_name = "Parse Table"
 17 | 
 18 |     def __init__(self, translation_config: TranslationConfig):
 19 |         self.translation_config = translation_config
 20 |         self.model = translation_config.table_model
 21 | 
 22 |     def _save_debug_image(self, image: np.ndarray, layouts, page_number: int):
 23 |         """Save debug image with drawn boxes if debug mode is enabled."""
 24 |         if not self.translation_config.debug:
 25 |             return
 26 | 
 27 |         if not isinstance(layouts, list):
 28 |             layouts = [layouts]
 29 |         debug_dir = Path(
 30 |             self.translation_config.get_working_file_path("table-ocr-box-image")
 31 |         )
 32 |         debug_dir.mkdir(parents=True, exist_ok=True)
 33 | 
 34 |         # Draw boxes on the image
 35 |         debug_image = image.copy()
 36 |         for layout in layouts:
 37 |             for box in layout.boxes:
 38 |                 x0, y0, x1, y1 = box.xyxy
 39 |                 cv2.rectangle(
 40 |                     debug_image,
 41 |                     (int(x0), int(y0)),
 42 |                     (int(x1), int(y1)),
 43 |                     (0, 255, 0),
 44 |                     2,
 45 |                 )
 46 |                 # Add text label
 47 |                 cv2.putText(
 48 |                     debug_image,
 49 |                     layout.names[box.cls],
 50 |                     (int(x0), int(y0) - 5),
 51 |                     cv2.FONT_HERSHEY_SIMPLEX,
 52 |                     0.5,
 53 |                     (0, 255, 0),
 54 |                     1,
 55 |                 )
 56 | 
 57 |         # Save the image
 58 |         output_path = debug_dir / f"{page_number}.jpg"
 59 |         cv2.imwrite(str(output_path), debug_image)
 60 | 
 61 |     def _save_debug_box_to_page(self, page: il_version_1.Page):
 62 |         """Save debug boxes and text labels to the PDF page."""
 63 |         if not self.translation_config.debug:
 64 |             return
 65 | 
 66 |         color = GREEN
 67 | 
 68 |         for layout in page.page_layout:
 69 |             # Create a rectangle box
 70 |             rect = il_version_1.PdfRectangle(
 71 |                 box=il_version_1.Box(
 72 |                     x=layout.box.x,
 73 |                     y=layout.box.y,
 74 |                     x2=layout.box.x2,
 75 |                     y2=layout.box.y2,
 76 |                 ),
 77 |                 graphic_state=color,
 78 |                 debug_info=True,
 79 |             )
 80 |             page.pdf_rectangle.append(rect)
 81 | 
 82 |             # Create text label at top-left corner
 83 |             # Note: PDF coordinates are from bottom-left,
 84 |             # so we use y2 for top position
 85 |             style = il_version_1.PdfStyle(
 86 |                 font_id="china-ss",
 87 |                 font_size=4,
 88 |                 graphic_state=color,
 89 |             )
 90 |             page.pdf_paragraph.append(
 91 |                 il_version_1.PdfParagraph(
 92 |                     first_line_indent=False,
 93 |                     box=il_version_1.Box(
 94 |                         x=layout.box.x,
 95 |                         y=layout.box.y2,
 96 |                         x2=layout.box.x2,
 97 |                         y2=layout.box.y2 + 5,
 98 |                     ),
 99 |                     vertical=False,
100 |                     pdf_style=style,
101 |                     unicode=layout.class_name,
102 |                     pdf_paragraph_composition=[
103 |                         il_version_1.PdfParagraphComposition(
104 |                             pdf_same_style_unicode_characters=il_version_1.PdfSameStyleUnicodeCharacters(
105 |                                 unicode=layout.class_name,
106 |                                 pdf_style=style,
107 |                                 debug_info=True,
108 |                             ),
109 |                         ),
110 |                     ],
111 |                     xobj_id=-1,
112 |                 ),
113 |             )
114 | 
115 |     def process(self, docs: il_version_1.Document, mupdf_doc: Document):
116 |         """Generate layouts for all pages that need to be translated."""
117 |         # Get pages that need to be translated
118 |         have_table_pages = {}
119 |         for page in docs.page:
120 |             for layout in page.page_layout:
121 |                 if layout.class_name == "table":
122 |                     have_table_pages[page.page_number] = page
123 |         with self.translation_config.progress_monitor.stage_start(
124 |             self.stage_name,
125 |             len(have_table_pages),
126 |         ) as progress:
127 |             # Process predictions for each page
128 |             for page, layouts in self.model.handle_document(
129 |                 have_table_pages.values(),
130 |                 mupdf_doc,
131 |                 self.translation_config,
132 |                 self._save_debug_image,
133 |             ):
134 |                 page_layouts = []
135 |                 for layout in layouts.boxes:
136 |                     # Convert coordinate system from picture to il
137 |                     # system to the il coordinate system
138 |                     x0, y0, x1, y1 = layout.xyxy
139 |                     pix = mupdf_doc[page.page_number].get_pixmap()
140 |                     h, w = pix.height, pix.width
141 |                     x0, y0, x1, y1 = (
142 |                         np.clip(int(x0 - 1), 0, w - 1),
143 |                         np.clip(int(h - y1 - 1), 0, h - 1),
144 |                         np.clip(int(x1 + 1), 0, w - 1),
145 |                         np.clip(int(h - y0 + 1), 0, h - 1),
146 |                     )
147 |                     page_layout = il_version_1.PageLayout(
148 |                         id=len(page_layouts) + 1,
149 |                         box=il_version_1.Box(
150 |                             x0.item(),
151 |                             y0.item(),
152 |                             x1.item(),
153 |                             y1.item(),
154 |                         ),
155 |                         conf=layout.conf.item(),
156 |                         class_name=layouts.names[layout.cls],
157 |                     )
158 |                     page_layouts.append(page_layout)
159 | 
160 |                 page.page_layout.extend(page_layouts)
161 |                 self._save_debug_box_to_page(page)
162 |                 progress.advance(1)
163 | 
164 |         return docs
165 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/translator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/document_il/translator/__init__.py


--------------------------------------------------------------------------------
/babeldoc/document_il/translator/cache.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | 
  4 | from peewee import SQL
  5 | from peewee import AutoField
  6 | from peewee import CharField
  7 | from peewee import Model
  8 | from peewee import SqliteDatabase
  9 | from peewee import TextField
 10 | 
 11 | from babeldoc.const import CACHE_FOLDER
 12 | 
 13 | # we don't init the database here
 14 | db = SqliteDatabase(None)
 15 | 
 16 | 
 17 | class _TranslationCache(Model):
 18 |     id = AutoField()
 19 |     translate_engine = CharField(max_length=20)
 20 |     translate_engine_params = TextField()
 21 |     original_text = TextField()
 22 |     translation = TextField()
 23 | 
 24 |     class Meta:
 25 |         database = db
 26 |         constraints = [
 27 |             SQL(
 28 |                 """
 29 |             UNIQUE (
 30 |                 translate_engine,
 31 |                 translate_engine_params,
 32 |                 original_text
 33 |                 )
 34 |             ON CONFLICT REPLACE
 35 |             """,
 36 |             ),
 37 |         ]
 38 | 
 39 | 
 40 | class TranslationCache:
 41 |     @staticmethod
 42 |     def _sort_dict_recursively(obj):
 43 |         if isinstance(obj, dict):
 44 |             return {
 45 |                 k: TranslationCache._sort_dict_recursively(v)
 46 |                 for k in sorted(obj.keys())
 47 |                 for v in [obj[k]]
 48 |             }
 49 |         elif isinstance(obj, list):
 50 |             return [TranslationCache._sort_dict_recursively(item) for item in obj]
 51 |         return obj
 52 | 
 53 |     def __init__(self, translate_engine: str, translate_engine_params: dict = None):
 54 |         self.translate_engine = translate_engine
 55 |         self.replace_params(translate_engine_params)
 56 | 
 57 |     # The program typically starts multi-threaded translation
 58 |     # only after cache parameters are fully configured,
 59 |     # so thread safety doesn't need to be considered here.
 60 |     def replace_params(self, params: dict = None):
 61 |         if params is None:
 62 |             params = {}
 63 |         self.params = params
 64 |         params = self._sort_dict_recursively(params)
 65 |         self.translate_engine_params = json.dumps(params)
 66 | 
 67 |     def update_params(self, params: dict = None):
 68 |         if params is None:
 69 |             params = {}
 70 |         self.params.update(params)
 71 |         self.replace_params(self.params)
 72 | 
 73 |     def add_params(self, k: str, v):
 74 |         self.params[k] = v
 75 |         self.replace_params(self.params)
 76 | 
 77 |     # Since peewee and the underlying sqlite are thread-safe,
 78 |     # get and set operations don't need locks.
 79 |     def get(self, original_text: str) -> str | None:
 80 |         result = _TranslationCache.get_or_none(
 81 |             translate_engine=self.translate_engine,
 82 |             translate_engine_params=self.translate_engine_params,
 83 |             original_text=original_text,
 84 |         )
 85 |         return result.translation if result else None
 86 | 
 87 |     def set(self, original_text: str, translation: str):
 88 |         _TranslationCache.create(
 89 |             translate_engine=self.translate_engine,
 90 |             translate_engine_params=self.translate_engine_params,
 91 |             original_text=original_text,
 92 |             translation=translation,
 93 |         )
 94 | 
 95 | 
 96 | def init_db(remove_exists=False):
 97 |     CACHE_FOLDER.mkdir(parents=True, exist_ok=True)
 98 |     # The current version does not support database migration, so add the version number to the file name.
 99 |     cache_db_path = CACHE_FOLDER / "cache.v1.db"
100 |     if remove_exists and cache_db_path.exists():
101 |         cache_db_path.unlink()
102 |     db.init(
103 |         cache_db_path,
104 |         pragmas={
105 |             "journal_mode": "wal",
106 |             "busy_timeout": 1000,
107 |         },
108 |     )
109 |     db.create_tables([_TranslationCache], safe=True)
110 | 
111 | 
112 | def init_test_db():
113 |     import tempfile
114 | 
115 |     temp_file = tempfile.NamedTemporaryFile(suffix=".db", delete=False)
116 |     cache_db_path = temp_file.name
117 |     temp_file.close()
118 | 
119 |     test_db = SqliteDatabase(
120 |         cache_db_path,
121 |         pragmas={
122 |             "journal_mode": "wal",
123 |             "busy_timeout": 1000,
124 |         },
125 |     )
126 |     test_db.bind([_TranslationCache], bind_refs=False, bind_backrefs=False)
127 |     test_db.connect()
128 |     test_db.create_tables([_TranslationCache], safe=True)
129 |     return test_db
130 | 
131 | 
132 | def clean_test_db(test_db):
133 |     test_db.drop_tables([_TranslationCache])
134 |     test_db.close()
135 |     db_path = Path(test_db.database)
136 |     if db_path.exists():
137 |         db_path.unlink()
138 |     wal_path = Path(str(db_path) + "-wal")
139 |     if wal_path.exists():
140 |         wal_path.unlink()
141 |     shm_path = Path(str(db_path) + "-shm")
142 |     if shm_path.exists():
143 |         shm_path.unlink()
144 | 
145 | 
146 | init_db()
147 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/utils/atomic_integer.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | 
 4 | class AtomicInteger:
 5 |     def __init__(self, value=0):
 6 |         self._value = int(value)
 7 |         self._lock = threading.Lock()
 8 | 
 9 |     def inc(self, d=1):
10 |         with self._lock:
11 |             self._value += int(d)
12 |             return self._value
13 | 
14 |     def dec(self, d=1):
15 |         return self.inc(-d)
16 | 
17 |     @property
18 |     def value(self):
19 |         with self._lock:
20 |             return self._value
21 | 
22 |     @value.setter
23 |     def value(self, v):
24 |         with self._lock:
25 |             self._value = int(v)
26 |             return self._value
27 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/utils/priority_thread_pool_executor.py:
--------------------------------------------------------------------------------
  1 | # thanks to:
  2 | # https://github.com/oleglpts/PriorityThreadPoolExecutor/blob/master/PriorityThreadPoolExecutor/__init__.py
  3 | # https://github.com/oleglpts/PriorityThreadPoolExecutor/issues/4
  4 | 
  5 | import atexit
  6 | import itertools
  7 | import logging
  8 | import queue
  9 | import random
 10 | import sys
 11 | import threading
 12 | import weakref
 13 | from concurrent.futures import _base
 14 | from concurrent.futures.thread import BrokenThreadPool
 15 | from concurrent.futures.thread import ThreadPoolExecutor
 16 | from concurrent.futures.thread import _python_exit
 17 | from concurrent.futures.thread import _threads_queues
 18 | from concurrent.futures.thread import _WorkItem
 19 | from heapq import heappop
 20 | from heapq import heappush
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | ########################################################################################################################
 25 | #                                                Global variables                                                      #
 26 | ########################################################################################################################
 27 | 
 28 | NULL_ENTRY = (sys.maxsize, _WorkItem(None, None, (), {}))
 29 | _shutdown = False
 30 | 
 31 | ########################################################################################################################
 32 | #                                           Before system exit procedure                                               #
 33 | ########################################################################################################################
 34 | 
 35 | 
 36 | def python_exit():
 37 |     """
 38 | 
 39 |     Cleanup before system exit
 40 | 
 41 |     """
 42 |     global _shutdown
 43 |     _shutdown = True
 44 |     items = list(_threads_queues.items())
 45 |     for _t, q in items:
 46 |         q.put(NULL_ENTRY)
 47 |     for t, _q in items:
 48 |         t.join()
 49 | 
 50 | 
 51 | # change default cleanup
 52 | 
 53 | 
 54 | atexit.unregister(_python_exit)
 55 | atexit.register(python_exit)
 56 | 
 57 | 
 58 | class PriorityQueue(queue.Queue):
 59 |     """Variant of Queue that retrieves open entries in priority order (lowest first).
 60 | 
 61 |     Entries are typically tuples of the form:  (priority number, data).
 62 |     """
 63 | 
 64 |     REMOVED = "<removed-task>"
 65 |     DEFAULT_PRIORITY = 100
 66 | 
 67 |     def _init(self, maxsize):
 68 |         self.queue = []
 69 |         self.entry_finder = {}
 70 |         self.counter = itertools.count()
 71 | 
 72 |     def _qsize(self):
 73 |         return len(self.queue)
 74 | 
 75 |     def _put(self, item):
 76 |         # heappush(self.queue, item)
 77 |         try:
 78 |             if item[1] in self.entry_finder:
 79 |                 self.remove(item[1])
 80 |             count = next(self.counter)
 81 |             entry = [item[0], count, item[1]]
 82 |             self.entry_finder[item[1]] = entry
 83 |             heappush(self.queue, entry)
 84 |         except TypeError:  # handle item==None
 85 |             self._put((self.DEFAULT_PRIORITY, None))
 86 | 
 87 |     def remove(self, task):
 88 |         """
 89 |         This simply replaces the data with the REMOVED value,
 90 |         which will get cleared out once _get reaches it.
 91 |         """
 92 |         entry = self.entry_finder.pop(task)
 93 |         entry[-1] = self.REMOVED
 94 | 
 95 |     def _get(self):
 96 |         while self.queue:
 97 |             entry = heappop(self.queue)
 98 |             if entry[2] is not self.REMOVED:
 99 |                 del self.entry_finder[entry[2]]
100 |                 return entry
101 |         return None
102 | 
103 | 
104 | def _worker(executor_reference, work_queue, initializer, initargs):
105 |     if initializer is not None:
106 |         try:
107 |             initializer(*initargs)
108 |         except BaseException:
109 |             _base.LOGGER.critical("Exception in initializer:", exc_info=True)
110 |             executor = executor_reference()
111 |             if executor is not None:
112 |                 executor._initializer_failed()
113 |             return
114 |     try:
115 |         while True:
116 |             work_item = work_queue.get(block=True)
117 |             try:
118 |                 if work_item[2] is not None:
119 |                     work_item[2].run()
120 |                     # Delete references to object. See issue16284
121 |                     del work_item
122 | 
123 |                     # attempt to increment idle count
124 |                     executor = executor_reference()
125 |                     if executor is not None:
126 |                         executor._idle_semaphore.release()
127 |                     del executor
128 |                     continue
129 | 
130 |                 executor = executor_reference()
131 |                 # Exit if:
132 |                 #   - The interpreter is shutting down OR
133 |                 #   - The executor that owns the worker has been collected OR
134 |                 #   - The executor that owns the worker has been shutdown.
135 |                 if _shutdown or executor is None or executor._shutdown:
136 |                     # Flag the executor as shutting down as early as possible if it
137 |                     # is not gc-ed yet.
138 |                     if executor is not None:
139 |                         executor._shutdown = True
140 |                     # Notice other workers
141 |                     work_queue.put(None)
142 |                     return
143 |                 del executor
144 |             finally:
145 |                 work_queue.task_done()
146 |     except BaseException:
147 |         _base.LOGGER.critical("Exception in worker", exc_info=True)
148 | 
149 | 
150 | class PriorityThreadPoolExecutor(ThreadPoolExecutor):
151 |     """
152 |     Thread pool executor with priority queue (priorities must be different, lowest first)
153 |     """
154 | 
155 |     def __init__(self, *args, **kwargs):
156 |         super().__init__(*args, **kwargs)
157 | 
158 |         # change work queue type to queue.PriorityQueue
159 |         self._work_queue: PriorityQueue = PriorityQueue()
160 | 
161 |     def submit(self, fn, *args, **kwargs):
162 |         """
163 | 
164 |         Sending the function to the execution queue
165 | 
166 |         :param fn: function being executed
167 |         :type fn: callable
168 |         :param args: function's positional arguments
169 |         :param kwargs: function's keywords arguments
170 |         :return: future instance
171 |         :rtype: _base.Future
172 | 
173 |         Added keyword:
174 | 
175 |         - priority (integer later sys.maxsize)
176 | 
177 |         """
178 |         with self._shutdown_lock:
179 |             if self._broken:
180 |                 raise BrokenThreadPool(self._broken)
181 | 
182 |             if self._shutdown:
183 |                 raise RuntimeError("cannot schedule new futures after shutdown")
184 |             if _shutdown:
185 |                 raise RuntimeError(
186 |                     "cannot schedule new futures after interpreter shutdown"
187 |                 )
188 | 
189 |             priority = kwargs.get("priority", random.randint(0, sys.maxsize - 1))  # noqa: S311
190 |             if "priority" in kwargs:
191 |                 del kwargs["priority"]
192 | 
193 |             f = _base.Future()
194 |             w = _WorkItem(f, fn, args, kwargs)
195 | 
196 |             self._work_queue.put((priority, w))
197 |             self._adjust_thread_count()
198 |             return f
199 | 
200 |     def _adjust_thread_count(self):
201 |         # if idle threads are available, don't spin new threads
202 |         if self._idle_semaphore.acquire(timeout=0):
203 |             return
204 | 
205 |         # When the executor gets lost, the weakref callback will wake up
206 |         # the worker threads.
207 |         def weakref_cb(_, q=self._work_queue):
208 |             q.put(None)
209 | 
210 |         num_threads = len(self._threads)
211 |         if num_threads < self._max_workers:
212 |             thread_name = f"{self._thread_name_prefix or self}_{num_threads:d}"
213 |             t = threading.Thread(
214 |                 name=thread_name,
215 |                 target=_worker,
216 |                 args=(
217 |                     weakref.ref(self, weakref_cb),
218 |                     self._work_queue,
219 |                     self._initializer,
220 |                     self._initargs,
221 |                 ),
222 |             )
223 |             t.start()
224 |             self._threads.add(t)
225 |             _threads_queues[t] = self._work_queue
226 | 
227 |     def shutdown(self, wait=True, *, cancel_futures=False):
228 |         logger.debug("Shutting down executor %s", self._thread_name_prefix or self)
229 |         if wait:
230 |             logger.debug(
231 |                 "Waiting for all tasks done %s", self._thread_name_prefix or self
232 |             )
233 |             self._work_queue.join()
234 |             logger.debug("All tasks done %s", self._thread_name_prefix or self)
235 | 
236 |         with self._shutdown_lock:
237 |             self._shutdown = True
238 |             if cancel_futures:
239 |                 # Drain all work items from the queue, and then cancel their
240 |                 # associated futures.
241 |                 while True:
242 |                     try:
243 |                         work_item = self._work_queue.get_nowait()
244 |                     except queue.Empty:
245 |                         break
246 |                     if work_item is not None:
247 |                         work_item.future.cancel()
248 | 
249 |             # Send a wake-up to prevent threads calling
250 |             # _work_queue.get(block=True) from permanently blocking.
251 |             self._work_queue.put(None)
252 |         if wait:
253 |             logger.debug(
254 |                 "Waiting for all thread done %s", self._thread_name_prefix or self
255 |             )
256 |             for t in self._threads:
257 |                 self._work_queue.put(None)
258 |                 t.join()
259 |         logger.debug("shutdown finish %s", self._thread_name_prefix or self)
260 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/utils/style_helper.py:
--------------------------------------------------------------------------------
 1 | from babeldoc.document_il import il_version_1
 2 | 
 3 | 
 4 | def create_pdf_style(r, g, b, font_id="china-ss", font_size=6):
 5 |     """
 6 |     Create a PdfStyle object from RGB values.
 7 | 
 8 |     Args:
 9 |         r: Red component in range 0-255
10 |         g: Green component in range 0-255
11 |         b: Blue component in range 0-255
12 |         font_id: Font identifier
13 |         font_size: Font size
14 | 
15 |     Returns:
16 |         PdfStyle object with the specified color
17 |     """
18 |     r, g, b = [x / 255.0 for x in (r, g, b)]
19 |     return il_version_1.PdfStyle(
20 |         font_id=font_id,
21 |         font_size=font_size,
22 |         graphic_state=il_version_1.GraphicState(
23 |             passthrough_per_char_instruction=f"{r:.10f} {g:.10f} {b:.10f} rg",
24 |         ),
25 |     )
26 | 
27 | 
28 | BLACK = il_version_1.GraphicState(passthrough_per_char_instruction="0 g 0 G")
29 | 
30 | WHITE = il_version_1.GraphicState(passthrough_per_char_instruction="1 g 1 G")
31 | 
32 | # Generate all color styles
33 | RED = il_version_1.GraphicState(
34 |     passthrough_per_char_instruction="1.0000000000 0.2313725490 0.1882352941 rg "
35 |     "1.0000000000 0.2313725490 0.1882352941 RG",
36 | )
37 | 
38 | ORANGE = il_version_1.GraphicState(
39 |     passthrough_per_char_instruction="1.0000000000 0.5843137255 0.0000000000 rg "
40 |     "1.0000000000 0.5843137255 0.0000000000 RG",
41 | )
42 | YELLOW = il_version_1.GraphicState(
43 |     passthrough_per_char_instruction="1.0000000000 0.8000000000 0.0000000000 rg "
44 |     "1.0000000000 0.8000000000 0.0000000000 RG",
45 | )
46 | 
47 | GREEN = il_version_1.GraphicState(
48 |     passthrough_per_char_instruction="0.2039215686 0.7803921569 0.3490196078 rg "
49 |     "0.2039215686 0.7803921569 0.3490196078 RG",
50 | )
51 | 
52 | MINT = il_version_1.GraphicState(
53 |     passthrough_per_char_instruction="0.0000000000 0.7803921569 0.7450980392 rg "
54 |     "0.0000000000 0.7803921569 0.7450980392 RG",
55 | )
56 | 
57 | TEAL = il_version_1.GraphicState(
58 |     passthrough_per_char_instruction="0.1882352941 0.6901960784 0.7803921569 rg "
59 |     "0.1882352941 0.6901960784 0.7803921569 RG",
60 | )
61 | 
62 | CYAN = il_version_1.GraphicState(
63 |     passthrough_per_char_instruction="0.1960784314 0.6784313725 0.9019607843 rg "
64 |     "0.1960784314 0.6784313725 0.9019607843 RG",
65 | )
66 | 
67 | BLUE = il_version_1.GraphicState(
68 |     passthrough_per_char_instruction="0.0000000000 0.4784313725 1.0000000000 rg "
69 |     "0.0000000000 0.4784313725 1.0000000000 RG",
70 | )
71 | 
72 | INDIGO = il_version_1.GraphicState(
73 |     passthrough_per_char_instruction="0.3450980392 0.3372549020 0.8392156863 rg "
74 |     "0.3450980392 0.3372549020 0.8392156863 RG",
75 | )
76 | 
77 | PURPLE = il_version_1.GraphicState(
78 |     passthrough_per_char_instruction="0.6862745098 0.3215686275 0.8705882353 rg "
79 |     "0.6862745098 0.3215686275 0.8705882353 RG",
80 | )
81 | 
82 | PINK = il_version_1.GraphicState(
83 |     passthrough_per_char_instruction="1.0000000000 0.1764705882 0.3333333333 rg "
84 |     "1.0000000000 0.1764705882 0.3333333333 RG",
85 | )
86 | 
87 | BROWN = il_version_1.GraphicState(
88 |     passthrough_per_char_instruction="0.6352941176 0.5176470588 0.3686274510 rg "
89 |     "0.6352941176 0.5176470588 0.3686274510 RG",
90 | )
91 | 


--------------------------------------------------------------------------------
/babeldoc/document_il/xml_converter.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | from pathlib import Path
 3 | 
 4 | import orjson
 5 | from xsdata.formats.dataclass.context import XmlContext
 6 | from xsdata.formats.dataclass.parsers import XmlParser
 7 | from xsdata.formats.dataclass.serializers import XmlSerializer
 8 | from xsdata.formats.dataclass.serializers.config import SerializerConfig
 9 | 
10 | from babeldoc.document_il import il_version_1
11 | 
12 | 
13 | class XMLConverter:
14 |     def __init__(self):
15 |         self.parser = XmlParser()
16 |         config = SerializerConfig(indent="  ")
17 |         context = XmlContext()
18 |         self.serializer = XmlSerializer(context=context, config=config)
19 | 
20 |     def write_xml(self, document: il_version_1.Document, path: str):
21 |         with Path(path).open("w", encoding="utf-8") as f:
22 |             f.write(self.to_xml(document))
23 | 
24 |     def read_xml(self, path: str) -> il_version_1.Document:
25 |         with Path(path).open(encoding="utf-8") as f:
26 |             return self.from_xml(f.read())
27 | 
28 |     def to_xml(self, document: il_version_1.Document) -> str:
29 |         return self.serializer.render(document)
30 | 
31 |     def from_xml(self, xml: str) -> il_version_1.Document:
32 |         return self.parser.from_string(
33 |             xml,
34 |             il_version_1.Document,
35 |         )
36 | 
37 |     def deepcopy(self, document: il_version_1.Document) -> il_version_1.Document:
38 |         return copy.deepcopy(document)
39 |         # return self.from_xml(self.to_xml(document))
40 | 
41 |     def to_json(self, document: il_version_1.Document) -> str:
42 |         return orjson.dumps(
43 |             document,
44 |             option=orjson.OPT_APPEND_NEWLINE
45 |             | orjson.OPT_INDENT_2
46 |             | orjson.OPT_SORT_KEYS,
47 |         ).decode()
48 | 
49 |     def write_json(self, document: il_version_1.Document, path: str):
50 |         with Path(path).open("w", encoding="utf-8") as f:
51 |             f.write(self.to_json(document))
52 | 


--------------------------------------------------------------------------------
/babeldoc/docvision/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/docvision/README.md


--------------------------------------------------------------------------------
/babeldoc/docvision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/docvision/__init__.py


--------------------------------------------------------------------------------
/babeldoc/docvision/doclayout.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | import ast
  3 | import logging
  4 | import platform
  5 | import re
  6 | import threading
  7 | from collections.abc import Generator
  8 | 
  9 | import cv2
 10 | import numpy as np
 11 | 
 12 | try:
 13 |     import onnx
 14 |     import onnxruntime
 15 | except ImportError as e:
 16 |     if "DLL load failed" in str(e):
 17 |         raise OSError(
 18 |             "Microsoft Visual C++ Redistributable is not installed. "
 19 |             "Download it at https://aka.ms/vs/17/release/vc_redist.x64.exe"
 20 |         ) from e
 21 |     raise
 22 | import pymupdf
 23 | 
 24 | import babeldoc.document_il.il_version_1
 25 | from babeldoc.assets.assets import get_doclayout_onnx_model_path
 26 | 
 27 | # from huggingface_hub import hf_hub_download
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | class YoloResult:
 33 |     """Helper class to store detection results from ONNX model."""
 34 | 
 35 |     def __init__(self, names, boxes=None, boxes_data=None):
 36 |         if boxes is not None:
 37 |             self.boxes = boxes
 38 |         else:
 39 |             assert boxes_data is not None
 40 |             self.boxes = [YoloBox(data=d) for d in boxes_data]
 41 |         self.boxes.sort(key=lambda x: x.conf, reverse=True)
 42 |         self.names = names
 43 | 
 44 | 
 45 | class DocLayoutModel(abc.ABC):
 46 |     @staticmethod
 47 |     def load_onnx():
 48 |         logger.info("Loading ONNX model...")
 49 |         model = OnnxModel.from_pretrained()
 50 |         return model
 51 | 
 52 |     @staticmethod
 53 |     def load_available():
 54 |         return DocLayoutModel.load_onnx()
 55 | 
 56 |     @property
 57 |     @abc.abstractmethod
 58 |     def stride(self) -> int:
 59 |         """Stride of the model input."""
 60 | 
 61 |     @abc.abstractmethod
 62 |     def predict(self, image: bytes, imgsz: int = 1024, **kwargs) -> list[int]:
 63 |         """
 64 |         Predict the layout of a document page.
 65 | 
 66 |         Args:
 67 |             image: The image of the document page.
 68 |             imgsz: Resize the image to this size. Must be a multiple of the stride.
 69 |             **kwargs: Additional arguments.
 70 |         """
 71 | 
 72 |     @abc.abstractmethod
 73 |     def handle_document(
 74 |         self,
 75 |         pages: list[babeldoc.document_il.il_version_1.Page],
 76 |         mupdf_doc: pymupdf.Document,
 77 |         translate_config,
 78 |         save_debug_image,
 79 |     ) -> Generator[
 80 |         tuple[babeldoc.document_il.il_version_1.Page, YoloResult], None, None
 81 |     ]:
 82 |         """
 83 |         Handle a document.
 84 |         """
 85 | 
 86 | 
 87 | class YoloBox:
 88 |     """Helper class to store detection results from ONNX model."""
 89 | 
 90 |     def __init__(self, data=None, xyxy=None, conf=None, cls=None):
 91 |         if data is not None:
 92 |             self.xyxy = data[:4]
 93 |             self.conf = data[-2]
 94 |             self.cls = data[-1]
 95 |             return
 96 |         assert xyxy is not None and conf is not None and cls is not None
 97 |         self.xyxy = xyxy
 98 |         self.conf = conf
 99 |         self.cls = cls
100 | 
101 | 
102 | # 检测操作系统类型
103 | os_name = platform.system()
104 | 
105 | 
106 | class OnnxModel(DocLayoutModel):
107 |     def __init__(self, model_path: str):
108 |         self.model_path = model_path
109 | 
110 |         model = onnx.load(model_path)
111 |         metadata = {d.key: d.value for d in model.metadata_props}
112 |         self._stride = ast.literal_eval(metadata["stride"])
113 |         self._names = ast.literal_eval(metadata["names"])
114 |         providers = []
115 | 
116 |         available_providers = onnxruntime.get_available_providers()
117 |         for provider in available_providers:
118 |             if re.match(r"dml|cuda|cpu", provider, re.IGNORECASE):
119 |                 logger.info(f"Available Provider: {provider}")
120 |                 providers.append(provider)
121 |         self.model = onnxruntime.InferenceSession(
122 |             model.SerializeToString(),
123 |             providers=providers,
124 |         )
125 |         self.lock = threading.Lock()
126 | 
127 |     @staticmethod
128 |     def from_pretrained():
129 |         pth = get_doclayout_onnx_model_path()
130 |         return OnnxModel(pth)
131 | 
132 |     @property
133 |     def stride(self):
134 |         return self._stride
135 | 
136 |     def resize_and_pad_image(self, image, new_shape):
137 |         """
138 |         Resize and pad the image to the specified size, ensuring dimensions are multiples of stride.
139 | 
140 |         Parameters:
141 |         - image: Input image
142 |         - new_shape: Target size (integer or (height, width) tuple)
143 |         - stride: Padding alignment stride, default 32
144 | 
145 |         Returns:
146 |         - Processed image
147 |         """
148 |         if isinstance(new_shape, int):
149 |             new_shape = (new_shape, new_shape)
150 | 
151 |         h, w = image.shape[:2]
152 |         new_h, new_w = new_shape
153 | 
154 |         # Calculate scaling ratio
155 |         r = min(new_h / h, new_w / w)
156 |         resized_h, resized_w = int(round(h * r)), int(round(w * r))
157 | 
158 |         # Resize image
159 |         image = cv2.resize(
160 |             image,
161 |             (resized_w, resized_h),
162 |             interpolation=cv2.INTER_LINEAR,
163 |         )
164 | 
165 |         # Calculate padding size and align to stride multiple
166 |         pad_w = (new_w - resized_w) % self.stride
167 |         pad_h = (new_h - resized_h) % self.stride
168 |         top, bottom = pad_h // 2, pad_h - pad_h // 2
169 |         left, right = pad_w // 2, pad_w - pad_w // 2
170 | 
171 |         # Add padding
172 |         image = cv2.copyMakeBorder(
173 |             image,
174 |             top,
175 |             bottom,
176 |             left,
177 |             right,
178 |             cv2.BORDER_CONSTANT,
179 |             value=(114, 114, 114),
180 |         )
181 | 
182 |         return image
183 | 
184 |     def scale_boxes(self, img1_shape, boxes, img0_shape):
185 |         """
186 |         Rescales bounding boxes (in the format of xyxy by default) from the shape of the image they were originally
187 |         specified in (img1_shape) to the shape of a different image (img0_shape).
188 | 
189 |         Args:
190 |             img1_shape (tuple): The shape of the image that the bounding boxes are for,
191 |                 in the format of (height, width).
192 |             boxes (torch.Tensor): the bounding boxes of the objects in the image, in the format of (x1, y1, x2, y2)
193 |             img0_shape (tuple): the shape of the target image, in the format of (height, width).
194 | 
195 |         Returns:
196 |             boxes (torch.Tensor): The scaled bounding boxes, in the format of (x1, y1, x2, y2)
197 |         """
198 | 
199 |         # Calculate scaling ratio
200 |         gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])
201 | 
202 |         # Calculate padding size
203 |         pad_x = round((img1_shape[1] - img0_shape[1] * gain) / 2 - 0.1)
204 |         pad_y = round((img1_shape[0] - img0_shape[0] * gain) / 2 - 0.1)
205 | 
206 |         # Remove padding and scale boxes
207 |         boxes[..., :4] = (boxes[..., :4] - [pad_x, pad_y, pad_x, pad_y]) / gain
208 |         return boxes
209 | 
210 |     def predict(self, image, imgsz=800, batch_size=16, **kwargs):
211 |         """
212 |         Predict the layout of document pages.
213 | 
214 |         Args:
215 |             image: A single image or a list of images of document pages.
216 |             imgsz: Resize the image to this size. Must be a multiple of the stride.
217 |             batch_size: Number of images to process in one batch.
218 |             **kwargs: Additional arguments.
219 | 
220 |         Returns:
221 |             A list of YoloResult objects, one for each input image.
222 |         """
223 |         # Handle single image input
224 |         if isinstance(image, np.ndarray) and len(image.shape) == 3:
225 |             image = [image]
226 | 
227 |         total_images = len(image)
228 |         results = []
229 |         batch_size = 1
230 | 
231 |         # Process images in batches
232 |         for i in range(0, total_images, batch_size):
233 |             batch_images = image[i : i + batch_size]
234 |             batch_size_actual = len(batch_images)
235 | 
236 |             # Calculate target size based on the maximum height in the batch
237 |             max_height = max(img.shape[0] for img in batch_images)
238 |             target_imgsz = 1024
239 | 
240 |             # Preprocess batch
241 |             processed_batch = []
242 |             orig_shapes = []
243 |             for img in batch_images:
244 |                 orig_h, orig_w = img.shape[:2]
245 |                 orig_shapes.append((orig_h, orig_w))
246 | 
247 |                 pix = self.resize_and_pad_image(img, new_shape=target_imgsz)
248 |                 pix = np.transpose(pix, (2, 0, 1))  # CHW
249 |                 pix = pix.astype(np.float32) / 255.0  # Normalize to [0, 1]
250 |                 processed_batch.append(pix)
251 | 
252 |             # Stack batch
253 |             batch_input = np.stack(processed_batch, axis=0)  # BCHW
254 |             new_h, new_w = batch_input.shape[2:]
255 | 
256 |             # Run inference
257 |             batch_preds = self.model.run(None, {"images": batch_input})[0]
258 | 
259 |             # Process each prediction in the batch
260 |             for j in range(batch_size_actual):
261 |                 preds = batch_preds[j]
262 |                 preds = preds[preds[..., 4] > 0.25]
263 |                 if len(preds) > 0:
264 |                     preds[..., :4] = self.scale_boxes(
265 |                         (new_h, new_w),
266 |                         preds[..., :4],
267 |                         orig_shapes[j],
268 |                     )
269 |                 results.append(YoloResult(boxes_data=preds, names=self._names))
270 | 
271 |         return results
272 | 
273 |     def handle_document(
274 |         self,
275 |         pages: list[babeldoc.document_il.il_version_1.Page],
276 |         mupdf_doc: pymupdf.Document,
277 |         translate_config,
278 |         save_debug_image,
279 |     ) -> Generator[
280 |         tuple[babeldoc.document_il.il_version_1.Page, YoloResult], None, None
281 |     ]:
282 |         for page in pages:
283 |             translate_config.raise_if_cancelled()
284 |             with self.lock:
285 |                 pix = mupdf_doc[page.page_number].get_pixmap(dpi=72)
286 |             image = np.fromstring(pix.samples, np.uint8).reshape(
287 |                 pix.height,
288 |                 pix.width,
289 |                 3,
290 |             )[:, :, ::-1]
291 |             predict_result = self.predict(image)[0]
292 |             save_debug_image(
293 |                 image,
294 |                 predict_result,
295 |                 page.page_number + 1,
296 |             )
297 |             yield page, predict_result
298 | 


--------------------------------------------------------------------------------
/babeldoc/format/office/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/babeldoc/format/office/__init__.py


--------------------------------------------------------------------------------
/babeldoc/result_merger.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from pathlib import Path
  3 | 
  4 | from pymupdf import Document
  5 | 
  6 | from babeldoc.document_il.backend.pdf_creater import PDFCreater
  7 | from babeldoc.translation_config import TranslateResult
  8 | from babeldoc.translation_config import TranslationConfig
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class ResultMerger:
 14 |     """Handles merging of split translation results"""
 15 | 
 16 |     def __init__(self, translation_config: TranslationConfig):
 17 |         self.config = translation_config
 18 | 
 19 |     def merge_results(self, results: dict[int, TranslateResult]) -> TranslateResult:
 20 |         """Merge multiple translation results into one"""
 21 |         if not results:
 22 |             raise ValueError("No results to merge")
 23 | 
 24 |         basename = Path(self.config.input_file).stem
 25 |         debug_suffix = ".debug" if self.config.debug else ""
 26 | 
 27 |         mono_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf"
 28 |         dual_file_name = f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf"
 29 | 
 30 |         debug_suffix += ".no_watermark"
 31 | 
 32 |         mono_file_name_no_watermark = (
 33 |             f"{basename}{debug_suffix}.{self.config.lang_out}.mono.pdf"
 34 |         )
 35 |         dual_file_name_no_watermark = (
 36 |             f"{basename}{debug_suffix}.{self.config.lang_out}.dual.pdf"
 37 |         )
 38 | 
 39 |         # Sort results by part index
 40 |         sorted_results = dict(sorted(results.items()))
 41 |         first_result = next(iter(sorted_results.values()))
 42 | 
 43 |         # Initialize paths for merged files
 44 |         merged_mono_path = None
 45 |         merged_dual_path = None
 46 |         merged_no_watermark_mono_path = None
 47 |         merged_no_watermark_dual_path = None
 48 | 
 49 |         # Merge monolingual PDFs if they exist
 50 |         if any(r.mono_pdf_path for r in results.values()):
 51 |             merged_mono_path = self._merge_pdfs(
 52 |                 [r.mono_pdf_path for r in sorted_results.values() if r.mono_pdf_path],
 53 |                 mono_file_name,
 54 |                 tag="merged_mono",
 55 |             )
 56 | 
 57 |         # Merge dual-language PDFs if they exist
 58 |         if any(r.dual_pdf_path for r in results.values()):
 59 |             merged_dual_path = self._merge_pdfs(
 60 |                 [r.dual_pdf_path for r in sorted_results.values() if r.dual_pdf_path],
 61 |                 dual_file_name,
 62 |                 tag="merged_dual",
 63 |             )
 64 | 
 65 |         if any(
 66 |             r.dual_pdf_path != r.no_watermark_dual_pdf_path
 67 |             or r.mono_pdf_path != r.no_watermark_mono_pdf_path
 68 |             for r in results.values()
 69 |         ):
 70 |             # Merge no-watermark PDFs if they exist
 71 |             if any(r.no_watermark_mono_pdf_path for r in results.values()):
 72 |                 merged_no_watermark_mono_path = self._merge_pdfs(
 73 |                     [
 74 |                         r.no_watermark_mono_pdf_path
 75 |                         for r in sorted_results.values()
 76 |                         if r.no_watermark_mono_pdf_path
 77 |                     ],
 78 |                     mono_file_name_no_watermark,
 79 |                     tag="merged_no_watermark_mono",
 80 |                 )
 81 | 
 82 |             if any(r.no_watermark_dual_pdf_path for r in results.values()):
 83 |                 merged_no_watermark_dual_path = self._merge_pdfs(
 84 |                     [
 85 |                         r.no_watermark_dual_pdf_path
 86 |                         for r in sorted_results.values()
 87 |                         if r.no_watermark_dual_pdf_path
 88 |                     ],
 89 |                     "merged_no_watermark_dual.pdf",
 90 |                     tag="merged_no_watermark_dual",
 91 |                 )
 92 | 
 93 |         # Create merged result
 94 |         merged_result = TranslateResult(
 95 |             mono_pdf_path=merged_mono_path,
 96 |             dual_pdf_path=merged_dual_path,
 97 |         )
 98 |         merged_result.no_watermark_mono_pdf_path = merged_no_watermark_mono_path
 99 |         merged_result.no_watermark_dual_pdf_path = merged_no_watermark_dual_path
100 | 
101 |         if merged_result.no_watermark_mono_pdf_path is None:
102 |             merged_result.no_watermark_mono_pdf_path = merged_mono_path
103 |         elif merged_result.mono_pdf_path is None:
104 |             merged_result.mono_pdf_path = merged_no_watermark_mono_path
105 | 
106 |         if merged_result.no_watermark_dual_pdf_path is None:
107 |             merged_result.no_watermark_dual_pdf_path = merged_dual_path
108 |         elif merged_result.dual_pdf_path is None:
109 |             merged_result.dual_pdf_path = merged_no_watermark_dual_path
110 | 
111 |         # Calculate total time
112 |         total_time = sum(
113 |             r.total_seconds for r in results.values() if hasattr(r, "total_seconds")
114 |         )
115 |         merged_result.total_seconds = total_time
116 | 
117 |         return merged_result
118 | 
119 |     def _merge_pdfs(
120 |         self, pdf_paths: list[str | Path], output_name: str, tag: str
121 |     ) -> Path:
122 |         """Merge multiple PDFs into one"""
123 |         if not pdf_paths:
124 |             return None
125 | 
126 |         output_path = self.config.get_output_file_path(output_name)
127 |         merged_doc = Document()
128 | 
129 |         for pdf_path in pdf_paths:
130 |             doc = Document(str(pdf_path))
131 |             merged_doc.insert_pdf(doc)
132 | 
133 |         merged_doc = PDFCreater.subset_fonts_in_subprocess(
134 |             merged_doc, self.config, tag=tag
135 |         )
136 |         PDFCreater.save_pdf_with_timeout(
137 |             merged_doc, str(output_path), translation_config=self.config
138 |         )
139 | 
140 |         return output_path
141 | 


--------------------------------------------------------------------------------
/babeldoc/split_manager.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from dataclasses import dataclass
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | 
 7 | @dataclass
 8 | class SplitPoint:
 9 |     """Represents a point where the document should be split"""
10 | 
11 |     start_page: int
12 |     end_page: int
13 |     estimated_complexity: float = 1.0
14 |     chapter_title: str | None = None
15 | 
16 | 
17 | class BaseSplitStrategy:
18 |     """Base class for split strategies"""
19 | 
20 |     def determine_split_points(self, config) -> list[SplitPoint]:
21 |         raise NotImplementedError
22 | 
23 | 
24 | class PageCountStrategy(BaseSplitStrategy):
25 |     """Split document based on page count"""
26 | 
27 |     def __init__(self, max_pages_per_part: int = 20):
28 |         self.max_pages_per_part = max_pages_per_part
29 | 
30 |     def determine_split_points(self, config) -> list[SplitPoint]:
31 |         from pymupdf import Document
32 | 
33 |         doc = Document(str(config.input_file))
34 |         total_pages = doc.page_count
35 | 
36 |         split_points = []
37 |         current_page = 0
38 | 
39 |         while current_page < total_pages:
40 |             end_page = min(current_page + self.max_pages_per_part, total_pages)
41 |             split_points.append(
42 |                 SplitPoint(
43 |                     start_page=current_page,
44 |                     end_page=end_page - 1,  # end_page is inclusive
45 |                 )
46 |             )
47 |             current_page = end_page
48 | 
49 |         return split_points
50 | 
51 | 
52 | class SplitManager:
53 |     """Manages document splitting process"""
54 | 
55 |     def __init__(self, config=None):
56 |         self.strategy = config.split_strategy
57 | 
58 |     def determine_split_points(self, config) -> list[SplitPoint]:
59 |         """Determine where to split the document"""
60 |         return self.strategy.determine_split_points(config)
61 | 
62 |     def estimate_part_complexity(self, split_point: SplitPoint) -> float:
63 |         """Estimate the complexity of a document part"""
64 |         # Simple estimation based on page count for now
65 |         return (
66 |             split_point.end_page - split_point.start_page + 1
67 |         ) * split_point.estimated_complexity
68 | 


--------------------------------------------------------------------------------
/babeldoc/tools/generate_font_metadata.py:
--------------------------------------------------------------------------------
  1 | # This script is used to automatically generate the following files:
  2 | # https://github.com/funstory-ai/BabelDOC-Assets/blob/main/font_metadata.json
  3 | 
  4 | 
  5 | import argparse
  6 | import hashlib
  7 | import io
  8 | import logging
  9 | from pathlib import Path
 10 | 
 11 | import babeldoc.high_level
 12 | import babeldoc.translation_config
 13 | import orjson
 14 | import pymupdf
 15 | from babeldoc.document_il import PdfFont
 16 | from rich.logging import RichHandler
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | 
 21 | def get_font_metadata(font_path) -> PdfFont:
 22 |     doc = pymupdf.open()
 23 |     page = doc.new_page(width=1000, height=1000)
 24 |     page.insert_font("test_font", font_path)
 25 |     translation_config = babeldoc.translation_config.TranslationConfig(
 26 |         *[None for _ in range(4)], doc_layout_model=1
 27 |     )
 28 |     translation_config.progress_monitor = babeldoc.high_level.ProgressMonitor(
 29 |         babeldoc.high_level.TRANSLATE_STAGES
 30 |     )
 31 |     translation_config.font = font_path
 32 |     il_creater = babeldoc.high_level.ILCreater(translation_config)
 33 |     il_creater.mupdf = doc
 34 |     buffer = io.BytesIO()
 35 |     doc.save(buffer)
 36 |     babeldoc.high_level.start_parse_il(
 37 |         buffer,
 38 |         doc_zh=doc,
 39 |         resfont="test_font",
 40 |         il_creater=il_creater,
 41 |         translation_config=translation_config,
 42 |     )
 43 | 
 44 |     il = il_creater.create_il()
 45 |     il_page = il.page[0]
 46 |     font_metadata = il_page.pdf_font[0]
 47 |     return font_metadata
 48 | 
 49 | 
 50 | def main():
 51 |     logging.basicConfig(level=logging.INFO, handlers=[RichHandler()])
 52 |     parser = argparse.ArgumentParser(description="Get font metadata.")
 53 |     parser.add_argument("assets_repo_path", type=str, help="Path to the font file.")
 54 |     args = parser.parse_args()
 55 |     repo_path = Path(args.assets_repo_path)
 56 |     assert repo_path.exists(), f"Assets repo path {repo_path} does not exist."
 57 |     assert (repo_path / "README.md").exists(), (
 58 |         f"Assets repo path {repo_path} does not contain a README.md file."
 59 |     )
 60 |     assert (repo_path / "fonts").exists(), (
 61 |         f"Assets repo path {repo_path} does not contain a fonts folder."
 62 |     )
 63 |     logger.info(f"Getting font metadata for {repo_path}")
 64 | 
 65 |     metadatas = {}
 66 |     for font_path in list((repo_path / "fonts").glob("**/*.ttf")):
 67 |         logger.info(f"Getting font metadata for {font_path}")
 68 |         with Path(font_path).open("rb") as f:
 69 |             # Read the file in chunks to handle large files efficiently
 70 |             hash_ = hashlib.sha3_256()
 71 |             while True:
 72 |                 chunk = f.read(1024 * 1024)
 73 |                 if not chunk:
 74 |                     break
 75 |                 hash_.update(chunk)
 76 |         extracted_metadata = get_font_metadata(font_path)
 77 |         metadata = {
 78 |             "file_name": font_path.name,
 79 |             "font_name": extracted_metadata.name,
 80 |             "encoding_length": extracted_metadata.encoding_length,
 81 |             "bold": extracted_metadata.bold,
 82 |             "italic": extracted_metadata.italic,
 83 |             "monospace": extracted_metadata.monospace,
 84 |             "serif": extracted_metadata.serif,
 85 |             "ascent": extracted_metadata.ascent,
 86 |             "descent": extracted_metadata.descent,
 87 |             "sha3_256": hash_.hexdigest(),
 88 |             "size": font_path.stat().st_size,
 89 |         }
 90 |         metadatas[font_path.name] = metadata
 91 |     metadatas = orjson.dumps(
 92 |         metadatas,
 93 |         option=orjson.OPT_APPEND_NEWLINE | orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS,
 94 |     ).decode()
 95 |     print(f"FONT METADATA: {metadatas}")
 96 |     with (repo_path / "font_metadata.json").open("w") as f:
 97 |         f.write(metadatas)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     main()
102 | 


--------------------------------------------------------------------------------
/babeldoc/tools/italic_assistance.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import re
  4 | from pathlib import Path
  5 | 
  6 | import orjson
  7 | from babeldoc.const import CACHE_FOLDER
  8 | 
  9 | WORKING_FOLDER = Path(CACHE_FOLDER) / "working"
 10 | 
 11 | 
 12 | def find_latest_il_json() -> Path | None:
 13 |     """
 14 |     Find the latest il_translated.json file in ~/.cache/babeldoc/ subdirectories.
 15 | 
 16 |     Returns:
 17 |         Path to the most recently modified il_translated.json file, or None if not found.
 18 |     """
 19 |     base_dir = Path(WORKING_FOLDER)
 20 |     json_files = list(base_dir.glob("*/il_translated.json"))
 21 | 
 22 |     if not json_files:
 23 |         return None
 24 | 
 25 |     # Sort by modification time (newest first)
 26 |     json_files.sort(key=lambda p: p.stat().st_mtime, reverse=True)
 27 |     return json_files[0]
 28 | 
 29 | 
 30 | def extract_fonts_from_paragraph(
 31 |     paragraph: dict, page_font_map: dict[str, tuple[str, str]]
 32 | ) -> set[tuple[str, str]]:
 33 |     """
 34 |     Extract all font_ids and names used in a paragraph.
 35 | 
 36 |     Args:
 37 |         paragraph: The paragraph dictionary
 38 |         page_font_map: Dictionary mapping font_id to (font_id, name) tuples
 39 | 
 40 |     Returns:
 41 |         Set of (font_id, name) tuples
 42 |     """
 43 |     fonts = set()
 44 | 
 45 |     # Check if paragraph has a pdfStyle with font_id
 46 |     if (
 47 |         "pdf_style" in paragraph
 48 |         and paragraph["pdf_style"]
 49 |         and "font_id" in paragraph["pdf_style"]
 50 |     ):
 51 |         font_id = paragraph["pdf_style"]["font_id"]
 52 |         if font_id in page_font_map:
 53 |             fonts.add(page_font_map[font_id])
 54 | 
 55 |     # Process paragraph compositions if present
 56 |     if "pdf_paragraph_composition" in paragraph:
 57 |         for comp in paragraph["pdf_paragraph_composition"]:
 58 |             # Check different composition types that might contain font information
 59 | 
 60 |             # Direct pdfCharacter in composition
 61 |             if "pdf_character" in comp and comp["pdf_character"]:
 62 |                 char = comp["pdf_character"]
 63 |                 if "pdf_style" in char and "font_id" in char["pdf_style"]:
 64 |                     font_id = char["pdf_style"]["font_id"]
 65 |                     if font_id in page_font_map:
 66 |                         fonts.add(page_font_map[font_id])
 67 | 
 68 |             # PdfLine in composition
 69 |             elif "pdf_line" in comp and comp["pdf_line"]:
 70 |                 line = comp["pdf_line"]
 71 |                 if "pdf_character" in line:
 72 |                     for char in line["pdf_character"]:
 73 |                         if "pdf_style" in char and "font_id" in char["pdf_style"]:
 74 |                             font_id = char["pdf_style"]["font_id"]
 75 |                             if font_id in page_font_map:
 76 |                                 fonts.add(page_font_map[font_id])
 77 | 
 78 |             # PdfFormula in composition
 79 |             elif "pdf_formula" in comp and comp["pdf_formula"]:
 80 |                 formula = comp["pdf_formula"]
 81 |                 if "pdf_character" in formula:
 82 |                     for char in formula["pdf_character"]:
 83 |                         if "pdf_style" in char and "font_id" in char["pdf_style"]:
 84 |                             font_id = char["pdf_style"]["font_id"]
 85 |                             if font_id in page_font_map:
 86 |                                 fonts.add(page_font_map[font_id])
 87 | 
 88 |             # PdfSameStyleCharacters in composition
 89 |             elif (
 90 |                 "pdf_same_style_characters" in comp
 91 |                 and comp["pdf_same_style_characters"]
 92 |             ):
 93 |                 same_style = comp["pdf_same_style_characters"]
 94 |                 if "pdf_style" in same_style and "font_id" in same_style["pdf_style"]:
 95 |                     font_id = same_style["pdf_style"]["font_id"]
 96 |                     if font_id in page_font_map:
 97 |                         fonts.add(page_font_map[font_id])
 98 | 
 99 |             # PdfSameStyleUnicodeCharacters in composition
100 |             elif (
101 |                 "pdf_same_style_unicode_characters" in comp
102 |                 and comp["pdf_same_style_unicode_characters"]
103 |             ):
104 |                 same_style_unicode = comp["pdf_same_style_unicode_characters"]
105 |                 if (
106 |                     "pdf_style" in same_style_unicode
107 |                     and same_style_unicode["pdf_style"] is not None
108 |                     and "font_id" in same_style_unicode["pdf_style"]
109 |                 ):
110 |                     font_id = same_style_unicode["pdf_style"]["font_id"]
111 |                     if font_id in page_font_map:
112 |                         fonts.add(page_font_map[font_id])
113 | 
114 |     return fonts
115 | 
116 | 
117 | def find_fonts_by_debug_id(json_path: Path, debug_id_regex: str) -> dict[str, str]:
118 |     """
119 |     Find all fonts used in paragraphs with matching debug_id.
120 | 
121 |     Args:
122 |         json_path: Path to the il_translated.json file
123 |         debug_id_regex: Regular expression to match debug_id values
124 | 
125 |     Returns:
126 |         Dictionary mapping font_ids to font names
127 |     """
128 |     # Load and parse JSON
129 |     with json_path.open("rb") as f:
130 |         doc_data = orjson.loads(f.read())
131 | 
132 |     # Compile regex pattern (case insensitive)
133 |     pattern = re.compile(debug_id_regex.strip(" \"'"), re.IGNORECASE)
134 | 
135 |     # Set to collect all found font information
136 |     found_fonts = set()
137 | 
138 |     # Process each page
139 |     for page in doc_data.get("page", []):
140 |         # Create a mapping of font_id to (font_id, name) tuples for this page
141 |         page_font_map = {}
142 |         for font in page.get("pdf_font", []):
143 |             if "font_id" in font and "name" in font:
144 |                 page_font_map[font["font_id"]] = (font["font_id"], font["name"])
145 | 
146 |         # Check each paragraph
147 |         for paragraph in page.get("pdf_paragraph", []):
148 |             # Check if paragraph has debug_id and if it matches the pattern
149 |             debug_id = paragraph.get("debug_id")
150 |             if debug_id and pattern.search(debug_id):
151 |                 # Get all fonts used in this paragraph
152 |                 paragraph_fonts = extract_fonts_from_paragraph(paragraph, page_font_map)
153 |                 found_fonts.update(paragraph_fonts)
154 | 
155 |     # Convert set of tuples to dictionary
156 |     return dict(found_fonts)
157 | 
158 | 
159 | def main():
160 |     parser = argparse.ArgumentParser(
161 |         description="Extract fonts from paragraphs with matching debug_id"
162 |     )
163 |     parser.add_argument(
164 |         "debug_id_regex", nargs="+", help="Regular expression to match debug_id values"
165 |     )
166 |     parser.add_argument(
167 |         "--json-path",
168 |         help="Path to il_translated.json (if not provided, will use the latest file)",
169 |     )
170 | 
171 |     args = parser.parse_args()
172 | 
173 |     # Determine JSON file path
174 |     json_path = None
175 |     if args.json_path:
176 |         json_path = Path(args.json_path)
177 |         if not json_path.exists():
178 |             print(f"Error: File not found: {json_path}")
179 |             return 1
180 |     else:
181 |         json_path = find_latest_il_json()
182 |         if not json_path:
183 |             print("Error: Could not find any il_translated.json file")
184 |             return 1
185 | 
186 |     print(f"Using JSON file: {json_path}")
187 | 
188 |     # Find fonts matching the debug_id pattern
189 |     fonts = find_fonts_by_debug_id(json_path, "|".join(args.debug_id_regex))
190 | 
191 |     # Output the results
192 |     if fonts:
193 |         print(
194 |             f"Found {len(fonts)} fonts in paragraphs matching debug_id pattern: {args.debug_id_regex}"
195 |         )
196 |         print(json.dumps(fonts, indent=2, ensure_ascii=False))
197 |     else:
198 |         print(
199 |             f"No fonts found for paragraphs matching debug_id pattern: {args.debug_id_regex}"
200 |         )
201 | 
202 |     return 0
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     exit(main())
207 | 


--------------------------------------------------------------------------------
/babeldoc/tools/italic_recognize_tool.py:
--------------------------------------------------------------------------------
 1 | # Identify non-formula italic fonts that were incorrectly classified as formulas in BableDOC translation results (intermediate)
 2 | 
 3 | import json
 4 | 
 5 | import babeldoc.tools.italic_assistance as italic_assistance
 6 | from babeldoc.document_il.midend.styles_and_formulas import StylesAndFormulas
 7 | from babeldoc.translation_config import TranslationConfig
 8 | from rich.console import Console
 9 | from rich.table import Table
10 | 
11 | console = Console()
12 | 
13 | json_path = italic_assistance.find_latest_il_json()
14 | 
15 | fonts = []
16 | 
17 | # Read intermediate representation
18 | with json_path.open(encoding="utf-8") as f:
19 |     pdf_data = json.load(f)
20 | 
21 | for page_index, page in enumerate(pdf_data["page"]):
22 |     for paragraph_index, paragraph_content in enumerate(page["pdf_paragraph"]):
23 |         font_debug_id = paragraph_content["debug_id"]
24 |         if font_debug_id:
25 |             # Create page font mapping
26 |             page_font_map = {}
27 |             for font in page["pdf_font"]:
28 |                 if "font_id" in font and "name" in font:
29 |                     page_font_map[font["font_id"]] = (font["font_id"], font["name"])
30 | 
31 |             # Extract fonts from paragraph
32 |             name_list = []
33 |             paragraph_fonts = italic_assistance.extract_fonts_from_paragraph(
34 |                 paragraph_content, page_font_map
35 |             )
36 |             for _font_id, font_name in paragraph_fonts:
37 |                 name_list.append(font_name)
38 | 
39 |             font_list = []
40 |             for each in fonts:
41 |                 font_list.append(each[1])
42 | 
43 |             for each_name in name_list:
44 |                 if each_name not in font_list:
45 |                     fonts.append(
46 |                         (page_index, each_name, paragraph_index, font_debug_id)
47 |                     )
48 | 
49 | # Initialize checker
50 | translation_config = TranslationConfig(
51 |     *[None for _ in range(3)], lang_out="zh_cn", doc_layout_model=1
52 | )
53 | checker = StylesAndFormulas(translation_config)
54 | 
55 | # Create table
56 | table = Table(title="Font Recognition Results")
57 | table.add_column("Page #", justify="center", style="cyan")
58 | table.add_column("Paragraph #", justify="center", style="cyan")
59 | table.add_column("DEBUG_ID", justify="center", style="cyan")
60 | table.add_column("Font Name", style="magenta")
61 | table.add_column("Recognition Result", justify="center")
62 | 
63 | # Output results
64 | for each_font in fonts:
65 |     page_index, font_name, paragraph_index, font_debug_id = each_font
66 | 
67 |     if checker.is_formulas_font(font_name):
68 |         table.add_row(
69 |             str(page_index),
70 |             str(paragraph_index),
71 |             str(font_debug_id),
72 |             font_name,
73 |             "[bold red]Formula Font[/bold red]",
74 |         )
75 |     else:
76 |         table.add_row(
77 |             str(page_index),
78 |             str(paragraph_index),
79 |             str(font_debug_id),
80 |             font_name,
81 |             "[bold blue]Non-Formula Font[/bold blue]",
82 |         )
83 | 
84 | # Print table
85 | console.print(table)
86 | 


--------------------------------------------------------------------------------
/docs/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | aw@funstory.ai .
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/docs/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to BabelDOC
  2 | 
  3 | ## How to contribute to BabelDOC
  4 | 
  5 | ### **About Language**
  6 | 
  7 | - Issues can be in Chinese or English
  8 | - PRs are limited to English
  9 | - All documents are provided in English only
 10 | 
 11 | ### **Did you find a bug?**
 12 | 
 13 | - **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/funstory-ai/BabelDOC/issues).
 14 | 
 15 | Please pay special attention to:
 16 | 
 17 | 1. Known compatibility issues with pdf2zh - see [#20](https://github.com/funstory-ai/BabelDOC/issues/20) for details
 18 | 2. Reported edge cases and limitations from downstream applications - see [#23](https://github.com/funstory-ai/BabelDOC/issues/23) for discussion
 19 | 
 20 | - If you're unable to find an open issue addressing the problem, [open a new one](https://github.com/funstory-ai/BabelDOC/issues/new?template=bug_report.md). Be sure to include a **title and clear description**, as much relevant information as possible.
 21 | 
 22 | ### **If you wish to request changes or new features**
 23 | 
 24 | - Suggest your change in the [Issues](https://github.com/funstory-ai/BabelDOC/issues/new?template=feature_request.md) section.
 25 | 
 26 | ### **If you wish to add more translators**
 27 | 
 28 | - This project is not intended for direct end-user use, and the supported translators are mainly for debugging purposes. Unless it clearly helps with development and debugging, PRs for directly adding translators will not be accepted.
 29 | - You can directly use [PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate) to get support for more translators.
 30 | 
 31 | ### **If you wish to contribute to BabelDOC**
 32 | 
 33 | > [!TIP]
 34 | >
 35 | > If you have any questions about the source code or related matters, please contact the maintainer at aw@funstory.ai .
 36 | > 
 37 | > You can also raise questions in [Issues](https://github.com/funstory-ai/BabelDOC/issues).
 38 | > 
 39 | > You can contact the maintainers in the pdf2zh discussion group.
 40 | > 
 41 | > Due to the current high rate of code changes, this project only accepts small PRs. If you would like to suggest a change and you include a patch as a proof-of-concept, that would be great. However, please do not be offended if we rewrite your patch from scratch.
 42 | 
 43 | [//]: # (> We welcome pull requests and will review your contributions.)
 44 | 
 45 | 
 46 | 1. Fork this repository and clone it locally.
 47 | 2. Use `doc/deploy.sh` to set up the development environment.
 48 | 3. Create a new branch and make code changes on that branch. `git checkout -b feature/<feature-name>`
 49 | 4. Perform development and ensure the code meets the requirements.
 50 | 
 51 | 5. Commit your changes to your new branch.
 52 | 
 53 | ```
 54 | git add .
 55 | 
 56 | git commit -m "<semantic commit message>"
 57 | ```
 58 | 
 59 | 5. Push to your repository: `git push origin feature/<feature-name>`.
 60 | 
 61 | 6. Create a PR on GitHub and provide a detailed description.
 62 | 
 63 | 7. Ensure all automated checks pass.
 64 | 
 65 | #### Basic Requirements
 66 | 
 67 | ##### Workflow
 68 | 
 69 | 1. Please create a fork on the main branch and develop on the forked branch.
 70 | 
 71 | - When submitting a Pull Request (PR), please provide detailed descriptions of the changes.
 72 | 
 73 | - If the PR fails automated checks (showing checks failed and red cross marks), please review the corresponding details and modify the submission to ensure the new PR passes automated checks.
 74 | 
 75 | 2. Development and Testing
 76 | 
 77 | - Use the `uv run BabelDOC` command for development and testing.
 78 | 
 79 | - When you need print log, please use `log.debug()` to print info. **DO NOT USE `print()`**
 80 | 
 81 | - Code formatting
 82 | 
 83 | 3. Dependency Updates
 84 | 
 85 | - If new dependencies are introduced, please update the dependency list in pyproject.toml accordingly.
 86 | 
 87 | - It is recommended to use the `uv add` command for adding dependencies.
 88 | 
 89 | 4. Documentation Updates
 90 | 
 91 | - If new command-line options are added, please update the command-line options list in README.md accordingly.
 92 | 
 93 | 5. Commit Messages
 94 | 
 95 | - Use [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/), for example: feat(translator): add openai.
 96 | 
 97 | 6. Coding Style
 98 | 
 99 | - Please ensure submitted code follows basic coding style guidelines.
100 | - Use pep8-naming.
101 | - Comments should be in English.
102 | - Follow these specific Python coding style guidelines:
103 | 
104 |   a. Naming Conventions:
105 | 
106 |   - Class names should use CapWords (PascalCase): `class TranslatorConfig`
107 |   - Function and variable names should use snake_case: `def process_text()`, `word_count = 0`
108 |   - Constants should be UPPER_CASE: `MAX_RETRY_COUNT = 3`
109 |   - Private attributes should start with underscore: `_internal_state`
110 | 
111 |   b. Code Layout:
112 | 
113 |   - Use 4 spaces for indentation (no tabs)
114 |   - Maximum line length is 88 characters (compatible with black formatter)
115 |   - Add 2 blank lines before top-level classes and functions
116 |   - Add 1 blank line before class methods
117 |   - No trailing whitespace
118 | 
119 |   c. Imports:
120 | 
121 |   - Imports should be on separate lines: `import os\nimport sys`
122 |   - Imports should be grouped in the following order:
123 |     1.  Standard library imports
124 |     2.  Related third party imports
125 |     3.  Local application/library specific imports
126 |   - Use absolute imports over relative imports
127 | 
128 |   d. String Formatting:
129 | 
130 |   - Prefer f-strings for string formatting: `f"Count: {count}"`
131 |   - Use double quotes for docstrings
132 | 
133 |   e. Type Hints:
134 | 
135 |   - Use type hints for function arguments and return values
136 |   - Example: `def translate_text(text: str) -> str:`
137 | 
138 |   f. Documentation:
139 | 
140 |   - All public functions and classes must have docstrings
141 |   - Use Google style for docstrings
142 |   - Example:
143 | 
144 |     ```python
145 |     def function_name(arg1: str, arg2: int) -> bool:
146 |         """Short description of function.
147 | 
148 |         Args:
149 |             arg1: Description of arg1
150 |             arg2: Description of arg2
151 | 
152 |         Returns:
153 |             Description of return value
154 | 
155 |         Raises:
156 |             ValueError: Description of when this error occurs
157 |         """
158 |     ```
159 | 
160 | The existing codebase does not comply with the above specifications in some aspects. Contributions for modifications are welcome.
161 | 
162 | #### How to modify the intermediate representation
163 | 
164 | The intermediate representation is described by [il_version_1.rnc](https://github.com/funstory-ai/BabelDOC/blob/main/BabelDOC/document_il/il_version_1.rnc). Corresponding Python data classes are generated using [xsdata](https://xsdata.readthedocs.io/en/latest/). The files `il_version_1.rng`, `il_version_1.xsd`, and `il_version_1.py` are auto-generated and must not be manually modified.
165 | 
166 | ##### Format RNC file
167 | 
168 | ```bash
169 | trang babeldoc/document_il/il_version_1.rnc babeldoc/document_il/il_version_1.rnc
170 | ```
171 | 
172 | ##### Generate RNG, XSD and Python classes
173 | 
174 | ```bash
175 | # Generate RNG from RNC
176 | trang babeldoc/document_il/il_version_1.rnc babeldoc/document_il/il_version_1.rng
177 | 
178 | # Generate XSD from RNC
179 | trang babeldoc/document_il/il_version_1.rnc babeldoc/document_il/il_version_1.xsd
180 | 
181 | # Generate Python classes from XSD
182 | xsdata generate babeldoc/document_il/il_version_1.xsd --package babeldoc.document_il
183 | ```
184 | 
185 | ##### Profile memory usage
186 | 
187 | ```bash
188 | uv run memray run --native --aggregate babeldoc/main.py -c yadt.toml
189 | ```


--------------------------------------------------------------------------------
/docs/CONTRIBUTOR_REWARD.md:
--------------------------------------------------------------------------------
 1 | # BabelDOC/PDFMathTranslate 贡献者奖励规则
 2 | 
 3 | ## 月度活跃贡献者奖励规则
 4 | 
 5 | ### 一、资格标准
 6 | #### **贡献类型要求**
 7 |    - 需提交 **至少 1 个有效 PR**（Pull Request），或进行 **PR 审核、文档编写** 等贡献。
 8 |    - 有效贡献定义：
 9 |      - 非简单的文档错别字修复
10 |      - 非简单的代码格式化调整（如仅调整缩进、空格等）
11 |      - 需做出实质性贡献（如功能开发、Bug 修复、性能优化、架构调整、技术文档编写、PR 审核等）
12 |    - 示例合格贡献：新增功能模块、修复逻辑错误、优化算法效率、编写技术文档等
13 | 
14 | #### **时间范围**
15 |    - 每月 1 日至月末最后一天合并的 PR 计入当月统计
16 | 
17 | ### 二、申请流程
18 | #### **申请条件**
19 |    - PR 需被成功合并至[funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 仓库或 [Byaidu/PDFMathTranslate](https://github.com/Byaidu/PDFMathTranslate/pulls)的主分支。
20 |    - 若目标为 [funstory-ai/BabelDOC](https://github.com/funstory-ai/BabelDOC/pulls) 的 PR 未被合并，但被维护者认定为有价值的概念验证，同样符合条件。
21 |    - 审核 PR、撰写 wiki 等贡献也必须是以上两个仓库。
22 |    - 同一贡献者每月仅可申请一次（无论提交 PR 数量）
23 |    - 同一贡献者每月最多可以获得 1 个兑换码
24 |    - 对于 PR，只有发起者可以申请兑换码
25 |    - 仅可使用当月的贡献申请兑换码（特殊情况请联系 aw@funstory.ai 说明）
26 | 
27 | #### **申请方式**
28 |    - 发送邮件至 **aw@funstory.ai**
29 |    - 邮件标题格式：`[贡献者会员兑换码申请] GitHub用户名-月份`（例：`[贡献者会员兑换码申请] awwaawwa-2024-07`）
30 |    - 邮件正文需包含：
31 |      - GitHub 用户名
32 |      - 合并 PR 的完整链接
33 |    - 附件要求：
34 |      - PR 页面完整截图（需包含合并状态、仓库名称及点击头像后弹出来的侧边栏，如下图所示）
35 | 
36 | ![附件示例](https://s.immersivetranslate.com/assets/r2-uploads/images/babeldoc-contributor_reward_example.png)
37 | 
38 | #### **奖励说明**
39 |    - 奖励内容：[沉浸式翻译（Immersive Translate）](https://immersivetranslate.com/zh-Hans/pricing/)月度会员兑换码
40 |    - 兑换码使用：在[沉浸式翻译官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入即可激活
41 |    - 会员权益：沉浸式翻译 Pro 会员一个月（详见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明）
42 |    - 兑换码为专属福利，不可转让
43 | 
44 | ### 三、审核与发放
45 | #### **审核周期**
46 |    - 我们会尽力在收到申请邮件后 1 个工作日内完成审核
47 |    - 审核时间可能因申请数量、审核复杂度等因素有所延长
48 |    - 审核通过后，兑换码将通过邮件方式发送
49 |    - 若审核未通过，我们会通过邮件说明原因
50 | 
51 | #### **兑换码规则**
52 |    - 使用方式：[官网兑换页](https://immersivetranslate.com/zh-Hans/exchange)输入兑换码激活
53 |    - 权益内容：月度会员（具体权益见[官网价格页](https://immersivetranslate.com/zh-Hans/pricing/)说明）
54 |    - 不可转让
55 | 
56 | ### 四、注意事项
57 | #### **禁止行为**
58 |    - 将完整功能拆分为多个无关 PR 提交
59 |    - 提交质量不合格或具有潜在危害的代码
60 |    - 提供虚假或误导性的申请材料
61 | 
62 | #### **特别说明**
63 |    - funstory.ai 保留对贡献价值的评估权、规则的最终解释权等所有必要权利
64 |    - 规则如有实质性更新（格式调整等除外），将提前 1 天在 [BabelDOC GitHub PR](https://github.com/funstory-ai/BabelDOC/pulls) 公告
65 |    - 过期未使用的兑换码不予补发
66 |    - 自 2025 年 2 月 1 日起的贡献可以申请兑换码
67 |    - 为了确认您是 Pull Request (PR) 的发起者，防止他人冒领，我们可能会要求您使用发起者账号在 PR 下方留言指定的随机数字。
68 | 
69 | ## 常见问题解答（FAQ）
70 | 
71 | **Q：如何判断文档翻译贡献是否有效？**
72 | 
73 | A：系统性的人工翻译（如完整章节的翻译并经过人工校对）视为有效贡献。零散段落翻译或仅依赖机器翻译的内容不计入有效贡献。
74 | 
75 | **Q：兑换码过期了可以补发吗？**
76 | 
77 |    A：为确保公平性，过期的兑换码将不予补发，请在有效期内及时使用。
78 | 
79 | **Q：为什么这个文档是中文的？**
80 | 
81 | A：因为目前应该是中文贡献者多吧，所以就先写中文的。后面再撰写英文版的。
82 | 
83 | ---
84 | **规则公示**：本规则文档存放于 BabelDOC 仓库 [CONTRIBUTOR_REWARD.md](https://github.com/funstory-ai/BabelDOC/blob/main/docs/CONTRIBUTOR_REWARD.md)，并在 [Contributor Reward - BabelDOC](https://funstory-ai.github.io/BabelDOC/CONTRIBUTOR_REWARD/) 展示。
85 | 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/AsyncTranslate/AsyncTranslate.md:
--------------------------------------------------------------------------------
  1 | # Async Translation API
  2 | 
  3 | > [!NOTE]
  4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
  5 | >
  6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
  7 | > - Community contribution (PRs welcome!)
  8 | 
  9 | ## Overview
 10 | 
 11 | The `yadt.high_level.async_translate` function provides an asynchronous interface for translating PDF files with real-time progress reporting. This function yields progress events that can be used to update progress bars or other UI elements.
 12 | 
 13 | ## Usage
 14 | 
 15 | ```python linenums="1"
 16 | async def translate_with_progress():
 17 |     config = TranslationConfig(
 18 |         input_file="example.pdf",
 19 |         translator=your_translator,
 20 |         # ... other configuration options
 21 |     )
 22 |     
 23 |     try:
 24 |         async for event in async_translate(config):
 25 |             if event["type"] == "progress_update":
 26 |                 print(f"Progress: {event['overall_progress']}%")
 27 |             elif event["type"] == "finish":
 28 |                 result = event["translate_result"]
 29 |                 print(f"Translation completed: {result.original_pdf_path}")
 30 |             elif event["type"] == "error":
 31 |                 print(f"Error occurred: {event['error']}")
 32 |                 break
 33 |     except asyncio.CancelledError:
 34 |         print("Translation was cancelled")
 35 |     except KeyboardInterrupt:
 36 |         print("Translation was interrupted")
 37 | ```
 38 | 
 39 | ## Event Types
 40 | 
 41 | The function yields different types of events during the translation process:
 42 | 
 43 | ### 1. Progress Start Event
 44 | 
 45 | Emitted when a translation stage begins:
 46 | 
 47 | ```python
 48 | {
 49 |     "type": "progress_start",
 50 |     "stage": str,              # Name of the current stage
 51 |     "stage_progress": float,   # Always 0.0
 52 |     "stage_current": int,      # Current progress count (0)
 53 |     "stage_total": int         # Total items to process in this stage
 54 | }
 55 | ```
 56 | 
 57 | ### 2. Progress Update Event
 58 | 
 59 | Emitted periodically during translation (controlled by report_interval, default 0.1s):
 60 | 
 61 | ```python
 62 | {
 63 |     "type": "progress_update",
 64 |     "stage": str,              # Name of the current stage
 65 |     "stage_progress": float,   # Progress percentage of current stage (0-100)
 66 |     "stage_current": int,      # Current items processed in this stage
 67 |     "stage_total": int,        # Total items to process in this stage
 68 |     "overall_progress": float  # Overall translation progress (0-100)
 69 | }
 70 | ```
 71 | 
 72 | ### 3. Progress End Event
 73 | 
 74 | Emitted when a stage completes:
 75 | 
 76 | ```python
 77 | {
 78 |     "type": "progress_end",
 79 |     "stage": str,              # Name of the completed stage
 80 |     "stage_progress": float,   # Always 100.0
 81 |     "stage_current": int,      # Equal to stage_total
 82 |     "stage_total": int,        # Total items processed in this stage
 83 |     "overall_progress": float  # Overall translation progress (0-100)
 84 | }
 85 | ```
 86 | 
 87 | ### 4. Finish Event
 88 | 
 89 | Emitted when translation completes successfully:
 90 | 
 91 | ```python
 92 | {
 93 |     "type": "finish",
 94 |     "translate_result": TranslateResult  # Contains paths to translated files and timing info
 95 | }
 96 | ```
 97 | 
 98 | ### 5. Error Event
 99 | 
100 | Emitted if an error occurs during translation:
101 | 
102 | ```python
103 | {
104 |     "type": "error",
105 |     "error": str  # Error message
106 | }
107 | ```
108 | 
109 | ## Translation Stages
110 | 
111 | The translation process goes through the following stages in order:
112 | 
113 | 1. ILCreater
114 | 2. LayoutParser
115 | 3. ParagraphFinder
116 | 4. StylesAndFormulas
117 | 5. ILTranslator
118 | 6. Typesetting
119 | 7. FontMapper
120 | 8. PDFCreater
121 | 
122 | Each stage will emit its own set of progress events.
123 | 
124 | ## Cancellation
125 | 
126 | The translation process can be cancelled in several ways:
127 | 
128 | 1. By raising a `CancelledError` (e.g., when using `asyncio.Task.cancel()`)
129 | 2. Through `KeyboardInterrupt` (e.g., when user presses Ctrl+C)
130 | 3. By calling `translation_config.cancel_translation()` method
131 | 
132 | Example of programmatic cancellation:
133 | 
134 | ```python linenums="1"
135 | async def translate_with_cancellation():
136 |     config = TranslationConfig(
137 |         input_file="example.pdf",
138 |         translator=your_translator,
139 |         # ... other configuration options
140 |     )
141 |     
142 |     try:
143 |         # Start translation in another task
144 |         translation_task = asyncio.create_task(process_translation(config))
145 |         
146 |         # Simulate some condition that requires cancellation
147 |         await asyncio.sleep(5)
148 |         config.cancel_translation()  # This will trigger cancellation
149 |         
150 |         await translation_task  # Wait for the task to finish
151 |     except asyncio.CancelledError:
152 |         print("Translation was cancelled")
153 | 
154 | async def process_translation(config):
155 |     async for event in async_translate(config):
156 |         if event["type"] == "error":
157 |             if isinstance(event["error"], asyncio.CancelledError):
158 |                 print("Translation was cancelled")
159 |                 break
160 |             print(f"Error occurred: {event['error']}")
161 |             break
162 |         # ... handle other events ...
163 | ```
164 | 
165 | When cancelled:
166 | - The function will log the cancellation reason
167 | - All resources will be cleaned up properly
168 | - Any ongoing translation tasks will be stopped
169 | - A final error event with `CancelledError` will be emitted
170 | - The function will exit gracefully
171 | 
172 | ## Error Handling
173 | 
174 | Any errors during translation will be:
175 | 1. Logged with full traceback (if debug mode is enabled)
176 | 2. Reported through an error event
177 | 3. Cause the event stream to stop after the error event
178 | 4. Clean up resources properly before exiting
179 | 
180 | It's recommended to handle these events appropriately in your application to provide feedback to users. The example in the Usage section shows a basic error handling pattern. 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/ILTranslator/ILTranslator.md:
--------------------------------------------------------------------------------
  1 | # Intermediate Layer Translator
  2 | 
  3 | > [!NOTE]
  4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
  5 | >
  6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
  7 | > - Community contribution (PRs welcome!)
  8 | 
  9 | ## Background
 10 | 
 11 | After formula and style processing, we need to translate the document while preserving all formatting, formulas, and styles. The intermediate layer translator handles this complex task by using placeholders and style preservation techniques.
 12 | 
 13 | ## Goal
 14 | 
 15 | 1. Translate text while preserving document structure
 16 | 2. Maintain formulas and special formatting
 17 | 3. Handle rich text with different styles
 18 | 4. Support concurrent translation for better performance
 19 | 
 20 | ## Specific Implementation
 21 | 
 22 | The translation process consists of several key steps:
 23 | 
 24 | ### Step 1: Translation Preparation
 25 | 
 26 | 1. Process paragraphs:
 27 |    - Skip vertical text
 28 |    - Handle single-component paragraphs directly
 29 |    - Process multi-component paragraphs with placeholders
 30 | 
 31 | 2. Create placeholders:
 32 |    - Formula placeholders for mathematical expressions
 33 |    - Rich text placeholders for styled text
 34 |    - Ensure placeholder uniqueness within each paragraph
 35 | 
 36 | ### Step 2: Translation Input Creation
 37 | 
 38 | 1. Analyze paragraph components:
 39 |    - Regular text components
 40 |    - Formula components
 41 |    - Styled text components
 42 | 
 43 | 2. Handle special cases:
 44 |    - Skip pure formula paragraphs
 45 |    - Preserve original text when style matches base style
 46 |    - Handle font mapping cases
 47 | 
 48 | ### Step 3: Translation Execution
 49 | 
 50 | 1. Concurrent translation:
 51 |    - Use thread pool for parallel processing
 52 |    - Control QPS (Queries Per Second)
 53 |    - Track translation progress
 54 | 
 55 | 2. Translation tracking:
 56 |    - Record original text
 57 |    - Record translated text
 58 |    - Save tracking information for debugging
 59 | 
 60 | ### Step 4: Translation Output Processing
 61 | 
 62 | 1. Parse translated text:
 63 |    - Extract text between placeholders
 64 |    - Restore formulas at placeholder positions
 65 |    - Restore rich text with original styles
 66 | 
 67 | 2. Create new paragraph components:
 68 |    - Maintain style information
 69 |    - Preserve formula positioning
 70 |    - Handle empty text segments
 71 | 
 72 | ## Additional Features
 73 | 
 74 | 1. Style preservation:
 75 |    - Maintains original text styles
 76 |    - Handles font size variations
 77 |    - Preserves formatting attributes
 78 | 
 79 | 2. Formula handling:
 80 |    - Preserves formula integrity
 81 |    - Maintains formula positioning
 82 |    - Supports complex mathematical expressions
 83 | 
 84 | 3. Debug support:
 85 |    - Translation tracking
 86 |    - JSON output for debugging
 87 |    - Detailed logging
 88 | 
 89 | ## Limitations
 90 | 
 91 | 1. Vertical text is not supported
 92 | 
 93 | 2. Complex nested styles might not be perfectly preserved
 94 | 
 95 | 3. Placeholder conflicts could occur in rare cases
 96 | 
 97 | 4. Translation quality depends on external translation engine
 98 | 
 99 | ## Configuration Options
100 | 
101 | The translation process can be customized through `TranslationConfig`:
102 | 
103 | 1. `qps`: Maximum queries per second for translation
104 | 2. `debug`: Enable/disable debug mode and tracking
105 | 3. Translation engine specific settings 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/PDFCreation/PDFCreation.md:
--------------------------------------------------------------------------------
  1 | # PDF Creation
  2 | 
  3 | > [!NOTE]
  4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
  5 | >
  6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
  7 | > - Community contribution (PRs welcome!)
  8 | 
  9 | ## Background
 10 | 
 11 | After translation and typesetting, we need to create the final PDF document that preserves all the formatting, styles, and layout of the original document while containing the translated text. The PDF creation process handles this final step.
 12 | 
 13 | ## Goal
 14 | 
 15 | 1. Create a new PDF document with translated content
 16 | 2. Preserve all original formatting and styles
 17 | 3. Support both monolingual and dual-language output
 18 | 4. Maintain font consistency and character encoding
 19 | 5. Optimize the output file size and performance
 20 | 
 21 | ## Specific Implementation
 22 | 
 23 | The PDF creation process consists of several key steps:
 24 | 
 25 | ### Step 1: Font Management
 26 | 
 27 | 1. Font initialization:
 28 |    - Add required fonts to the document
 29 |    - Map font identifiers
 30 |    - Handle font encoding lengths
 31 | 
 32 | 2. Font availability checking:
 33 |    - Check available fonts for each page
 34 |    - Handle XObject font requirements
 35 |    - Manage font resources
 36 | 
 37 | 3. Font subsetting:
 38 |    - Optimize font usage
 39 |    - Reduce file size
 40 |    - Maintain character support
 41 | 
 42 | ### Step 2: Content Rendering
 43 | 
 44 | 1. Character processing:
 45 |    - Handle individual characters
 46 |    - Process character encodings
 47 |    - Manage character positioning
 48 | 
 49 | 2. Graphics state handling:
 50 |    - Process color spaces
 51 |    - Handle transparency
 52 |    - Manage graphic state instructions
 53 | 
 54 | 3. XObject management:
 55 |    - Process form XObjects
 56 |    - Handle drawing operations
 57 |    - Maintain XObject hierarchy
 58 | 
 59 | ### Step 3: Document Assembly
 60 | 
 61 | 1. Page construction:
 62 |    - Build page content
 63 |    - Process page resources
 64 |    - Handle page boundaries
 65 | 
 66 | 2. Content stream creation:
 67 |    - Generate drawing operations
 68 |    - Handle text positioning
 69 |    - Manage content streams
 70 | 
 71 | 3. Resource management:
 72 |    - Handle font resources
 73 |    - Manage XObject resources
 74 |    - Process graphic states
 75 | 
 76 | ### Step 4: Output Generation
 77 | 
 78 | 1. Monolingual output:
 79 |    - Create translated-only PDF
 80 |    - Optimize file size
 81 |    - Apply compression
 82 | 
 83 | 2. Dual-language output:
 84 |    - Combine original and translated pages
 85 |    - Handle page ordering
 86 |    - Maintain document structure
 87 | 
 88 | 3. File optimization:
 89 |    - Apply garbage collection
 90 |    - Enable compression
 91 |    - Optimize for linear reading
 92 | 
 93 | ## Additional Features
 94 | 
 95 | 1. Font handling:
 96 |    - Support for CID fonts
 97 |    - Font subsetting
 98 |    - Font resource management
 99 | 
100 | 2. Document optimization:
101 |    - File size reduction
102 |    - Performance optimization
103 |    - Resource cleanup
104 | 
105 | 3. Debug support:
106 |    - Decompressed output
107 |    - Debug information
108 |    - Progress tracking
109 | 
110 | ## Limitations
111 | 
112 | 1. Font support:
113 |    - Limited to available font formats
114 |    - Font subsetting restrictions
115 |    - Character encoding constraints
116 | 
117 | 2. File size:
118 |    - Dual-language output increases size
119 |    - Font embedding impact
120 |    - Resource duplication
121 | 
122 | 3. Performance considerations:
123 |    - Processing time for large documents
124 |    - Memory usage during creation
125 |    - Optimization overhead
126 | 
127 | ## Configuration Options
128 | 
129 | The PDF creation process can be customized through `TranslationConfig`:
130 | 
131 | 1. Output options:
132 |    - `no_mono`: Disable monolingual output
133 |    - `no_dual`: Disable dual-language output
134 |    - Output file naming patterns
135 | 
136 | 2. Optimization settings:
137 |    - Compression options
138 |    - Garbage collection
139 |    - Font subsetting
140 | 
141 | 3. Debug options:
142 |    - Debug mode
143 |    - Decompressed output
144 |    - Progress tracking 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/PDFParsing/PDFParsing.md:
--------------------------------------------------------------------------------
  1 | # PDF Parsing and Intermediate Layer Creation
  2 | 
  3 | > [!NOTE]
  4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
  5 | >
  6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
  7 | > - Community contribution (PRs welcome!)
  8 | 
  9 | ## Background
 10 | 
 11 | The first step in the translation process is to parse the PDF document and create an intermediate layer (IL) representation. This step involves extracting text, styles, formulas, and layout information from the PDF while maintaining their relationships and properties.
 12 | 
 13 | ## Goal
 14 | 
 15 | 1. Extract text content while preserving character-level information
 16 | 2. Maintain font and style information
 17 | 3. Preserve document structure and layout
 18 | 4. Handle special elements like XObjects and graphics
 19 | 5. Create a structured intermediate representation for later processing
 20 | 
 21 | ## Specific Implementation
 22 | 
 23 | The parsing process consists of several key components working together:
 24 | 
 25 | ### Step 1: PDF Interpreter (PDFPageInterpreterEx)
 26 | 
 27 | 1. Page content processing:
 28 |    - Parse PDF operators and their parameters
 29 |    - Handle graphics state operations
 30 |    - Process text and font operations
 31 |    - Manage XObject rendering
 32 | 
 33 | 2. Graphics filtering:
 34 |    - Filter non-formula lines
 35 |    - Handle color space operations
 36 |    - Process stroke and fill operations
 37 | 
 38 | 3. XObject handling:
 39 |    - Process form XObjects
 40 |    - Handle image XObjects
 41 |    - Maintain XObject hierarchy
 42 | 
 43 | ### Step 2: PDF Converter (PDFConverterEx)
 44 | 
 45 | 1. Character processing:
 46 |    - Extract character information
 47 |    - Maintain character positions
 48 |    - Preserve style attributes
 49 | 
 50 | 2. Layout management:
 51 |    - Handle page boundaries
 52 |    - Process figure elements
 53 |    - Manage coordinate systems
 54 | 
 55 | 3. Font handling:
 56 |    - Map font identifiers
 57 |    - Process font metadata
 58 |    - Handle CID fonts
 59 | 
 60 | ### Step 3: Intermediate Layer Creator (ILCreater)
 61 | 
 62 | 1. Document structure creation:
 63 |    - Build page hierarchy
 64 |    - Create character objects
 65 |    - Maintain font registry
 66 | 
 67 | 2. Resource management:
 68 |    - Process font resources
 69 |    - Handle color spaces
 70 |    - Manage graphic states
 71 | 
 72 | 3. XObject tracking:
 73 |    - Track XObject hierarchy
 74 |    - Maintain XObject states
 75 |    - Process form content
 76 | 
 77 | ### Step 4: High-level Coordination
 78 | 
 79 | 1. Process management:
 80 |    - Initialize resources
 81 |    - Coordinate component interactions
 82 |    - Handle progress tracking
 83 | 
 84 | 2. Resource initialization:
 85 |    - Set up font management
 86 |    - Initialize graphics resources
 87 |    - Prepare document structure
 88 | 
 89 | 3. Error handling:
 90 |    - Handle malformed content
 91 |    - Manage resource errors
 92 |    - Provide debug information
 93 | 
 94 | ## Additional Features
 95 | 
 96 | 1. Font management:
 97 |    - Support for CID fonts
 98 |    - Font metadata extraction
 99 |    - Font mapping capabilities
100 | 
101 | 2. Graphics state tracking:
102 |    - Color space management
103 |    - Line style preservation
104 |    - Transparency handling
105 | 
106 | 3. Coordinate system handling:
107 |    - Support for transformations
108 |    - Boundary box calculations
109 |    - Position normalization
110 | 
111 | 4. Debug support:
112 |    - Detailed logging
113 |    - Intermediate file generation
114 |    - Progress tracking
115 | 
116 | ## Limitations
117 | 
118 | 1. Complex PDF features:
119 |    - Limited support for some PDF extensions
120 |    - Simplified graphics model
121 |    - Basic transparency support
122 | 
123 | 2. Font handling:
124 |    - Limited support for some font formats
125 |    - Simplified font metrics
126 |    - Basic font feature support
127 | 
128 | 3. Performance considerations:
129 |    - Memory usage for large documents
130 |    - Processing time for complex layouts
131 |    - Resource management overhead
132 | 
133 | ## Configuration Options
134 | 
135 | The parsing process can be customized through `TranslationConfig`:
136 | 
137 | 1. `debug`: Enable/disable debug mode and intermediate file generation
138 | 2. Font-related settings:
139 |    - Font mapping configurations
140 |    - CID font handling options
141 | 3. Layout processing options:
142 |    - Page selection
143 |    - Content filtering rules 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/ParagraphFinding/ParagraphFinding.md:
--------------------------------------------------------------------------------
 1 | # Paragraph Finding
 2 | 
 3 | > [!NOTE]
 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
 5 | >
 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
 7 | > - Community contribution (PRs welcome!)
 8 | 
 9 | ## Background
10 | 
11 | After PDF analysis, we need to identify paragraphs from individual characters. This is a crucial step before translation and typesetting, as it helps maintain the logical structure of the document.
12 | 
13 | ## Goal
14 | 
15 | 1. Group characters into meaningful paragraphs while preserving the document's logical structure
16 | 2. Handle special cases like table of contents, short lines, and multi-line paragraphs
17 | 3. Maintain layout information for later typesetting
18 | 
19 | ## Specific Implementation
20 | 
21 | The paragraph finding process consists of four main steps:
22 | 
23 | ### Step 1: Create Initial Paragraphs
24 | 
25 | 1. Group characters into lines based on their spatial relationships
26 | 2. Create paragraphs based on layout information and XObject IDs
27 | 3. Characters that don't belong to text layouts are skipped
28 | 
29 | ### Step 2: Process Paragraph Spacing
30 | 
31 | 1. Remove completely empty lines
32 | 2. Handle trailing spaces within lines
33 | 3. Update paragraph boundary boxes and metadata
34 | 
35 | ### Step 3: Calculate Line Width Statistics
36 | 
37 | 1. Calculate the median width of all lines
38 | 2. This information is used for identifying potential paragraph breaks
39 | 
40 | ### Step 4: Process Independent Paragraphs
41 | 
42 | 1. Analyze paragraphs with multiple lines
43 | 2. Split paragraphs in two cases:
44 |    - When encountering table of contents entries (identified by consecutive dots)
45 |    - When finding lines significantly shorter than the median width (configurable via `short_line_split_factor`)
46 | 
47 | ## Additional Features
48 | 
49 | 1. Layout-aware processing:
50 |    - Respects different layout types (plain text, title, figure caption, etc.)
51 |    - Maintains layout priority order for overlapping regions
52 | 
53 | 2. First line indent detection:
54 |    - Automatically detects and marks paragraphs with first line indentation
55 | 
56 | 3. Flexible character position detection:
57 |    - Uses multiple position detection modes (middle, topleft, bottomright)
58 |    - Special handling for characters with unreliable height information
59 | 
60 | ## Limitations
61 | 
62 | 1. The current implementation assumes left-to-right text direction
63 | 
64 | 2. May not perfectly handle complex layouts with overlapping regions
65 | 
66 | 3. Table of contents detection relies on consecutive dots pattern
67 | 
68 | 4. Short line splitting might occasionally create incorrect paragraph breaks
69 | 
70 | ## Configuration Options
71 | 
72 | The paragraph finding behavior can be customized through `TranslationConfig`:
73 | 
74 | 1. `split_short_lines`: Enable/disable splitting paragraphs at short lines
75 | 2. `short_line_split_factor`: Threshold factor for short line detection (relative to median width) 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/README.md:
--------------------------------------------------------------------------------
 1 | # Implementation Details
 2 | 
 3 | > [!NOTE]
 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
 5 | >
 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
 7 | > - Community contribution (PRs welcome!)
 8 | 
 9 | ## Core Processing Flow
10 | 
11 | Main processing stages in order of actual execution and corresponding documentation:
12 | 
13 | 1. [PDFParser.md](PDFParsing/PDFParsing.md): **PDF Parsing and Intermediate Layer Creation**
14 | 
15 | 2. [LayoutParser](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/midend/layout_parser.py): **Layout OCR**
16 | 
17 | 3. [ParagraphFinding.md](ParagraphFinding/ParagraphFinding.md): **Paragraph Recognition**
18 | 
19 | 4. [StylesAndFormulas.md](StylesAndFormulas/StylesAndFormulas.md): **Style and Formula Processing**
20 | 
21 | 5. [ILTranslator.md](ILTranslator/ILTranslator.md): **Intermediate Layer Translation**
22 | 
23 | 6. [Typesetting.md](Typesetting/Typesetting.md): **Typesetting Processing**
24 | 
25 | 7. [FontMapper](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/utils/fontmap.py): **Font Mapping**
26 | 
27 | 8. [PDFCreation.md](PDFCreation/PDFCreation.md): **PDF Generation**
28 | 
29 | ## API
30 | 
31 | 1. [Async Translation API](AsyncTranslate/AsyncTranslate.md): **Async Translation API**
32 | 
33 | > [!TIP]
34 | >
35 | > Click on document links to view detailed implementation principles and configuration options
36 | 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md:
--------------------------------------------------------------------------------
 1 | # Styles and Formulas Processing
 2 | 
 3 | > [!NOTE]
 4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
 5 | >
 6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
 7 | > - Community contribution (PRs welcome!)
 8 | 
 9 | ## Background
10 | 
11 | After paragraph finding, we need to identify formulas and text styles within each paragraph. This step is crucial for maintaining mathematical expressions and text formatting during translation.
12 | 
13 | ## Goal
14 | 
15 | 1. Identify and preserve mathematical formulas
16 | 2. Detect and maintain consistent text styles
17 | 3. Handle special cases like subscripts and superscripts
18 | 4. Calculate proper offsets for formula positioning
19 | 
20 | ## Specific Implementation
21 | 
22 | The processing consists of several main steps:
23 | 
24 | ### Step 1: Formula Detection
25 | 
26 | 1. Identify formula characters based on:
27 |    - Formula-specific fonts
28 |    - Special Unicode characters
29 |    - Vertical text
30 |    - Corner marks (subscripts/superscripts)
31 | 
32 | 2. Group consecutive formula characters into formula units
33 | 
34 | ### Step 2: Formula Processing
35 | 
36 | 1. Process comma-containing formulas:
37 |    - Split complex formulas at commas when appropriate
38 |    - Preserve brackets and their contents
39 |    - Convert simple number-only formulas to regular text
40 | 
41 | 2. Merge overlapping formulas:
42 |    - Handle cases where subscripts/superscripts are detected as separate formulas
43 |    - Maintain proper character ordering
44 | 
45 | ### Step 3: Style Analysis
46 | 
47 | 1. Calculate base style for each paragraph:
48 |    - Find common style attributes across all text
49 |    - Handle font variations
50 |    - Process graphic states
51 | 
52 | 2. Group characters with identical styles:
53 |    - Font properties
54 |    - Size properties
55 |    - Graphic state properties
56 | 
57 | ### Step 4: Position Calculation
58 | 
59 | 1. Calculate formula offsets:
60 |    - Compute x-offset relative to surrounding text
61 |    - Compute y-offset for proper vertical alignment
62 |    - Handle line spacing variations
63 | 
64 | ## Additional Features
65 | 
66 | 1. Font mapping:
67 |    - Maps different fonts to standard ones
68 |    - Special handling for formula fonts
69 | 
70 | 2. Style inheritance:
71 |    - Maintains style hierarchy
72 |    - Handles partial style overrides
73 | 
74 | 3. Formula classification:
75 |    - Distinguishes between translatable and non-translatable formulas
76 |    - Special handling for numeric formulas with commas
77 | 
78 | ## Limitations
79 | 
80 | 1. Formula detection relies on font and character patterns
81 | 
82 | 2. May not handle all types of mathematical notations
83 | 
84 | 3. Complex subscript/superscript combinations might be misidentified
85 | 
86 | 4. Limited support for vertical formulas
87 | 
88 | ## Configuration Options
89 | 
90 | The formula and style processing can be customized through `TranslationConfig`:
91 | 
92 | 1. `formular_font_pattern`: Regex pattern for identifying formula fonts
93 | 2. `formular_char_pattern`: Regex pattern for identifying formula characters 


--------------------------------------------------------------------------------
/docs/ImplementationDetails/Typesetting/Typesetting.md:
--------------------------------------------------------------------------------
  1 | # Typography
  2 | 
  3 | > [!NOTE]
  4 | > This documentation may contain AI-generated content. While we strive for accuracy, there might be inaccuracies. Please report any issues via:
  5 | >
  6 | > - [GitHub Issues](https://github.com/funstory-ai/yadt/issues)
  7 | > - Community contribution (PRs welcome!)
  8 | 
  9 | ## Background
 10 | 
 11 | After translation, text needs to be typeset before placing into PDF.
 12 | 
 13 | Translated paragraphs can contain any combination of the following types:
 14 | 
 15 | 1. PDF formulas
 16 | 
 17 | 2. Single PDF original character
 18 | 
 19 | 3. PDF original string with same style
 20 | 
 21 | 4. Translated Unicode string with same style
 22 | 
 23 | Let's discuss different cases:
 24 | 
 25 | For the following 3 types, they can be directly transmitted transparently to new positions:
 26 | 
 27 | 1. PDF formulas
 28 | 
 29 | 2. Single PDF original character
 30 | 
 31 | 3. PDF original string with same style
 32 | 
 33 | Only "translated Unicode string with same style" needs typesetting operation, as this step loses original layout information. However, since paragraphs may contain other components that need transparent transmission, their positions may also change and need to participate in typesetting.
 34 | 
 35 | ## Goal
 36 | 
 37 | Try to fit all components within the original paragraph bounding box. If impossible, try to expand the bounding box in writing direction.
 38 | 
 39 | ## Specific Implementation
 40 | 
 41 | First perform reflow judgment to determine if the paragraph needs reflow. If all elements can be transmitted transparently, no reflow is needed. Then, if reflow is needed, execute Algorithm 1:
 42 | 
 43 | 1. Convert all elements to typesetting unit type, which records length and width information.
 44 | 
 45 | 2. Start from top-left of original paragraph bounding box, place elements sequentially.
 46 | 
 47 | 3. If current line cannot fit next element, wrap to next line.
 48 | 
 49 | 4. Repeat 2-3 until all elements are placed or exceed original bounding box.
 50 | 
 51 | Algorithm 1 works normally when translated text is shorter than original. When translated text is longer, Algorithm 2 needs to be added:
 52 | 
 53 | 1. Initialize element scaling factor as 1.0.
 54 | 
 55 | 2. Initialize line spacing as 1.5.
 56 | 
 57 | 3. Try typesetting using Algorithm 1.
 58 | 
 59 | 4. If it cannot fit all elements:
 60 | 
 61 |    - First try to reduce line spacing by 0.1 step until reaching minimum line spacing (1.4)
 62 |    - If still cannot fit:
 63 |      - When scale > 0.6, reduce element scaling by 0.05
 64 |      - When scale <= 0.6, reduce element scaling by 0.1
 65 |      - Reset line spacing to 1.5
 66 |    - When scale becomes less than 0.7, adjust minimum line spacing to 1.1
 67 | 
 68 | 5. Report error if element scaling is less than 0.1.
 69 | 
 70 | Algorithm 2 can fit translations of almost all languages in original position.
 71 | 
 72 | However, for special cases like "图 1" translated to "Figure 1", even with the above algorithms some text may still overflow. So Algorithm 3:
 73 | 
 74 | 1. Before reducing scale, first try to expand the bounding box in writing direction.
 75 | 
 76 | 2. Calculate paragraph's right whitespace by:
 77 | 
 78 |    - Using 90% of page crop box width as maximum limit
 79 |    - Checking for overlapping paragraphs on the right
 80 |    - Checking for overlapping figures on the right
 81 | 
 82 | 3. Expand paragraph bounding box based on available whitespace.
 83 | 
 84 | 4. If still cannot fit all elements, continue with scale reduction as in Algorithm 2.
 85 | 
 86 | ## Additional Features
 87 | 
 88 | 1. Mixed Chinese-English text handling:
 89 |    - Adds 0.5 character width spacing between Chinese and English text transitions
 90 |    - Excludes certain punctuation marks from this spacing rule
 91 | 2. First line indent:
 92 | 
 93 |    - Adds 2 Chinese characters width indent for the first line when specified
 94 | 
 95 | 3. Hanging punctuation:
 96 |    - Allows certain punctuation marks to extend beyond the right margin
 97 |    - Helps maintain better visual alignment
 98 | 
 99 | ## Limitations
100 | 
101 | 1. Currently, we use PDFPlumber for PDF analysis, this is only implemented for paragraphs, only handles left-to-right writing.
102 | 
103 | 2. Cannot handle table of contents alignment by dots.
104 | 
105 | 3. Poor performance, needs optimization.
106 | 
107 | 4. No global page information consideration, inconsistent text sizes.
108 | 
109 | 5. No advanced typography features, poor reading experience.
110 | 
111 | ## Related Resources
112 | 
113 | [UTR #59: East Asian Spacing](https://www.unicode.org/reports/tr59/) specifies which characters need spacing between them.
114 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | YADT Spec
2 | ===
3 | 
4 | ## YADT Document Intermediate Language
5 | 
6 | [il_version_1.rnc](https://github.com/funstory-ai/yadt/blob/main/yadt/document_il/il_version_1.rnc): The definition of the intermediate language used between PDF parsing and rendering stages.
7 | 
8 | For other implementation details, please refer to [Implementation Details](ImplementationDetails/README.md).


--------------------------------------------------------------------------------
/docs/deploy.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | command_exists() {
 5 |   command -v "$1" >/dev/null 2>&1
 6 | }
 7 | 
 8 | echo "check uv installed ……"
 9 | if command_exists uv; then
10 |   echo "uv installed !"
11 |   exit 0
12 | fi
13 | 
14 | echo "uv not install, start installing ……"
15 | 
16 | OS=$(uname -s)
17 | case "$OS" in
18 |   Linux)
19 |     if command_exists curl; then
20 |         curl -LsSf https://astral.sh/uv/install.sh | sh
21 |     elif command_exists wget; then
22 |         wget -qO- https://astral.sh/uv/install.sh | sh
23 |     else
24 |       echo "curl or wget not found. uv installed failed."
25 |       exit 1
26 |     fi
27 |     ;;
28 |   Darwin)
29 |     if command_exists brew; then
30 |       brew install uv
31 |     else
32 |       echo "Homebrew not installed, please installed uv munally. "
33 |       exit 1
34 |     fi
35 |     ;;
36 |   *)
37 |     echo "not support OS: $OS"
38 |     exit 1
39 |     ;;
40 | esac
41 | 
42 | if command_exists uv; then
43 |      uv run babeldoc --version
44 |      pre-commit install
45 | else
46 |   exit 1
47 | fi
48 | 


--------------------------------------------------------------------------------
/docs/images/babeldoc-banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-banner.png


--------------------------------------------------------------------------------
/docs/images/babeldoc-big-logo-darkmode-with-transparent-background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-big-logo-darkmode-with-transparent-background.png


--------------------------------------------------------------------------------
/docs/images/babeldoc-big-logo-darkmode-with-transparent-background.svg:
--------------------------------------------------------------------------------
 1 | <svg width="3033" height="1024" viewBox="0 0 3033 1024" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <path d="M545.97 191.828L7.28418 333.753L176.393 975.617L715.079 833.692L545.97 191.828Z" fill="#D7DAE6"/>
 3 | <path d="M1023.27 162.578L572.766 44.2217L416.264 639.924L866.77 758.28L1023.27 162.578Z" fill="#D7DAE6"/>
 4 | <path d="M777.884 119.796H220.815V904.673H777.884V119.796Z" fill="#024AFF"/>
 5 | <path d="M571.439 253.802H99.1982V483.38H571.439V253.802Z" fill="white"/>
 6 | <path d="M279.087 328.642H152.667V353.926H279.087V328.642Z" fill="#024AFF"/>
 7 | <path d="M234.84 379.209H152.667V404.493H234.84V379.209Z" fill="#024AFF"/>
 8 | <path d="M902.217 541.153H429.976V770.732H902.217V541.153Z" fill="#FFBE19"/>
 9 | <path d="M669.862 512.262C669.862 513.463 669.799 514.664 669.799 515.865H695.083C695.083 514.664 695.146 513.463 695.146 512.262C695.146 439.697 655.45 376.298 596.729 342.48V372.379C640.912 403.225 669.862 454.425 669.862 512.262Z" fill="white"/>
10 | <path d="M404.693 654.049C358.929 623.455 328.778 571.307 328.778 512.269C328.778 511.068 328.841 509.867 328.841 508.666H303.557C303.557 509.867 303.494 511.068 303.494 512.269C303.494 585.972 344.391 650.256 404.693 683.631V654.049Z" fill="#FFBD14"/>
11 | <path d="M866.938 619.407H740.519V644.691H866.938V619.407Z" fill="#024AFF"/>
12 | <path d="M866.938 669.975H784.766V695.259H866.938V669.975Z" fill="#024AFF"/>
13 | <path d="M454.76 304.9H523.33V408.135H454.76V465.023H424.419V408.135H354.938V304.9H424.419V271.753L454.76 272.815V304.9ZM385.052 334.862V378.628H424.343V334.862H385.052ZM454.76 378.552H493.216V334.786H454.76V378.552Z" fill="#024AFF"/>
14 | <path d="M550.839 695.334L539.385 728.861H500.321L554.935 581.48H605.452L659.61 728.861H619.712L608.486 695.334H550.914H550.839ZM598.17 664.69L581.179 613.945H578.6L561.154 664.69H598.094H598.17Z" fill="#024AFF"/>
15 | <path d="M1189.6 740.358V305.79H1307.38C1338.99 305.79 1361.95 313.527 1376.27 329C1390.76 344.309 1398 369.659 1398 405.05V424.062C1398 444.474 1394.3 461.017 1386.89 473.692C1379.65 486.367 1368.54 494.679 1353.56 498.63C1373.14 503.568 1386.23 515.173 1392.81 533.445C1399.56 551.552 1402.94 573.692 1402.94 599.865C1402.94 628.013 1400.3 652.622 1395.04 673.692C1389.77 694.762 1380.22 711.14 1366.4 722.828C1352.57 734.515 1332.9 740.358 1307.38 740.358H1189.6ZM1274.54 469.494H1292.32C1300.39 469.494 1305.57 466.367 1307.88 460.111C1310.18 453.856 1311.33 446.367 1311.33 437.642V393.939C1311.33 379.947 1305.16 372.951 1292.81 372.951H1274.54V469.494ZM1283.19 663.815C1305.74 663.815 1317.01 653.116 1317.01 631.716V577.395C1317.01 565.05 1315.12 555.338 1311.33 548.26C1307.71 541.017 1300.8 537.395 1290.59 537.395H1274.54V663.321C1278.16 663.651 1281.05 663.815 1283.19 663.815ZM1497.51 744.309C1476.93 744.309 1461.21 739.535 1450.35 729.988C1439.65 720.441 1432.32 707.354 1428.37 690.729C1424.42 674.103 1422.44 655.256 1422.44 634.186C1422.44 611.634 1424.67 593.198 1429.11 578.877C1433.56 564.391 1441.13 552.539 1451.83 543.321C1462.69 534.103 1477.59 526.284 1496.52 519.865L1551.33 501.099V463.074C1551.33 443.157 1544.83 433.198 1531.83 433.198C1519.98 433.198 1514.05 441.264 1514.05 457.395V479.618H1428.12C1427.96 478.301 1427.88 476.655 1427.88 474.679C1427.88 472.539 1427.88 470.153 1427.88 467.519C1427.88 430.646 1436.52 404.556 1453.8 389.247C1471.25 373.774 1498.82 366.037 1536.52 366.037C1556.27 366.037 1573.97 369.577 1589.6 376.655C1605.24 383.568 1617.59 393.856 1626.64 407.519C1635.86 421.181 1640.47 438.054 1640.47 458.136V740.358H1552.57V696.408C1548.29 711.881 1541.21 723.733 1531.33 731.963C1521.46 740.194 1510.18 744.309 1497.51 744.309ZM1532.32 675.667C1539.56 675.667 1544.5 672.622 1547.14 666.531C1549.77 660.441 1551.09 653.856 1551.09 646.778V543.568C1538.91 548.507 1529.44 554.762 1522.69 562.334C1515.94 569.741 1512.57 580.688 1512.57 595.173V642.828C1512.57 664.721 1519.15 675.667 1532.32 675.667ZM1822.44 744.309C1807.96 744.309 1796.19 740.77 1787.14 733.692C1778.08 726.449 1770.84 715.502 1765.41 700.852V740.358H1675.53V305.79H1765.41V409.494C1770.84 395.832 1778.33 385.214 1787.88 377.642C1797.59 369.906 1811.42 366.037 1829.36 366.037C1853.56 366.037 1870.59 374.844 1880.47 392.457C1890.51 410.07 1895.53 433.116 1895.53 461.593V643.815C1895.53 662.745 1892.9 679.865 1887.63 695.173C1882.53 710.317 1874.54 722.334 1863.68 731.223C1852.98 739.947 1839.23 744.309 1822.44 744.309ZM1785.9 675.42C1795.28 675.42 1800.96 670.811 1802.94 661.593C1804.91 652.375 1805.9 641.017 1805.9 627.519V482.828C1805.9 469.165 1804.91 457.807 1802.94 448.753C1800.96 439.535 1795.37 434.926 1786.15 434.926C1776.6 434.926 1770.76 439.618 1768.62 449C1766.48 458.218 1765.41 469.494 1765.41 482.828V627.519C1765.41 640.852 1766.48 652.21 1768.62 661.593C1770.76 670.811 1776.52 675.42 1785.9 675.42ZM2032.07 744.309C2005.74 744.309 1984.75 739.371 1969.11 729.494C1953.47 719.618 1942.28 705.379 1935.53 686.778C1928.78 668.177 1925.41 645.79 1925.41 619.618V470.729C1925.41 436.325 1935.28 410.317 1955.04 392.704C1974.79 374.926 2001.95 366.037 2036.52 366.037C2107.63 366.037 2143.19 400.935 2143.19 470.729V497.642C2143.19 530.235 2142.86 552.128 2142.2 563.321H2013.8V634.926C2013.8 641.511 2014.21 648.013 2015.04 654.432C2015.86 660.688 2017.67 665.873 2020.47 669.988C2023.43 674.103 2028.12 676.161 2034.54 676.161C2043.76 676.161 2049.52 672.21 2051.83 664.309C2054.13 656.243 2055.28 645.79 2055.28 632.951V597.395H2143.19V618.383C2143.19 646.037 2139.73 669.247 2132.81 688.013C2126.07 706.614 2114.54 720.688 2098.25 730.235C2082.12 739.618 2060.06 744.309 2032.07 744.309ZM2013.31 519.371H2055.28V469.494C2055.28 456.161 2053.64 446.614 2050.35 440.852C2047.05 434.926 2042.12 431.963 2035.53 431.963C2028.45 431.963 2022.94 434.762 2018.99 440.358C2015.2 445.955 2013.31 455.667 2013.31 469.494V519.371ZM2175.28 740.358V305.79H2265.16V740.358H2175.28ZM2302.2 740.358V305.79H2421.7C2452.98 305.79 2476.52 314.515 2492.32 331.963C2508.12 349.247 2516.02 374.597 2516.02 408.013V611.47C2516.02 652.622 2508.78 684.391 2494.3 706.778C2479.98 729.165 2454.79 740.358 2418.74 740.358H2302.2ZM2389.61 663.568H2404.67C2420.63 663.568 2428.62 655.832 2428.62 640.358V416.161C2428.62 401.675 2426.64 392.375 2422.69 388.26C2418.91 383.98 2411.09 381.84 2399.23 381.84H2389.61V663.568ZM2657.75 744.309C2584.01 744.309 2547.14 706.037 2547.14 629.494V480.852C2547.14 445.955 2556.93 418.136 2576.52 397.395C2596.11 376.49 2623.19 366.037 2657.75 366.037C2692.49 366.037 2719.65 376.49 2739.23 397.395C2758.82 418.136 2768.62 445.955 2768.62 480.852V629.494C2768.62 706.037 2731.66 744.309 2657.75 744.309ZM2657.75 676.161C2665.16 676.161 2670.43 673.527 2673.56 668.26C2676.85 662.828 2678.49 655.996 2678.49 647.766V466.778C2678.49 445.214 2671.58 434.432 2657.75 434.432C2643.93 434.432 2637.01 445.214 2637.01 466.778V647.766C2637.01 655.996 2638.58 662.828 2641.7 668.26C2645 673.527 2650.35 676.161 2657.75 676.161ZM2912.81 744.309C2871.33 744.309 2841.87 733.692 2824.42 712.457C2807.14 691.223 2798.49 660.029 2798.49 618.877V504.803C2798.49 473.856 2801.79 448.095 2808.37 427.519C2814.95 406.943 2826.48 391.552 2842.94 381.346C2859.4 371.14 2882.28 366.037 2911.58 366.037C2931.99 366.037 2950.26 369.659 2966.4 376.902C2982.69 384.144 2995.53 394.762 3004.91 408.753C3014.3 422.745 3018.99 439.865 3018.99 460.111V516.655H2928.86V464.803C2928.86 456.243 2927.63 449.083 2925.16 443.321C2922.69 437.395 2917.34 434.432 2909.11 434.432C2894.63 434.432 2887.38 444.721 2887.38 465.297V644.803C2887.38 652.375 2889.03 659.371 2892.32 665.79C2895.61 672.046 2901.05 675.173 2908.62 675.173C2916.35 675.173 2921.7 672.128 2924.67 666.037C2927.79 659.782 2929.36 652.539 2929.36 644.309V582.087H3018.99V646.778C3018.99 667.19 3014.38 684.721 3005.16 699.371C2996.11 713.856 2983.6 724.967 2967.63 732.704C2951.66 740.441 2933.39 744.309 2912.81 744.309Z" fill="white"/>
16 | </svg>
17 | 


--------------------------------------------------------------------------------
/docs/images/babeldoc-big-logo-with-transparent-background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-big-logo-with-transparent-background.png


--------------------------------------------------------------------------------
/docs/images/babeldoc-big-logo-with-transparent-background.svg:
--------------------------------------------------------------------------------
 1 | <svg width="3033" height="1024" viewBox="0 0 3033 1024" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <path d="M545.97 191.828L7.28418 333.753L176.393 975.617L715.079 833.692L545.97 191.828Z" fill="#D7DAE6"/>
 3 | <path d="M1023.27 162.578L572.766 44.2217L416.264 639.924L866.77 758.28L1023.27 162.578Z" fill="#D7DAE6"/>
 4 | <path d="M777.884 119.796H220.815V904.673H777.884V119.796Z" fill="#024AFF"/>
 5 | <path d="M571.439 253.802H99.1982V483.38H571.439V253.802Z" fill="white"/>
 6 | <path d="M279.087 328.642H152.667V353.926H279.087V328.642Z" fill="#024AFF"/>
 7 | <path d="M234.84 379.209H152.667V404.493H234.84V379.209Z" fill="#024AFF"/>
 8 | <path d="M902.217 541.153H429.976V770.732H902.217V541.153Z" fill="#FFBE19"/>
 9 | <path d="M669.862 512.262C669.862 513.463 669.799 514.664 669.799 515.865H695.083C695.083 514.664 695.146 513.463 695.146 512.262C695.146 439.697 655.45 376.298 596.729 342.48V372.379C640.912 403.225 669.862 454.425 669.862 512.262Z" fill="white"/>
10 | <path d="M404.693 654.049C358.929 623.455 328.778 571.307 328.778 512.269C328.778 511.068 328.841 509.867 328.841 508.666H303.557C303.557 509.867 303.494 511.068 303.494 512.269C303.494 585.972 344.391 650.256 404.693 683.631V654.049Z" fill="#FFBD14"/>
11 | <path d="M866.938 619.407H740.519V644.691H866.938V619.407Z" fill="#024AFF"/>
12 | <path d="M866.938 669.975H784.766V695.259H866.938V669.975Z" fill="#024AFF"/>
13 | <path d="M454.76 304.9H523.33V408.135H454.76V465.023H424.419V408.135H354.938V304.9H424.419V271.753L454.76 272.815V304.9ZM385.052 334.862V378.628H424.343V334.862H385.052ZM454.76 378.552H493.216V334.786H454.76V378.552Z" fill="#024AFF"/>
14 | <path d="M550.839 695.334L539.385 728.861H500.321L554.935 581.48H605.452L659.61 728.861H619.712L608.486 695.334H550.914H550.839ZM598.17 664.69L581.179 613.945H578.6L561.154 664.69H598.094H598.17Z" fill="#024AFF"/>
15 | <path d="M1189.6 740.358V305.79H1307.38C1338.99 305.79 1361.95 313.527 1376.27 329C1390.76 344.309 1398 369.659 1398 405.05V424.062C1398 444.474 1394.3 461.017 1386.89 473.692C1379.65 486.367 1368.54 494.679 1353.56 498.63C1373.14 503.568 1386.23 515.173 1392.81 533.445C1399.56 551.552 1402.94 573.692 1402.94 599.865C1402.94 628.013 1400.3 652.622 1395.04 673.692C1389.77 694.762 1380.22 711.14 1366.4 722.828C1352.57 734.515 1332.9 740.358 1307.38 740.358H1189.6ZM1274.54 469.494H1292.32C1300.39 469.494 1305.57 466.367 1307.88 460.111C1310.18 453.856 1311.33 446.367 1311.33 437.642V393.939C1311.33 379.947 1305.16 372.951 1292.81 372.951H1274.54V469.494ZM1283.19 663.815C1305.74 663.815 1317.01 653.116 1317.01 631.716V577.395C1317.01 565.05 1315.12 555.338 1311.33 548.26C1307.71 541.017 1300.8 537.395 1290.59 537.395H1274.54V663.321C1278.16 663.651 1281.05 663.815 1283.19 663.815ZM1497.51 744.309C1476.93 744.309 1461.21 739.535 1450.35 729.988C1439.65 720.441 1432.32 707.354 1428.37 690.729C1424.42 674.103 1422.44 655.256 1422.44 634.186C1422.44 611.634 1424.67 593.198 1429.11 578.877C1433.56 564.391 1441.13 552.539 1451.83 543.321C1462.69 534.103 1477.59 526.284 1496.52 519.865L1551.33 501.099V463.074C1551.33 443.157 1544.83 433.198 1531.83 433.198C1519.98 433.198 1514.05 441.264 1514.05 457.395V479.618H1428.12C1427.96 478.301 1427.88 476.655 1427.88 474.679C1427.88 472.539 1427.88 470.153 1427.88 467.519C1427.88 430.646 1436.52 404.556 1453.8 389.247C1471.25 373.774 1498.82 366.037 1536.52 366.037C1556.27 366.037 1573.97 369.577 1589.6 376.655C1605.24 383.568 1617.59 393.856 1626.64 407.519C1635.86 421.181 1640.47 438.054 1640.47 458.136V740.358H1552.57V696.408C1548.29 711.881 1541.21 723.733 1531.33 731.963C1521.46 740.194 1510.18 744.309 1497.51 744.309ZM1532.32 675.667C1539.56 675.667 1544.5 672.622 1547.14 666.531C1549.77 660.441 1551.09 653.856 1551.09 646.778V543.568C1538.91 548.507 1529.44 554.762 1522.69 562.334C1515.94 569.741 1512.57 580.688 1512.57 595.173V642.828C1512.57 664.721 1519.15 675.667 1532.32 675.667ZM1822.44 744.309C1807.96 744.309 1796.19 740.77 1787.14 733.692C1778.08 726.449 1770.84 715.502 1765.41 700.852V740.358H1675.53V305.79H1765.41V409.494C1770.84 395.832 1778.33 385.214 1787.88 377.642C1797.59 369.906 1811.42 366.037 1829.36 366.037C1853.56 366.037 1870.59 374.844 1880.47 392.457C1890.51 410.07 1895.53 433.116 1895.53 461.593V643.815C1895.53 662.745 1892.9 679.865 1887.63 695.173C1882.53 710.317 1874.54 722.334 1863.68 731.223C1852.98 739.947 1839.23 744.309 1822.44 744.309ZM1785.9 675.42C1795.28 675.42 1800.96 670.811 1802.94 661.593C1804.91 652.375 1805.9 641.017 1805.9 627.519V482.828C1805.9 469.165 1804.91 457.807 1802.94 448.753C1800.96 439.535 1795.37 434.926 1786.15 434.926C1776.6 434.926 1770.76 439.618 1768.62 449C1766.48 458.218 1765.41 469.494 1765.41 482.828V627.519C1765.41 640.852 1766.48 652.21 1768.62 661.593C1770.76 670.811 1776.52 675.42 1785.9 675.42ZM2032.07 744.309C2005.74 744.309 1984.75 739.371 1969.11 729.494C1953.47 719.618 1942.28 705.379 1935.53 686.778C1928.78 668.177 1925.41 645.79 1925.41 619.618V470.729C1925.41 436.325 1935.28 410.317 1955.04 392.704C1974.79 374.926 2001.95 366.037 2036.52 366.037C2107.63 366.037 2143.19 400.935 2143.19 470.729V497.642C2143.19 530.235 2142.86 552.128 2142.2 563.321H2013.8V634.926C2013.8 641.511 2014.21 648.013 2015.04 654.432C2015.86 660.688 2017.67 665.873 2020.47 669.988C2023.43 674.103 2028.12 676.161 2034.54 676.161C2043.76 676.161 2049.52 672.21 2051.83 664.309C2054.13 656.243 2055.28 645.79 2055.28 632.951V597.395H2143.19V618.383C2143.19 646.037 2139.73 669.247 2132.81 688.013C2126.07 706.614 2114.54 720.688 2098.25 730.235C2082.12 739.618 2060.06 744.309 2032.07 744.309ZM2013.31 519.371H2055.28V469.494C2055.28 456.161 2053.64 446.614 2050.35 440.852C2047.05 434.926 2042.12 431.963 2035.53 431.963C2028.45 431.963 2022.94 434.762 2018.99 440.358C2015.2 445.955 2013.31 455.667 2013.31 469.494V519.371ZM2175.28 740.358V305.79H2265.16V740.358H2175.28ZM2302.2 740.358V305.79H2421.7C2452.98 305.79 2476.52 314.515 2492.32 331.963C2508.12 349.247 2516.02 374.597 2516.02 408.013V611.47C2516.02 652.622 2508.78 684.391 2494.3 706.778C2479.98 729.165 2454.79 740.358 2418.74 740.358H2302.2ZM2389.61 663.568H2404.67C2420.63 663.568 2428.62 655.832 2428.62 640.358V416.161C2428.62 401.675 2426.64 392.375 2422.69 388.26C2418.91 383.98 2411.09 381.84 2399.23 381.84H2389.61V663.568ZM2657.75 744.309C2584.01 744.309 2547.14 706.037 2547.14 629.494V480.852C2547.14 445.955 2556.93 418.136 2576.52 397.395C2596.11 376.49 2623.19 366.037 2657.75 366.037C2692.49 366.037 2719.65 376.49 2739.23 397.395C2758.82 418.136 2768.62 445.955 2768.62 480.852V629.494C2768.62 706.037 2731.66 744.309 2657.75 744.309ZM2657.75 676.161C2665.16 676.161 2670.43 673.527 2673.56 668.26C2676.85 662.828 2678.49 655.996 2678.49 647.766V466.778C2678.49 445.214 2671.58 434.432 2657.75 434.432C2643.93 434.432 2637.01 445.214 2637.01 466.778V647.766C2637.01 655.996 2638.58 662.828 2641.7 668.26C2645 673.527 2650.35 676.161 2657.75 676.161ZM2912.81 744.309C2871.33 744.309 2841.87 733.692 2824.42 712.457C2807.14 691.223 2798.49 660.029 2798.49 618.877V504.803C2798.49 473.856 2801.79 448.095 2808.37 427.519C2814.95 406.943 2826.48 391.552 2842.94 381.346C2859.4 371.14 2882.28 366.037 2911.58 366.037C2931.99 366.037 2950.26 369.659 2966.4 376.902C2982.69 384.144 2995.53 394.762 3004.91 408.753C3014.3 422.745 3018.99 439.865 3018.99 460.111V516.655H2928.86V464.803C2928.86 456.243 2927.63 449.083 2925.16 443.321C2922.69 437.395 2917.34 434.432 2909.11 434.432C2894.63 434.432 2887.38 444.721 2887.38 465.297V644.803C2887.38 652.375 2889.03 659.371 2892.32 665.79C2895.61 672.046 2901.05 675.173 2908.62 675.173C2916.35 675.173 2921.7 672.128 2924.67 666.037C2927.79 659.782 2929.36 652.539 2929.36 644.309V582.087H3018.99V646.778C3018.99 667.19 3014.38 684.721 3005.16 699.371C2996.11 713.856 2983.6 724.967 2967.63 732.704C2951.66 740.441 2933.39 744.309 2912.81 744.309Z" fill="#333333"/>
16 | </svg>
17 | 


--------------------------------------------------------------------------------
/docs/images/babeldoc-big-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-big-logo.png


--------------------------------------------------------------------------------
/docs/images/babeldoc-contributor_reward_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-contributor_reward_example.png


--------------------------------------------------------------------------------
/docs/images/babeldoc-preview.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-preview.gif


--------------------------------------------------------------------------------
/docs/images/babeldoc-small-logo-with-transparent-background.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-small-logo-with-transparent-background.png


--------------------------------------------------------------------------------
/docs/images/babeldoc-small-logo-with-transparent-background.svg:
--------------------------------------------------------------------------------
 1 | <svg width="1024" height="1024" viewBox="0 0 1024 1024" fill="none" xmlns="http://www.w3.org/2000/svg">
 2 | <path d="M545.008 191.828L6.32129 333.753L175.43 975.617L714.116 833.692L545.008 191.828Z" fill="#D7DAE6"/>
 3 | <path d="M1022.31 162.578L571.803 44.2217L415.301 639.924L865.807 758.28L1022.31 162.578Z" fill="#D7DAE6"/>
 4 | <path d="M776.921 119.796H219.853V904.673H776.921V119.796Z" fill="#024AFF"/>
 5 | <path d="M570.476 253.802H98.2354V483.38H570.476V253.802Z" fill="white"/>
 6 | <path d="M278.124 328.642H151.704V353.926H278.124V328.642Z" fill="#024AFF"/>
 7 | <path d="M233.877 379.209H151.704V404.493H233.877V379.209Z" fill="#024AFF"/>
 8 | <path d="M901.254 541.153H429.013V770.732H901.254V541.153Z" fill="#FFBE19"/>
 9 | <path d="M668.899 512.262C668.899 513.463 668.836 514.664 668.836 515.865H694.12C694.12 514.664 694.183 513.463 694.183 512.262C694.183 439.697 654.488 376.298 595.766 342.48V372.379C639.949 403.225 668.899 454.425 668.899 512.262Z" fill="white"/>
10 | <path d="M403.73 654.049C357.966 623.455 327.815 571.307 327.815 512.269C327.815 511.068 327.878 509.867 327.878 508.666H302.594C302.594 509.867 302.531 511.068 302.531 512.269C302.531 585.972 343.428 650.256 403.73 683.631V654.049Z" fill="#FFBD14"/>
11 | <path d="M865.976 619.407H739.557V644.691H865.976V619.407Z" fill="#024AFF"/>
12 | <path d="M865.976 669.975H783.803V695.259H865.976V669.975Z" fill="#024AFF"/>
13 | <path d="M453.797 304.9H522.367V408.135H453.797V465.023H423.456V408.135H353.976V304.9H423.456V271.753L453.797 272.815V304.9ZM384.089 334.862V378.628H423.38V334.862H384.089ZM453.797 378.552H492.254V334.786H453.797V378.552Z" fill="#024AFF"/>
14 | <path d="M549.876 695.334L538.422 728.861H499.358L553.972 581.48H604.489L658.647 728.861H618.749L607.523 695.334H549.952H549.876ZM597.207 664.69L580.217 613.945H577.638L560.192 664.69H597.131H597.207Z" fill="#024AFF"/>
15 | </svg>
16 | 


--------------------------------------------------------------------------------
/docs/images/babeldoc-small-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/docs/images/babeldoc-small-logo.png


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | 
2 | {!README.md!}
3 | 


--------------------------------------------------------------------------------
/docs/intro-to-pdf-object.md:
--------------------------------------------------------------------------------
  1 | An Introduction to PDF Object Definitions in dpml
  2 | ===
  3 | 
  4 | ## 1. Understanding PDF Structure
  5 | A PDF file is fundamentally an indexed collection of objects, where each object represents a structured data unit. The file structure consists of four main components:
  6 | 
  7 | 1. A header
  8 | 2. Object definitions
  9 | 3. A cross-reference table
 10 | 4. A trailer
 11 | 
 12 | The cross-reference table serves as a lookup directory, mapping each numbered object to its byte offset location within the file. The trailer contains critical metadata, including the location of the root object (document catalog), which serves as the entry point for PDF interpretation. The file concludes with a byte offset pointing to the cross-reference table.
 13 | 
 14 | Here's an illustrative example of a PDF file structure:
 15 | 
 16 | ```pdf
 17 | %PDF-2.0
 18 | 1 0 obj
 19 | <<
 20 |   /Pages 2 0 R
 21 |   /Type /Catalog
 22 | >>
 23 | endobj
 24 | 2 0 obj
 25 | <<
 26 |   /Count 1
 27 |   /Kids [
 28 |     3 0 R
 29 |   ]
 30 |   /Type /Pages
 31 | >>
 32 | endobj
 33 | 3 0 obj
 34 | <<
 35 |   /Contents 4 0 R
 36 |   /MediaBox [ 0 0 612 792 ]
 37 |   /Parent 2 0 R
 38 |   /Resources <<
 39 |     /Font << /F1 5 0 R >>
 40 |   >>
 41 |   /Type /Page
 42 | >>
 43 | endobj
 44 | 4 0 obj
 45 | <<
 46 |   /Length 44
 47 | >>
 48 | stream
 49 | BT
 50 |   /F1 24 Tf
 51 |   72 720 Td
 52 |   (Potato) Tj
 53 | ET
 54 | endstream
 55 | endobj
 56 | 5 0 obj
 57 | <<
 58 |   /BaseFont /Helvetica
 59 |   /Encoding /WinAnsiEncoding
 60 |   /Subtype /Type1
 61 |   /Type /Font
 62 | >>
 63 | endobj
 64 | 
 65 | xref
 66 | 0 6
 67 | 0000000000 65535 f 
 68 | 0000000009 00000 n 
 69 | 0000000062 00000 n 
 70 | 0000000133 00000 n 
 71 | 0000000277 00000 n 
 72 | 0000000372 00000 n 
 73 | trailer <<
 74 |   /Root 1 0 R
 75 |   /Size 6
 76 |   /ID [<42841c13bbf709d79a200fa1691836f8><b1d8b5838eeafe16125317aa78e666aa>]
 77 | >>
 78 | startxref
 79 | 478
 80 | %%EOF
 81 | ```
 82 | 
 83 | ### PDF File Interpretation
 84 | When a PDF viewer processes a file, it follows these steps:
 85 | 
 86 | 1. Starts at the file's end to locate the cross-reference table offset
 87 | 2. Accesses the cross-reference table to find object locations
 88 | 3. Reads the trailer dictionary to identify the document catalog
 89 | 4. Uses the document catalog to access various document components:
 90 |    - Pages
 91 |    - Outlines
 92 |    - Thumbnails
 93 |    - Annotations
 94 |    - Other PDF elements
 95 | 
 96 | The pages tree root is particularly crucial as it enables navigation to specific pages within the document.
 97 | 
 98 | ### Example Interpretation Flow
 99 | Let's trace through our example:
100 | 
101 | 1. The cross-reference table begins at byte offset 478 (indicated after `startxref`)
102 | 2. The trailer identifies object 1 as the document catalog (`/Root 1 0 R`)
103 | 3. Object 1 is located at byte offset 9
104 | 4. The document catalog points to object 2 as the pages tree root
105 | 5. Object 2 is found at byte offset 62
106 | 6. The pages tree identifies page 3 as the first page
107 | 7. Object 3 is positioned at byte offset 133
108 | 8. Object 3 defines the page properties and links to object 4 for content
109 | 9. Object 4, at byte offset 277, contains the drawing instructions for rendering "Potato"
110 | 
111 | This structure enables efficient random access to any part of the PDF document.
112 | 
113 | ## 2. PDF Objects
114 | 
115 | Earlier, we discussed PDF objects and introduced the concept of dictionaries. At the top level of a PDF file, objects are identified by two numbers followed by the keyword "obj". The first number serves as the object number, while the second—known as the generation number—is typically 0. Everything between these identifiers and the "endobj" keyword constitutes the object's body.
116 | 
117 | The PDF specification provides a mechanism for modifying files by appending object updates and cross-reference table entries. When an object's contents are completely replaced (rather than modified), its generation number can be incremented. This allows object numbers to be reused while preventing old indirect references from resolving to new objects. However, such files are rare in practice, and generation numbers can generally be disregarded. Modern PDF specifications using object streams have even eliminated generation numbers entirely.
118 | 
119 | PDF objects share similarities with data structures found in JSON, YAML, and modern programming languages, though PDF includes some unique object types. Here are the available PDF object types:
120 | 
121 | - String: A text sequence enclosed in parentheses, e.g., (potato). Note that PDF strings typically don't support full Unicode encoding, though there are specific cases where this is possible. (A detailed discussion of character encoding is beyond our current scope.)
122 | 
123 | - Number: Both integers and floating-point numbers (e.g., 12, 3.14159). While the PDF specification distinguishes between integers and real numbers, they're often interchangeable in practice—integers can be used where real numbers are expected, and viewers typically handle real numbers appropriately when integers are required.
124 | 
125 | - Boolean: Simple true/false values
126 | 
127 | - Null: Represented by the keyword "null"
128 | 
129 | - Name: A keyword or dictionary key identifier starting with a forward slash (/), e.g., /Type
130 | 
131 | - Array: An ordered collection of objects enclosed in square brackets, with no separators between items. Arrays support nested structures, including other arrays and dictionaries. Example: `[1 (two) 3.14 false]`
132 | 
133 | - Dictionary: A collection of key-value pairs where keys are Names and values can be any object type. Dictionaries are enclosed in << and >> with no separators between entries. Example: `<< /A 1 /B [2, 3 <</Four 4>> ] >>`
134 | 
135 | - Indirect object reference: A reference to a numbered object in the file, consisting of two numbers (object and generation) followed by 'R', e.g., 1 0 R. While some objects must be direct per the PDF specification, most can be defined at the top level and referenced indirectly.
136 | 
137 | - Stream: A container for binary data, structured as a dictionary (containing at least a /Length key and other format-specific entries) followed by the specified number of bytes between "stream" and "endstream" keywords. 🔍 The stream length can be specified as an indirect object, enabling single-pass PDF generation where the stream length isn't known in advance—a common practice in PDF creation.
138 | 
139 | ## 3. PDF Object Definitions In dpml
140 | 
141 | ### Coordinate system definition
142 | 
143 | The positive x-axis extends horizontally to the right, while the positive y-axis extends vertically upward, following
144 | standard mathematical conventions. The unit length along both the x and y axes is defined as 1/72 inch (or 1 point).
145 | 
146 | ## 4. Useful Information
147 | 
148 | - [PDF32000_2008](https://opensource.adobe.com/dc-acrobat-sdk-docs/pdfstandards/PDF32000_2008.pdf) page 111: Table 51 - Operator Categories


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx>=8.2.0
2 | sphinx-click>=5.1.0
3 | furo>=2024.1.29
4 | myst-parser[linkify,html_meta,html_admonition]>=2.0.0 


--------------------------------------------------------------------------------
/examples/basic.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <wp:document xmlns:wp="urn:ns:yadt:dpml">
 3 |     <wp:page>
 4 |         <wp:p offsetX="120pt" offsetY="100pt" width="100%" height="auto" align="justify">
 5 |             <wp:run font-family="Arial" color="000000">
 6 |                 This is a simple paragraph with some text.
 7 |                 <wp:break type="line"/>
 8 |                 And this is a new line.
 9 |             </wp:run>
10 |         </wp:p>
11 |     </wp:page>
12 | </wp:document>


--------------------------------------------------------------------------------
/examples/ci/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/funstory-ai/BabelDOC/1c4c25415984777b615eb88f299e88d006ffd16c/examples/ci/test.pdf


--------------------------------------------------------------------------------
/examples/code-figure.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <wp:document xmlns:wp="urn:ns:yadt:dpml">
 3 |     <wp:page>
 4 |         <wp:object offsetX="20pt" offsetY="20pt" width="90%" height="auto">
 5 |             <wp:codeblock language="python">
 6 |                 <wp:codeline number="1" highlight="true">def hello_world():</wp:codeline>
 7 |                 <wp:codeline number="2">    print("Hello, World!")</wp:codeline>
 8 |                 <wp:codeline number="3" highlight="false">    return None</wp:codeline>
 9 |             </wp:codeblock>
10 |         </wp:object>
11 |         <wp:object offsetX="20pt" offsetY="20pt" width="90%" height="auto">
12 |             <wp:figure 
13 |                 src="/path/to/image.png" 
14 |                 width="300px" 
15 |                 height="200px"
16 |                 caption="Sample figure caption"
17 |                 title="Sample figure title"/>
18 |         </wp:object>
19 |     </wp:page>
20 | </wp:document>


--------------------------------------------------------------------------------
/examples/formular.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <wp:document xmlns:wp="urn:ns:yadt:dpml">
 3 |     <wp:page>
 4 |         <wp:p align="center">
 5 |             <wp:run>Here's a mathematical formula: 
 6 |             <wp:math>\frac{-b \pm \sqrt{b^2-4ac}}{2a}</wp:math>
 7 |             And here's a special symbol: 
 8 |             <wp:symbol src="/path/to/symbol.svg"/>
 9 |             </wp:run>
10 | 
11 |         </wp:p>
12 |     </wp:page>
13 | </wp:document>


--------------------------------------------------------------------------------
/examples/table.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <wp:document xmlns:wp="urn:ns:yadt:dpml">
 3 |     <wp:page>
 4 |         <wp:table offsetX="20pt" offsetY="20pt" width="90%" height="auto" frame="all" framestyle="single">
 5 |             <wp:cols>
 6 |                 <wp:col colwidth="30%"/>
 7 |                 <wp:col colwidth="70%"/>
 8 |             </wp:cols>
 9 |             <wp:thead>
10 |                 <wp:tr>
11 |                     <wp:td align="center" shade="E0E0E0">
12 |                         <wp:p>
13 |                             <wp:run font-family="Arial Bold">Header 1</wp:run>
14 |                         </wp:p>
15 |                     </wp:td>
16 |                     <wp:td align="center" shade="E0E0E0">
17 |                         <wp:p>
18 |                             <wp:run font-family="Arial Bold">Header 2</wp:run>
19 |                         </wp:p>
20 |                     </wp:td>
21 |                 </wp:tr>
22 |             </wp:thead>
23 |             <wp:tbody>
24 |                 <wp:tr>
25 |                     <wp:td>
26 |                         <wp:p>
27 |                             <wp:run>Cell 1</wp:run>
28 |                         </wp:p>
29 |                     </wp:td>
30 |                     <wp:td>
31 |                         <wp:p>
32 |                             <wp:run>Cell 2</wp:run>
33 |                         </wp:p>
34 |                     </wp:td>
35 |                 </wp:tr>
36 |             </wp:tbody>
37 |         </wp:table>
38 |     </wp:page>
39 | </wp:document>


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2016-2025 Martin Donath <martin.donath@squidfunk.com>
  2 | 
  3 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  4 | # of this software and associated documentation files (the "Software"), to
  5 | # deal in the Software without restriction, including without limitation the
  6 | # rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7 | # sell copies of the Software, and to permit persons to whom the Software is
  8 | # furnished to do so, subject to the following conditions:
  9 | 
 10 | # The above copyright notice and this permission notice shall be included in
 11 | # all copies or substantial portions of the Software.
 12 | 
 13 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 14 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 15 | # FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE
 16 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 17 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 18 | # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 19 | # IN THE SOFTWARE.
 20 | 
 21 | # Project information
 22 | site_name: BabelDOC
 23 | site_url: https://squidfunk.github.io/mkdocs-material/
 24 | site_author: funstory.ai
 25 | site_description: >-
 26 |   Write your documentation in Markdown and create a professional static site in
 27 |   minutes – searchable, customizable, in 60+ languages, for all devices
 28 | 
 29 | # Repository
 30 | repo_name: funstory-ai/BabelDOC
 31 | repo_url: https://github.com/funstory-ai/BabelDOC
 32 | edit_uri: edit/main/docs/
 33 | 
 34 | # Copyright
 35 | copyright: Copyright &copy; 2025 funstory.ai
 36 | 
 37 | # Configuration
 38 | theme:
 39 |   name: material
 40 |   # custom_dir: material/overrides
 41 |   features:
 42 |     - announce.dismiss
 43 |     - content.action.edit
 44 |     - content.action.view
 45 |     - content.code.annotate
 46 |     - content.code.copy
 47 |     - content.code.select
 48 |     # - content.footnote.tooltips
 49 |     # - content.tabs.link
 50 |     - content.tooltips
 51 |     # - header.autohide
 52 |     # - navigation.expand
 53 |     - navigation.footer
 54 |     - navigation.indexes
 55 |     # - navigation.instant
 56 |     # - navigation.instant.prefetch
 57 |     # - navigation.instant.progress
 58 |     # - navigation.prune
 59 |     - navigation.sections
 60 |     - navigation.tabs
 61 |     # - navigation.tabs.sticky
 62 |     - navigation.top
 63 |     - navigation.tracking
 64 |     - search.highlight
 65 |     - search.share
 66 |     - search.suggest
 67 |     - toc.follow
 68 |     # - toc.integrate
 69 |   palette:
 70 |     - media: "(prefers-color-scheme)"
 71 |       toggle:
 72 |         icon: material/brightness-auto
 73 |         name: Switch to light mode
 74 |     - media: "(prefers-color-scheme: light)"
 75 |       scheme: default
 76 |       primary: white
 77 |       accent: indigo
 78 |       toggle:
 79 |         icon: material/brightness-7
 80 |         name: Switch to dark mode
 81 |     - media: "(prefers-color-scheme: dark)"
 82 |       scheme: slate
 83 |       primary: black
 84 |       accent: indigo
 85 |       toggle:
 86 |         icon: material/brightness-4
 87 |         name: Switch to system preference
 88 |   font:
 89 |     text: Roboto
 90 |     code: Roboto Mono
 91 |   # favicon: assets/favicon.png
 92 |   favicon: images/babeldoc-small-logo-with-transparent-background.svg
 93 |   logo: images/babeldoc-small-logo-with-transparent-background.svg
 94 | 
 95 | # Plugins
 96 | plugins:
 97 |   - search:
 98 |       separator: '[\s\u200b\-_,:!=\[\]()"`/]+|\.(?!\d)|&[lg]t;|(?!\b)(?=[A-Z][a-z])'
 99 |   - minify:
100 |       minify_html: true
101 |   - git-authors
102 |   - git-revision-date-localized:
103 |       enable_creation_date: true
104 | # Additional configuration
105 | extra:
106 |   status:
107 |     new: Recently added
108 |     deprecated: Deprecated
109 |   social:
110 |     - icon: fontawesome/brands/github
111 |       link: https://github.com/funstory-ai/BabelDOC
112 |     - icon: fontawesome/brands/python
113 |       link: https://pypi.org/project/BabelDOC/
114 | 
115 | # Extensions
116 | markdown_extensions:
117 |   - github-callouts
118 |   - markdown_include.include
119 |   - pymdownx.highlight:
120 |       anchor_linenums: true
121 |       line_spans: __span
122 |       pygments_lang_class: true
123 |   - pymdownx.inlinehilite
124 |   - pymdownx.snippets
125 |   - pymdownx.superfences
126 |   - def_list
127 |   - pymdownx.tasklist:
128 |       custom_checkbox: true
129 | not_in_nav: |
130 |   /tutorials/**/*.md
131 | 
132 | # Page tree
133 | nav:
134 |   - Home: index.md
135 |   - API:
136 |     - Async Translation API: ImplementationDetails/AsyncTranslate/AsyncTranslate.md
137 |   - Implementation Details:
138 |     - ImplementationDetails/README.md
139 |     - PDF Parsing: ImplementationDetails/PDFParsing/PDFParsing.md
140 |     - Layout Parser(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/midend/layout_parser.py
141 |     - Paragraph Finding: ImplementationDetails/ParagraphFinding/ParagraphFinding.md
142 |     - Styles and Formulas: ImplementationDetails/StylesAndFormulas/StylesAndFormulas.md
143 |     - IL Translator: ImplementationDetails/ILTranslator/ILTranslator.md
144 |     - Typesetting: ImplementationDetails/Typesetting/Typesetting.md
145 |     - Font Mapper(.py): https://github.com/funstory-ai/BabelDOC/blob/main/babeldoc/document_il/utils/fontmap.py
146 |     - PDF Creation: ImplementationDetails/PDFCreation/PDFCreation.md
147 |     - Intro To PDF Object: intro-to-pdf-object.md
148 |   - Community:
149 |     - Code of Conduct: CODE_OF_CONDUCT.md
150 |     - Contributing:
151 |       - Contributing: CONTRIBUTING.md
152 |       - Contributor Reward: CONTRIBUTOR_REWARD.md


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "BabelDOC"
  3 | version = "0.3.21"
  4 | description = "Yet Another Document Translator"
  5 | license = "AGPL-3.0"
  6 | readme = "README.md"
  7 | requires-python = ">=3.10,<3.13"
  8 | authors = [
  9 |     { name = "awwaawwa", email = "aw@funstory.ai" }
 10 | ]
 11 | maintainers = [
 12 |     { name = "awwaawwa", email = "aw@funstory.ai" }
 13 | ]
 14 | classifiers = [
 15 |     "Programming Language :: Python :: 3",
 16 |     "Operating System :: OS Independent",
 17 | ]
 18 | keywords = ["PDF"]
 19 | dependencies = [
 20 |     "bitstring>=4.3.0",
 21 |     "configargparse>=1.7",
 22 |     "httpx[socks]>=0.27.0",
 23 |     "huggingface-hub>=0.27.0",
 24 |     "numpy>=2.0.2",
 25 |     "onnx>=1.17.0",
 26 |     "onnxruntime>=1.16.1",
 27 |     "openai>=1.59.3",
 28 |     "orjson>=3.10.14",
 29 |     "pdfminer-six>=20240706",
 30 |     "peewee>=3.17.8",
 31 |     "psutil>=7.0.0",
 32 |     "pymupdf>=1.25.1",
 33 |     "rich>=13.9.4",
 34 |     "toml>=0.10.2",
 35 |     "tqdm>=4.67.1",
 36 |     "xsdata[cli,lxml,soap]>=24.12",
 37 |     "msgpack>=1.1.0",
 38 |     "pydantic>=2.10.6",
 39 |     "tenacity>=9.0.0",
 40 |     "scikit-image>=0.25.2",
 41 |     "freetype-py>=2.5.1",
 42 |     "tiktoken>=0.9.0",
 43 |     "python-levenshtein>=0.27.1",
 44 |     "opencv-python-headless>=4.10.0.84",
 45 |     "rapidocr-onnxruntime>=1.4.4",
 46 | ]
 47 | 
 48 | [project.optional-dependencies]
 49 | directml = ["onnxruntime-directml>=1.16.1"]
 50 | cuda = ["onnxruntime-gpu>=1.16.1"]
 51 | memray = ["memray>=1.17.1"]
 52 | 
 53 | [project.urls]
 54 | Homepage = "https://github.com/funstory-ai/BabelDOC"
 55 | Issues = "https://github.com/funstory-ai/BabelDOC/issues"
 56 | 
 57 | [project.scripts]
 58 | babeldoc = "babeldoc.main:cli"
 59 | 
 60 | [build-system]
 61 | requires = ["hatchling"]
 62 | build-backend = "hatchling.build"
 63 | 
 64 | [tool.flake8]
 65 | ignore = ["E203", "E261", "E501", "W503", "E741", "E501"]
 66 | max-line-length = 88
 67 | 
 68 | [tool.ruff]
 69 | src = ["babeldoc"]
 70 | target-version = "py310"
 71 | show-fixes = true
 72 | 
 73 | [tool.ruff.format]
 74 | # Enable reformatting of code snippets in docstrings.
 75 | docstring-code-format = true
 76 | 
 77 | [tool.ruff.lint]
 78 | ignore = [
 79 |     "E203",   # 冒号前的空格
 80 |     "E261",   # 注释前至少两个空格
 81 |     "E501",   # 行太长
 82 |     "E741",   # 变量名歧义
 83 |     "F841",   # 未使用的变量
 84 |     "C901",   # 太复杂的函数
 85 |     "S101",   # use assert
 86 |     "SIM",    # flake8-simplify
 87 |     "ARG002", # unused argument
 88 |     "S110",   # `try`-`except`-`pass` detected, consider logging the exception
 89 |     "B024",   # abstract class without abstract methods
 90 |     "S112",   # `try`-`except`-`continue` detected, consider logging the exception
 91 |     "COM812", # missing-trailing-comma
 92 | 
 93 | ]
 94 | select = [
 95 |     "E",   # pycodestyle 错误
 96 |     "F",   # Pyflakes
 97 |     "N",   # PEP8 命名
 98 |     "B",   # flake8-bugbear
 99 |     "I",   # isort
100 |     "C",   # mccabe
101 |     "UP",  # pyupgrade
102 |     "S",   # flake8-bandit
103 |     "A",   # flake8-builtins
104 |     "COM", # flake8-commas
105 |     "ARG", # flake8-unused-arguments
106 |     "PTH", # 使用 pathlib
107 | ]
108 | 
109 | [tool.ruff.lint.flake8-quotes]
110 | docstring-quotes = "double"
111 | 
112 | [tool.ruff.lint.flake8-annotations]
113 | suppress-none-returning = true
114 | 
115 | [tool.ruff.lint.isort]
116 | force-single-line = true
117 | 
118 | [tool.ruff.lint.pydocstyle]
119 | convention = "google"
120 | 
121 | # 设置一些规则的特定配置
122 | [tool.ruff.lint.mccabe]
123 | max-complexity = 10 # 函数圈复杂度阈值
124 | 
125 | [tool.ruff.lint.per-file-ignores]
126 | "babeldoc/pdfinterp.py" = ["N"] # 忽略命名规范
127 | "tests/*" = ["S101"]            # 在测试文件中允许 assert
128 | "**/__init__.py" = ["F401"]     # 允许未使用的导入
129 | # 忽略 S311 警告，因为这是有意的
130 | "babeldoc/document_il/midend/paragraph_finder.py" = ["S311"]
131 | "docs/*" = ["A001"]
132 | [dependency-groups]
133 | dev = [
134 |     "bumpver>=2024.1130",
135 |     "markdown-callouts>=0.4.0",
136 |     "markdown-include>=0.8.1",
137 |     "mkdocs-git-authors-plugin>=0.9.2",
138 |     "mkdocs-git-committers-plugin-2>=2.5.0",
139 |     "mkdocs-git-revision-date-localized-plugin>=1.3.0",
140 |     "mkdocs-material[recommended]>=9.6.4",
141 |     "pre-commit>=4.1.0",
142 |     "pygments>=2.19.1",
143 |     "ruff>=0.9.2",
144 |     "pytest>=8.3.4",
145 | ]
146 | 
147 | [tool.pytest.ini_options]
148 | pythonpath = [".", "src"]
149 | testpaths = ["tests"]
150 | 
151 | [bumpver]
152 | current_version = "0.3.21"
153 | version_pattern = "MAJOR.MINOR.PATCH[.PYTAGNUM]"
154 | 
155 | [bumpver.file_patterns]
156 | "pyproject.toml" = [
157 |     'current_version = "{version}"',
158 |     'version = "{version}"'
159 | ]
160 | "babeldoc/__init__.py" = [
161 |     '__version__ = "{version}"'
162 | ]
163 | "babeldoc/main.py" = [
164 |     '__version__ = "{version}"'
165 | ]
166 | "babeldoc/const.py" = [
167 |     '__version__ = "{version}"'
168 | ]
169 | 


--------------------------------------------------------------------------------
/tests/test_translation_config.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from babeldoc.translation_config import ConfigModel
 4 | 
 5 | # Since it is necessary to test whether the functionality meets the expected requirements,
 6 | # private functions and private methods are allowed to be called.
 7 | # pyright: reportPrivateUsage=false
 8 | 
 9 | 
10 | class TestConfigArgs:
11 |     def test_page_range_regex(self):
12 |         test_strings = [
13 |             "1,3,5,7,9",
14 |             "1-3,5,7-9",
15 |             "1,2-4,5,6-8,9",
16 |             "10-12,14,16-18",
17 |             "1-,5",
18 |             "-5,10",
19 |             "1-, 5, -3, 10-12",
20 |         ]
21 |         pattern = ConfigModel._page_range_pattern()
22 |         for string in test_strings:
23 |             assert re.match(pattern, string)
24 | 


--------------------------------------------------------------------------------