├── .env_example
├── .github
    └── workflows
    │   └── deploy_docs.yml
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── .nojekyll
    ├── Makefile
    ├── api.rst
    ├── benchmark.csv
    ├── benchmark.rst
    ├── conf.py
    ├── contributing.rst
    ├── index.rst
    ├── installation.rst
    ├── make.bat
    ├── requirements.txt
    └── update_benchmarks.py
├── examples
    ├── example_notebook.ipynb
    ├── example_notebook_colab.ipynb
    ├── inputs
    │   ├── bench_md.pdf
    │   ├── benchmark.pdf
    │   ├── costco_bill.jpg
    │   ├── cvs_coupon.jpg
    │   ├── grocery_bill.jpg
    │   ├── medical_invoice_sample1.png
    │   ├── medical_travel_request_OWCP_957.png
    │   ├── sample.docx
    │   ├── sample.pptx
    │   ├── sample.xlsx
    │   ├── sample_test.txt
    │   ├── sample_test_doc.pdf
    │   ├── screenshot-1.png
    │   ├── stress_test
    │   │   ├── large_doc_1.pdf
    │   │   └── large_doc_2.pdf
    │   ├── test_1.pdf
    │   ├── test_2.pdf
    │   ├── test_3.pdf
    │   ├── test_4.jpg
    │   ├── test_5.jpg
    │   ├── test_explicit_hyperlink_n_img.pdf
    │   ├── test_hidden_link_with_image.pdf
    │   └── test_with_hidden_links_no_img.pdf
    └── outputs
    │   ├── benchmark.md
    │   ├── costco_bill.md
    │   ├── cvs_coupon.md
    │   ├── grocery_bill.md
    │   ├── medical_invoice_sample1.md
    │   ├── medical_travel_request_OWCP_957.md
    │   ├── test_1.md
    │   ├── test_2.md
    │   ├── test_3.md
    │   ├── test_4.md
    │   └── test_5.md
├── lexoid
    ├── api.py
    └── core
    │   ├── parse_type
    │       ├── llm_parser.py
    │       └── static_parser.py
    │   ├── prompt_templates.py
    │   └── utils.py
├── poetry.lock
├── pyproject.toml
└── tests
    ├── api_cost_mapping.json
    ├── benchmark.py
    ├── env_template
    └── test_parser.py


/.env_example:
--------------------------------------------------------------------------------
1 | GOOGLE_API_KEY=""
2 | OPENAI_API_KEY=""
3 | HUGGINGFACEHUB_API_TOKEN=""
4 | TOGETHER_API_KEY=""


--------------------------------------------------------------------------------
/.github/workflows/deploy_docs.yml:
--------------------------------------------------------------------------------
 1 | name: Docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - '**docs**'
 8 |     paths:
 9 |       - 'docs/**'
10 |       - '.github/workflows/deploy_docs.yml'
11 | 
12 | jobs:
13 |   build-docs:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: "3.11"
23 | 
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install sphinx docutils
28 |           pip install -r docs/requirements.txt || true
29 | 
30 |       - name: Build Sphinx documentation
31 |         run: |
32 |           sphinx-build -b html docs/ docs/_build/html
33 | 
34 |       - name: Upload documentation artifact
35 |         uses: actions/upload-pages-artifact@v3
36 |         with:
37 |           path: docs/_build/html
38 | 
39 |   deploy:
40 |     needs: build-docs
41 |     runs-on: ubuntu-latest
42 |     permissions:
43 |       pages: write
44 |       id-token: write
45 |     environment:
46 |       name: github-pages
47 |       url: ${{ steps.deployment.outputs.page_url }}
48 |     steps:
49 |       - name: Deploy to GitHub Pages
50 |         id: deployment
51 |         uses: actions/deploy-pages@v4
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | # Custom
165 | tests/outputs/
166 | outputs/
167 | inputs/
168 | 
169 | # Others
170 | .DS_Store


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Change Log
  2 | 
  3 | ## [0.1.1] - 2024-10-28
  4 | 
  5 | ### Added
  6 | - Support for URL parsing
  7 | 
  8 | ### Changed
  9 | 
 10 | ### Fixed
 11 | 
 12 | ## [0.1.2] - 2024-11-04
 13 | 
 14 | ### Added
 15 | - Initial testing code
 16 | - Benchmarking code
 17 | 
 18 | ### Changed
 19 | - Improvements in OpenAI prompt
 20 | - Conversion of PDFs to images before parsing with OpenAI models
 21 | 
 22 | ### Fixed
 23 | 
 24 | 
 25 | ## [0.1.3] - 2024-11-12
 26 | 
 27 | ### Added
 28 | - `AUTO` parse mode
 29 | 
 30 | ### Changed
 31 | - Switch from multithreading to multiprocessing
 32 | 
 33 | ### Fixed
 34 | 
 35 | ## [0.1.4] - 2024-11-22
 36 | 
 37 | ### Added
 38 | - Support for structured parsing of HTML pages
 39 | - Support for recursive URL parsing in websites and PDFs
 40 | 
 41 | ### Changed
 42 | - URL extraction regex
 43 | 
 44 | ### Fixed
 45 | - Bug in document appending logic
 46 | - Bug caused by split pdfs being in same dir as source pdf
 47 | 
 48 | ## [0.1.5] - 2024-12-06
 49 | 
 50 | ### Added
 51 | 
 52 | ### Changed
 53 | - Improved pdfplumber parsing to format markdown and detect hyperlinks
 54 | 
 55 | ### Fixed
 56 | 
 57 | ## [0.1.6] - 2024-12-10
 58 | 
 59 | ### Added
 60 | * Support for parsing .csv, .txt, and .html, and .docx files
 61 | * Support for parsing links to documents when recursive HTML parsing
 62 | 
 63 | ### Changed
 64 | 
 65 | ### Fixed
 66 | 
 67 | ## [0.1.7] - 2025-01-08
 68 | 
 69 | ### Added
 70 | * Colab example notebook
 71 | * Support for bold and italic formatting in PDFPlumber
 72 | * Support for Llama 3.2 models through HuggingFace and Together AI
 73 | 
 74 | ### Changed
 75 | * Improved PDFPlumber table parsing
 76 | 
 77 | ### Fixed
 78 | * PDFPlumber text detection bug
 79 | 
 80 | ## [0.1.8] - 2025-01-23
 81 | 
 82 | ### Added
 83 | * Retry and error handling for LLM_PARSE
 84 | 
 85 | ### Changed
 86 | * Remove together Python client dependency and use REST API calls instead
 87 | 
 88 | ## [0.1.8.post1] - 2025-01-28
 89 | 
 90 | ### Added
 91 | * Documentation
 92 | 
 93 | ### Changed
 94 | * Specify headers for Playwright web page retrieval
 95 | 
 96 | ## [0.1.9] - 2025-02-17
 97 | 
 98 | ### Added
 99 | - Parameters to specify intermediate PDF save path when `as_pdf=True`.
100 | - Return `token_uage` and `pdf_path` with `parse()` output where applicable
101 | 
102 | ### Changed
103 | - Switched back to together Python client
104 | - Improved `parse()` function return format to be a dictionary.
105 | 
106 | 
107 | ## [0.1.10] - 2025-02-23
108 | 
109 | ### Added
110 | - Parameter to specify page numbers for parsing
111 | 
112 | ### Fixed
113 | - Errors caused by empty token_usage
114 | 
115 | ## [0.1.11] - 2025-02-27
116 | 
117 | ### Added
118 | - Priority setting to AUTO routing
119 | - More models to benchmark
120 | 
121 | ### Changed
122 | - Set default parse_type to AUTO
123 | - Set default LLM to Gemini 2.0 Flash
124 | - Updated benchmark script to aggregate over multiple runs
125 | 
126 | ### Fixed
127 | - Incorrect title when `as_pdf=True`
128 | 
129 | 
130 | ## [0.1.11.post1] - 2025-03-05
131 | 
132 | ### Added
133 | - Code of Conduct
134 | 
135 | ### Fixed
136 | - Segmentation fault when PyQT app is reinitialized
137 | 
138 | ## [0.1.12] - 2025-04-11
139 | 
140 | ### Added
141 | * Support for OpenRouter models
142 | * Return token cost when cost mapping is provided
143 | * Support for custom prompts
144 | * Support for parsing Excel and PowerPoint files
145 | 
146 | ### Changed
147 | * Set default `router_priority` to `speed`
148 | 
149 | ## [0.1.13] - 2025-04-20
150 | 
151 | ### Added
152 | * `STATIC_PARSE` improvements
153 |     * Horizontal line detection
154 |     * Strikethrough text detection
155 |     * Email address formatting
156 |     * Improved heading level detection
157 |     * Monospace font detection
158 |     * Indentation detection
159 | 
160 | ## [0.1.14] - 2025-06-05
161 | 
162 | ### Added
163 | * Add support for Fireworks API
164 | * Add support for matching data in document to pre-defined schema or template
165 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | At Oid Labs we are committed to enabling a safe, welcoming and collaborative environment for everyone.
  2 | 
  3 | # Contributor Covenant Code of Conduct
  4 | 
  5 | ## Our Pledge
  6 | 
  7 | We as members, contributors, and leaders pledge to make participation in our
  8 | community a harassment-free experience for everyone, regardless of age, body
  9 | size, visible or invisible disability, ethnicity, sex characteristics, gender
 10 | identity and expression, level of experience, education, socio-economic status,
 11 | nationality, personal appearance, race, caste, color, religion, or sexual
 12 | identity and orientation.
 13 | 
 14 | We pledge to act and interact in ways that contribute to an open, welcoming,
 15 | diverse, inclusive, and healthy community.
 16 | 
 17 | ## Our Standards
 18 | 
 19 | Examples of behavior that contributes to a positive environment for our
 20 | community include:
 21 | 
 22 | - Demonstrating empathy and kindness toward other people
 23 | - Being respectful of differing opinions, viewpoints, and experiences
 24 | - Giving and gracefully accepting constructive feedback
 25 | - Accepting responsibility and apologizing to those affected by our mistakes,
 26 |   and learning from the experience
 27 | - Focusing on what is best not just for us as individuals, but for the overall
 28 |   community
 29 | 
 30 | Examples of unacceptable behavior include:
 31 | 
 32 | - The use of sexualized language or imagery, and sexual attention or advances of
 33 |   any kind
 34 | - Trolling, insulting or derogatory comments, and personal or political attacks
 35 | - Public or private harassment
 36 | - Publishing others' private information, such as a physical or email address,
 37 |   without their explicit permission
 38 | - Other conduct which could reasonably be considered inappropriate in a
 39 |   professional setting
 40 | 
 41 | ## Enforcement Responsibilities
 42 | 
 43 | Community leaders are responsible for clarifying and enforcing our standards of
 44 | acceptable behavior and will take appropriate and fair corrective action in
 45 | response to any behavior that they deem inappropriate, threatening, offensive,
 46 | or harmful.
 47 | 
 48 | Community leaders have the right and responsibility to remove, edit, or reject
 49 | comments, commits, code, wiki edits, issues, and other contributions that are
 50 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 51 | decisions when appropriate.
 52 | 
 53 | ## Scope
 54 | 
 55 | This Code of Conduct applies within all community spaces, and also applies when
 56 | an individual is officially representing the community in public spaces.
 57 | Examples of representing our community include using an official email address,
 58 | posting via an official social media account, or acting as an appointed
 59 | representative at an online or offline event.
 60 | 
 61 | ## Enforcement
 62 | 
 63 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 64 | reported to the community leaders responsible for enforcement at
 65 | [INSERT CONTACT METHOD].
 66 | All complaints will be reviewed and investigated promptly and fairly.
 67 | 
 68 | All community leaders are obligated to respect the privacy and security of the
 69 | reporter of any incident.
 70 | 
 71 | ## Enforcement Guidelines
 72 | 
 73 | Community leaders will follow these Community Impact Guidelines in determining
 74 | the consequences for any action they deem in violation of this Code of Conduct:
 75 | 
 76 | ### 1. Correction
 77 | 
 78 | **Community Impact**: Use of inappropriate language or other behavior deemed
 79 | unprofessional or unwelcome in the community.
 80 | 
 81 | **Consequence**: A private, written warning from community leaders, providing
 82 | clarity around the nature of the violation and an explanation of why the
 83 | behavior was inappropriate. A public apology may be requested.
 84 | 
 85 | ### 2. Warning
 86 | 
 87 | **Community Impact**: A violation through a single incident or series of
 88 | actions.
 89 | 
 90 | **Consequence**: A warning with consequences for continued behavior. No
 91 | interaction with the people involved, including unsolicited interaction with
 92 | those enforcing the Code of Conduct, for a specified period of time. This
 93 | includes avoiding interactions in community spaces as well as external channels
 94 | like social media. Violating these terms may lead to a temporary or permanent
 95 | ban.
 96 | 
 97 | ### 3. Temporary Ban
 98 | 
 99 | **Community Impact**: A serious violation of community standards, including
100 | sustained inappropriate behavior.
101 | 
102 | **Consequence**: A temporary ban from any sort of interaction or public
103 | communication with the community for a specified period of time. No public or
104 | private interaction with the people involved, including unsolicited interaction
105 | with those enforcing the Code of Conduct, is allowed during this period.
106 | Violating these terms may lead to a permanent ban.
107 | 
108 | ### 4. Permanent Ban
109 | 
110 | **Community Impact**: Demonstrating a pattern of violation of community
111 | standards, including sustained inappropriate behavior, harassment of an
112 | individual, or aggression toward or disparagement of classes of individuals.
113 | 
114 | **Consequence**: A permanent ban from any sort of public interaction within the
115 | community.
116 | 
117 | ## Attribution
118 | 
119 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
120 | version 2.1, available at
121 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
122 | 
123 | Community Impact Guidelines were inspired by
124 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
128 | [https://www.contributor-covenant.org/translations][translations].
129 | 
130 | [homepage]: https://www.contributor-covenant.org
131 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
132 | [Mozilla CoC]: https://github.com/mozilla/diversity
133 | [FAQ]: https://www.contributor-covenant.org/faq
134 | [translations]: https://www.contributor-covenant.org/translations
135 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: dev help
 2 | 
 3 | help:
 4 | 	@echo "make dev     - Install development dependencies"
 5 | 	@echo "make setup   - Dev setup"
 6 | 
 7 | setup:
 8 | 	python3 -m venv .venv
 9 | 	.venv/bin/python3 -m pip install --upgrade pip
10 | 	.venv/bin/python3 -m pip install poetry
11 | 	.venv/bin/poetry update
12 | 
13 | install: setup
14 | 	.venv/bin/poetry install --without dev
15 | 	.venv/bin/playwright install --with-deps --only-shell chromium
16 | 
17 | dev: setup
18 | 	.venv/bin/poetry install --with dev
19 | 	.venv/bin/playwright install --with-deps --only-shell chromium
20 | 
21 | clean:
22 | 	rm -rf .venv
23 | 	rm -rf lexoid.egg-info
24 | 	rm -rf dist
25 | 
26 | build:
27 | 	.venv/bin/poetry update && .venv/bin/poetry build
28 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   
  3 | ```
  4 |  ___      _______  __   __  _______  ___   ______  
  5 | |   |    |       ||  |_|  ||       ||   | |      | 
  6 | |   |    |    ___||       ||   _   ||   | |  _    |
  7 | |   |    |   |___ |       ||  | |  ||   | | | |   |
  8 | |   |___ |    ___| |     | |  |_|  ||   | | |_|   |
  9 | |       ||   |___ |   _   ||       ||   | |       |
 10 | |_______||_______||__| |__||_______||___| |______| 
 11 |                                                                                                     
 12 | ```
 13 |   
 14 | </div>
 15 | 
 16 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
 17 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/oidlabs/Lexoid)
 18 | [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-turquoise.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
 19 | [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
 20 | [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
 21 | 
 22 | Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
 23 | 
 24 | [Documentation](https://oidlabs-com.github.io/Lexoid/)
 25 | 
 26 | ## Motivation:
 27 | 
 28 | - Use the multi-modal advancement of LLMs
 29 | - Enable convenience for users
 30 | - Collaborate with a permissive license
 31 | 
 32 | ## Installation
 33 | 
 34 | ### Installing with pip
 35 | 
 36 | ```
 37 | pip install lexoid
 38 | ```
 39 | 
 40 | To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
 41 | 
 42 | ```
 43 | OPENAI_API_KEY=""
 44 | GOOGLE_API_KEY=""
 45 | ```
 46 | 
 47 | Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
 48 | 
 49 | ```
 50 | playwright install --with-deps --only-shell chromium
 51 | ```
 52 | 
 53 | ### Building `.whl` from source
 54 | 
 55 | ```
 56 | make build
 57 | ```
 58 | 
 59 | ### Creating a local installation
 60 | 
 61 | To install dependencies:
 62 | 
 63 | ```
 64 | make install
 65 | ```
 66 | 
 67 | or, to install with dev-dependencies:
 68 | 
 69 | ```
 70 | make dev
 71 | ```
 72 | 
 73 | To activate virtual environment:
 74 | 
 75 | ```
 76 | source .venv/bin/activate
 77 | ```
 78 | 
 79 | ## Usage
 80 | 
 81 | [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
 82 | 
 83 | [Example Colab Notebook](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
 84 | 
 85 | Here's a quick example to parse documents using Lexoid:
 86 | 
 87 | ```python
 88 | from lexoid.api import parse
 89 | from lexoid.api import ParserType
 90 | 
 91 | parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"]
 92 | # or
 93 | pdf_path = "path/to/immigration-law-advisor.pdf"
 94 | parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"]
 95 | 
 96 | print(parsed_md)
 97 | ```
 98 | 
 99 | ### Parameters
100 | 
101 | - path (str): The file path or URL.
102 | - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
103 | - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
104 | - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
105 | - \*\*kwargs: Additional arguments for the parser.
106 | 
107 | ## Supported API Providers
108 | * Google
109 | * OpenAI
110 | * Hugging Face
111 | * Together AI
112 | * OpenRouter
113 | * Fireworks
114 | 
115 | ## Benchmark
116 | 
117 | Results aggregated across 5 iterations each for 5 documents.
118 | 
119 | _Note:_ Benchmarks are currently done in the zero-shot setting.
120 | 
121 | | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
122 | | --- | --- | --- | --- | --- | --- |
123 | | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
124 | | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
125 | | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
126 | | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
127 | | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
128 | | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
129 | | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
130 | | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
131 | | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
132 | | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
133 | | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
134 | | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
135 | | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
136 | | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
137 | | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
138 | | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
139 | | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
140 | | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
141 | | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
142 | 


--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/docs/.nojekyll


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
  1 | API Reference
  2 | =============
  3 | 
  4 | Core Function
  5 | -------------
  6 | 
  7 | parse
  8 | ^^^^^
  9 | 
 10 | .. py:function:: lexoid.api.parse(path: str, parser_type: Union[str, ParserType] = "LLM_PARSE", pages_per_split: int = 4, max_processes: int = 4, **kwargs) -> Dict
 11 | 
 12 |    Parse a document using specified strategy.
 13 | 
 14 |    :param path: File path or URL to parse
 15 |    :param parser_type: Parser type to use ("LLM_PARSE", "STATIC_PARSE", or "AUTO")
 16 |    :param pages_per_split: Number of pages per chunk for processing
 17 |    :param max_processes: Maximum number of parallel processes
 18 |    :param kwargs: Additional keyword arguments
 19 |    :return: List of dictionaries containing page metadata and content, or raw text string
 20 | 
 21 |    Additional keyword arguments:
 22 | 
 23 |    * ``model`` (str): LLM model to use
 24 |    * ``framework`` (str): Static parsing framework
 25 |    * ``temperature`` (float): Temperature for LLM generation
 26 |    * ``depth`` (int): Depth for recursive URL parsing
 27 |    * ``as_pdf`` (bool): Convert input to PDF before processing
 28 |    * ``verbose`` (bool): Enable verbose logging
 29 |    * ``x_tolerance`` (int): X-axis tolerance for text extraction
 30 |    * ``y_tolerance`` (int): Y-axis tolerance for text extraction
 31 |    * ``save_dir`` (str): Directory to save intermediate PDFs
 32 |    * ``page_nums`` (List[int]): List of page numbers to parse
 33 |    * ``api_cost_mapping`` (Union[dict, str]): Dictionary containing API cost details or the string path to a JSON file containing
 34 |      the cost details. Sample file available at ``tests/api_cost_mapping.json``
 35 |    * ``router_priority`` (str): What the routing strategy should prioritize. Options are ``"speed"`` and ``"accuracy"``. The router directs a file to either ``"STATIC_PARSE"`` or ``"LLM_PARSE"`` based on its type and the selected priority. If priority is "accuracy", it prefers LLM_PARSE unless the PDF has no images but contains embedded/hidden hyperlinks, in which case it uses ``STATIC_PARSE`` (because LLMs currently fail to parse hidden hyperlinks). If priority is "speed", it uses ``STATIC_PARSE`` for documents without images and ``LLM_PARSE`` for documents with images.
 36 |    * ``api_provider`` (str): The API provider to use for LLM parsing. Options are ``openai``, ``huggingface``, ``together``, ``openrouter``, and ``fireworks``. This parameter is only relevant when using LLM parsing.
 37 | 
 38 |    Return value format:
 39 |    A dictionary containing a subset or all of the following keys:
 40 |    
 41 |    *  ``raw``: Full markdown content as string
 42 |    * ``segments``: List of dictionaries with metadata and content of each segment. For PDFs, a segment denotes a page. For webpages, a segment denotes a section (a heading and its content).
 43 |    * ``title``: Title of the document
 44 |    * ``url``: URL if applicable
 45 |    * ``parent_title``: Title of parent doc if recursively parsed
 46 |    * ``recursive_docs``: List of dictionaries for recursively parsed documents
 47 |    * ``token_usage``: Token usage statistics
 48 |    * ``pdf_path``: Path to the intermediate PDF generated when ``as_pdf`` is enabled and the kwarg ``save_dir`` is specified.
 49 | 
 50 | 
 51 | parse_with_schema
 52 | ^^^^^^^^^^^^^^^^^
 53 | 
 54 | .. py:function:: lexoid.api.parse_with_schema(path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs) -> List[List[Dict]]
 55 | 
 56 |    Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
 57 | 
 58 |    :param path: Path to the PDF file.
 59 |    :param schema: JSON schema to which the parsed output should conform.
 60 |    :param api: LLM API provider to use (``"openai"``, ``"huggingface"``, ``"together"``, ``"openrouter"``, or ``"fireworks"``).
 61 |    :param model: LLM model name.
 62 |    :param kwargs: Additional keyword arguments passed to the LLM (e.g., ``temperature``, ``max_tokens``).
 63 |    :return: A list where each element represents a page, which in turn contains a list of dictionaries conforming to the provided schema.
 64 | 
 65 |    Additional keyword arguments:
 66 | 
 67 |    * ``temperature`` (float): Sampling temperature for LLM generation.
 68 |    * ``max_tokens`` (int): Maximum number of tokens to generate.
 69 | 
 70 |    Return value format:
 71 |    A list of pages, where each page is represented as a list of dictionaries. Each dictionary conforms to the structure defined by the input ``schema``.
 72 | 
 73 | 
 74 | Examples
 75 | --------
 76 | 
 77 | Basic Usage
 78 | ^^^^^^^^^^^
 79 | 
 80 | .. code-block:: python
 81 | 
 82 |     from lexoid.api import parse
 83 | 
 84 |     # Basic parsing
 85 |     result = parse("document.pdf")
 86 | 
 87 |     # Raw text output
 88 |     parsed_md = result["raw"]
 89 | 
 90 |     # Segmented output with metadata
 91 |     parsed_segments = result["segments"]
 92 | 
 93 |     # Automatic parser selection
 94 |     result = parse("document.pdf", parser_type="AUTO")
 95 | 
 96 | LLM-Based Parsing
 97 | ^^^^^^^^^^^^^^^^^
 98 | 
 99 | .. code-block:: python
100 | 
101 |     # Parse using GPT-4o
102 |     result = parse("document.pdf", parser_type="LLM_PARSE", model="gpt-4o")
103 | 
104 |     # Parse using Gemini 1.5 Pro
105 |     result = parse("document.pdf", parser_type="LLM_PARSE", model="gemini-1.5-pro")
106 | 
107 | 
108 | Static Parsing
109 | ^^^^^^^^^^^^^^
110 | 
111 | .. code-block:: python
112 | 
113 |     # Parse using PDFPlumber
114 |     result = parse("document.pdf", parser_type="STATIC_PARSE", model="pdfplumber")
115 | 
116 |     # Parse using PDFMiner
117 |     result = parse("document.pdf", parser_type="STATIC_PARSE", model="pdfminer")
118 | 
119 | 
120 | Parse with Schema
121 | ^^^^^^^^^^^^^^^^^
122 | 
123 | .. code-block:: python
124 | 
125 |     from lexoid.api import parse_with_schema
126 | 
127 |     sample_schema = [
128 |         {
129 |             "Disability Category": "string",
130 |             "Participants": "int",
131 |             "Ballots Completed": "int",
132 |             "Ballots Incomplete/Terminated": "int",
133 |             "Accuracy": ["string"],
134 |             "Time to complete": ["string"]
135 |         }
136 |     ]
137 | 
138 |     pdf_path = "inputs/test_1.pdf"
139 |     result = parse_with_schema(path=pdf_path, schema=sample_schema, model="gpt-4o") 
140 | 
141 | Web Content
142 | ^^^^^^^^^^^
143 | 
144 | .. code-block:: python
145 | 
146 |     # Parse webpage
147 |     result = parse("https://example.com")
148 | 
149 |     # Parse webpage and the pages linked within the page
150 |     result = parse("https://example.com", depth=2)


--------------------------------------------------------------------------------
/docs/benchmark.csv:
--------------------------------------------------------------------------------
 1 | Model,Mean Similarity,Std. Dev.,Time (s),Cost($)
 2 | gemini-2.0-flash,0.829,0.102,7.41,0.00048
 3 | gemini-2.0-flash-001,0.814,0.176,6.85,0.000421
 4 | gemini-1.5-flash,0.797,0.143,9.54,0.000238
 5 | gemini-2.0-pro-exp,0.764,0.227,11.95,TBA
 6 | AUTO,0.760,0.184,5.14,0.000217
 7 | gemini-2.0-flash-thinking-exp,0.746,0.266,10.46,TBA
 8 | gemini-1.5-pro,0.732,0.265,11.44,0.003332
 9 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks),0.687,0.221,8.07,0.000419
10 | gpt-4o,0.687,0.247,10.16,0.004736
11 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks),0.675,0.184,5.98,0.000226
12 | gpt-4o-mini,0.642,0.213,9.71,0.000275
13 | gemma-3-27b-it (via OpenRouter),0.628,0.299,18.79,0.000096
14 | gemini-1.5-flash-8b,0.551,0.223,3.91,0.000055
15 | Llama-Vision-Free (via Together AI),0.531,0.198,6.93,0
16 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI),0.524,0.192,3.68,0.00006
17 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter),0.482,0.209,11.53,0.000052
18 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI),0.461,0.306,19.26,0.000426
19 | Llama-3.2-11B-Vision-Instruct (via Hugging Face),0.451,0.257,4.54,0
20 | microsoft/phi-4-multimodal-instruct (via OpenRouter),0.366,0.287,10.8,0.000019
21 | 


--------------------------------------------------------------------------------
/docs/benchmark.rst:
--------------------------------------------------------------------------------
  1 | Benchmark Report
  2 | ================
  3 | 
  4 | Overview
  5 | --------
  6 | 
  7 | This benchmark evaluates the performance of various Large Language Models (LLMs) and parsing strategies in extracting and parsing document content using Lexoid.
  8 | 
  9 | Each approach is evaluated based on a comparison between the parsed content and the manually created ground truths of several documents, with a similarity metric indicating the accuracy of the parsing process.
 10 | 
 11 | Similarity Metric
 12 | ^^^^^^^^^^^^^^^^^
 13 | 
 14 | The similarity metric is calculated using the following steps (see `calculate_similarity()` in `lexoid/core/utils.py` for the implementation).
 15 | 
 16 | 1. Markdown Conversion
 17 |    Both parsed and ground truth documents are converted to HTML, standardizing their format across structural elements like tables and lists.
 18 | 
 19 | 2. HTML Tag Removal
 20 |    All HTML markup is stripped away, leaving only the pure textual content. This ensures the comparison focuses on the actual text rather than formatting.
 21 | 
 22 | 3. Sequence Matching
 23 |    Python's ``SequenceMatcher`` compares the extracted text sequences, calculating a similarity ratio between 0 and 1 that reflects content preservation and accuracy.
 24 | 
 25 | Running the Benchmarks
 26 | ----------------------
 27 | 
 28 | Setup Environment Variables
 29 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 30 | 
 31 | Create a ``.env`` file with the necessary API keys:
 32 | 
 33 | .. code-block:: bash
 34 | 
 35 |     OPENAI_API_KEY=your_openai_key
 36 |     GOOGLE_API_KEY=your_google_key
 37 |     HUGGINGFACEHUB_API_TOKEN=your_huggingface_token
 38 |     TOGETHER_API_KEY=your_together_api_key
 39 | 
 40 | Running the Benchmark Script
 41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 42 | 
 43 | .. code-block:: bash
 44 | 
 45 |     # Clone the repository
 46 |     git clone https://github.com/oidlabs-com/lexoid.git
 47 |     cd lexoid
 48 | 
 49 |     # Install dependencies
 50 |     pip install -r requirements.txt
 51 | 
 52 |     # Run benchmarks
 53 |     python tests/benchmark.py
 54 | 
 55 | Customizing Benchmarks
 56 | ^^^^^^^^^^^^^^^^^^^^^^
 57 | 
 58 | You can modify the ``test_attributes`` list in the ``main()`` function to test different configurations:
 59 | 
 60 | * ``parser_type``: Switch between LLM and static parsing
 61 | * ``model``: Test different LLM models
 62 | * ``framework``: Test different static parsing frameworks
 63 | * ``pages_per_split``: Adjust document chunking
 64 | * ``max_threads``: Control parallel processing
 65 | 
 66 | Benchmark Results
 67 | -----------------
 68 | 
 69 | Here are the detailed parsing performance results for various models:
 70 | 
 71 | .. list-table::
 72 |    :widths: auto
 73 |    :header-rows: 1
 74 | 
 75 |    * - Rank
 76 |      - Model
 77 |      - Mean Similarity
 78 |      - Std. Dev.
 79 |      - Time (s)
 80 |      - Cost ($)
 81 |    * - 1
 82 |      - gemini-2.0-flash
 83 |      - 0.829
 84 |      - 0.102
 85 |      - 7.41
 86 |      - 0.00048
 87 |    * - 2
 88 |      - gemini-2.0-flash-001
 89 |      - 0.814
 90 |      - 0.176
 91 |      - 6.85
 92 |      - 0.000421
 93 |    * - 3
 94 |      - gemini-1.5-flash
 95 |      - 0.797
 96 |      - 0.143
 97 |      - 9.54
 98 |      - 0.000238
 99 |    * - 4
100 |      - gemini-2.0-pro-exp
101 |      - 0.764
102 |      - 0.227
103 |      - 11.95
104 |      - TBA
105 |    * - 5
106 |      - AUTO
107 |      - 0.76
108 |      - 0.184
109 |      - 5.14
110 |      - 0.000217
111 |    * - 6
112 |      - gemini-2.0-flash-thinking-exp
113 |      - 0.746
114 |      - 0.266
115 |      - 10.46
116 |      - TBA
117 |    * - 7
118 |      - gemini-1.5-pro
119 |      - 0.732
120 |      - 0.265
121 |      - 11.44
122 |      - 0.003332
123 |    * - 8
124 |      - accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks)
125 |      - 0.687
126 |      - 0.221
127 |      - 8.07
128 |      - 0.000419
129 |    * - 9
130 |      - gpt-4o
131 |      - 0.687
132 |      - 0.247
133 |      - 10.16
134 |      - 0.004736
135 |    * - 10
136 |      - accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks)
137 |      - 0.675
138 |      - 0.184
139 |      - 5.98
140 |      - 0.000226
141 |    * - 11
142 |      - gpt-4o-mini
143 |      - 0.642
144 |      - 0.213
145 |      - 9.71
146 |      - 0.000275
147 |    * - 12
148 |      - gemma-3-27b-it (via OpenRouter)
149 |      - 0.628
150 |      - 0.299
151 |      - 18.79
152 |      - 0.000096
153 |    * - 13
154 |      - gemini-1.5-flash-8b
155 |      - 0.551
156 |      - 0.223
157 |      - 3.91
158 |      - 0.000055
159 |    * - 14
160 |      - Llama-Vision-Free (via Together AI)
161 |      - 0.531
162 |      - 0.198
163 |      - 6.93
164 |      - 0
165 |    * - 15
166 |      - Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI)
167 |      - 0.524
168 |      - 0.192
169 |      - 3.68
170 |      - 0.00006
171 |    * - 16
172 |      - qwen/qwen-2.5-vl-7b-instruct (via OpenRouter)
173 |      - 0.482
174 |      - 0.209
175 |      - 11.53
176 |      - 0.000052
177 |    * - 17
178 |      - Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI)
179 |      - 0.461
180 |      - 0.306
181 |      - 19.26
182 |      - 0.000426
183 |    * - 18
184 |      - Llama-3.2-11B-Vision-Instruct (via Hugging Face)
185 |      - 0.451
186 |      - 0.257
187 |      - 4.54
188 |      - 0
189 |    * - 19
190 |      - microsoft/phi-4-multimodal-instruct (via OpenRouter)
191 |      - 0.366
192 |      - 0.287
193 |      - 10.8
194 |      - 0.000019
195 |     


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = "Lexoid"
10 | copyright = "2025, Lexoid Contributors"
11 | author = "Lexoid Contributors"
12 | release = "0.1.14"
13 | 
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 | 
17 | extensions = []
18 | 
19 | templates_path = ["_templates"]
20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
21 | 
22 | 
23 | # -- Options for HTML output -------------------------------------------------
24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
25 | 
26 | html_theme = "pydata_sphinx_theme"
27 | html_static_path = ["_build/html/_static"]
28 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
  1 | Contributing to Lexoid
  2 | ======================
  3 | 
  4 | Thank you for your interest in contributing to Lexoid! We welcome contributions from the community to make our document parsing library even better.
  5 | 
  6 | Getting Started
  7 | ---------------
  8 | 
  9 | 1. Fork the repository and clone your fork:
 10 | 
 11 |    .. code-block:: bash
 12 | 
 13 |        git clone https://github.com/YOUR_USERNAME/lexoid.git
 14 |        cd lexoid
 15 | 
 16 | 2. Set up your development environment:
 17 | 
 18 |    .. code-block:: bash
 19 | 
 20 |        make dev
 21 | 
 22 | 3. Activate the virtual environment:
 23 | 
 24 |    .. code-block:: bash
 25 | 
 26 |        source .venv/bin/activate
 27 | 
 28 | Development Setup
 29 | -----------------
 30 | 
 31 | Environment Variables
 32 | ^^^^^^^^^^^^^^^^^^^^^
 33 | 
 34 | Create a ``.env`` file in the root directory with the following API keys (as needed):
 35 | 
 36 | .. code-block:: bash
 37 | 
 38 |     GOOGLE_API_KEY=your_google_api_key
 39 |     OPENAI_API_KEY=your_openai_api_key
 40 |     HUGGINGFACEHUB_API_TOKEN=your_huggingface_token
 41 |     TOGETHER_API_KEY=your_together_api_key
 42 | 
 43 | Running Tests
 44 | ^^^^^^^^^^^^^
 45 | 
 46 | Run the test suite:
 47 | 
 48 | .. code-block:: bash
 49 | 
 50 |     python3 -m pytest tests/test_parser.py -v
 51 | 
 52 | To see test logs:
 53 | 
 54 | .. code-block:: bash
 55 | 
 56 |     python3 -m pytest tests/test_parser.py -v -s
 57 | 
 58 | Contributing Guidelines
 59 | -----------------------
 60 | 
 61 | Code Style
 62 | ^^^^^^^^^^
 63 | 
 64 | * We use Python's `PEP 8 <https://www.python.org/dev/peps/pep-0008/>`_ style guide
 65 | * If using VS Code, install the `Black Formatter <https://marketplace.visualstudio.com/items?itemName=ms-python.black-formatter>`_ extension
 66 | * Use type hints for function parameters and return values
 67 | 
 68 | Pull Request Process
 69 | ^^^^^^^^^^^^^^^^^^^^
 70 | 
 71 | 1. Create a new branch for your feature or bugfix:
 72 | 
 73 |    .. code-block:: bash
 74 | 
 75 |        git checkout -b feature-name
 76 | 
 77 | 2. Make your changes and commit them with clear, descriptive commit messages
 78 | 3. Add tests for any new functionality
 79 | 4. Update documentation as needed
 80 | 5. Push your changes and create a pull request
 81 | 
 82 | Areas for Contribution
 83 | ^^^^^^^^^^^^^^^^^^^^^^
 84 | 
 85 | * When starting out, check out the `Issues <https://github.com/oidlabs-com/Lexoid/issues>`_ page and look for tickets tagged with ``good first issue``
 86 | * However, don't let the above restrict you. Feel free to have a go at any ticket or suggest any new features!
 87 | 
 88 | Testing Your Changes
 89 | ^^^^^^^^^^^^^^^^^^^^
 90 | 
 91 | 1. Add test cases to ``tests/test_parser.py`` along with changes if appropriate
 92 | 2. Test with different file formats and parsing strategies
 93 | 
 94 | Documentation
 95 | -------------
 96 | 
 97 | When adding new features, please:
 98 | 
 99 | 1. Update the main ``README.md`` if needed
100 | 2. Add docstrings to new functions and classes
101 | 3. Include example usage in the documentation
102 | 4. Update type hints and function signatures in the docs
103 | 
104 | Reporting Issues
105 | ----------------
106 | 
107 | When reporting bugs, please include:
108 | 
109 | * A clear description of the problem
110 | * Steps to reproduce
111 | * Expected vs actual behavior
112 | * Sample files (if possible)
113 | * Environment information (Python version, OS, etc.)
114 | 
115 | Thank you for helping improve Lexoid!


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to Lexoid's Documentation
 2 | =================================
 3 | 
 4 | Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
 5 | 
 6 | .. toctree::
 7 |    :maxdepth: 2
 8 |    :caption: Contents:
 9 | 
10 |    installation
11 |    api
12 |    contributing
13 |    benchmark
14 | 
15 | Key Features
16 | ------------
17 | 
18 | * Multiple parsing strategies (LLM-based and static parsing)
19 | * Automatic parsing strategy selection
20 | * Support for multiple LLM providers (OpenAI, Google, Meta/Llama, Together AI)
21 | * Table detection and markdown conversion
22 | * Hyperlink detection and preservation
23 | * Recursive URL parsing
24 | * Multi-format support
25 | * Parallel processing support
26 | * Permissive license
27 | 
28 | Supported API Providers
29 | -----------------------
30 | 
31 | * Google
32 | * OpenAI
33 | * Hugging Face
34 | * Together AI
35 | * OpenRouter
36 | * Fireworks
37 | 
38 | Indices and tables
39 | ==================
40 | 
41 | * :ref:`genindex`
42 | * :ref:`modindex`
43 | * :ref:`search`


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | Installing with pip
 5 | -------------------
 6 | 
 7 | .. code-block:: bash
 8 | 
 9 |     pip install lexoid
10 | 
11 | Environment Setup
12 | -----------------
13 | 
14 | To use LLM-based parsing, define the following environment variables or create a ``.env`` file with the following definitions:
15 | 
16 | .. code-block:: bash
17 | 
18 |     GOOGLE_API_KEY=your_google_api_key
19 |     OPENAI_API_KEY=your_openai_api_key
20 |     HUGGINGFACEHUB_API_TOKEN=your_huggingface_token
21 |     TOGETHER_API_KEY=your_together_api_key
22 | 
23 | Optional Dependencies
24 | ---------------------
25 | 
26 | To use ``Playwright`` for retrieving web content (instead of the ``requests`` library):
27 | 
28 | .. code-block:: bash
29 | 
30 |     playwright install --with-deps --only-shell chromium
31 | 
32 | Building from Source
33 | --------------------
34 | 
35 | To build the ``.whl`` file:
36 | 
37 | .. code-block:: bash
38 | 
39 |     make build
40 | 
41 | Local Development Setup
42 | -----------------------
43 | 
44 | To install dependencies:
45 | 
46 | .. code-block:: bash
47 | 
48 |     make install
49 | 
50 | Or, to install with dev-dependencies:
51 | 
52 | .. code-block:: bash
53 | 
54 |     make dev
55 | 
56 | To activate virtual environment:
57 | 
58 | .. code-block:: bash
59 | 
60 |     source .venv/bin/activate


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | pydata-sphinx-theme
3 | docutils


--------------------------------------------------------------------------------
/docs/update_benchmarks.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pandas as pd
 3 | import re
 4 | 
 5 | 
 6 | def update_markdown(content, table_md):
 7 |     pattern = r"(##\s*Benchmark\s*\n(?:.*?\n)*?\n)(\| Rank .*?\n\|.*?\n(?:\|.*?\n)+)"
 8 |     replacement = r"\1" + table_md + "\n"
 9 |     return re.sub(pattern, replacement, content, flags=re.DOTALL)
10 | 
11 | 
12 | def update_rst(content, table_rst):
13 |     pattern = r"(Benchmark Results\s*-+\n.*?\n)(\s*\* - .*\n)+"
14 |     return re.sub(pattern, f"\\1{table_rst}\n", content, flags=re.DOTALL)
15 | 
16 | 
17 | def generate_markdown_table(df):
18 |     header = "| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |\n"
19 |     sep = "| --- | --- | --- | --- | --- | --- |\n"
20 |     rows = [
21 |         f"| {i+1} | {row['Model']} | {row['Mean Similarity']} | {row['Std. Dev.']} | {row['Time (s)']} | {row['Cost($)']} |"
22 |         for i, row in df.iterrows()
23 |     ]
24 |     return header + sep + "\n".join(rows)
25 | 
26 | 
27 | def generate_rst_table(df):
28 |     header = "\n   * - Rank\n     - Model\n     - Mean Similarity\n     - Std. Dev.\n     - Time (s)\n     - Cost ($)"
29 |     rows = [
30 |         f"   * - {i+1}\n     - {row['Model']}\n     - {row['Mean Similarity']}\n     - {row['Std. Dev.']}\n     - {row['Time (s)']}\n     - {row['Cost($)']}"
31 |         for i, row in df.iterrows()
32 |     ]
33 |     return header + "\n" + "\n".join(rows)
34 | 
35 | 
36 | def main(csv_path, md_path, rst_path):
37 |     df = pd.read_csv(csv_path)
38 |     df = df.sort_values(by="Mean Similarity", ascending=False).reset_index(drop=True)
39 | 
40 |     with open(md_path, "r", encoding="utf-8") as f:
41 |         md_content = f.read()
42 |     with open(rst_path, "r", encoding="utf-8") as f:
43 |         rst_content = f.read()
44 | 
45 |     table_md = generate_markdown_table(df)
46 |     table_rst = generate_rst_table(df)
47 | 
48 |     updated_md = update_markdown(md_content, table_md)
49 |     updated_rst = update_rst(rst_content, table_rst)
50 | 
51 |     with open(md_path, "w", encoding="utf-8") as f:
52 |         f.write(updated_md)
53 |     with open(rst_path, "w", encoding="utf-8") as f:
54 |         f.write(updated_rst)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     parser = argparse.ArgumentParser(
59 |         description="Update benchmark tables in README.md and benchmark.rst from CSV"
60 |     )
61 |     parser.add_argument("--csv", default="benchmark.csv", help="Path to benchmark.csv")
62 |     parser.add_argument("--md", default="../README.md", help="Path to README.md")
63 |     parser.add_argument("--rst", default="benchmark.rst", help="Path to benchmark.rst")
64 |     args = parser.parse_args()
65 | 
66 |     main(args.csv, args.md, args.rst)
67 | 


--------------------------------------------------------------------------------
/examples/inputs/bench_md.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/bench_md.pdf


--------------------------------------------------------------------------------
/examples/inputs/benchmark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/benchmark.pdf


--------------------------------------------------------------------------------
/examples/inputs/costco_bill.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/costco_bill.jpg


--------------------------------------------------------------------------------
/examples/inputs/cvs_coupon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/cvs_coupon.jpg


--------------------------------------------------------------------------------
/examples/inputs/grocery_bill.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/grocery_bill.jpg


--------------------------------------------------------------------------------
/examples/inputs/medical_invoice_sample1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/medical_invoice_sample1.png


--------------------------------------------------------------------------------
/examples/inputs/medical_travel_request_OWCP_957.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/medical_travel_request_OWCP_957.png


--------------------------------------------------------------------------------
/examples/inputs/sample.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample.docx


--------------------------------------------------------------------------------
/examples/inputs/sample.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample.pptx


--------------------------------------------------------------------------------
/examples/inputs/sample.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample.xlsx


--------------------------------------------------------------------------------
/examples/inputs/sample_test.txt:
--------------------------------------------------------------------------------
1 | Large language models (LLMs) have shown impressive performance on complex reasoning by leveraging chain-of-thought (CoT) prompting to generate intermediate reasoning chains as the rationale to infer the answer. However, existing CoT studies have primarily focused on the language modality. We propose Multimodal-CoT that incorporates language (text) and vision (images) modalities into a two-stage framework that separates rationale generation and answer inference. In this way, answer inference can leverage better generated rationales that are based on multimodal information. Experimental results on ScienceQA and A-OKVQA benchmark datasets show the effectiveness of our proposed approach. With Multimodal-CoT, our model under 1 billion parameters achieves state-of-the-art performance on the ScienceQA benchmark. Our analysis indicates that Multimodal-CoT offers the advantages of mitigating hallucination and enhancing convergence speed.
2 | 
3 | 


--------------------------------------------------------------------------------
/examples/inputs/sample_test_doc.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample_test_doc.pdf


--------------------------------------------------------------------------------
/examples/inputs/screenshot-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/screenshot-1.png


--------------------------------------------------------------------------------
/examples/inputs/stress_test/large_doc_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/stress_test/large_doc_1.pdf


--------------------------------------------------------------------------------
/examples/inputs/stress_test/large_doc_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/stress_test/large_doc_2.pdf


--------------------------------------------------------------------------------
/examples/inputs/test_1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_1.pdf


--------------------------------------------------------------------------------
/examples/inputs/test_2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_2.pdf


--------------------------------------------------------------------------------
/examples/inputs/test_3.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_3.pdf


--------------------------------------------------------------------------------
/examples/inputs/test_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_4.jpg


--------------------------------------------------------------------------------
/examples/inputs/test_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_5.jpg


--------------------------------------------------------------------------------
/examples/inputs/test_explicit_hyperlink_n_img.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_explicit_hyperlink_n_img.pdf


--------------------------------------------------------------------------------
/examples/inputs/test_hidden_link_with_image.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_hidden_link_with_image.pdf


--------------------------------------------------------------------------------
/examples/inputs/test_with_hidden_links_no_img.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_with_hidden_links_no_img.pdf


--------------------------------------------------------------------------------
/examples/outputs/benchmark.md:
--------------------------------------------------------------------------------
  1 | # Heading level 1
  2 | First paragraph.
  3 | 
  4 | Second paragraph.
  5 | 
  6 | Third paragraph.
  7 | 
  8 | ## Heading level 2
  9 | This is **bold text**.
 10 | 
 11 | This is *italic text*.
 12 | 
 13 | This is ***bold and italic text***.
 14 | 
 15 | This is ~~strikethrough~~.
 16 | 
 17 | ### Heading level 3
 18 | > This is a level one blockquote.
 19 | 
 20 | > > This is a level two blockquote.
 21 | 
 22 | > > > This is a level three blockquote.
 23 | 
 24 | > > This is a level two blockquote.
 25 | 
 26 | 1. First item on the ordered list
 27 | 2. Second item on the ordered list
 28 | 3. Third item on the ordered list
 29 | 
 30 | - First item on the unordered list
 31 | - Second item on the unordered list
 32 | - Third item on the unordered list
 33 | 
 34 | Before a horizontal line
 35 | 
 36 | ---
 37 | 
 38 | After horizontal line
 39 | 
 40 | Here comes a link: [example-link](https://www.example.com).
 41 | 
 42 | Email: <mail@example.com>
 43 | 
 44 | Here comes Python code:
 45 | 
 46 | ```python
 47 | def add_integer(a: int, b: int) -> int:
 48 |     return a + b
 49 | ```
 50 | 
 51 | And here comes a Bash command:
 52 | 
 53 | ```bash 
 54 | curl -o thatpage.html http://www.example.com/
 55 | ```
 56 | 
 57 | Here comes a table:
 58 | 
 59 | | **Column L** | **Column C** | **Column R** |
 60 | |:-------------|:------------:|-------------:|
 61 | | 11           | 12           | 13           |
 62 | | 21           | 22           | 23           |
 63 | | 31           | 32           | 33           |
 64 | 
 65 | And a second table:
 66 | 
 67 | |        | **B1**    | **C1**    |
 68 | |--------|-----------|-----------|
 69 | | **A2** | _data 11_ | _data 12_ |
 70 | | **A3** | _data 21_ | _data 22_ |
 71 | 
 72 | 
 73 | <div style="background-color: #FFFFE0; padding: 10px; font-family: 'Comic Sans MS', 'Chalkboard SE', 'Bradley Hand', cursive, sans-serif; line-height: 1.8; border: 1px solid #E0E0C1;">
 74 | 
 75 |   <div style="padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;">
 76 |     <span style="font-size: 1.3em; color: #333;">V-February Flow</span>
 77 |   </div>
 78 | 
 79 |   <div style="padding-top: 10px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;">
 80 |     <span style="color: #333;">Data Components:</span>
 81 |   </div>
 82 | 
 83 |   <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;">
 84 |     <span style="color: blue;">Code:</span>
 85 |   </div>
 86 | 
 87 |   <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; padding-left: 25px;">
 88 |     <span style="color: blue;">The-Stack-V2</span>
 89 |   </div>
 90 | 
 91 |   <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;">
 92 |     <span style="color: red;">CodeText:</span>
 93 |   </div>
 94 | 
 95 |   <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; padding-left: 25px;">
 96 |     <span style="color: red;">SE, whatever we've scraped</span>
 97 |   </div>
 98 | 
 99 |   <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;">
100 |     <span style="color: green;">WebText:</span>
101 |   </div>
102 | 
103 |   <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; padding-left: 25px;">
104 |     <span style="color: green;">HQ DCLM</span>
105 |   </div>
106 | 
107 |   <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;">
108 |      <!-- Placeholder for the wavy pink line. Using a simple HR as representation -->
109 |      <hr style="border: none; height: 4px; background-color: hotpink; margin: 5px 0; border-radius: 2px; opacity: 0.7;">
110 |   </div>
111 | 
112 |   <div style="padding-top: 10px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; font-weight: bold; color: #333;">
113 |     DATA MIXES
114 |   </div>
115 | 
116 |   <table style="width: 100%; border-collapse: collapse; border-spacing: 0;">
117 |     <tr>
118 |       <td style="width: 50%; padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; vertical-align: top;"><span style="color: blue;">~85% Source Code</span></td>
119 |       <td style="width: 10%; font-size: 3.5em; line-height: 1; vertical-align: top; text-align: left; border-bottom: 1px solid #D4D4B8;" rowspan="3">]</td>
120 |       <td style="width: 40%; vertical-align: middle; border-bottom: 1px solid #D4D4B8; padding-left: 10px; color: #333;" rowspan="3">Deepseek<br>Coder</td>
121 |     </tr>
122 |     <tr><td style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; vertical-align: top;"><span style="color: red;">~10% CodeText</span></td></tr>
123 |     <tr><td style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; vertical-align: top;"><span style="color: green;">~ 5% Webtext</span></td></tr>
124 |   </table>
125 | 
126 |    <table style="width: 100%; border-collapse: collapse; border-spacing: 0;">
127 |     <tr>
128 |       <td style="width: 50%; padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; vertical-align: top;"><span style="color: blue;">~ 85% The-stack-v2</span></td>
129 |       <td style="width: 10%; font-size: 3.5em; line-height: 1; vertical-align: top; text-align: left; border-bottom: 1px solid #D4D4B8;" rowspan="3">]</td>
130 |       <td style="width: 40%; vertical-align: middle; border-bottom: 1px solid #D4D4B8; padding-left: 10px;" rowspan="3"><span style="color: purple;">Starcoder<br>2</span></td>
131 |     </tr>
132 |     <tr><td style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; vertical-align: top;"><span style="color: red;">~ 15% CodeText</span></td></tr>
133 |     <tr><td style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; vertical-align: top;"><span style="color: green;">~ 0% webtext</span></td></tr>
134 |   </table>
135 | 
136 |    <table style="width: 100%; border-collapse: collapse; border-spacing: 0;">
137 |      <tr>
138 |       <td style="width: 50%; padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8; vertical-align: middle;"><span style="color: blue;">~100% Source Code</span></td>
139 |       <td style="width: 10%; font-size: 3.5em; line-height: 1; vertical-align: middle; text-align: left; border-bottom: 1px solid #D4D4B8;" rowspan="1">]</td>
140 |       <td style="width: 40%; vertical-align: middle; border-bottom: 1px solid #D4D4B8; padding-left: 10px;" rowspan="1"><span style="color: purple;">Arctic</span></td>
141 |     </tr>
142 |    </table>
143 | 
144 |    <!-- Add empty lines at the bottom -->
145 |    <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;"> </div>
146 |    <div style="padding-top: 5px; padding-bottom: 5px; border-bottom: 1px solid #D4D4B8;"> </div>
147 | 
148 | </div>
149 | 
150 | <br>
151 | 
152 | <div style="font-family: sans-serif; line-height: 1.4;">
153 | 
154 | <table style="width: 100%; border-collapse: collapse;">
155 |   <tr>
156 |     <td style="width: 20%; font-size: 1.1em; font-weight: bold; vertical-align: top;">Summary of Care</td>
157 |     <td style="width: 80%;">
158 |       <span style="background-color: #4a4a8a; color: white; padding: 2px 5px; border-radius: 3px; font-weight: bold;">Patient</span> Adam Everyman <br>
159 |       <span style="background-color: #4a4a8a; color: white; padding: 2px 5px; border-radius: 3px; font-weight: bold;">D.O.B</span> October 22, 1962
160 |       <span style="background-color: #4a4a8a; color: white; padding: 2px 5px; border-radius: 3px; font-weight: bold; margin-left: 10px;">Sex</span> <span style="font-family: 'DejaVu Sans', Arial, sans-serif;">♂</span>Male <br>
161 |       <span style="background-color: #4a4a8a; color: white; padding: 5px 15px; border-radius: 10px; display: inline-block; margin-top: 5px; font-size: 0.9em;">Patient Detail</span>
162 |     </td>
163 |   </tr>
164 | </table>
165 | 
166 | <div style="border: 1px solid #cccccc; border-radius: 5px; margin-top: 15px; padding: 10px;">
167 |   <div style="font-weight: bold; color: #4a4a8a; margin-bottom: 5px;"><span style="font-size: 1.2em; vertical-align: middle;">🚑</span> Reason for referral</div>
168 |   Pulmonary function tests, Dr. Penny Puffer, Tel: 555-555-1049,<br>
169 |   1047 Healthcare Drive, Portland, OR 97005, Scheduled date:<br>
170 |   08/17/2012
171 | </div>
172 | 
173 | <div style="border: 1px solid #cccccc; border-radius: 5px; margin-top: 15px; padding: 10px;">
174 |   <div style="font-weight: bold; color: #4a4a8a; margin-bottom: 5px;"><span style="font-size: 1.2em; vertical-align: middle;">💊</span> Medications</div>
175 |   <table style="width: 100%; border-collapse: collapse;">
176 |     <thead>
177 |       <tr style="background-color: #eeeeee; font-weight: bold;">
178 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 20%;">Medication</td>
179 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 30%;">Instructions</td>
180 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 10%;">Dosage</td>
181 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 25%;">Effective Dates (start - stop)</td>
182 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 15%;">Status</td>
183 |       </tr>
184 |     </thead>
185 |     <tbody>
186 |       <tr>
187 |         <td style="padding: 5px; border: 1px solid #cccccc; vertical-align: top;">Albuterol 0.09 MG/ACTUAT</td>
188 |         <td style="padding: 5px; border: 1px solid #cccccc; vertical-align: top;">2 puffs every 6 hours PRN wheezing</td>
189 |         <td style="padding: 5px; border: 1px solid #cccccc; vertical-align: top;"></td>
190 |         <td style="padding: 5px; border: 1px solid #cccccc; vertical-align: top;">Aug 10, 2012 -</td>
191 |         <td style="padding: 5px; border: 1px solid #cccccc; vertical-align: top;">Active</td>
192 |       </tr>
193 |     </tbody>
194 |   </table>
195 | </div>
196 | 
197 | <div style="border: 1px solid #cccccc; border-radius: 5px; margin-top: 15px; padding: 10px;">
198 |   <div style="font-weight: bold; color: #4a4a8a; margin-bottom: 5px;"><span style="background-color: #4a4a8a; color: white; font-weight: bold; padding: 0 5px; display: inline-block; border-radius: 3px; margin-right: 3px;">H</span> Immunizations</div>
199 |   <table style="width: 100%; border-collapse: collapse;">
200 |     <thead>
201 |       <tr style="background-color: #eeeeee; font-weight: bold;">
202 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 40%;">Vaccine</td>
203 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 20%;">Lot Number</td>
204 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 20%;">Date</td>
205 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 20%;">Status</td>
206 |       </tr>
207 |     </thead>
208 |     <tbody>
209 |       <tr>
210 |         <td style="padding: 5px; border: 1px solid #cccccc;">Influenza Virus Vaccine</td>
211 |         <td style="padding: 5px; border: 1px solid #cccccc;">1</td>
212 |         <td style="padding: 5px; border: 1px solid #cccccc;">8/15/2010</td>
213 |         <td style="padding: 5px; border: 1px solid #cccccc;">Completed</td>
214 |       </tr>
215 |     </tbody>
216 |   </table>
217 | </div>
218 | 
219 | <div style="border: 1px solid #cccccc; border-radius: 5px; margin-top: 15px; padding: 10px;">
220 |   <div style="font-weight: bold; color: #4a4a8a; margin-bottom: 5px;"><span style="font-size: 1.2em; vertical-align: middle;">❤️</span> Vital signs</div>
221 |   <table style="width: 100%; border-collapse: collapse;">
222 |     <thead>
223 |       <tr style="background-color: #eeeeee; font-weight: bold;">
224 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 20%;">Date</td>
225 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 35%;">Test</td>
226 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 25%;">Result</td>
227 |         <td style="padding: 5px; border: 1px solid #cccccc; width: 20%;">Details</td>
228 |       </tr>
229 |     </thead>
230 |     <tbody>
231 |       <tr>
232 |         <td style="padding: 5px; border: 1px solid #cccccc; vertical-align: top;" rowspan="5">15-Aug-2012</td>
233 |         <td style="padding: 5px; border: 1px solid #cccccc;">Height</td>
234 |         <td style="padding: 5px; border: 1px solid #cccccc;">70 in</td>
235 |         <td style="padding: 5px; border: 1px solid #cccccc;"></td>
236 |       </tr>
237 |       <tr>
238 |         <td style="padding: 5px; border: 1px solid #cccccc;">Weight</td>
239 |         <td style="padding: 5px; border: 1px solid #cccccc;">195 lb</td>
240 |         <td style="padding: 5px; border: 1px solid #cccccc;"></td>
241 |       </tr>
242 |       <tr>
243 |         <td style="padding: 5px; border: 1px solid #cccccc;">Body Mass Index Calculated</td>
244 |         <td style="padding: 5px; border: 1px solid #cccccc;">28</td>
245 |         <td style="padding: 5px; border: 1px solid #cccccc;"></td>
246 |       </tr>
247 |       <tr>
248 |         <td style="padding: 5px; border: 1px solid #cccccc;">BP Systolic</td>
249 |         <td style="padding: 5px; border: 1px solid #cccccc;">155 mm[Hg]</td>
250 |         <td style="padding: 5px; border: 1px solid #cccccc;"></td>
251 |       </tr>
252 |       <tr>
253 |         <td style="padding: 5px; border: 1px solid #cccccc;">BP Diastolic</td>
254 |         <td style="padding: 5px; border: 1px solid #cccccc;">92 mm[Hg]</td>
255 |         <td style="padding: 5px; border: 1px solid #cccccc;"></td>
256 |       </tr>
257 |     </tbody>
258 |   </table>
259 | </div>
260 | 
261 | </div>


--------------------------------------------------------------------------------
/examples/outputs/costco_bill.md:
--------------------------------------------------------------------------------
 1 | <div>
 2 | <h1>Costco</h1>
 3 | <p>WHOLESALE</p>
 4 | <p>
 5 | Irvine #454<br>
 6 | 115 Technology Drive W<br>
 7 | Irvine, CA 92618<br>
 8 | (949) 453-0435
 9 | </p>
10 | <p>MT Member <b>1234</b></p>
11 | </div>
12 | <table>
13 | 
14 | <tr><td colspan="5" align="center">****Bottom of Basket****</td></tr>
15 | <tr><td></td><td>1714849</td><td>SCOOP AWAY</td><td>16.49</td><td>A</td></tr>
16 | <tr><td colspan="5" align="center">****BOB Count 1 ****</td></tr>
17 | <tr><td>E</td><td>5536</td><td>YNG COCO 3CT</td><td>9.99</td><td></td></tr>
18 | <tr><td>E</td><td>57554</td><td>BLUEBERRIES</td><td>6.99</td><td></td></tr>
19 | <tr><td>E</td><td>370586</td><td>ORG. DATES</td><td>11.99</td><td></td></tr>
20 | <tr><td>E</td><td>1280655</td><td>ORG CSR KIT</td><td>8.99</td><td></td></tr>
21 | <tr><td>E</td><td>1280655</td><td>ORG CSR KIT</td><td>8.99</td><td></td></tr>
22 | <tr><td>E</td><td>161750</td><td>KS UNS CASHE</td><td>13.99</td><td></td></tr>
23 | <tr><td>E</td><td>1308623</td><td>SUJA WELLNES</td><td>15.39</td><td></td></tr>
24 | <tr><td>E</td><td>1900000000</td><td>CA REDEMP VA</td><td>0.50</td><td></td></tr>
25 | <tr><td>E</td><td>1308623</td><td>SUJA WELLNES</td><td>15.39</td><td></td></tr>
26 | <tr><td>E</td><td>1900000000</td><td>CA REDEMP VA</td><td>0.50</td><td></td></tr>
27 | <tr><td>F</td><td>504882</td><td>TYL RR 290CT</td><td>21.49</td><td>A</td></tr>
28 | <tr><td>F</td><td>652782</td><td>ALEVE GEL160</td><td>19.99</td><td>A</td></tr>
29 | <tr><td>F</td><td>1830585</td><td>ABGMYS90CT</td><td>19.99</td><td>A</td></tr>
30 | <tr><td>F</td><td>1566153</td><td>MUCINEX 56CT</td><td>31.99</td><td>A</td></tr>
31 | <tr><td></td><td>0000345923</td><td>/1566153</td><td>6.50</td><td>-A</td></tr>
32 | <tr><td>E</td><td>1801060</td><td>LIVSFVARIETY</td><td>28.99</td><td></td></tr>
33 | <tr><td></td><td></td><td>SUBTOTAL</td><td>225.16</td><td></td></tr>
34 | <tr><td></td><td></td><td>TAX</td><td>8.02</td><td></td></tr>
35 | <tr><td></td><td></td><td>**** TOTAL</td><td>233.18</td><td></td></tr>
36 | </table>
37 | 


--------------------------------------------------------------------------------
/examples/outputs/cvs_coupon.md:
--------------------------------------------------------------------------------
 1 | <p style="text-align: center;">
 2 | <b><span style="font-size: x-large;">CVS</span></b>pharmacy
 3 | </p>
 4 | <p style="text-align: center;">
 5 | <span style="font-size: xx-large;"><b>$2.00 off</b></span><br/>
 6 | <span>$2 off CVS HEALTH Topical</span><br/>
 7 | <span>Pain products</span>
 8 | </p>
 9 | <p style="text-align: center;">
10 | Expires 11/02/2024 (Up to $2.00 value)
11 | </p>
12 | <p style="text-align: center;">
13 | <!-- Barcode Image Representation -->
14 | <img alt="Barcode representing coupon with numbers 7168 4009 6700 2002" width="200" height="50">
15 | </p>
16 | <p style="text-align: center;">
17 | <span style="font-size: small">ExtraCare card required. See</span><br/>
18 | <span style="font-size: small">www.cvs.com/COUPONpolicy or policy at register for details.</span><br/>
19 | <p style="text-align: left;">
20 | ExtraCare Card #
21 | </p>
22 | 


--------------------------------------------------------------------------------
/examples/outputs/grocery_bill.md:
--------------------------------------------------------------------------------
  1 | <p style="text-align: center;">
  2 | <img alt="Ralphs Logo" width="100" height="50">
  3 | </p>
  4 | 
  5 | <p style="text-align: center;">
  6 | 6300 Irvine Blvd<br/>
  7 | (949) 559-1139<br/>
  8 | Your Cashier was BROOKE S<br/>
  9 | <b>VERIFIED TOTAL SAVINGS $ 1.90</b>
 10 | </p>
 11 | 
 12 | <table style="width:100%; font-family: monospace; border-collapse: collapse;">
 13 |     <tr>
 14 |         <td></td>
 15 |         <td style="width:25%;">DANN OIKOS P PC YT</td>
 16 |         <td style="width:15%; text-align: right;">1.99</td>
 17 |         <td style="width:20%; text-align: left;">F</td>
 18 |     </tr>
 19 |     <tr>
 20 |         <td>SC</td>
 21 |         <td>RALPHS SAVED YOU</td>
 22 |         <td style="text-align: right;">0.20</td>
 23 |         <td></td>
 24 |     </tr>
 25 |     <tr>
 26 |         <td></td>
 27 |         <td>DANN OIKOS P SB YT</td>
 28 |         <td style="text-align: right;">1.99</td>
 29 |         <td style="text-align: left;">F</td>
 30 |     </tr>
 31 |     <tr>
 32 |         <td>SC</td>
 33 |         <td>RALPHS SAVED YOU</td>
 34 |         <td style="text-align: right;">0.20</td>
 35 |         <td></td>
 36 |     </tr>
 37 |     <tr>
 38 |       <td></td>
 39 |       <td>COKE ZERO SUGAR RC</td>
 40 |       <td style="text-align: right;">7.49</td>
 41 |       <td style="text-align: left;">B</td>
 42 |     </tr>
 43 |     <tr>
 44 |         <td></td>
 45 |         <td>CRV</td>
 46 |         <td style="text-align: right;">0.50</td>
 47 |         <td style="text-align: left;">B</td>
 48 |     </tr>
 49 |     <tr>
 50 |         <td></td>
 51 |       <td>2.92 lb @ 0.89 /lb</td>
 52 |       <td></td>
 53 |       <td></td>
 54 |     </tr>
 55 |     <tr>
 56 |         <td>WT</td>
 57 |         <td>BANANA ORGNC</td>
 58 |         <td style="text-align: right;">2.60</td>
 59 |         <td style="text-align: left;">F</td>
 60 |     </tr>
 61 |     <tr>
 62 |       <td></td>
 63 |       <td>2.39 lb @ 0.89 /lb</td>
 64 |       <td></td>
 65 |       <td></td>
 66 |     </tr>
 67 |     <tr>
 68 |       <td>WT</td>
 69 |         <td>BANANAS ORGANIC</td>
 70 |         <td style="text-align: right;">2.13</td>
 71 |         <td style="text-align: left;">F</td>
 72 |     </tr>
 73 |     <tr>
 74 |       <td></td>
 75 |       <td>1 @ 3/5.00</td>
 76 |       <td></td>
 77 |       <td></td>
 78 |     </tr>
 79 |     <tr>
 80 |         <td></td>
 81 |       <td>LUNA BAR</td>
 82 |       <td style="text-align: right;">1.67</td>
 83 |       <td style="text-align: left;">F</td>
 84 |     </tr>
 85 |     <tr>
 86 |       <td></td>
 87 |       <td>1 @ 3/5.00</td>
 88 |       <td></td>
 89 |       <td></td>
 90 |     </tr>
 91 |      <tr>
 92 |         <td></td>
 93 |         <td>LUNA BAR</td>
 94 |         <td style="text-align: right;">1.67</td>
 95 |         <td style="text-align: left;">F</td>
 96 |     </tr>
 97 |     <tr>
 98 |         <td></td>
 99 |         <td>PPB RAW CASHEWS RC</td>
100 |         <td style="text-align: right;">8.99</td>
101 |         <td style="text-align: left;">F</td>
102 |     </tr>
103 |     <tr>
104 |         <td>SC</td>
105 |         <td>RALPHS SAVED YOU</td>
106 |         <td style="text-align: right;">1.50</td>
107 |         <td></td>
108 |     </tr>
109 |     <tr>
110 |         <td>MR</td>
111 |         <td>CHECKOUT BAG TAX</td>
112 |         <td style="text-align: right;">0.10</td>
113 |         <td></td>
114 |     </tr>
115 |   <tr>
116 |         <td>RALPHS REWARDS CUSTOMER</td>
117 |         <td>******3844</td>
118 |         <td></td>
119 |         <td></td>
120 |     </tr>
121 |     <tr>
122 |         <td></td>
123 |         <td>TAX</td>
124 |         <td style="text-align: right;">0.62</td>
125 |         <td></td>
126 |     </tr>
127 |     <tr>
128 |       <td></td>
129 |       <td>**** BALANCE</td>
130 |       <td style="text-align: right;">29.75</td>
131 |       <td></td>
132 |     </tr>
133 | 
134 | </table>
135 | <div style="text-align:center;">
136 | <!-- Black rectangle to cover sensitive information -->
137 | <div style="background-color:black; width:200px; height:50px; margin: 5px auto;"></div>
138 | </div>
139 | 
140 | <table style="margin: 0 auto; font-family: monospace;">
141 |     <tr>
142 |         <td style="text-align: left;">VISA</td>
143 |         <td style="text-align: right;">29.75</td>
144 |     </tr>
145 |     <tr>
146 |         <td style="text-align: left;">CHANGE</td>
147 |         <td style="text-align: right;">0.00</td>
148 |     </tr>
149 |         <tr>
150 |         <td>TOTAL NUMBER OF ITEMS SOLD =</td>
151 |         <td style="text-align: right;">9</td>
152 |     </tr>
153 |     <tr>
154 |         <td>RALPHS rewards SAVINGS</td>
155 |         <td style="text-align: right;">$1.90</td>
156 |     </tr>
157 |     <tr>
158 |         <td>TOTAL COUPONS</td>
159 |         <td style="text-align: right;">$1.90</td>
160 |     </tr>
161 |     <tr>
162 |     <td colspan="2" style="text-align: center;">01/13/25 11:09pm 299 6 528 355<br/>
163 |     ******************************<br/>
164 |     ANNUAL CARD SAVINGS $29.76<br/>
165 |     ******************************<br/>
166 |     Fuel Points Earned Today: 29<br/>
167 |     Total Jan Fuel Points: 474<br/>
168 |     ******************************<br/>
169 |     Next Reward: 256 points<br/>
170 |     ******************************<br/>
171 |     Remaining Dec Fuel Points: 365<br/>
172 |     ******************************
173 |     </td>
174 |     </tr>
175 | </table>
176 | 
177 | <p style = "text-align: center">
178 | <b>Apply Now</b><br/>
179 | <b>$100 Statement Credit</b><br/>
180 | When you spend $500 with your card<br/>in the first 90 days* and<br/>
181 | <b>get up to 5% CASH BACK</b><br/>
182 | on eligible purchases* with your<br/>
183 | Ralphs Rewards World Elite Mastercard
184 | </p>
185 | 
186 | <p style="text-align: center;">
187 | APPLY TODAY!<br/>
188 | www.RalphsMastercard.com/42472
189 | 
190 | </p>
191 | <p style="text-align: center; font-size:smaller;">
192 | *Restrictions apply, see website<br/>for details.
193 | </p>
194 | 
195 | <p style="text-align: center; font-size:smaller;">
196 | ******************************<br/>
197 | With Card & Coupons<br/>
198 | <b>VERIFIED TOTAL SAVINGS $ 1.</b>
199 | </p>
200 | <p style="text-align: center; font-size:smaller;">
201 | TRY OUR PHARMACY (949) 559-1739<br/>
202 | MGR: ADOLFO VERGARA (949) 559-1139<br/>
203 | THANK YOU FOR SHOPPING AT RALPHS!
204 | </p>
205 | 
206 | <p style="text-align: center; font-size: x-small">
207 | Fresh opportunity awaits<br/>
208 | Join our team today!
209 | </p>
210 | 
211 | <p style="text-align: center;">
212 |     <img alt="QR Code - link not provided" width="100">
213 | </p>
214 | <p style="text-align: center; font-size: x-small">
215 | jobs.ralphs.com<br/>
216 | www.ralphs.com
217 | </p>


--------------------------------------------------------------------------------
/examples/outputs/medical_invoice_sample1.md:
--------------------------------------------------------------------------------
  1 | <table style="width: 100%; border-collapse: collapse;">
  2 |     <tr>
  3 |         <td style="width: 50%; vertical-align: top; background-color: #ffffff; color: #000000; text-align: center; padding: 5px; font-weight: bold; border: 1px solid #000000;">MAKE CHECK PAYABLE TO</td>
  4 |         <td style="width: 50%; vertical-align: top; text-align: center; font-weight: bold; padding: 1px 5px; font-size: 0.8em; border-top: 1px solid #000000; border-right: 1px solid #000000;">IF PAYING BY CREDIT CARD FILL OUT BELOW</td>
  5 |     </tr>
  6 |     <tr>
  7 |         <td style="width: 50%; vertical-align: top; padding: 5px; border-left: 1px solid #000000; border-bottom: 1px solid #000000;">
  8 |             Providence Anesthesiology Associates<br>
  9 |             PO Box 371863<br>
 10 |             Pittsburgh, PA 15250-7863
 11 |         </td>
 12 |         <td style="width: 50%; vertical-align: top; padding: 0px; border-right: 1px solid #000000; border-bottom: 1px solid #000000;">
 13 |             <table border="0" cellspacing="0" cellpadding="3" style="width: 100%; border-collapse: collapse; border: 1px solid #000000; border-top: none; border-right:none; border-bottom: none; border-left: none;">
 14 |                 <tr style="border-bottom: 1px solid #000000;">
 15 |                     <td style="font-size: 0.8em; border-right: 1px solid #000000;"><input type="checkbox"> MASTERCARD</td>
 16 |                     <td style="font-size: 0.8em; border-right: 1px solid #000000;"><input type="checkbox"> VISA</td>
 17 |                     <td style="font-size: 0.8em; border-right: 1px solid #000000;"><input type="checkbox"> DISCOVER</td>
 18 |                     <td style="font-size: 0.8em;"><input type="checkbox"> AMERICAN EXPRESS</td>
 19 |                 </tr>
 20 |                 <tr style="border-bottom: 1px solid #000000;">
 21 |                     <td colspan="2" style="font-size: 0.7em; vertical-align: bottom; border-right: 1px solid #000000;">CARD NUMBER</td>
 22 |                     <td style="font-size: 0.7em; vertical-align: bottom; border-right: 1px solid #000000;">EXP. DATE</td>
 23 |                     <td style="font-size: 0.7em; vertical-align: bottom;">SECURITY CODE</td>
 24 |                 </tr>
 25 |                 <tr style="border-bottom: 1px solid #000000;">
 26 |                     <td colspan="2" style="height: 20px; border-right: 1px solid #000000;"></td>
 27 |                     <td style="height: 20px; border-right: 1px solid #000000;"></td>
 28 |                     <td style="height: 20px;"></td>
 29 |                 </tr>
 30 |                 <tr style="border-bottom: 1px solid #000000;">
 31 |                     <td colspan="2" style="font-size: 0.7em; vertical-align: bottom; border-right: 1px solid #000000;">NAME ON CARD</td>
 32 |                     <td colspan="2" style="font-size: 0.7em; vertical-align: bottom;">SIGNATURE</td>
 33 |                 </tr>
 34 |                 <tr>
 35 |                     <td colspan="2" style="height: 20px; border-right: 1px solid #000000;"></td>
 36 |                     <td colspan="2" style="height: 20px;"></td>
 37 |                 </tr>
 38 |             </table>
 39 |         </td>
 40 |     </tr>
 41 | </table>
 42 | 
 43 | <table style="width: 100%; border-collapse: collapse; margin-top: 10px;">
 44 |     <tr>
 45 |         <td style="width: 50%; vertical-align: top; padding: 5px;">
 46 |             <div style="font-weight: bold;">Return Service Requested</div>
 47 |             <div>For all billing questions, call (704)749-5801</div>
 48 |             <div>Hrs. 8:00am - 6:00pm EST / M-F</div>
 49 |         </td>
 50 |         <td style="width: 50%; vertical-align: top; padding: 5px;">
 51 |             <table border="1" cellspacing="0" cellpadding="5" style="width: 100%; border-collapse: collapse;">
 52 |                 <tr>
 53 |                     <td style="width: 33%; text-align: center;">
 54 |                         (A) <span style="font-weight: bold;">STATEMENT DATE</span><br>
 55 |                         08/21/2020
 56 |                     </td>
 57 |                     <td style="width: 33%; text-align: center;">
 58 |                         (B) <span style="font-weight: bold;">PAY THIS AMOUNT</span><br>
 59 |                         463.30
 60 |                     </td>
 61 |                     <td style="width: 33%; text-align: center;">
 62 |                         (C) <span style="font-weight: bold;">ACCOUNT NO.</span><br>
 63 |                         PAA284850
 64 |                     </td>
 65 |                 </tr>
 66 |             </table>
 67 |             <table border="0" cellspacing="0" cellpadding="1" style="width: 100%; margin-top: 2px;">
 68 |                 <tr>
 69 |                     <td style="width: 65%; font-size: 0.7em; vertical-align: middle;">CHARGES AND CREDITS MADE AFTER STATEMENT<br>DATE WILL APPEAR ON NEXT STATEMENT</td>
 70 |                     <td style="width: 25%; text-align: right; font-weight: bold; vertical-align: middle;">Show Amount<br>Paid Here</td>
 71 |                     <td style="width: 5%; text-align: center; font-size: 1.2em; vertical-align: bottom;">$</td>
 72 |                     <td style="width: 5%; border-bottom: 1px solid #000000;">(D)</td>
 73 |                 </tr>
 74 |             </table>
 75 |         </td>
 76 |     </tr>
 77 | </table>
 78 | 
 79 | <table style="width: 100%; border-collapse: collapse; margin-top: 10px;">
 80 |     <tr>
 81 |         <td style="width: 50%; vertical-align: top; padding: 5px;">
 82 |             <div style="font-weight: bold;">SEND TO</div>
 83 |             <div style="padding-left: 20px;">
 84 |                 JOHN SMITH<br>
 85 |                 100 S TRYON ST<br>
 86 |                 UNIT 001<br>
 87 |                 CHARLOTTE, NC 28202-3258
 88 |             </div>
 89 |             <div style="margin-top: 10px; font-size: 0.9em;">
 90 |                 <input type="checkbox"> Please check box if above address is incorrect or insurance information has changed, and indicate changes on reverse side
 91 |             </div>
 92 |         </td>
 93 |         <td style="width: 50%; vertical-align: top; padding: 5px;">
 94 |             <div style="font-weight: bold;">REMIT TO</div>
 95 |             <div style="padding-left: 20px;">
 96 |                 Providence Anesthesiology Associates<br>
 97 |                 PO Box 371863<br>
 98 |                 Pittsburgh, PA 15250-7863
 99 |             </div>
100 |         </td>
101 |     </tr>
102 | </table>
103 | 
104 | <hr style="border: none; border-top: 2px dotted #000000; margin-top: 5px; margin-bottom: 5px;">
105 | 
106 | <table style="width: 100%; border-collapse: collapse;">
107 |     <tr>
108 |         <td style="width: 20%; background-color: #ffffff; color: #000000; text-align: center; padding: 5px; font-weight: bold; border: 1px solid black;">STATEMENT</td>
109 |         <td style="width: 80%; color: #000000; font-size: 0.8em; text-align: right; vertical-align: middle; padding-right: 5px;">PLEASE DETACH AND RETURN TOP PORTION<br>WITH YOUR PAYMENT</td>
110 |     </tr>
111 | </table>
112 | 
113 | <table cellspacing="0" cellpadding="3" style="width: 100%; border-collapse: collapse; margin-top: 5px; border: 1px solid black;">
114 |     <thead>
115 |         <tr style="border-top: 2px solid #000000; border-bottom: 2px solid #000000;">
116 |             <th style="font-weight: bold; text-align: left; width: 10%; border-right: 1px solid black;">(E) Date</th>
117 |             <th style="font-weight: bold; text-align: left; width: 10%; border-right: 1px solid black;">(F) Patient</th>
118 |             <th style="font-weight: bold; text-align: left; width: 30%; border-right: 1px solid black;">(G) Description</th>
119 |             <th style="font-weight: bold; text-align: right; width: 8%; border-right: 1px solid black;">(H) Charge</th>
120 |             <th style="font-weight: bold; text-align: right; width: 8%; font-size: 0.9em; line-height: 1.1; border-right: 1px solid black;">(I) Insurance<br>Receipts</th>
121 |             <th style="font-weight: bold; text-align: right; width: 8%; font-size: 0.9em; line-height: 1.1; border-right: 1px solid black;">(J) Patient<br>Receipts</th>
122 |             <th style="font-weight: bold; text-align: right; width: 8%; border-right: 1px solid black;">(K) Adjustments</th>
123 |             <th style="font-weight: bold; text-align: right; width: 8%; font-size: 0.9em; line-height: 1.1; border-right: 1px solid black;">(L) Insurance<br>Pending</th>
124 |             <th style="font-weight: bold; text-align: right; width: 10%; font-size: 0.9em; line-height: 1.1;">(M) Patient<br>Resp.</th>
125 |         </tr>
126 |     </thead>
127 |     <tbody>
128 |         <tr style="border-top: 1px solid black; border-bottom: 1px solid black;">
129 |             <td colspan="3" style="font-weight: bold; border-right: 1px solid black;">Facility : Southpark Surgery Center</td>
130 |             <td style="text-align: right; border-right: 1px solid black;"></td>
131 |             <td style="text-align: right; border-right: 1px solid black;"></td>
132 |             <td style="text-align: right; border-right: 1px solid black;"></td>
133 |             <td style="text-align: right; border-right: 1px solid black;"></td>
134 |             <td style="text-align: right; border-right: 1px solid black;"></td>
135 |             <td style="text-align: right;"></td>
136 |         </tr>
137 |         <tr style="border-bottom: 1px solid black;">
138 |             <td style="border-right: 1px solid black;">03/09/2020</td>
139 |             <td style="border-right: 1px solid black;">JOHN</td>
140 |             <td style="border-right: 1px solid black;">Professional Anesthesia Services - Physician</td>
141 |             <td style="text-align: right; border-right: 1px solid black;">863.00</td>
142 |             <td style="text-align: right; border-right: 1px solid black;">517.80</td>
143 |             <td style="text-align: right; border-right: 1px solid black;">54.50</td>
144 |             <td style="text-align: right; border-right: 1px solid black;"></td>
145 |             <td style="text-align: right; border-right: 1px solid black;"></td>
146 |             <td style="text-align: right;">463.30</td>
147 |         </tr>
148 |         <tr>
149 |             <td style="border-right: 1px solid black; height: 300px; vertical-align: top;"> </td>
150 |             <td style="border-right: 1px solid black;"> </td>
151 |             <td style="border-right: 1px solid black;"> </td>
152 |             <td style="border-right: 1px solid black;"> </td>
153 |             <td style="border-right: 1px solid black;"> </td>
154 |             <td style="border-right: 1px solid black;"> </td>
155 |             <td style="border-right: 1px solid black;"> </td>
156 |             <td style="border-right: 1px solid black;"> </td>
157 |             <td> </td>
158 |         </tr>
159 |     </tbody>
160 | </table>
161 | 
162 | <table border="1" cellspacing="0" cellpadding="3" style="width: 100%; border-collapse: collapse; text-align: center; font-size: 0.9em; margin-top: -1px;">
163 |      <thead>
164 |         <tr style="font-weight: bold;">
165 |             <th style="width: 11%;">Under 30</th>
166 |             <th style="width: 11%;">31 - 60</th>
167 |             <th style="width: 11%;">61 - 90</th>
168 |             <th style="width: 11%;">91 - 120</th>
169 |             <th style="width: 11%;">121 - 150</th>
170 |             <th style="width: 11%;">Over 151</th>
171 |             <th style="width: 11%;">Total</th>
172 |             <th style="width: 11%;"></th> <!-- Empty Header -->
173 |             <th style="width: 12%;"></th> <!-- Empty Header -->
174 |         </tr>
175 |     </thead>
176 |     <tbody>
177 |         <tr>
178 |             <td>0.00</td>
179 |             <td>0.00</td>
180 |             <td>0.00</td>
181 |             <td>463.30</td>
182 |             <td>0.00</td>
183 |             <td>0.00</td>
184 |             <td>463.30</td>
185 |             <td></td> <!-- Empty Cell -->
186 |             <td></td> <!-- Empty Cell -->
187 |         </tr>
188 |     </tbody>
189 | </table>
190 | 
191 | <table style="width: 100%; border-collapse: collapse; margin-top: 5px;">
192 |     <tr>
193 |         <td style="width: 70%; vertical-align: bottom; padding-left: 5px;">
194 |             Your payment is below the acceptable payment amount.
195 |         </td>
196 |         <td style="width: 30%; vertical-align: top; padding-right: 5px; text-align: right;">
197 |             <table border="0" cellspacing="0" cellpadding="0" style="width: 100%; display: inline-block; text-align: right;">
198 |                 <tr>
199 |                     <td style="text-align: right; font-weight: bold; padding-right: 10px;">Amount Due</td>
200 |                     <td style="font-weight: bold; text-align: right; width: 80px;">(N) 463.30</td>
201 |                 </tr>
202 |             </table>
203 |             <div style="margin-top: 10px;">
204 |                 Providence Anesthesiology Associates<br>
205 |                 PO Box 371863<br>
206 |                 Pittsburgh, PA 15250-7863
207 |             </div>
208 |             <div style="margin-top: 10px;">
209 |                 Billing questions? Call <b>(704)749-5801</b><br>
210 |                 Hrs. 8:00am - 6:00pm EST / M-F
211 |             </div>
212 |         </td>
213 |     </tr>
214 | </table>
215 | 
216 | <div style="text-align: right; margin-top: 10px; padding: 5px;">Page : 1 of 1</div>


--------------------------------------------------------------------------------
/examples/outputs/medical_travel_request_OWCP_957.md:
--------------------------------------------------------------------------------
 1 | <h4>Instructions – Form OWCP-957 Part A – Medical Travel Refund Request – Mileage</h4>
 2 | <p>This is a mileage-only reimbursement form. If you need other travel expenses reimbursed, complete Form OWCP-957 Part
 3 |     B Medical Travel Refund Request - Expenses.</p>
 4 | <ol>
 5 |     <li>Enter claimant's full name: last name, first name, middle initial (M.I.).</li>
 6 |     <li>Enter claimant's claim/case file number.</li>
 7 |     <li>Enter payee's full name (if a person other than the claimant is to be reimbursed): last name, first name, middle
 8 |         initial. A payee other than the claimant must submit proof of special authorization. Not applicable to the FECA
 9 |         Program.</li>
10 |     <li>Enter the Claimant's or Payee's phone number to be reached with questions about this form.</li>
11 |     <li>Enter the street address of the person to be reimbursed, including the: Street or Rural Route (RR), City, State,
12 |         and Zip Code.</li>
13 | </ol>
14 | <p><b>Note: For the FECA program to process your request, a FECA claimant must provide the home address where the
15 |         claimant resides. A Post Office (PO) Box or attorney/representative address is not an acceptable address.</b>
16 | </p>
17 | <ol start="6">
18 |     <li>Enter the Claimant's or Payee's email address to be reached with questions about this form.</li>
19 |     <li>Complete a separate block for each medical facility, pharmacy, therapist, etc., visited as follows:</li>
20 | </ol>
21 | <p><i>Sample: Multiple trips to a physical therapy office 31 miles from home.</i></p>
22 | <table style="width:100%;">
23 |     <tr>
24 |         <th style="text-align:left;">7a. Date(s) of Travel</th>
25 |         <th style="text-align:left;">7b. Reason for Travel</th>
26 |         <th style="text-align:left;">7c. From (Full name and street address)</th>
27 |         <th style="text-align:left;">7d. To (Full name and street address)</th>
28 |         <th style="text-align:left;">7e. One-way / Round trip</th>
29 |         <th style="text-align:left;">7f. Total # Miles</th>
30 |     </tr>
31 |     <tr>
32 |         <td style="text-align:left;">3/2/2022<br>3/6/2022<br>3/10/2022</td>
33 |         <td style="text-align:left;"> <input type="checkbox"> Hospital<br> <input type="checkbox"> Medical Appt.<br>
34 |             <input type="checkbox" checked> Therapy/Rehab<br> <input type="checkbox"> Pharmacy<br> <input
35 |                 type="checkbox"> Med. Supply<br> <input type="checkbox"> Other
36 |         </td>
37 |         <td style="text-align:left;"><i>Home</i><br><i>123 Oak St.</i><br><i>Everytown, OH 12345</i></td>
38 |         <td style="text-align:left;"><i>Therapy and Rehab</i><br><i>8000 Main St</i><br><i>Anytown, OH 54321</i></td>
39 |         <td style="text-align:left;"> <input type="checkbox"> One-way<br> <input type="checkbox" checked> Round trip
40 |         </td>
41 |         <td style="text-align:left;">62<br>62<br>62</td>
42 |     </tr>
43 | </table>
44 | <ol type="a">
45 |     <li>Enter date(s) of travel. If you made multiple trips to the same location, you may enter multiple dates in this
46 |         column.</li>
47 |     <li>Mark one box only.</li>
48 |     <li>Enter the full name and street address of the address where your trip started.</li>
49 |     <li>Enter the full name and street address of the address where your trip ended. <br>If column c or d is a medical
50 |         provider, pharmacy, therapist, etc., provide the name of the medical provider or business along with their
51 |         address.</li>
52 |     <li>Mark one box only.</li>
53 |     <li>If it was a one-way trip, enter the number of miles. If it was a round trip, enter the total miles traveled for
54 |         both legs of the trip.</li>
55 | </ol>
56 | <p>8. The person claiming reimbursement must sign and enter the date here.</p>


--------------------------------------------------------------------------------
/examples/outputs/test_1.md:
--------------------------------------------------------------------------------
 1 | ## Example table
 2 | This is an example of a data table.
 3 | <table>
 4 |   <thead>
 5 |     <tr>
 6 |       <th rowspan="2">Disability Category</th>
 7 |       <th rowspan="2">Participants</th>
 8 |       <th rowspan="2">Ballots Completed</th>
 9 |       <th rowspan="2">Ballots Incomplete/ Terminated</th>
10 |       <th colspan="2">Results</th>
11 |     </tr>
12 |     <tr>
13 |       <th>Accuracy</th>
14 |       <th>Time to complete</th>
15 |     </tr>
16 |   </thead>
17 |   <tbody>
18 |     <tr>
19 |       <td>Blind</td>
20 |       <td>5</td>
21 |       <td>1</td>
22 |       <td>4</td>
23 |       <td>34.5%, n=1</td>
24 |       <td>1199 sec, n=1</td>
25 |     </tr>
26 |     <tr>
27 |       <td>Low Vision</td>
28 |       <td>5</td>
29 |       <td>2</td>
30 |       <td>3</td>
31 |       <td>98.3% n=2 <br>(97.7%, n=3)</td>
32 |       <td>1716 sec, n=3 <br>(1934 sec, n=2)</td>
33 |     </tr>
34 |     <tr>
35 |       <td>Dexterity</td>
36 |       <td>5</td>
37 |       <td>4</td>
38 |       <td>1</td>
39 |       <td>98.3%, n=4</td>
40 |       <td>1672.1 sec, n=4</td>
41 |     </tr>
42 |     <tr>
43 |       <td>Mobility</td>
44 |       <td>3</td>
45 |       <td>3</td>
46 |       <td>0</td>
47 |       <td>95.4%, n=3</td>
48 |       <td>1416 sec, n=3</td>
49 |     </tr>
50 |   </tbody>
51 | </table>


--------------------------------------------------------------------------------
/examples/outputs/test_2.md:
--------------------------------------------------------------------------------
 1 | <h1>Cumulative Total Shareholder Return (5 Years)</h1>
 2 | 
 3 | <div style="display: flex; justify-content: center;">
 4 |   <div style="width: 60%;">
 5 |     <img src="" alt="A line chart showing the cumulative total shareholder return over 5 years. The Bank of New York Mellon Corporation is represented by a green line with circle markers. The S&P 500 Index is represented by a gray line with triangle markers. The S&P 500 Financials Index is represented by a blue line with square markers.">
 6 |   </div>
 7 | </div>
 8 | <div>
 9 | <p style="text-align: left;">
10 |     <span style="color: #80b140; display: inline-block;">●</span> The Bank of New York Mellon Corporation<br>
11 |     <span style="color: #666; display: inline-block;">▲</span> S&P 500 Index<br>
12 |     <span style="color: #4682b4; display: inline-block;">■</span> S&P 500 Financials Index
13 | </p>
14 | </div>
15 | 
16 | <hr>
17 | <table>
18 | <thead> <tr> <th style="text-align: left;">Cumulative shareholder returns<br />(in dollars)</th> <th style="text-align: center;">Dec. 31,</th> </tr>
19 |     <tr>
20 |       <th></th>
21 |       <th>2018</th>
22 |       <th>2019</th>
23 |       <th>2020</th>
24 |       <th>2021</th>
25 |       <th>2022</th>
26 |       <th>2023</th>
27 |     </tr>
28 |   </thead>
29 |   <tbody>
30 |     <tr>
31 |       <td>The Bank of New York Mellon Corporation</td>
32 |       <td>$ 100.0</td>
33 |       <td>$ 109.5</td>
34 |       <td>$ 95.4</td>
35 |       <td>$ 134.0</td>
36 |       <td>$ 108.4</td>
37 |       <td>$ 128.4</td>
38 |     </tr>
39 |     <tr>
40 |       <td>S&P 500 Financials Index <i>(a)</i></td>
41 |       <td>100.0</td>
42 |       <td>132.1</td>
43 |       <td>129.9</td>
44 |       <td>175.4</td>
45 |       <td>156.9</td>
46 |       <td>176.0</td>
47 |     </tr>
48 |     <tr>
49 |       <td>S&P 500 Index <i>(a)</i></td>
50 |       <td>100.0</td>
51 |       <td>131.5</td>
52 |       <td>155.7</td>
53 |       <td>200.4</td>
54 |       <td>164.1</td>
55 |       <td>207.2</td>
56 |     </tr>
57 |   </tbody>
58 | </table>
59 | 
60 | <p><i>(a)</i> Returns are weighted by market capitalization at the beginning of the measurement period.</p>


--------------------------------------------------------------------------------
/examples/outputs/test_3.md:
--------------------------------------------------------------------------------
  1 | <h1>The Bank of New York Mellon Corporation (and its subsidiaries)</h1>
  2 | <h2>Financial Summary</h2>
  3 | 
  4 | <p>(dollars in millions, except per share amounts and unless otherwise noted)</p>
  5 | 
  6 | <table>
  7 |   <tbody>
  8 |     <tr>
  9 |       <td colspan="4"><b>Selected income statement information:</b></td>
 10 |     </tr>
 11 |     <tr>
 12 |       <td>Fee and other revenue</td>
 13 |       <td>$ 13,157</td>
 14 |       <td>$ 12,873</td>
 15 |       <td>$ 13,313</td>
 16 |     </tr>
 17 |     <tr>
 18 |       <td>Net interest revenue</td>
 19 |       <td>4,345</td>
 20 |       <td>3,504</td>
 21 |       <td>2,618</td>
 22 |     </tr>
 23 |     <tr>
 24 |       <td>Total revenue</td>
 25 |       <td>17,502</td>
 26 |       <td>16,377</td>
 27 |       <td>15,931</td>
 28 |     </tr>
 29 |     <tr>
 30 |       <td>Provision for credit losses</td>
 31 |       <td>119</td>
 32 |       <td>39</td>
 33 |       <td>(231)</td>
 34 |     </tr>
 35 |     <tr>
 36 |       <td>Noninterest expense</td>
 37 |       <td>13,295</td>
 38 |       <td>13,010</td>
 39 |       <td>11,514</td>
 40 |     </tr>
 41 |     <tr>
 42 |       <td>Income before income taxes</td>
 43 |       <td>4,088</td>
 44 |       <td>3,328</td>
 45 |       <td>4,648</td>
 46 |     </tr>
 47 |     <tr>
 48 |       <td>Provision for income taxes</td>
 49 |       <td>800</td>
 50 |       <td>768</td>
 51 |       <td>877</td>
 52 |     </tr>
 53 |     <tr>
 54 |       <td>Net income</td>
 55 |       <td>3,288</td>
 56 |       <td>2,560</td>
 57 |       <td>3,771</td>
 58 |     </tr>
 59 |     <tr>
 60 |       <td>Net (income) loss attributable to noncontrolling interests related to consolidated investment management funds</td>
 61 |       <td>(2)</td>
 62 |       <td>13</td>
 63 |       <td>(12)</td>
 64 |     </tr>
 65 |     <tr>
 66 |       <td>Preferred stock dividends</td>
 67 |       <td>(235)</td>
 68 |       <td>(211)</td>
 69 |       <td>(207)</td>
 70 |     </tr>
 71 |     <tr>
 72 |       <td><b>Net income applicable to common shareholders of The Bank of New York Mellon Corporation</b></td>
 73 |       <td><b>$ 3,051</b></td>
 74 |       <td><b>$ 2,362</b></td>
 75 |       <td><b>$ 3,552</b></td>
 76 |     </tr>
 77 |     <tr>
 78 |       <td colspan="4"><b>Earnings per share applicable to common shareholders of The Bank of New York Mellon Corporation:</b></td>
 79 |     </tr>
 80 |     <tr>
 81 |       <td>Basic</td>
 82 |       <td>$ 3.89</td>
 83 |       <td>$ 2.91</td>
 84 |       <td>$ 4.17</td>
 85 |     </tr>
 86 |     <tr>
 87 |       <td>Diluted</td>
 88 |       <td>$ 3.87</td>
 89 |       <td>$ 2.90</td>
 90 |       <td>$ 4.14</td>
 91 |     </tr>
 92 |     <tr>
 93 |       <td colspan="4"><b>Average common shares and equivalents outstanding <i>(in thousands)</i>:</b></td>
 94 |     </tr>
 95 |     <tr>
 96 |       <td>Basic</td>
 97 |       <td>784,069</td>
 98 |       <td>811,068</td>
 99 |       <td>851,905</td>
100 |     </tr>
101 |     <tr>
102 |       <td>Diluted</td>
103 |       <td>787,798</td>
104 |       <td>814,795</td>
105 |       <td>856,359</td>
106 |     </tr>
107 |     <tr>
108 |       <td colspan="4"><b>At Dec. 31</b></td>
109 |     </tr>
110 |     <tr>
111 |       <td>Assets under custody and/or administration ("AUC/A") <i>(in trillions)</i> (a)</td>
112 |       <td>$ 47.8</td>
113 |       <td>$ 44.3</td>
114 |       <td>$ 46.7</td>
115 |     </tr>
116 |     <tr>
117 |       <td>Assets under management ("AUM") <i>(in trillions)</i> (b)</td>
118 |       <td>2.0</td>
119 |       <td>1.8</td>
120 |       <td>2.4</td>
121 |     </tr>
122 |     <tr>
123 |       <td colspan="4"><b>Selected ratios:</b></td>
124 |     </tr>
125 |     <tr>
126 |       <td>Return on common equity</td>
127 |       <td>8.5%</td>
128 |       <td>6.5%</td>
129 |       <td>8.9%</td>
130 |     </tr>
131 |     <tr>
132 |       <td>Return on tangible common equity - Non-GAAP (c)</td>
133 |       <td>16.6</td>
134 |       <td>13.4</td>
135 |       <td>17.1</td>
136 |     </tr>
137 |     <tr>
138 |       <td>Pre-tax operating margin</td>
139 |       <td>23</td>
140 |       <td>20</td>
141 |       <td>29</td>
142 |     </tr>
143 |     <tr>
144 |       <td>Net interest margin</td>
145 |       <td>1.25</td>
146 |       <td>0.97</td>
147 |       <td>0.68</td>
148 |     </tr>
149 |     <tr>
150 |       <td>Cash dividends per common share</td>
151 |       <td>$ 1.58</td>
152 |       <td>$ 1.42</td>
153 |       <td>$ 1.30</td>
154 |     </tr>
155 |     <tr>
156 |       <td>Common dividend payout ratio</td>
157 |       <td>41%</td>
158 |       <td>49%</td>
159 |       <td>32%</td>
160 |     </tr>
161 |     <tr>
162 |       <td>Common dividend yield</td>
163 |       <td>3.0%</td>
164 |       <td>3.1%</td>
165 |       <td>2.2%</td>
166 |     </tr>
167 |         <tr>
168 |       <td colspan="4"><b>At Dec. 31</b></td>
169 |     </tr>
170 |     <tr>
171 |       <td>Closing stock price per common share</td>
172 |       <td>$ 52.05</td>
173 |       <td>$ 45.52</td>
174 |       <td>$ 58.08</td>
175 |     </tr>
176 |     <tr>
177 |       <td>Market capitalization</td>
178 |       <td>$ 39,524</td>
179 |       <td>$ 36,800</td>
180 |       <td>$ 46,705</td>
181 |     </tr>
182 |     <tr>
183 |       <td>Book value per common share</td>
184 |       <td>$ 48.11</td>
185 |       <td>$ 44.40</td>
186 |       <td>$ 47.50</td>
187 |     </tr>
188 |     <tr>
189 |       <td>Tangible book value per common share - Non-GAAP (c)</td>
190 |       <td>$ 25.39</td>
191 |       <td>$ 23.11</td>
192 |       <td>$ 24.31</td>
193 |     </tr>
194 |     <tr>
195 |       <td>Full-time employees</td>
196 |       <td>53,400</td>
197 |       <td>51,700</td>
198 |       <td>49,100</td>
199 |     </tr>
200 |     <tr>
201 |       <td>Common shares outstanding <i>(in thousands)</i></td>
202 |       <td>759,344</td>
203 |       <td>808,445</td>
204 |       <td>804,145</td>
205 |     </tr>
206 |     <tr>
207 |       <td colspan="4"><b>Regulatory capital ratios (d)</b></td>
208 |     </tr>
209 |     <tr>
210 |       <td>Common Equity Tier 1 ("CET1") ratio</td>
211 |       <td>11.5%</td>
212 |       <td>11.2%</td>
213 |       <td>11.2%</td>
214 |     </tr>
215 |     <tr>
216 |       <td>Tier 1 capital ratio</td>
217 |       <td>14.2</td>
218 |       <td>14.1</td>
219 |       <td>14.0</td>
220 |     </tr>
221 |     <tr>
222 |       <td>Total capital ratio</td>
223 |       <td>15.0</td>
224 |       <td>14.9</td>
225 |       <td>14.9</td>
226 |     </tr>
227 |     <tr>
228 |       <td>Tier 1 leverage ratio</td>
229 |       <td>6.0</td>
230 |       <td>5.8</td>
231 |       <td>5.5</td>
232 |     </tr>
233 |     <tr>
234 |       <td>Supplementary leverage ratio ("SLR")</td>
235 |       <td>7.3</td>
236 |       <td>6.8</td>
237 |       <td>6.6</td>
238 |     </tr>
239 |   </tbody>
240 | </table>
241 | 
242 | <p><i>(a) Consists of AUC/A primarily from the Asset Servicing line of business and, to a lesser extent, the Clearance and Collateral Management, Issuer Services, Pershing and Wealth Management lines of business. Includes the AUC/A of CIBC Mellon Global Securities Services Company ("CIBC Mellon"), a joint venture with the Canadian Imperial Bank of Commerce, of $1.7 trillion at Dec. 31, 2023, $1.5 trillion at Dec. 31, 2022 and $1.7 trillion at Dec. 31, 2021.</i></p>
243 | <p><i>(b) Excludes assets managed outside of the Investment and Wealth Management business segment.</i></p>
244 | <p><i>(c) Return on tangible common equity and tangible book value per common share, both Non-GAAP measures, exclude goodwill and intangible assets, net of deferred tax liabilities. See "Supplemental Information - Explanation of GAAP and Non-GAAP financial measures" beginning on page 111 for the reconciliation of these Non-GAAP measures.</i></p>
245 | <p><i>(d) For our CET1, Tier 1 and Total capital ratios, our effective capital ratios under U.S. capital rules are the lower of the ratios as calculated under the Standardized and Advanced Approaches. For additional information on our regulatory capital ratios, see "Capital" beginning on page 39.</i></p>


--------------------------------------------------------------------------------
/examples/outputs/test_4.md:
--------------------------------------------------------------------------------
 1 | <p>The following represents analytical data compiled from smoke analyses of segments 1 and 2 and also includes the physical evaluation conducted on the filter rods.</p>
 2 | 
 3 | <h2>Smoke Analysis</h2>
 4 | 
 5 | <table>
 6 |   <tr>
 7 |     <td></td>
 8 |     <td>No Filter</td>
 9 |     <td>With Filter</td>
10 |     <td>(Percent)</td>
11 |   </tr>
12 |   <tr>
13 |     <td>a. Tobacco Rod Burned, mm</td>
14 |     <td>51</td>
15 |     <td>51</td>
16 |     <td></td>
17 |   </tr>
18 |   <tr>
19 |     <td>b. Putts/Cigt.</td>
20 |     <td>7.2</td>
21 |     <td>7.8</td>
22 |     <td>8.3</td>
23 |   </tr>
24 |   <tr>
25 |     <td>c. TPM (Wet), mg/cigt.</td>
26 |     <td>24.0</td>
27 |     <td>16.4</td>
28 |     <td>31.7</td>
29 |   </tr>
30 |   <tr>
31 |     <td>d. Nicotine, mg/cigt.</td>
32 |     <td>1.06</td>
33 |     <td>.85</td>
34 |     <td>19.8</td>
35 |   </tr>
36 |   <tr>
37 |     <td>e. FTC “Tar”, mg/cigt.</td>
38 |     <td>20.3</td>
39 |     <td>14.2</td>
40 |     <td>30.1</td>
41 |   </tr>
42 | </table>


--------------------------------------------------------------------------------
/examples/outputs/test_5.md:
--------------------------------------------------------------------------------
 1 | <table>
 2 |   <thead>
 3 |     <tr>
 4 |       <th>RPE SCALE</th>
 5 |       <th>EMOJI</th>
 6 |       <th>INTENSITY LEVEL...</th>
 7 |     </tr>
 8 |   </thead>
 9 |   <tbody>
10 |     <tr>
11 |       <td>9 - 10</td>
12 |       <td>🥵</td>
13 |       <td><b>MAXIMUM INTENSITY</b></td>
14 |     </tr>
15 |     <tr>
16 |       <td>7 - 8</td>
17 |       <td>😬</td>
18 |       <td><b>VIGOROUS INTENSITY</b></td>
19 |     </tr>
20 |     <tr>
21 |       <td>5 - 6</td>
22 |       <td>😐</td>
23 |       <td><b>MODERATE INTENSITY</b></td>
24 |     </tr>
25 |     <tr>
26 |       <td>3 - 4</td>
27 |       <td>😉</td>
28 |       <td><b>LIGHT INTENSITY</b></td>
29 |     </tr>
30 |     <tr>
31 |       <td>1 - 2</td>
32 |       <td>😁</td>
33 |       <td><b>VERY LIGHT INTENSITY</b></td>
34 |     </tr>
35 |   </tbody>
36 | </table>


--------------------------------------------------------------------------------
/lexoid/api.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import re
  4 | import tempfile
  5 | from concurrent.futures import ProcessPoolExecutor
  6 | from enum import Enum
  7 | from glob import glob
  8 | from time import time
  9 | from typing import Union, Dict, List
 10 | 
 11 | from loguru import logger
 12 | 
 13 | from lexoid.core.parse_type.llm_parser import (
 14 |     parse_llm_doc,
 15 |     create_response,
 16 |     convert_doc_to_base64_images,
 17 | )
 18 | from lexoid.core.parse_type.static_parser import parse_static_doc
 19 | from lexoid.core.utils import (
 20 |     convert_to_pdf,
 21 |     download_file,
 22 |     is_supported_url_file_type,
 23 |     is_supported_file_type,
 24 |     recursive_read_html,
 25 |     router,
 26 |     split_pdf,
 27 |     create_sub_pdf,
 28 |     get_webpage_soup,
 29 | )
 30 | 
 31 | 
 32 | class ParserType(Enum):
 33 |     LLM_PARSE = "LLM_PARSE"
 34 |     STATIC_PARSE = "STATIC_PARSE"
 35 |     AUTO = "AUTO"
 36 | 
 37 | 
 38 | def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
 39 |     """
 40 |     Parses a file using the specified parser type.
 41 | 
 42 |     Args:
 43 |         path (str): The file path or URL.
 44 |         parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO).
 45 |         **kwargs: Additional arguments for the parser.
 46 | 
 47 |     Returns:
 48 |         Dict: Dictionary containing:
 49 |             - raw: Full markdown content as string
 50 |             - segments: List of dictionaries with metadata and content
 51 |             - title: Title of the document
 52 |             - url: URL if applicable
 53 |             - parent_title: Title of parent doc if recursively parsed
 54 |             - recursive_docs: List of dictionaries for recursively parsed documents
 55 |             - token_usage: Dictionary containing token usage statistics
 56 |             - parser_used: Which parser was actually used
 57 |     """
 58 |     if parser_type == ParserType.AUTO:
 59 |         router_priority = kwargs.get("router_priority", "speed")
 60 |         parser_type = ParserType[router(path, router_priority)]
 61 |         logger.debug(f"Auto-detected parser type: {parser_type}")
 62 | 
 63 |     kwargs["start"] = (
 64 |         int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
 65 |     )
 66 |     if parser_type == ParserType.STATIC_PARSE:
 67 |         logger.debug("Using static parser")
 68 |         result = parse_static_doc(path, **kwargs)
 69 |     else:
 70 |         logger.debug("Using LLM parser")
 71 |         result = parse_llm_doc(path, **kwargs)
 72 | 
 73 |     result["parser_used"] = parser_type
 74 |     return result
 75 | 
 76 | 
 77 | def parse_chunk_list(
 78 |     file_paths: List[str], parser_type: ParserType, kwargs: Dict
 79 | ) -> Dict:
 80 |     """
 81 |     Parses a list of files using the specified parser type.
 82 | 
 83 |     Args:
 84 |         file_paths (list): List of file paths.
 85 |         parser_type (ParserType): The type of parser to use.
 86 |         kwargs (dict): Additional arguments for the parser.
 87 | 
 88 |     Returns:
 89 |         Dict: Dictionary containing parsed document data
 90 |     """
 91 |     combined_segments = []
 92 |     raw_texts = []
 93 |     token_usage = {"input": 0, "output": 0, "llm_page_count": 0}
 94 |     for file_path in file_paths:
 95 |         result = parse_chunk(file_path, parser_type, **kwargs)
 96 |         combined_segments.extend(result["segments"])
 97 |         raw_texts.append(result["raw"])
 98 |         if (
 99 |             result.get("parser_used") == ParserType.LLM_PARSE
100 |             and "token_usage" in result
101 |         ):
102 |             token_usage["input"] += result["token_usage"]["input"]
103 |             token_usage["output"] += result["token_usage"]["output"]
104 |             token_usage["llm_page_count"] += len(result["segments"])
105 |     token_usage["total"] = token_usage["input"] + token_usage["output"]
106 | 
107 |     return {
108 |         "raw": "\n\n".join(raw_texts),
109 |         "segments": combined_segments,
110 |         "title": kwargs.get("title", ""),
111 |         "url": kwargs.get("url", ""),
112 |         "parent_title": kwargs.get("parent_title", ""),
113 |         "recursive_docs": [],
114 |         "token_usage": token_usage,
115 |     }
116 | 
117 | 
118 | def parse(
119 |     path: str,
120 |     parser_type: Union[str, ParserType] = "AUTO",
121 |     pages_per_split: int = 4,
122 |     max_processes: int = 4,
123 |     **kwargs,
124 | ) -> Dict:
125 |     """
126 |     Parses a document or URL, optionally splitting it into chunks and using multiprocessing.
127 | 
128 |     Args:
129 |         path (str): The file path or URL.
130 |         parser_type (Union[str, ParserType], optional): Parser type ("LLM_PARSE", "STATIC_PARSE", or "AUTO").
131 |         pages_per_split (int, optional): Number of pages per split for chunking.
132 |         max_processes (int, optional): Maximum number of processes for parallel processing.
133 |         **kwargs: Additional arguments for the parser.
134 | 
135 |     Returns:
136 |         Dict: Dictionary containing:
137 |             - raw: Full markdown content as string
138 |             - segments: List of dictionaries with metadata and content
139 |             - title: Title of the document
140 |             - url: URL if applicable
141 |             - parent_title: Title of parent doc if recursively parsed
142 |             - recursive_docs: List of dictionaries for recursively parsed documents
143 |             - token_usage: Dictionary containing token usage statistics
144 |     """
145 |     kwargs["title"] = os.path.basename(path)
146 |     kwargs["pages_per_split_"] = pages_per_split
147 |     as_pdf = kwargs.get("as_pdf", False)
148 |     depth = kwargs.get("depth", 1)
149 | 
150 |     if type(parser_type) is str:
151 |         parser_type = ParserType[parser_type]
152 |     if (
153 |         path.lower().endswith((".doc", ".docx"))
154 |         and parser_type != ParserType.STATIC_PARSE
155 |     ):
156 |         as_pdf = True
157 |     if path.lower().endswith(".xlsx") and parser_type == ParserType.LLM_PARSE:
158 |         logger.warning("LLM_PARSE does not support .xlsx files. Using STATIC_PARSE.")
159 |         parser_type = ParserType.STATIC_PARSE
160 |     if path.lower().endswith(".pptx") and parser_type == ParserType.LLM_PARSE:
161 |         logger.warning("LLM_PARSE does not support .pptx files. Using STATIC_PARSE.")
162 |         parser_type = ParserType.STATIC_PARSE
163 | 
164 |     with tempfile.TemporaryDirectory() as temp_dir:
165 |         kwargs["temp_dir"] = temp_dir
166 |         if path.startswith(("http://", "https://")):
167 |             kwargs["url"] = path
168 |             download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
169 |             os.makedirs(download_dir, exist_ok=True)
170 |             if is_supported_url_file_type(path):
171 |                 path = download_file(path, download_dir)
172 |             elif as_pdf:
173 |                 kwargs["title"] = get_webpage_soup(path).title.string.strip()
174 |                 pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
175 |                 if not pdf_filename.endswith(".pdf"):
176 |                     pdf_filename += ".pdf"
177 |                 pdf_path = os.path.join(download_dir, pdf_filename)
178 |                 path = convert_to_pdf(path, pdf_path)
179 |             else:
180 |                 return recursive_read_html(path, depth)
181 | 
182 |         assert is_supported_file_type(
183 |             path
184 |         ), f"Unsupported file type {os.path.splitext(path)[1]}"
185 | 
186 |         if as_pdf and not path.lower().endswith(".pdf"):
187 |             pdf_path = os.path.join(temp_dir, "converted.pdf")
188 |             path = convert_to_pdf(path, pdf_path)
189 | 
190 |         if "page_nums" in kwargs and path.lower().endswith(".pdf"):
191 |             sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
192 |             os.makedirs(sub_pdf_dir, exist_ok=True)
193 |             sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
194 |             path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
195 | 
196 |         if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
197 |             kwargs["split"] = False
198 |             result = parse_chunk_list([path], parser_type, kwargs)
199 |         else:
200 |             kwargs["split"] = True
201 |             split_dir = os.path.join(temp_dir, "splits/")
202 |             os.makedirs(split_dir, exist_ok=True)
203 |             split_pdf(path, split_dir, pages_per_split)
204 |             split_files = sorted(glob(os.path.join(split_dir, "*.pdf")))
205 | 
206 |             chunk_size = max(1, len(split_files) // max_processes)
207 |             file_chunks = [
208 |                 split_files[i : i + chunk_size]
209 |                 for i in range(0, len(split_files), chunk_size)
210 |             ]
211 | 
212 |             process_args = [(chunk, parser_type, kwargs) for chunk in file_chunks]
213 | 
214 |             if max_processes == 1 or len(file_chunks) == 1:
215 |                 chunk_results = [parse_chunk_list(*args) for args in process_args]
216 |             else:
217 |                 with ProcessPoolExecutor(max_workers=max_processes) as executor:
218 |                     chunk_results = list(
219 |                         executor.map(parse_chunk_list, *zip(*process_args))
220 |                     )
221 | 
222 |             # Combine results from all chunks
223 |             result = {
224 |                 "raw": "\n\n".join(r["raw"] for r in chunk_results),
225 |                 "segments": [seg for r in chunk_results for seg in r["segments"]],
226 |                 "title": kwargs["title"],
227 |                 "url": kwargs.get("url", ""),
228 |                 "parent_title": kwargs.get("parent_title", ""),
229 |                 "recursive_docs": [],
230 |                 "token_usage": {
231 |                     "input": sum(r["token_usage"]["input"] for r in chunk_results),
232 |                     "output": sum(r["token_usage"]["output"] for r in chunk_results),
233 |                     "llm_page_count": sum(
234 |                         r["token_usage"]["llm_page_count"] for r in chunk_results
235 |                     ),
236 |                     "total": sum(r["token_usage"]["total"] for r in chunk_results),
237 |                 },
238 |             }
239 | 
240 |         if "api_cost_mapping" in kwargs and "token_usage" in result:
241 |             api_cost_mapping = kwargs["api_cost_mapping"]
242 |             if isinstance(api_cost_mapping, dict):
243 |                 api_cost_mapping = api_cost_mapping
244 |             elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping):
245 |                 with open(api_cost_mapping, "r") as f:
246 |                     api_cost_mapping = json.load(f)
247 |             else:
248 |                 raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
249 | 
250 |             api_cost = api_cost_mapping.get(
251 |                 kwargs.get("model", "gemini-2.0-flash"), None
252 |             )
253 |             if api_cost:
254 |                 token_usage = result["token_usage"]
255 |                 token_cost = {
256 |                     "input": token_usage["input"] * api_cost["input"] / 1_000_000,
257 |                     "input-image": api_cost.get("input-image", 0)
258 |                     * token_usage.get("llm_page_count", 0),
259 |                     "output": token_usage["output"] * api_cost["output"] / 1_000_000,
260 |                 }
261 |                 token_cost["total"] = (
262 |                     token_cost["input"]
263 |                     + token_cost["input-image"]
264 |                     + token_cost["output"]
265 |                 )
266 |                 result["token_cost"] = token_cost
267 | 
268 |         if as_pdf:
269 |             result["pdf_path"] = path
270 | 
271 |     if depth > 1:
272 |         recursive_docs = []
273 |         for segment in result["segments"]:
274 |             urls = re.findall(
275 |                 r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*',
276 |                 segment["content"],
277 |             )
278 |             for url in urls:
279 |                 if "](" in url:
280 |                     url = url.split("](")[-1]
281 |                 logger.debug(f"Reading content from {url}")
282 |                 if not url.startswith("http"):
283 |                     url = "https://" + url
284 | 
285 |                 kwargs_cp = kwargs.copy()
286 |                 kwargs_cp["depth"] = depth - 1
287 |                 kwargs_cp["parent_title"] = result["title"]
288 |                 sub_doc = parse(
289 |                     url,
290 |                     parser_type=parser_type,
291 |                     pages_per_split=pages_per_split,
292 |                     max_processes=max_processes,
293 |                     **kwargs_cp,
294 |                 )
295 |                 recursive_docs.append(sub_doc)
296 | 
297 |         result["recursive_docs"] = recursive_docs
298 | 
299 |     return result
300 | 
301 | 
302 | def parse_with_schema(
303 |     path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
304 | ) -> List[List[Dict]]:
305 |     """
306 |     Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
307 | 
308 |     Args:
309 |         path (str): Path to the PDF file.
310 |         schema (Dict): JSON schema to which the parsed output should conform.
311 |         api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks").
312 |         model (str, optional): LLM model name.
313 |         **kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens).
314 | 
315 |     Returns:
316 |         List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
317 |     """
318 |     system_prompt = f"""
319 |         The output should be formatted as a JSON instance that conforms to the JSON schema below.
320 | 
321 |         As an example, for the schema {{
322 |         "properties": {{
323 |             "foo": {{
324 |             "title": "Foo",
325 |             "description": "a list of strings",
326 |             "type": "array",
327 |             "items": {{"type": "string"}}
328 |             }}
329 |         }},
330 |         "required": ["foo"]
331 |         }}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
332 | 
333 |         Here is the output schema:
334 |         {json.dumps(schema, indent=2)}
335 | 
336 |         """
337 | 
338 |     user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
339 | 
340 |     responses = []
341 |     images = convert_doc_to_base64_images(path)
342 |     for i, (page_num, image) in enumerate(images):
343 |         resp_dict = create_response(
344 |             api=api,
345 |             model=model,
346 |             user_prompt=user_prompt,
347 |             system_prompt=system_prompt,
348 |             image_url=image,
349 |             temperature=kwargs.get("temperature", 0.0),
350 |             max_tokens=kwargs.get("max_tokens", 1024),
351 |         )
352 | 
353 |         response = resp_dict.get("response", "")
354 |         response = response.split("```json")[-1].split("```")[0].strip()
355 |         logger.debug(f"Processing page {page_num + 1} with response: {response}")
356 |         new_dict = json.loads(response)
357 |         responses.append(new_dict)
358 | 
359 |     return responses
360 | 


--------------------------------------------------------------------------------
/lexoid/core/parse_type/llm_parser.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import io
  3 | import mimetypes
  4 | import os
  5 | import time
  6 | from functools import wraps
  7 | from typing import Dict, List, Optional, Tuple
  8 | 
  9 | import pypdfium2 as pdfium
 10 | import requests
 11 | from huggingface_hub import InferenceClient
 12 | from loguru import logger
 13 | from openai import OpenAI
 14 | from requests.exceptions import HTTPError
 15 | from together import Together
 16 | 
 17 | from lexoid.core.prompt_templates import (
 18 |     INSTRUCTIONS_ADD_PG_BREAK,
 19 |     LLAMA_PARSER_PROMPT,
 20 |     OPENAI_USER_PROMPT,
 21 |     PARSER_PROMPT,
 22 | )
 23 | from lexoid.core.utils import convert_image_to_pdf
 24 | 
 25 | 
 26 | def retry_on_http_error(func):
 27 |     @wraps(func)
 28 |     def wrapper(*args, **kwargs):
 29 |         try:
 30 |             return func(*args, **kwargs)
 31 |         except HTTPError as e:
 32 |             logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
 33 |             time.sleep(10)
 34 |             try:
 35 |                 logger.debug(f"Retry {func.__name__}")
 36 |                 return func(*args, **kwargs)
 37 |             except HTTPError as e:
 38 |                 logger.error(f"Retry failed: {e}")
 39 |                 return {
 40 |                     "raw": "",
 41 |                     "segments": [],
 42 |                     "title": kwargs["title"],
 43 |                     "url": kwargs.get("url", ""),
 44 |                     "parent_title": kwargs.get("parent_title", ""),
 45 |                     "recursive_docs": [],
 46 |                     "error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}",
 47 |                 }
 48 | 
 49 |     return wrapper
 50 | 
 51 | 
 52 | @retry_on_http_error
 53 | def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
 54 |     if "api_provider" in kwargs and kwargs["api_provider"]:
 55 |         return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
 56 |     if "model" not in kwargs:
 57 |         kwargs["model"] = "gemini-2.0-flash"
 58 |     model = kwargs.get("model")
 59 |     if model.startswith("gemini"):
 60 |         return parse_with_gemini(path, **kwargs)
 61 |     if model.startswith("gpt"):
 62 |         return parse_with_api(path, api="openai", **kwargs)
 63 |     if model.startswith("meta-llama"):
 64 |         if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
 65 |             return parse_with_api(path, api="together", **kwargs)
 66 |         return parse_with_api(path, api="huggingface", **kwargs)
 67 |     if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
 68 |         return parse_with_api(path, api="openrouter", **kwargs)
 69 |     if model.startswith("accounts/fireworks"):
 70 |         return parse_with_api(path, api="fireworks", **kwargs)
 71 |     raise ValueError(f"Unsupported model: {model}")
 72 | 
 73 | 
 74 | def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
 75 |     logger.debug(f"Parsing with Gemini API and model {kwargs['model']}")
 76 |     api_key = os.environ.get("GOOGLE_API_KEY")
 77 |     if not api_key:
 78 |         raise ValueError("GOOGLE_API_KEY environment variable is not set")
 79 | 
 80 |     url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
 81 | 
 82 |     # Check if the file is an image and convert to PDF if necessary
 83 |     mime_type, _ = mimetypes.guess_type(path)
 84 |     if mime_type and mime_type.startswith("image"):
 85 |         pdf_content = convert_image_to_pdf(path)
 86 |         mime_type = "application/pdf"
 87 |         base64_file = base64.b64encode(pdf_content).decode("utf-8")
 88 |     else:
 89 |         with open(path, "rb") as file:
 90 |             file_content = file.read()
 91 |         base64_file = base64.b64encode(file_content).decode("utf-8")
 92 | 
 93 |     if "system_prompt" in kwargs:
 94 |         prompt = kwargs["system_prompt"]
 95 |     else:
 96 |         # Ideally, we do this ourselves. But, for now this might be a good enough.
 97 |         custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
 98 |         if kwargs["pages_per_split_"] == 1:
 99 |             custom_instruction = ""
100 |         prompt = PARSER_PROMPT.format(custom_instructions=custom_instruction)
101 | 
102 |     payload = {
103 |         "contents": [
104 |             {
105 |                 "parts": [
106 |                     {"text": prompt},
107 |                     {"inline_data": {"mime_type": mime_type, "data": base64_file}},
108 |                 ]
109 |             }
110 |         ],
111 |         "generationConfig": {
112 |             "temperature": kwargs.get("temperature", 0.2),
113 |         },
114 |     }
115 | 
116 |     headers = {"Content-Type": "application/json"}
117 |     try:
118 |         response = requests.post(url, json=payload, headers=headers, timeout=120)
119 |         response.raise_for_status()
120 |     except requests.Timeout as e:
121 |         raise HTTPError(f"Timeout error occurred: {e}")
122 | 
123 |     result = response.json()
124 | 
125 |     raw_text = "".join(
126 |         part["text"]
127 |         for candidate in result.get("candidates", [])
128 |         for part in candidate.get("content", {}).get("parts", [])
129 |         if "text" in part
130 |     )
131 | 
132 |     combined_text = ""
133 |     if "<output>" in raw_text:
134 |         combined_text = raw_text.split("<output>")[-1].strip()
135 |     if "</output>" in result:
136 |         combined_text = result.split("</output>")[0].strip()
137 | 
138 |     token_usage = result["usageMetadata"]
139 |     input_tokens = token_usage.get("promptTokenCount", 0)
140 |     output_tokens = token_usage.get("candidatesTokenCount", 0)
141 |     total_tokens = input_tokens + output_tokens
142 | 
143 |     return {
144 |         "raw": combined_text.replace("<page-break>", "\n\n"),
145 |         "segments": [
146 |             {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
147 |             for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
148 |         ],
149 |         "title": kwargs["title"],
150 |         "url": kwargs.get("url", ""),
151 |         "parent_title": kwargs.get("parent_title", ""),
152 |         "recursive_docs": [],
153 |         "token_usage": {
154 |             "input": input_tokens,
155 |             "output": output_tokens,
156 |             "total": total_tokens,
157 |         },
158 |     }
159 | 
160 | 
161 | def convert_pdf_page_to_base64(
162 |     pdf_document: pdfium.PdfDocument, page_number: int
163 | ) -> str:
164 |     """Convert a PDF page to a base64-encoded PNG string."""
165 |     page = pdf_document[page_number]
166 |     # Render with 4x scaling for better quality
167 |     pil_image = page.render(scale=4).to_pil()
168 | 
169 |     # Convert to base64
170 |     img_byte_arr = io.BytesIO()
171 |     pil_image.save(img_byte_arr, format="PNG")
172 |     img_byte_arr.seek(0)
173 |     return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
174 | 
175 | 
176 | def get_messages(
177 |     system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
178 | ) -> List[Dict]:
179 |     messages = []
180 |     if system_prompt:
181 |         messages.append(
182 |             {
183 |                 "role": "system",
184 |                 "content": system_prompt,
185 |             }
186 |         )
187 |     base_message = (
188 |         [
189 |             {"type": "text", "text": user_prompt},
190 |         ]
191 |         if user_prompt
192 |         else []
193 |     )
194 |     image_message = (
195 |         [
196 |             {
197 |                 "type": "image_url",
198 |                 "image_url": {"url": image_url},
199 |             }
200 |         ]
201 |         if image_url
202 |         else []
203 |     )
204 | 
205 |     messages.append(
206 |         {
207 |             "role": "user",
208 |             "content": base_message + image_message,
209 |         }
210 |     )
211 | 
212 |     return messages
213 | 
214 | 
215 | def create_response(
216 |     api: str,
217 |     model: str,
218 |     system_prompt: Optional[str] = None,
219 |     user_prompt: Optional[str] = None,
220 |     image_url: Optional[str] = None,
221 |     temperature: float = 0.2,
222 |     max_tokens: int = 1024,
223 | ) -> Dict:
224 |     # Initialize appropriate client
225 |     clients = {
226 |         "openai": lambda: OpenAI(),
227 |         "huggingface": lambda: InferenceClient(
228 |             token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
229 |         ),
230 |         "together": lambda: Together(),
231 |         "openrouter": lambda: OpenAI(
232 |             base_url="https://openrouter.ai/api/v1",
233 |             api_key=os.environ["OPENROUTER_API_KEY"],
234 |         ),
235 |         "fireworks": lambda: OpenAI(
236 |             base_url="https://api.fireworks.ai/inference/v1",
237 |             api_key=os.environ["FIREWORKS_API_KEY"],
238 |         ),
239 |     }
240 |     assert api in clients, f"Unsupported API: {api}"
241 |     client = clients[api]()
242 | 
243 |     # Prepare messages for the API call
244 |     messages = get_messages(system_prompt, user_prompt, image_url)
245 | 
246 |     # Common completion parameters
247 |     completion_params = {
248 |         "model": model,
249 |         "messages": messages,
250 |         "max_tokens": max_tokens,
251 |         "temperature": temperature,
252 |     }
253 | 
254 |     # Get completion from selected API
255 |     response = client.chat.completions.create(**completion_params)
256 |     token_usage = response.usage
257 | 
258 |     # Extract the response text
259 |     page_text = response.choices[0].message.content
260 | 
261 |     return {
262 |         "response": page_text,
263 |         "usage": token_usage,
264 |     }
265 | 
266 | 
267 | def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
268 |     """
269 |     Parse documents (PDFs or images) using various vision model APIs.
270 | 
271 |     Args:
272 |         path (str): Path to the document to parse
273 |         api (str): Which API to use ("openai", "huggingface", or "together")
274 |         **kwargs: Additional arguments including model, temperature, title, etc.
275 | 
276 |     Returns:
277 |         Dict: Dictionary containing parsed document data
278 |     """
279 |     logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
280 | 
281 |     # Handle different input types
282 |     mime_type, _ = mimetypes.guess_type(path)
283 |     if mime_type and mime_type.startswith("image"):
284 |         # Single image processing
285 |         with open(path, "rb") as img_file:
286 |             image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
287 |             images = [(0, f"data:{mime_type};base64,{image_base64}")]
288 |     else:
289 |         # PDF processing
290 |         pdf_document = pdfium.PdfDocument(path)
291 |         images = [
292 |             (
293 |                 page_num,
294 |                 f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
295 |             )
296 |             for page_num in range(len(pdf_document))
297 |         ]
298 | 
299 |     # Process each page/image
300 |     all_results = []
301 |     for page_num, image_url in images:
302 |         if api == "openai":
303 |             system_prompt = kwargs.get(
304 |                 "system_prompt", PARSER_PROMPT.format(custom_instructions="")
305 |             )
306 |             user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
307 |         else:
308 |             system_prompt = kwargs.get("system_prompt", None)
309 |             user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT)
310 | 
311 |         response = create_response(
312 |             api=api,
313 |             model=kwargs["model"],
314 |             system_prompt=system_prompt,
315 |             user_prompt=user_prompt,
316 |             image_url=image_url,
317 |             temperature=kwargs.get("temperature", 0.2),
318 |             max_tokens=kwargs.get("max_tokens", 1024),
319 |         )
320 | 
321 |         # Get completion from selected API
322 |         page_text = response["response"]
323 |         token_usage = response["usage"]
324 | 
325 |         if kwargs.get("verbose", None):
326 |             logger.debug(f"Page {page_num + 1} response: {page_text}")
327 | 
328 |         # Extract content between output tags if present
329 |         result = page_text
330 |         if "<output>" in page_text:
331 |             result = page_text.split("<output>")[-1].strip()
332 |         if "</output>" in result:
333 |             result = result.split("</output>")[0].strip()
334 |         all_results.append(
335 |             (
336 |                 page_num,
337 |                 result,
338 |                 token_usage.prompt_tokens,
339 |                 token_usage.completion_tokens,
340 |                 token_usage.total_tokens,
341 |             )
342 |         )
343 | 
344 |     # Sort results by page number and combine
345 |     all_results.sort(key=lambda x: x[0])
346 |     all_texts = [text for _, text, _, _, _ in all_results]
347 |     combined_text = "\n\n".join(all_texts)
348 | 
349 |     return {
350 |         "raw": combined_text,
351 |         "segments": [
352 |             {
353 |                 "metadata": {
354 |                     "page": kwargs.get("start", 0) + page_no + 1,
355 |                     "token_usage": {
356 |                         "input": input_tokens,
357 |                         "output": output_tokens,
358 |                         "total": total_tokens,
359 |                     },
360 |                 },
361 |                 "content": page,
362 |             }
363 |             for page_no, page, input_tokens, output_tokens, total_tokens in all_results
364 |         ],
365 |         "title": kwargs["title"],
366 |         "url": kwargs.get("url", ""),
367 |         "parent_title": kwargs.get("parent_title", ""),
368 |         "recursive_docs": [],
369 |         "token_usage": {
370 |             "input": sum(input_tokens for _, _, input_tokens, _, _ in all_results),
371 |             "output": sum(output_tokens for _, _, _, output_tokens, _ in all_results),
372 |             "total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
373 |         },
374 |     }
375 | 
376 | 
377 | def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
378 |     """
379 |     Converts a document (PDF or image) to a base64 encoded string.
380 | 
381 |     Args:
382 |         path (str): Path to the PDF file.
383 | 
384 |     Returns:
385 |         str: Base64 encoded string of the PDF content.
386 |     """
387 |     if path.endswith(".pdf"):
388 |         pdf_document = pdfium.PdfDocument(path)
389 |         return [
390 |             (
391 |                 page_num,
392 |                 f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
393 |             )
394 |             for page_num in range(len(pdf_document))
395 |         ]
396 |     elif mimetypes.guess_type(path)[0].startswith("image"):
397 |         with open(path, "rb") as img_file:
398 |             image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
399 |             return [(0, f"data:image/png;base64,{image_base64}")]
400 | 


--------------------------------------------------------------------------------
/lexoid/core/prompt_templates.py:
--------------------------------------------------------------------------------
 1 | # Initial prompt,
 2 | # This might go through further changes as the library evolves.
 3 | PARSER_PROMPT = """\
 4 | You are a specialized document parsing (including OCR) and conversion agent.
 5 | Your primary task is to analyze various types of documents and reproduce their content in a format that, when rendered, visually replicates the original input as closely as possible.
 6 | Your output should use a combination of Markdown and HTML to achieve this goal.
 7 | Think step-by-step.
 8 | 
 9 | **Instructions:**
10 | - Analyze the given document thoroughly, identify formatting patterns, choose optimal markup, implement conversion and verify quality.
11 | - Your primary goal is to ensure structural fidelity of the input is replicated. Preserve all content without loss.
12 | - Use a combination of Markdown and HTML in your output. HTML can be used anywhere in the document, not just for complex structures. Choose the format that best replicates the original structural appearance. However, keep the font colors black and the background colors white.
13 | - When reproducing tables, use HTML tables (<table>, <tr>, <td>) if they better represent the original layout. Utilize `colspan` and `rowspan` attributes as necessary to accurately represent merged cells.
14 | - Preserve all formatting elements such as bold, italic, underline, strikethrough text, font sizes, and colors using appropriate HTML tags and inline styles if needed.
15 | - Maintain the hierarchy (h1-h6) and styling of headings and subheadings using appropriate HTML tags or Markdown.
16 | - Visual Elements:
17 |   * Images: If there is text within the image, try to recreate the structure within the image. If there is no text, describe the image content and position, and use placeholder `<img>` tags to represent their location in the document. Capture the image meaning in the alt attribute. Don't specify src if not known.
18 |   * Emojis: Use Unicode characters instead of images.
19 |   * Charts/Diagrams: For content that cannot be accurately represented in text format, provide a detailed textual description within an HTML element that visually represents its position in the document.
20 |   * Complex visuals: Mark with [?] and make a note for ambiguities or uncertain interpretations in the document. Use HTML comments <!-- --> for conversion notes. Only output notes with comment tags.
21 | - Special Characters:
22 |   * Letters with ascenders are usually: b, d, f, h, k, l, t
23 |   * Letters with descenders are usually: g, j, p, q, y. Lowercase f and z also have descenders in many typefaces.
24 |   * Pay special attention to these commonly confused character pairs,
25 |     Letter 'l' vs number '1' vs exclamation mark '!'
26 |     Number '2' vs letter 'Z'
27 |     Number '5' vs letter 'S'
28 |     Number '51' vs number '±1'
29 |     Number '6' vs letter 'G' vs letter 'b'
30 |     Number '0' vs letter 'O'
31 |     Number '8' vs letter 'B'
32 |     Letter 'f' vs letter 't'
33 |   * Contextual clues to differentiate:
34 |     - If in a numeric column, interpret 'O' as '0'
35 |     - If preceded/followed by numbers, interpret 'l' as '1'
36 |     - Consider font characteristics, e.g.
37 |     '1' typically has no serif
38 |     '2' has a curved bottom vs 'Z's straight line
39 |     '5' has more rounded features than 'S'
40 |     '6' has a closed loop vs 'G's open curve
41 |     '0' is typically more oval than 'O'
42 |     '8' has a more angular top than 'B'
43 | {custom_instructions}
44 | - Return only the correct markdown without additional text or explanations.
45 | - DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content.
46 | - Think before generating the output in <thinking></thinking> tags.
47 | 
48 | Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
49 | Prioritize replicating structure above all else.
50 | Use tables without borders to represent column-like structures.
51 | Keep the font color black (#000000) and the background white (#ffffff).
52 | 
53 | OUTPUT FORMAT:
54 | Enclose the response within XML tags as follows:
55 | <thinking>
56 | [Step-by-step analysis and generation strategy]
57 | </thinking>
58 | <output>
59 | "Your converted document content here in markdown format"
60 | </output>
61 | 
62 | Quality Checks:
63 | 1. Verify structural and layout accuracy
64 | 2. Verify content completeness
65 | 3. Visual element handling
66 | 4. Hierarchy preservation
67 | 5. Confirm table alignment and cell merging accuracy
68 | 6. Spacing fidelity
69 | 7. Verify that numbers fall within expected ranges for their column
70 | 8. Flag any suspicious characters that could be OCR errors
71 | 9. Validate markdown syntax
72 | """
73 | 
74 | OPENAI_USER_PROMPT = """\
75 | Convert the following document to markdown.
76 | Ensure accurate representation of all content, including tables and visual elements, per your instructions.
77 | """
78 | 
79 | INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."
80 | 
81 | LLAMA_PARSER_PROMPT = """\
82 | You are a document conversion assistant. Your task is to accurately reproduce the content of an image in Markdown and HTML format, maintaining the visual structure and layout of the original document as closely as possible.
83 | 
84 | Instructions:
85 | 1. Use a combination of Markdown and HTML to replicate the document's layout and formatting.
86 | 2. Reproduce all text content exactly as it appears, including preserving capitalization, punctuation, and any apparent errors or inconsistencies in the original.
87 | 3. Use appropriate Markdown syntax for headings, emphasis (bold, italic), and lists where applicable.
88 | 4. Always use HTML (`<table>`, `<tr>`, `<td>`) to represent tabular data. Include `colspan` and `rowspan` attributes if needed.
89 | 5. For figures, graphs, or diagrams, represent them using `<img>` tags and use appropriate `alt` text.
90 | 6. For handwritten documents, reproduce the content as typed text, maintaining the original structure and layout.
91 | 7. Do not include any descriptions of the document's appearance, paper type, or writing implements used.
92 | 8. Do not add any explanatory notes, comments, or additional information outside of the converted content.
93 | 9. Ensure all special characters, symbols, and equations are accurately represented.
94 | 10. Provide the output only once, without any duplication.
95 | 11. Enclose the entire output within <output> and </output> tags.
96 | 
97 | Output the converted content directly in Markdown and HTML without any additional explanations, descriptions, or notes.
98 | """
99 | 


--------------------------------------------------------------------------------
/lexoid/core/utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import io
  3 | import mimetypes
  4 | import os
  5 | import re
  6 | import sys
  7 | from difflib import SequenceMatcher
  8 | from hashlib import md5
  9 | from typing import Dict, List, Optional
 10 | from urllib.parse import urlparse
 11 | 
 12 | import nest_asyncio
 13 | import pikepdf
 14 | import pypdfium2
 15 | import requests
 16 | from bs4 import BeautifulSoup
 17 | from docx2pdf import convert
 18 | from loguru import logger
 19 | from markdown import markdown
 20 | from markdownify import markdownify as md
 21 | from PIL import Image
 22 | from PyQt5.QtCore import QMarginsF, QUrl
 23 | from PyQt5.QtGui import QPageLayout, QPageSize
 24 | from PyQt5.QtPrintSupport import QPrinter
 25 | from PyQt5.QtWebEngineWidgets import QWebEngineView
 26 | from PyQt5.QtWidgets import QApplication
 27 | 
 28 | # Source: https://stackoverflow.com/a/12982689
 29 | HTML_TAG_PATTERN = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
 30 | 
 31 | 
 32 | def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
 33 |     paths = []
 34 |     with pikepdf.open(input_path) as pdf:
 35 |         total_pages = len(pdf.pages)
 36 |         for start in range(0, total_pages, pages_per_split):
 37 |             end = min(start + pages_per_split, total_pages)
 38 |             output_path = os.path.join(
 39 |                 output_dir, f"split_{str(start + 1).zfill(4)}_{end}.pdf"
 40 |             )
 41 |             with pikepdf.new() as new_pdf:
 42 |                 new_pdf.pages.extend(pdf.pages[start:end])
 43 |                 new_pdf.save(output_path)
 44 |                 paths.append(output_path)
 45 |     return paths
 46 | 
 47 | 
 48 | def create_sub_pdf(
 49 |     input_path: str, output_path: str, page_nums: Optional[tuple[int, ...] | int] = None
 50 | ) -> str:
 51 |     if isinstance(page_nums, int):
 52 |         page_nums = (page_nums,)
 53 |     page_nums = tuple(sorted(set(page_nums)))
 54 |     with pikepdf.open(input_path) as pdf:
 55 |         indices = page_nums if page_nums else range(len(pdf.pages))
 56 |         with pikepdf.new() as new_pdf:
 57 |             new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
 58 |             new_pdf.save(output_path)
 59 |     return output_path
 60 | 
 61 | 
 62 | def convert_image_to_pdf(image_path: str) -> bytes:
 63 |     with Image.open(image_path) as img:
 64 |         img_rgb = img.convert("RGB")
 65 |         pdf_buffer = io.BytesIO()
 66 |         img_rgb.save(pdf_buffer, format="PDF")
 67 |         return pdf_buffer.getvalue()
 68 | 
 69 | 
 70 | def remove_html_tags(text: str):
 71 |     html = markdown(text, extensions=["tables"])
 72 |     return re.sub(HTML_TAG_PATTERN, "", html)
 73 | 
 74 | 
 75 | def calculate_similarity(text1: str, text2: str, ignore_html=True) -> float:
 76 |     """Calculate similarity ratio between two texts using SequenceMatcher."""
 77 |     if ignore_html:
 78 |         text1 = remove_html_tags(text1)
 79 |         text2 = remove_html_tags(text2)
 80 |     return SequenceMatcher(None, text1, text2).ratio()
 81 | 
 82 | 
 83 | def convert_pdf_page_to_image(
 84 |     pdf_document: pypdfium2.PdfDocument, page_number: int
 85 | ) -> bytes:
 86 |     """Convert a PDF page to an image."""
 87 |     page = pdf_document[page_number]
 88 |     # Render with 4x scaling for better quality
 89 |     pil_image = page.render(scale=4).to_pil()
 90 | 
 91 |     # Convert to bytes
 92 |     img_byte_arr = io.BytesIO()
 93 |     pil_image.save(img_byte_arr, format="PNG")
 94 |     img_byte_arr.seek(0)
 95 |     return img_byte_arr.getvalue()
 96 | 
 97 | 
 98 | def get_file_type(path: str) -> str:
 99 |     """Get the file type of a file based on its extension."""
100 |     return mimetypes.guess_type(path)[0]
101 | 
102 | 
103 | def is_supported_file_type(path: str) -> bool:
104 |     """Check if the file type is supported for parsing."""
105 |     file_type = get_file_type(path)
106 |     if (
107 |         file_type == "application/pdf"
108 |         or "wordprocessing" in file_type
109 |         or "spreadsheet" in file_type
110 |         or "presentation" in file_type
111 |         or file_type.startswith("image/")
112 |         or file_type.startswith("text")
113 |     ):
114 |         return True
115 |     return False
116 | 
117 | 
118 | def is_supported_url_file_type(url: str) -> bool:
119 |     """
120 |     Check if the file type from the URL is supported.
121 | 
122 |     Args:
123 |         url (str): The URL of the file.
124 | 
125 |     Returns:
126 |         bool: True if the file type is supported, False otherwise.
127 |     """
128 |     supported_extensions = [".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"]
129 |     parsed_url = urlparse(url)
130 |     ext = os.path.splitext(parsed_url.path)[1].lower()
131 | 
132 |     if ext in supported_extensions:
133 |         return True
134 | 
135 |     # If no extension in URL, try to get content type from headers
136 |     try:
137 |         response = requests.head(url)
138 |     except requests.exceptions.ConnectionError:
139 |         return False
140 |     content_type = response.headers.get("Content-Type", "")
141 |     ext = mimetypes.guess_extension(content_type)
142 | 
143 |     return ext in supported_extensions
144 | 
145 | 
146 | def download_file(url: str, temp_dir: str) -> str:
147 |     """
148 |     Downloads a file from the given URL and saves it to a temporary directory.
149 | 
150 |     Args:
151 |         url (str): The URL of the file to download.
152 |         temp_dir (str): The temporary directory to save the file.
153 | 
154 |     Returns:
155 |         str: The path to the downloaded file.
156 |     """
157 |     response = requests.get(url)
158 |     file_name = os.path.basename(urlparse(url).path)
159 |     if not file_name:
160 |         content_type = response.headers.get("Content-Type", "")
161 |         ext = mimetypes.guess_extension(content_type)
162 |         file_name = f"downloaded_file{ext}" if ext else "downloaded_file"
163 | 
164 |     file_path = os.path.join(temp_dir, file_name)
165 |     with open(file_path, "wb") as f:
166 |         f.write(response.content)
167 |     return file_path
168 | 
169 | 
170 | def find_dominant_heading_level(markdown_content: str) -> str:
171 |     """
172 |     Finds the most common heading level that occurs more than once.
173 |     Also checks for underline style headings (---).
174 | 
175 |     Args:
176 |         markdown_content (str): The markdown content to analyze
177 | 
178 |     Returns:
179 |         str: The dominant heading pattern (e.g., '##' or 'underline')
180 |     """
181 |     # Check for underline style headings first
182 |     underline_pattern = r"^[^\n]+\n-+$"
183 |     underline_matches = re.findall(underline_pattern, markdown_content, re.MULTILINE)
184 |     if len(underline_matches) > 1:
185 |         return "underline"
186 | 
187 |     # Find all hash-style headings in the markdown content
188 |     heading_patterns = ["#####", "####", "###", "##", "#"]
189 |     heading_counts = {}
190 | 
191 |     for pattern in heading_patterns:
192 |         # Look for headings at the start of a line
193 |         regex = f"^{pattern} .*$"
194 |         matches = re.findall(regex, markdown_content, re.MULTILINE)
195 |         if len(matches) > 1:  # Only consider headings that appear more than once
196 |             heading_counts[pattern] = len(matches)
197 | 
198 |     if not heading_counts:
199 |         return "#"  # Default to h1 if no repeated headings found
200 | 
201 |     return min(heading_counts.keys(), key=len)
202 | 
203 | 
204 | def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Dict]:
205 |     """
206 |     Splits markdown content by the specified heading pattern and structures it.
207 | 
208 |     Args:
209 |         markdown_content (str): The markdown content to split
210 |         heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline')
211 | 
212 |     Returns:
213 |         List[Dict]: List of dictionaries containing metadata and content
214 |     """
215 |     structured_content = []
216 | 
217 |     if heading_pattern == "underline":
218 |         # Split by underline headings
219 |         pattern = r"^([^\n]+)\n-+$"
220 |         sections = re.split(pattern, markdown_content, flags=re.MULTILINE)
221 |         # Remove empty sections and strip whitespace
222 |         sections = [section.strip() for section in sections]
223 | 
224 |         # Handle content before first heading if it exists
225 |         if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
226 |             structured_content.append(
227 |                 {
228 |                     "metadata": {"page": "Introduction"},
229 |                     "content": sections.pop(0),
230 |                 }
231 |             )
232 | 
233 |         # Process sections pairwise (heading, content)
234 |         for i in range(0, len(sections), 2):
235 |             if i + 1 < len(sections):
236 |                 structured_content.append(
237 |                     {
238 |                         "metadata": {"page": sections[i]},
239 |                         "content": sections[i + 1],
240 |                     }
241 |                 )
242 |     else:
243 |         # Split by hash headings
244 |         regex = f"^{heading_pattern} .*$"
245 |         sections = re.split(regex, markdown_content, flags=re.MULTILINE)
246 |         headings = re.findall(regex, markdown_content, flags=re.MULTILINE)
247 | 
248 |         # Remove empty sections and strip whitespace
249 |         sections = [section.strip() for section in sections]
250 | 
251 |         # Handle content before first heading if it exists
252 |         if len(sections) > len(headings):
253 |             structured_content.append(
254 |                 {
255 |                     "metadata": {"page": "Introduction"},
256 |                     "content": sections.pop(0),
257 |                 }
258 |             )
259 | 
260 |         # Process remaining sections
261 |         for heading, content in zip(headings, sections):
262 |             clean_heading = heading.replace(heading_pattern, "").strip()
263 |             structured_content.append(
264 |                 {
265 |                     "metadata": {"page": clean_heading},
266 |                     "content": content,
267 |                 }
268 |             )
269 | 
270 |     return structured_content
271 | 
272 | 
273 | def html_to_markdown(html: str, title: str, url: str) -> str:
274 |     """
275 |     Converts HTML content to markdown.
276 | 
277 |     Args:
278 |         html (str): The HTML content to convert.
279 |         title (str): The title of the HTML page
280 |         url (str): The URL of the HTML page
281 | 
282 |     Returns:
283 |         Dict: Dictionary containing parsed document data
284 |     """
285 |     markdown_content = md(html)
286 | 
287 |     # Find the dominant heading level
288 |     heading_pattern = find_dominant_heading_level(markdown_content)
289 | 
290 |     # Split content by headings and structure it
291 |     split_md = split_md_by_headings(markdown_content, heading_pattern)
292 | 
293 |     content = {
294 |         "raw": markdown_content,
295 |         "segments": split_md,
296 |         "title": title,
297 |         "url": url,
298 |         "parent_title": "",
299 |         "recursive_docs": [],
300 |     }
301 | 
302 |     return content
303 | 
304 | 
305 | def get_webpage_soup(url: str) -> BeautifulSoup:
306 |     try:
307 |         from playwright.async_api import async_playwright
308 | 
309 |         nest_asyncio.apply()
310 | 
311 |         async def fetch_page():
312 |             async with async_playwright() as p:
313 |                 browser = await p.chromium.launch(
314 |                     headless=True,
315 |                     args=[
316 |                         "--disable-blink-features=AutomationControlled",
317 |                         "--no-sandbox",
318 |                         "--window-size=1920,1080",
319 |                     ],
320 |                 )
321 |                 context = await browser.new_context(
322 |                     viewport={"width": 1920, "height": 1080},
323 |                     user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
324 |                     bypass_csp=True,
325 |                 )
326 |                 page = await context.new_page()
327 | 
328 |                 # Add headers to appear more like a real browser
329 |                 await page.set_extra_http_headers(
330 |                     {
331 |                         "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
332 |                         "Accept-Language": "en-US,en;q=0.5",
333 |                         "Sec-Fetch-Dest": "document",
334 |                         "Sec-Fetch-Mode": "navigate",
335 |                         "Sec-Fetch-Site": "none",
336 |                         "Sec-Fetch-User": "?1",
337 |                     }
338 |                 )
339 | 
340 |                 await page.goto(url)
341 | 
342 |                 # Wait for Cloudflare check to complete
343 |                 await page.wait_for_load_state("networkidle")
344 | 
345 |                 # Additional wait for any dynamic content
346 |                 try:
347 |                     await page.wait_for_selector("body", timeout=30000)
348 |                 except Exception:
349 |                     pass
350 | 
351 |                 html = await page.content()
352 |                 await browser.close()
353 |                 return html
354 | 
355 |         loop = asyncio.get_event_loop()
356 |         html = loop.run_until_complete(fetch_page())
357 |         soup = BeautifulSoup(html, "html.parser")
358 |     except Exception as e:
359 |         logger.debug(
360 |             f"Error reading HTML content from URL, attempting with default https request: {str(e)}"
361 |         )
362 |         response = requests.get(url)
363 |         soup = BeautifulSoup(
364 |             response.content, "html.parser", from_encoding="iso-8859-1"
365 |         )
366 |     return soup
367 | 
368 | 
369 | def read_html_content(url: str) -> Dict:
370 |     """
371 |     Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
372 | 
373 |     Args:
374 |         url (str): The URL of the HTML page.
375 | 
376 |     Returns:
377 |         Dict: Dictionary containing parsed document data
378 |     """
379 | 
380 |     soup = get_webpage_soup(url)
381 |     title = soup.title.string.strip() if soup.title else "No title"
382 |     url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
383 |     full_title = f"{title} - {url_hash}"
384 |     return html_to_markdown(str(soup), title=full_title, url=url)
385 | 
386 | 
387 | def extract_urls_from_markdown(content: str) -> List[str]:
388 |     """
389 |     Extracts URLs from markdown content using regex.
390 |     Matches both [text](url) and bare http(s):// URLs.
391 | 
392 |     Args:
393 |         content (str): Markdown content to search for URLs
394 | 
395 |     Returns:
396 |         List[str]: List of unique URLs found
397 |     """
398 |     # Match markdown links [text](url) and bare URLs
399 |     markdown_pattern = r"\[([^\]]+)\]\((https?://[^\s\)]+)\)"
400 |     bare_url_pattern = r"(?<!\()(https?://[^\s\)]+)"
401 | 
402 |     urls = []
403 |     # Extract URLs from markdown links
404 |     urls.extend(match.group(2) for match in re.finditer(markdown_pattern, content))
405 |     # Extract bare URLs
406 |     urls.extend(match.group(0) for match in re.finditer(bare_url_pattern, content))
407 | 
408 |     return list(set(urls))  # Remove duplicates
409 | 
410 | 
411 | def recursive_read_html(url: str, depth: int, visited_urls: set = None) -> Dict:
412 |     """
413 |     Recursively reads HTML content from URLs up to specified depth.
414 | 
415 |     Args:
416 |         url (str): The URL to parse
417 |         depth (int): How many levels deep to recursively parse
418 |         visited_urls (set): Set of already visited URLs to prevent cycles
419 | 
420 |     Returns:
421 |         Dict: Dictionary containing parsed document data
422 |     """
423 |     if visited_urls is None:
424 |         visited_urls = set()
425 | 
426 |     if url in visited_urls:
427 |         return {
428 |             "raw": "",
429 |             "segments": [],
430 |             "title": "",
431 |             "url": url,
432 |             "parent_title": "",
433 |             "recursive_docs": [],
434 |         }
435 | 
436 |     visited_urls.add(url)
437 | 
438 |     try:
439 |         content = read_html_content(url)
440 |     except Exception as e:
441 |         print(f"Error processing URL {url}: {str(e)}")
442 |         return {
443 |             "raw": "",
444 |             "segments": [],
445 |             "title": "",
446 |             "url": url,
447 |             "parent_title": "",
448 |             "recursive_docs": [],
449 |         }
450 | 
451 |     if depth <= 1:
452 |         return content
453 | 
454 |     # Extract URLs from all content sections
455 |     urls = extract_urls_from_markdown(content["raw"])
456 | 
457 |     # Recursively process each URL
458 |     recursive_docs = []
459 |     for sub_url in urls:
460 |         if sub_url not in visited_urls:
461 |             sub_content = recursive_read_html(sub_url, depth - 1, visited_urls)
462 |             recursive_docs.append(sub_content)
463 | 
464 |     content["recursive_docs"] = recursive_docs
465 |     return content
466 | 
467 | 
468 | def save_webpage_as_pdf(url: str, output_path: str) -> str:
469 |     """
470 |     Saves a webpage as a PDF file using PyQt5.
471 | 
472 |     Args:
473 |         url (str): The URL of the webpage.
474 |         output_path (str): The path to save the PDF file.
475 | 
476 |     Returns:
477 |         str: The path to the saved PDF file.
478 |     """
479 |     if not QApplication.instance():
480 |         app = QApplication(sys.argv)
481 |     else:
482 |         app = QApplication.instance()
483 |     web = QWebEngineView()
484 |     web.load(QUrl(url))
485 | 
486 |     def handle_print_finished(filename, status):
487 |         print(f"PDF saved to: {filename}")
488 |         app.quit()
489 | 
490 |     def handle_load_finished(status):
491 |         if status:
492 |             printer = QPrinter(QPrinter.HighResolution)
493 |             printer.setOutputFormat(QPrinter.PdfFormat)
494 |             printer.setOutputFileName(output_path)
495 | 
496 |             page_layout = QPageLayout(
497 |                 QPageSize(QPageSize.A4), QPageLayout.Portrait, QMarginsF(15, 15, 15, 15)
498 |             )
499 |             printer.setPageLayout(page_layout)
500 | 
501 |             web.page().printToPdf(output_path)
502 |             web.page().pdfPrintingFinished.connect(handle_print_finished)
503 | 
504 |     web.loadFinished.connect(handle_load_finished)
505 |     app.exec_()
506 | 
507 |     return output_path
508 | 
509 | 
510 | def convert_to_pdf(input_path: str, output_path: str) -> str:
511 |     """
512 |     Converts a file or webpage to PDF.
513 | 
514 |     Args:
515 |         input_path (str): The path to the input file or URL.
516 |         output_path (str): The path to save the output PDF file.
517 | 
518 |     Returns:
519 |         str: The path to the saved PDF file.
520 |     """
521 |     if input_path.startswith(("http://", "https://")):
522 |         return save_webpage_as_pdf(input_path, output_path)
523 |     file_type = get_file_type(input_path)
524 |     if file_type.startswith("image/"):
525 |         img_data = convert_image_to_pdf(input_path)
526 |         with open(output_path, "wb") as f:
527 |             f.write(img_data)
528 |     elif "word" in file_type:
529 |         return convert_doc_to_pdf(input_path, os.path.dirname(output_path))
530 |     else:
531 |         # Assume it's already a PDF, just copy it
532 |         with open(input_path, "rb") as src, open(output_path, "wb") as dst:
533 |             dst.write(src.read())
534 | 
535 |     return output_path
536 | 
537 | 
538 | def has_image_in_pdf(path: str):
539 |     with open(path, "rb") as fp:
540 |         content = fp.read()
541 |     return "Image".lower() in list(
542 |         map(lambda x: x.strip(), (str(content).lower().split("/")))
543 |     )
544 | 
545 | 
546 | def has_hyperlink_in_pdf(path: str):
547 |     with open(path, "rb") as fp:
548 |         content = fp.read()
549 |     # URI tag is used if Links are hidden.
550 |     return "URI".lower() in list(
551 |         map(lambda x: x.strip(), (str(content).lower().split("/")))
552 |     )
553 | 
554 | 
555 | def router(path: str, priority: str = "speed") -> str:
556 |     """
557 |     Routes the file path to the appropriate parser based on the file type.
558 | 
559 |     Args:
560 |         path (str): The file path to route.
561 |         priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
562 |     """
563 |     file_type = get_file_type(path)
564 |     if (
565 |         file_type.startswith("text/")
566 |         or "spreadsheet" in file_type
567 |         or "presentation" in file_type
568 |     ):
569 |         return "STATIC_PARSE"
570 | 
571 |     if priority == "accuracy":
572 |         # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
573 |         # Otherwise, use LLM_PARSE
574 |         has_image = has_image_in_pdf(path)
575 |         has_hyperlink = has_hyperlink_in_pdf(path)
576 |         if file_type == "application/pdf" and not has_image and has_hyperlink:
577 |             logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.")
578 |             return "STATIC_PARSE"
579 |         logger.debug(
580 |             f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})."
581 |         )
582 |         return "LLM_PARSE"
583 |     else:
584 |         # If the file is a PDF without images, use STATIC_PARSE
585 |         # Otherwise, use LLM_PARSE
586 |         if file_type == "application/pdf" and not has_image_in_pdf(path):
587 |             logger.debug("Using STATIC_PARSE for PDF without images.")
588 |             return "STATIC_PARSE"
589 |         logger.debug("Using LLM_PARSE because PDF has images")
590 |         return "LLM_PARSE"
591 | 
592 | 
593 | def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
594 |     temp_path = os.path.join(
595 |         temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
596 |     )
597 | 
598 |     # Convert the document to PDF
599 |     # docx2pdf is not supported in linux. Use LibreOffice in linux instead.
600 |     # May need to install LibreOffice if not already installed.
601 |     if "linux" in sys.platform.lower():
602 |         os.system(
603 |             f'lowriter --headless --convert-to pdf --outdir {temp_dir} "{input_path}"'
604 |         )
605 |     else:
606 |         convert(input_path, temp_path)
607 | 
608 |     # Return the path of the converted PDF
609 |     return temp_path
610 | 
611 | 
612 | def get_uri_rect(path):
613 |     with open(path, "rb") as fp:
614 |         byte_str = str(fp.read())
615 |     pattern = r"\((https?://[^\s)]+)\)"
616 |     uris = re.findall(pattern, byte_str)
617 |     rect_splits = byte_str.split("/Rect [")[1:]
618 |     rects = [
619 |         list(map(float, rect_split.split("]")[0].split())) for rect_split in rect_splits
620 |     ]
621 |     return {uri: rect for uri, rect in zip(uris, rects)}
622 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "lexoid"
 3 | version = "0.1.14"
 4 | description = ""
 5 | authors = []
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.10"
10 | google-generativeai = "^0.8.1"
11 | openai = "^1.47.0"
12 | pikepdf = "^9.3.0"
13 | pdfplumber = "^0.11.4"
14 | pandas = "^2.2.3"
15 | tabulate = "^0.9.0"
16 | bs4 = "^0.0.2"
17 | markdownify = "^0.13.1"
18 | opencv-python = "^4.10.0.84"
19 | pypdfium2 = "^4.30.0"
20 | markdown = "^3.7"
21 | python-dotenv = "^1.0.0"
22 | loguru = "^0.7.2"
23 | playwright = "^1.49.0"
24 | docx2pdf = "^0.1.8"
25 | python-docx = "^1.1.2"
26 | nest-asyncio ="^1.6.0"
27 | pyqt5 = {version = "^5.15.11", markers = "platform_system != 'debian'"}
28 | pyqtwebengine = {version = "^5.15.7", markers = "platform_system != 'debian'"}
29 | huggingface-hub = "^0.27.0"
30 | together = "^1.4.0"
31 | openpyxl = "^3.1.5"
32 | pptx2md = "^2.0.6"
33 | 
34 | [tool.poetry.group.dev.dependencies]
35 | ipykernel = "^6.29.5"
36 | pytest-asyncio = "^0.23.8"
37 | pytest = "^8.3.2"
38 | 
39 | 
40 | [tool.poetry.group.docs.dependencies]
41 | sphinx = "^8.1.3"
42 | pydata-sphinx-theme = "^0.16.1"
43 | docutils = "^0.21.2"
44 | 
45 | [build-system]
46 | requires = ["poetry-core", "wheel"]
47 | build-backend = "poetry.core.masonry.api"
48 | 


--------------------------------------------------------------------------------
/tests/api_cost_mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "gemini-2.5-flash-preview-04-17": {
 3 |         "input": 0.15,
 4 |         "output": 0.6
 5 |     },
 6 |     "gemini-2.5-pro-preview-03-25": {
 7 |         "input": 1.25,
 8 |         "output": 10
 9 |     },
10 |     "gemini-2.0-flash": {
11 |         "input": 0.1,
12 |         "output": 0.4
13 |     },
14 |     "gemini-2.0-pro-exp": {},
15 |     "gemini-2.0-flash-thinking-exp": {},
16 |     "gemini-2.0-flash-001": {
17 |         "input": 0.1,
18 |         "output": 0.4
19 |     },
20 |     "gemini-1.5-flash-8b": {
21 |         "input": 0.0375,
22 |         "output": 0.15
23 |     },
24 |     "gemini-1.5-flash": {
25 |         "input": 0.075,
26 |         "output": 0.3
27 |     },
28 |     "gemini-1.5-pro": {
29 |         "input": 1.25,
30 |         "output": 5
31 |     },
32 |     "gpt-4o": {
33 |         "input": 2.5,
34 |         "output": 10
35 |     },
36 |     "gpt-4o-mini": {
37 |         "input": 0.15,
38 |         "output": 0.6
39 |     },
40 |     "meta-llama/Llama-3.2-11B-Vision-Instruct": {
41 |         "input": 0,
42 |         "output": 0
43 |     },
44 |     "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo": {
45 |         "input": 0.18,
46 |         "output": 0.18
47 |     },
48 |     "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo": {
49 |         "input": 1.2,
50 |         "output": 1.2
51 |     },
52 |     "meta-llama/Llama-Vision-Free": {
53 |         "input": 0,
54 |         "output": 0
55 |     },
56 |     "google/gemma-3-27b-it": {
57 |         "input": 0.1,
58 |         "input-image": 0.0000256,
59 |         "output": 0.2
60 |     },
61 |     "qwen/qwen-2.5-vl-7b-instruct": {
62 |         "input": 0.2,
63 |         "input-image": 0.0001445,
64 |         "output": 0.2
65 |     },
66 |     "microsoft/phi-4-multimodal-instruct": {
67 |         "input": 0.05,
68 |         "input-image": 0.0001769,
69 |         "output": 0.1
70 |     },
71 |     "accounts/fireworks/models/llama4-maverick-instruct-basic": {
72 |         "input": 0.22,
73 |         "output": 0.88
74 |     },
75 |     "accounts/fireworks/models/llama4-scout-instruct-basic": {
76 |         "input": 0.15,
77 |         "output": 0.6
78 |     }
79 | }


--------------------------------------------------------------------------------
/tests/benchmark.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from dataclasses import dataclass
  4 | from glob import glob
  5 | from pathlib import Path
  6 | from statistics import mean, stdev
  7 | from typing import Dict, List, Optional, Tuple
  8 | 
  9 | import pandas as pd
 10 | from dotenv import load_dotenv
 11 | 
 12 | from lexoid.api import parse
 13 | from lexoid.core.utils import calculate_similarity
 14 | 
 15 | load_dotenv()
 16 | 
 17 | 
 18 | @dataclass
 19 | class BenchmarkResult:
 20 |     config: Dict
 21 |     similarity: List[float]  # Store all similarity scores for iterations
 22 |     execution_time: List[float]  # Store all execution times for iterations
 23 |     cost: Optional[List[float]] = None
 24 |     error: Optional[str] = None
 25 | 
 26 | 
 27 | def get_input_output_pairs(input_path: str, output_dir: str) -> List[Tuple[str, str]]:
 28 |     """Get matching input and ground truth file pairs."""
 29 |     if os.path.isfile(input_path):
 30 |         # Single file mode
 31 |         base_name = Path(input_path).stem
 32 |         ground_truth_path = os.path.join(output_dir, f"{base_name}.md")
 33 |         if os.path.exists(ground_truth_path):
 34 |             return [(input_path, ground_truth_path)]
 35 |         return []
 36 | 
 37 |     # Directory mode
 38 |     input_files = sorted(glob(os.path.join(input_path, "*")))
 39 |     pairs = []
 40 | 
 41 |     for input_file in input_files:
 42 |         base_name = Path(input_file).stem
 43 |         ground_truth_path = os.path.join(output_dir, f"{base_name}.md")
 44 | 
 45 |         if os.path.exists(ground_truth_path):
 46 |             pairs.append((input_file, ground_truth_path))
 47 | 
 48 |     return pairs
 49 | 
 50 | 
 51 | def run_benchmark_config(
 52 |     input_path: str,
 53 |     ground_truth: str,
 54 |     config: Dict,
 55 |     output_save_dir: str = None,
 56 |     iterations: int = 1,
 57 | ) -> BenchmarkResult:
 58 |     """Run a single benchmark configuration for a specified number of iterations."""
 59 |     similarities = []
 60 |     execution_times = []
 61 |     costs = []
 62 |     error = None
 63 | 
 64 |     for _ in range(iterations):
 65 |         try:
 66 |             start_time = time.time()
 67 |             config["parser_type"] = config.get(
 68 |                 "parser_type",
 69 |                 (
 70 |                     "LLM_PARSE"
 71 |                     if "model" in config
 72 |                     else ("STATIC_PARSE" if "framework" in config else "AUTO")
 73 |                 ),
 74 |             )
 75 |             result = parse(
 76 |                 input_path,
 77 |                 pages_per_split=1,
 78 |                 api_cost_mapping="tests/api_cost_mapping.json",
 79 |                 **config,
 80 |             )
 81 |             execution_time = time.time() - start_time
 82 | 
 83 |             if output_save_dir:
 84 |                 filename = (
 85 |                     f"{Path(input_path).stem}_"
 86 |                     + ", ".join(
 87 |                         [
 88 |                             f"{key}={str(value).replace('/', '_')}"
 89 |                             for key, value in config.items()
 90 |                         ]
 91 |                     )
 92 |                     + f"{int(start_time)}.md"
 93 |                 )
 94 |                 with open(os.path.join(output_save_dir, filename), "w") as fp:
 95 |                     fp.write(result["raw"])
 96 | 
 97 |             similarity = calculate_similarity(result["raw"], ground_truth)
 98 |             similarities.append(similarity)
 99 |             execution_times.append(execution_time)
100 |             costs.append(
101 |                 result["token_cost"]["output"] if "token_cost" in result else 0.0
102 |             )
103 |         except Exception as e:
104 |             print(f"Error running benchmark for config: {config}\n{e}")
105 |             error = str(e)
106 |             break  # Stop further iterations if an error occurs
107 | 
108 |     return BenchmarkResult(
109 |         config=config,
110 |         similarity=similarities,
111 |         execution_time=execution_times,
112 |         cost=costs,
113 |         error=error,
114 |     )
115 | 
116 | 
117 | def aggregate_results(results: List[BenchmarkResult]) -> BenchmarkResult:
118 |     """Aggregate multiple benchmark results into a single result."""
119 |     if not results:
120 |         return None
121 | 
122 |     valid_results = [r for r in results if r.error is None]
123 |     if valid_results:
124 |         all_similarities = [s for r in valid_results for s in r.similarity]
125 |         all_execution_times = [t for r in valid_results for t in r.execution_time]
126 |         all_costs = [c for r in valid_results for c in r.cost]
127 |         avg_similarity = mean(all_similarities)
128 |         std_similarity = stdev(all_similarities) if len(all_similarities) > 1 else 0.0
129 |         avg_execution_time = mean(all_execution_times)
130 |         avg_cost = mean(all_costs)
131 |         error = (
132 |             None
133 |             if len(valid_results) == len(results)
134 |             else f"Failed: {len(results) - len(valid_results)}/{len(results)}"
135 |         )
136 |     else:
137 |         avg_similarity = 0.0
138 |         std_similarity = 0.0
139 |         avg_execution_time = 0.0
140 |         avg_cost = 0.0
141 |         error = f"Failed: {len(results)}/{len(results)}"
142 | 
143 |     return BenchmarkResult(
144 |         config=results[0].config,
145 |         similarity=[avg_similarity, std_similarity],  # Store mean and std dev
146 |         execution_time=[avg_execution_time],
147 |         cost=[avg_cost],
148 |         error=error,
149 |     )
150 | 
151 | 
152 | def generate_test_configs(input_path: str, test_attributes: List[str]) -> List[Dict]:
153 |     """
154 |     Generate different configuration combinations to test based on specified attributes.
155 |     """
156 |     config_options = {
157 |         "parser_type": ["LLM_PARSE", "STATIC_PARSE", "AUTO"],
158 |         "model": [
159 |             # # Google models
160 |             "gemini-2.5-flash-preview-04-17",
161 |             # "gemini-2.5-pro-preview-03-25",
162 |             # "gemini-2.0-pro-exp",
163 |             "gemini-2.0-flash",
164 |             # "gemini-2.0-flash-thinking-exp",
165 |             # "gemini-2.0-flash-001",
166 |             # "gemini-1.5-flash-8b",
167 |             # "gemini-1.5-flash",
168 |             # "gemini-1.5-pro",
169 |             # # OpenAI models
170 |             "gpt-4o",
171 |             "gpt-4o-mini",
172 |             # # Meta-LLAMA models through HF Hub
173 |             # "meta-llama/Llama-3.2-11B-Vision-Instruct",
174 |             # # Meta-LLAMA models through Together AI
175 |             # "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
176 |             "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
177 |             # "meta-llama/Llama-Vision-Free",
178 |             # # Model through OpenRouter
179 |             "google/gemma-3-27b-it",
180 |             "qwen/qwen-2.5-vl-7b-instruct",
181 |             # "microsoft/phi-4-multimodal-instruct",
182 |             # # Model through fireworks
183 |             "accounts/fireworks/models/llama4-maverick-instruct-basic",
184 |             # "accounts/fireworks/models/llama4-scout-instruct-basic",
185 |         ],
186 |         "framework": ["pdfminer", "pdfplumber"],
187 |         "pages_per_split": [1, 2, 4, 8],
188 |         "max_threads": [1, 2, 4, 8],
189 |         "as_pdf": [True, False],
190 |         "temperature": [0.2, 0.7],
191 |     }
192 | 
193 |     # Only test as_pdf if input is not a PDF
194 |     is_pdf = input_path.lower().endswith(".pdf")
195 |     if is_pdf and "as_pdf" in test_attributes:
196 |         test_attributes.remove("as_pdf")
197 | 
198 |     configs = [{}]
199 | 
200 |     for attr in test_attributes:
201 |         new_configs = []
202 |         for config in configs:
203 |             if attr == "parser_type" or attr == "temperature":
204 |                 for value in config_options[attr]:
205 |                     new_config = config.copy()
206 |                     new_config[attr] = value
207 |                     new_configs.append(new_config)
208 |             elif attr == "model" and (
209 |                 "parser_type" not in config or config.get("parser_type") == "LLM_PARSE"
210 |             ):
211 |                 for value in config_options[attr]:
212 |                     new_config = config.copy()
213 |                     new_config[attr] = value
214 |                     new_configs.append(new_config)
215 |             elif attr == "framework" and (
216 |                 "parser_type" not in config
217 |                 or config.get("parser_type") == "STATIC_PARSE"
218 |             ):
219 |                 for value in config_options[attr]:
220 |                     new_config = config.copy()
221 |                     new_config[attr] = value
222 |                     new_configs.append(new_config)
223 |             elif attr in ("pages_per_split", "max_threads"):
224 |                 for value in config_options[attr]:
225 |                     new_config = config.copy()
226 |                     new_config[attr] = value
227 |                     new_configs.append(new_config)
228 |             elif attr == "as_pdf" and not is_pdf:
229 |                 for value in config_options[attr]:
230 |                     new_config = config.copy()
231 |                     new_config[attr] = value
232 |                     new_configs.append(new_config)
233 |             else:
234 |                 new_configs.append(config)
235 |         configs = new_configs
236 | 
237 |     return configs
238 | 
239 | 
240 | def format_results(results: List[BenchmarkResult], test_attributes: List[str]) -> str:
241 |     """Format benchmark results as a markdown table, including only tested attributes."""
242 |     sorted_results = sorted(results, key=lambda x: x.similarity[0], reverse=True)
243 | 
244 |     # Dynamically generate table headers based on test_attributes
245 |     headers = ["Rank"]
246 |     for attr in test_attributes:
247 |         headers.append(attr.replace("_", " ").title())
248 |     headers.extend(["Mean Similarity", "Std. Dev.", "Time (s)", "Cost ($)", "Error"])
249 | 
250 |     md_lines = [
251 |         "# Parser Benchmark Results\n",
252 |         "| " + " | ".join(headers) + " |",
253 |         "|" + "|".join(["---"] * len(headers)) + "|",
254 |     ]
255 | 
256 |     for i, result in enumerate(sorted_results, 1):
257 |         config = result.config
258 |         error_msg = result.error if result.error else "-"
259 | 
260 |         row = [str(i)]
261 |         for attr in test_attributes:
262 |             row.append(str(config.get(attr, "-")))
263 |         row.extend(
264 |             [
265 |                 f"{result.similarity[0]:.3f}",
266 |                 f"{result.similarity[1]:.3f}",
267 |                 f"{result.execution_time[0]:.2f}",
268 |                 f"{result.cost[0]}",
269 |                 error_msg,
270 |             ]
271 |         )
272 |         md_lines.append("| " + " | ".join(row) + " |")
273 | 
274 |     return "\n".join(md_lines)
275 | 
276 | 
277 | def run_benchmarks(
278 |     input_path: str,
279 |     output_dir: str,
280 |     test_attributes: List[str],
281 |     benchmark_output_dir: str,
282 |     iterations: int = 3,
283 | ) -> List[BenchmarkResult]:
284 |     """Run all benchmarks for given input(s) and return results."""
285 |     # Get input/output file pairs
286 |     file_pairs = get_input_output_pairs(input_path, output_dir)
287 |     if not file_pairs:
288 |         print("No matching input/output file pairs found!")
289 |         return []
290 | 
291 |     # Generate test configurations based on first input file
292 |     configs = generate_test_configs(file_pairs[0][0], test_attributes)
293 | 
294 |     # Run benchmarks
295 |     results = []
296 |     total_configs = len(configs)
297 |     total_files = len(file_pairs)
298 | 
299 |     print(
300 |         f"Running {total_configs} configurations across {total_files} file(s) for {iterations} iterations..."
301 |     )
302 | 
303 |     all_results = []
304 |     for i, config in enumerate(configs, 1):
305 |         print(f"Progress: {i}/{total_configs} - Testing config: {config}")
306 | 
307 |         # Run benchmark for each file
308 |         file_results = []
309 |         for input_file, ground_truth_path in file_pairs:
310 |             print(f"Running benchmark for file: {input_file}")
311 |             with open(ground_truth_path, "r", encoding="utf-8") as f:
312 |                 ground_truth = f.read()
313 |             result = run_benchmark_config(
314 |                 input_file, ground_truth, config, benchmark_output_dir, iterations
315 |             )
316 |             file_results.append(result)
317 |             all_results.append((input_file, result))
318 | 
319 |         result = aggregate_results(file_results)
320 | 
321 |         results.append(result)
322 | 
323 |         # Format and save results
324 |         save_format = "csv"
325 |         if save_format == "markdown":
326 |             markdown_report = format_results(results, test_attributes)
327 |             result_path = os.path.join(benchmark_output_dir, "results.md")
328 |             with open(result_path, "w", encoding="utf-8") as f:
329 |                 f.write(markdown_report)
330 |         elif save_format == "csv":
331 |             df = pd.DataFrame(
332 |                 [
333 |                     {
334 |                         "Model": result.config.get("model", "-"),
335 |                         "Mean Similarity": result.similarity[0],
336 |                         "Std. Dev.": result.similarity[1],
337 |                         "Time (s)": result.execution_time[0],
338 |                         "Cost($)": result.cost[0],
339 |                     }
340 |                     for result in results
341 |                 ]
342 |             )
343 |             result_path = os.path.join(benchmark_output_dir, "results.csv")
344 |             df.to_csv(result_path, index=False)
345 | 
346 |         print(f"\nBenchmark complete! Results saved to {result_path}")
347 | 
348 |     # Save document-wise results to CSV
349 |     doc_results = []
350 |     for input_file, result in all_results:
351 |         doc_result = {
352 |             "Input File": os.path.basename(input_file),
353 |             "Mean Similarity": result.similarity[0],
354 |             "Time (s)": result.execution_time[0],
355 |             "Cost($)": result.cost[0],
356 |         }
357 |         for key, value in result.config.items():
358 |             doc_result[key] = value
359 |         doc_results.append(doc_result)
360 |     doc_df = pd.DataFrame(doc_results)
361 |     doc_result_path = os.path.join(benchmark_output_dir, "document_results.csv")
362 |     doc_df.to_csv(doc_result_path, index=False)
363 |     print(f"Document-wise results saved to {doc_result_path}")
364 | 
365 |     return results
366 | 
367 | 
368 | def main():
369 |     # Can be either a single file or directory
370 |     input_path = "examples/inputs"
371 |     output_dir = "examples/outputs"
372 | 
373 |     benchmark_output_dir = f"tests/outputs/benchmark_{int(time.time())}/"
374 |     os.makedirs(benchmark_output_dir, exist_ok=True)
375 | 
376 |     # Specify which attributes to test
377 |     test_attributes = [
378 |         # "parser_type",
379 |         "model",
380 |         # "framework",
381 |         # "pages_per_split",
382 |         # "max_threads",
383 |         # "as_pdf",
384 |         # "temperature",
385 |     ]
386 | 
387 |     # Number of iterations for each benchmark
388 |     iterations = 5
389 | 
390 |     results = run_benchmarks(
391 |         input_path, output_dir, test_attributes, benchmark_output_dir, iterations
392 |     )
393 | 
394 |     # Print top 3 configurations
395 |     top_results = sorted(results, key=lambda x: x.similarity[0], reverse=True)[:3]
396 |     print("\nTop 3 Configurations:")
397 |     for i, result in enumerate(top_results, 1):
398 |         print(
399 |             f"{i}. Similarity: {result.similarity[0]:.3f} (±{result.similarity[1]:.3f}), Time: {result.execution_time[0]:.2f}s"
400 |         )
401 |         print(f"   Config: {result.config}")
402 | 
403 | 
404 | if __name__ == "__main__":
405 |     main()
406 | 


--------------------------------------------------------------------------------
/tests/env_template:
--------------------------------------------------------------------------------
1 | GOOGLE_API_KEY = ""
2 | OPENAI_API_KEY = ""
3 | HUGGINGFACEHUB_API_TOKEN = ""
4 | TOGETHER_API_KEY = ""


--------------------------------------------------------------------------------
/tests/test_parser.py:
--------------------------------------------------------------------------------
  1 | # python3 -m pytest tests/test_parser.py -v
  2 | # With logs: python3 -m pytest tests/test_parser.py -v -s
  3 | 
  4 | import os
  5 | 
  6 | import pytest
  7 | from dotenv import load_dotenv
  8 | from loguru import logger
  9 | 
 10 | from lexoid.api import parse
 11 | from lexoid.core.utils import calculate_similarity
 12 | 
 13 | load_dotenv()
 14 | output_dir = "tests/outputs"
 15 | os.makedirs(output_dir, exist_ok=True)
 16 | models = [
 17 |     # Google models
 18 |     "gemini-2.0-pro-exp",
 19 |     "gemini-2.0-flash",
 20 |     "gemini-1.5-flash",
 21 |     "gemini-1.5-flash-8b",
 22 |     "gemini-1.5-pro",
 23 |     # OpenAI models
 24 |     "gpt-4o",
 25 |     "gpt-4o-mini",
 26 |     # Meta-LLAMA models through HF Hub
 27 |     "meta-llama/Llama-3.2-11B-Vision-Instruct",
 28 |     # Meta-LLAMA models through Together AI
 29 |     "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
 30 |     "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo",
 31 |     "meta-llama/Llama-Vision-Free",
 32 | ]
 33 | 
 34 | 
 35 | @pytest.mark.asyncio
 36 | @pytest.mark.parametrize("model", models)
 37 | async def test_llm_parse(model):
 38 |     input_data = "examples/inputs/test_1.pdf"
 39 |     expected_ouput_path = "examples/outputs/test_1.md"
 40 |     config = {"parser_type": "LLM_PARSE", "model": model, "verbose": True}
 41 |     result = parse(input_data, **config)["raw"]
 42 |     assert isinstance(result, str)
 43 | 
 44 |     # Compare the result with the expected output
 45 |     expected_ouput = open(expected_ouput_path, "r").read()
 46 |     # save the result to a file
 47 |     with open(f"{output_dir}/input_table_{model.replace('/', '_')}.md", "w") as f:
 48 |         f.write(result)
 49 |     score = calculate_similarity(result, expected_ouput)
 50 |     assert round(score, 3) > 0.75
 51 | 
 52 | 
 53 | @pytest.mark.asyncio
 54 | @pytest.mark.parametrize("model", models)
 55 | async def test_jpg_parse(model):
 56 |     input_data = "examples/inputs/test_4.jpg"
 57 |     expected_ouput_path = "examples/outputs/test_4.md"
 58 |     config = {"parser_type": "LLM_PARSE", "model": model}
 59 |     result = parse(input_data, **config)["raw"]
 60 |     assert isinstance(result, str)
 61 | 
 62 |     # Compare the result with the expected output
 63 |     expected_ouput = open(expected_ouput_path, "r").read()
 64 |     # save the result to a file
 65 |     m_name = model.replace("/", "_")
 66 |     with open(f"{output_dir}/input_image_{m_name}.md", "w") as f:
 67 |         f.write(result)
 68 |     score = calculate_similarity(result, expected_ouput)
 69 |     assert round(score, 3) > 0.8
 70 | 
 71 | 
 72 | @pytest.mark.asyncio
 73 | @pytest.mark.parametrize(
 74 |     "sample",
 75 |     [
 76 |         "examples/inputs/test_explicit_hyperlink_n_img.pdf",
 77 |         "examples/inputs/test_hidden_link_with_image.pdf",  # currently fails
 78 |         "examples/inputs/test_with_hidden_links_no_img.pdf",
 79 |     ],
 80 | )
 81 | async def test_url_detection_auto_routing(sample):
 82 |     patterns = ["http", "https", "www"]
 83 |     model_type = "gemini-1.5-pro"
 84 |     config = {"parser_type": "AUTO", "model": model_type, "verbose": True}
 85 |     result = parse(sample, **config)["raw"]
 86 |     assert isinstance(result, str)
 87 |     found = [True if p in result else False for p in patterns]
 88 |     assert any(found)
 89 | 
 90 | 
 91 | @pytest.mark.asyncio
 92 | @pytest.mark.parametrize(
 93 |     "sample",
 94 |     [
 95 |         "examples/inputs/test_explicit_hyperlink_n_img.pdf",
 96 |         "examples/inputs/test_hidden_link_with_image.pdf",
 97 |         "examples/inputs/test_with_hidden_links_no_img.pdf",
 98 |     ],
 99 | )
100 | async def test_url_detection_pdfplumber(sample):
101 |     patterns = ["http", "https", "www"]
102 |     framework = "pdfplumber"
103 |     config = {"parser_type": "STATIC_PARSE", "framework": framework}
104 |     result = parse(sample, **config)["raw"]
105 |     assert isinstance(result, str)
106 |     found = [True if p in result else False for p in patterns]
107 |     assert any(found)
108 | 
109 | 
110 | @pytest.mark.parametrize("model", models)
111 | @pytest.mark.asyncio
112 | async def test_url_detection_multi_page_auto_routing(model):
113 |     sample = "examples/inputs/sample_test_doc.pdf"
114 |     patterns = ["http", "https", "www"]
115 |     config = {"parser_type": "AUTO", "model": model, "verbose": True}
116 |     results = parse(sample, pages_per_split=1, **config)["segments"]
117 | 
118 |     assert len(results) == 6
119 |     for res in results:
120 |         content = res["content"]
121 |         if res["metadata"]["page"] == 1:
122 |             # Page 1: Fails to detect the URL
123 |             found = [p in content for p in patterns]
124 |             assert not any(found)
125 |         elif res["metadata"]["page"] == 2:
126 |             # Page 2: Detects the URL
127 |             found = [p in content for p in patterns]
128 |             assert any(found)
129 |         elif res["metadata"]["page"] == 3:
130 |             # Page 3: Does not contain any URL
131 |             found = [p in content for p in patterns]
132 |             assert not any(found)
133 |         elif res["metadata"]["page"] == 4:
134 |             # Page 4: Detects the URL
135 |             found = [p in content for p in patterns]
136 |             assert any(found)
137 |         elif res["metadata"]["page"] == 5:
138 |             # Page 5: Detects all the URLs
139 |             found = [p in content for p in patterns]
140 |             assert all(found)
141 |         elif res["metadata"]["page"] == 6:
142 |             # Page 6: Detects the URL
143 |             found = "https://github" in content
144 |             assert found
145 | 
146 | 
147 | @pytest.mark.asyncio
148 | @pytest.mark.parametrize("depth", [1, 2])
149 | async def test_recursive_url_parsing(depth):
150 |     results = parse("https://example.com/", depth=depth)["segments"]
151 | 
152 |     # Not necessarily always the case. Just the case for "example.com".
153 |     assert len(results) == depth
154 | 
155 | 
156 | @pytest.mark.asyncio
157 | async def test_recursive_url_parsing_in_pdf():
158 |     sample = "examples/inputs/sample_test_doc.pdf"
159 |     parser_type = "AUTO"
160 |     results = parse(sample, parser_type, pages_per_split=1, depth=2)
161 |     assert len(results["recursive_docs"]) >= 7, results
162 | 
163 | 
164 | @pytest.mark.asyncio
165 | async def test_parsing_txt_type():
166 |     sample = "examples/inputs/sample_test.txt"
167 |     parser_type = "AUTO"
168 |     results = parse(sample, parser_type)["segments"]
169 |     assert len(results) == 1
170 |     assert results[0]["content"] is not None
171 | 
172 | 
173 | @pytest.mark.asyncio
174 | async def test_parsing_docx_type():
175 |     sample = "examples/inputs/sample.docx"
176 |     parser_type = "STATIC_PARSE"
177 |     results = parse(sample, parser_type)["segments"]
178 |     assert len(results) >= 1
179 |     assert results[0]["content"] is not None
180 | 
181 |     parser_type = "LLM_PARSE"
182 |     results = parse(sample, parser_type)["segments"]
183 |     assert len(results) > 1
184 |     assert results[0]["content"] is not None
185 | 
186 | 
187 | @pytest.mark.asyncio
188 | async def test_parsing_xlsx_type():
189 |     sample = "examples/inputs/sample.xlsx"
190 |     parser_type = "STATIC_PARSE"
191 |     results = parse(sample, parser_type)["segments"]
192 |     assert len(results) >= 1
193 |     assert results[0]["content"] is not None
194 | 
195 | 
196 | @pytest.mark.asyncio
197 | async def test_parsing_pptx_type():
198 |     sample = "examples/inputs/sample.pptx"
199 |     parser_type = "STATIC_PARSE"
200 |     results = parse(sample, parser_type)["segments"]
201 |     assert len(results) >= 1
202 |     assert results[0]["content"] is not None
203 | 
204 | 
205 | @pytest.mark.asyncio
206 | async def test_dynamic_js_parsing():
207 |     test_url = "https://go.contentsquare.com/ab-testing-playbook"
208 |     results = parse(test_url, parser_type="AUTO")["raw"]
209 |     # Check if the content contains the expected information
210 |     should_contain_info = "6 Types of experimentation"
211 |     assert should_contain_info.lower() in results.strip().lower()
212 | 
213 | 
214 | @pytest.mark.asyncio
215 | async def test_pdfplumber_table_parsing():
216 |     sample = "examples/inputs/test_1.pdf"
217 |     parser_type = "STATIC_PARSE"
218 |     results = parse(sample, parser_type, framework="pdfplumber")["raw"]
219 |     assert [token in results for token in ["|", "Results", "Accuracy"]]
220 | 
221 | 
222 | @pytest.mark.asyncio
223 | @pytest.mark.parametrize(
224 |     "sample",
225 |     [
226 |         ("examples/inputs/stress_test/large_doc_1.pdf", 527),
227 |         ("examples/inputs/stress_test/large_doc_2.pdf", 117),
228 |     ],
229 | )
230 | async def test_large_pdf_parsing(sample):
231 |     parser_type = "AUTO"
232 |     file_name = sample[0]
233 |     n_pages = sample[1]
234 |     results = parse(file_name, parser_type, pages_per_split=1)["segments"]
235 |     assert len(results) == n_pages
236 |     assert results[0]["content"] is not None
237 | 
238 | 
239 | token_usage_models = [
240 |     # Google models
241 |     "gemini-2.0-flash-001",
242 |     # OpenAI models
243 |     "gpt-4o",
244 |     # Meta-LLAMA models through HF Hub
245 |     "meta-llama/Llama-3.2-11B-Vision-Instruct",
246 |     # Meta-LLAMA models through Together AI
247 |     "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
248 | ]
249 | 
250 | 
251 | @pytest.mark.parametrize("model", token_usage_models)
252 | @pytest.mark.asyncio
253 | async def test_token_usage_api(model):
254 |     sample = "examples/inputs/test_1.pdf"
255 |     parser_type = "LLM_PARSE"
256 |     config = {"parser_type": parser_type, "model": model}
257 |     token_usage = parse(sample, **config)["token_usage"]
258 |     logger.info(f"Token usage: {token_usage}")
259 |     assert token_usage["input"] > 0
260 |     assert token_usage["output"] > 0
261 |     assert token_usage["total"] > 0
262 | 
263 | 
264 | @pytest.mark.asyncio
265 | async def test_pdf_save_path():
266 |     sample = "https://example.com/"
267 |     parser_type = "LLM_PARSE"
268 |     result = parse(
269 |         sample,
270 |         parser_type,
271 |         as_pdf=True,
272 |         save_dir="tests/outputs/temp",
273 |         save_filename="test_output.pdf",
274 |     )
275 |     assert "pdf_path" in result
276 |     assert result["pdf_path"].endswith(".pdf")
277 |     assert os.path.exists(result["pdf_path"])
278 | 
279 |     # Clean up
280 |     os.remove(result["pdf_path"])
281 |     os.rmdir("tests/outputs/temp")
282 | 
283 | 
284 | @pytest.mark.asyncio
285 | async def test_page_nums():
286 |     sample = "examples/inputs/sample_test_doc.pdf"
287 |     result = parse(sample, "LLM_PARSE", page_nums=(3, 4), pages_per_split=1)
288 |     assert len(result["segments"]) == 2
289 |     assert all(keyword in result["raw"] for keyword in ["Table 24", "apple"])
290 |     assert all(keyword not in result["raw"] for keyword in ["Aenean", "Lexoid"])
291 | 
292 |     result = parse(sample, "LLM_PARSE", page_nums=(3, 3), pages_per_split=1)
293 |     assert len(result["segments"]) == 1
294 |     assert "Table 24" in result["raw"]
295 | 
296 |     sample = "https://www.dca.ca.gov/acp/pdf_files/lemonlaw_qa.pdf"
297 |     result = parse(sample, "STATIC_PARSE", page_nums=2, pages_per_split=1)
298 |     assert len(result["segments"]) == 1
299 |     assert "ATTEMPTS" in result["raw"]
300 |     assert "acp@dca.ca.gov" not in result["raw"]
301 | 
302 | 
303 | @pytest.mark.parametrize(
304 |     "model",
305 |     [
306 |         "gemini-2.0-flash",
307 |         "gpt-4o",
308 |         "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo",
309 |     ],
310 | )
311 | @pytest.mark.asyncio
312 | async def test_token_cost(model):
313 |     sample = "examples/inputs/test_1.pdf"
314 |     parser_type = "LLM_PARSE"
315 |     api_cost_path = os.path.join(os.path.dirname(__file__), "api_cost_mapping.json")
316 |     config = {
317 |         "parser_type": parser_type,
318 |         "model": model,
319 |         "api_cost_mapping": api_cost_path,
320 |     }
321 |     result = parse(sample, **config)
322 |     assert "token_cost" in result
323 |     assert result["token_cost"]["input"] > 0
324 |     assert result["token_cost"]["output"] > 0
325 |     assert result["token_cost"]["total"] > 0
326 | 
327 | 
328 | @pytest.mark.asyncio
329 | async def test_blockquote():
330 |     sample = "examples/inputs/bench_md.pdf"
331 |     parser_type = "STATIC_PARSE"
332 |     results = parse(sample, parser_type, framework="pdfplumber")["raw"]
333 |     # Assert that there is at least one fenced code block
334 |     assert "&nbsp;" * 3 in results
335 | 
336 | 
337 | @pytest.mark.asyncio
338 | async def test_monospace_code_block():
339 |     sample = "examples/inputs/bench_md.pdf"
340 |     parser_type = "STATIC_PARSE"
341 |     results = parse(sample, parser_type, framework="pdfplumber")["raw"]
342 |     # Assert that there is at least one fenced code block
343 |     assert "```" in results
344 | 
345 | 
346 | @pytest.mark.asyncio
347 | async def test_pdf_headings():
348 |     sample_path = "examples/inputs/bench_md.pdf"
349 |     parser_type = "STATIC_PARSE"
350 |     results = parse(sample_path, parser_type, framework="pdfplumber")["raw"]
351 | 
352 |     # Test for h1 (should have # in markdown)
353 |     assert "#" in results
354 |     assert "##" in results
355 | 
356 | 
357 | @pytest.mark.asyncio
358 | async def test_email_address():
359 |     sample = "examples/inputs/bench_md.pdf"
360 |     parser_type = "STATIC_PARSE"
361 |     results = parse(sample, parser_type, framework="pdfplumber")["raw"]
362 |     assert "<mail@example.com>" in results
363 | 
364 | 
365 | @pytest.mark.asyncio
366 | async def test_horizontal_lines():
367 |     sample = "examples/inputs/bench_md.pdf"
368 |     parser_type = "STATIC_PARSE"
369 |     results = parse(sample, parser_type, framework="pdfplumber")["raw"]
370 |     assert "\n---\n" in results, "Markdown horizontal rule not found"
371 | 
372 | 
373 | @pytest.mark.asyncio
374 | async def test_strikethrough_words():
375 |     sample = "examples/inputs/bench_md.pdf"
376 |     parser_type = "STATIC_PARSE"
377 |     results = parse(sample, parser_type, framework="pdfplumber")["raw"]
378 |     assert "~~" in results, "Markdown strikethrough text not found"
379 | 


--------------------------------------------------------------------------------