├── .env_example ├── .github └── workflows │ └── deploy_docs.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── Makefile ├── README.md ├── docs ├── .nojekyll ├── Makefile ├── api.rst ├── benchmark.csv ├── benchmark.rst ├── conf.py ├── contributing.rst ├── index.rst ├── installation.rst ├── make.bat ├── requirements.txt └── update_benchmarks.py ├── examples ├── example_notebook.ipynb ├── example_notebook_colab.ipynb ├── inputs │ ├── bench_md.pdf │ ├── benchmark.pdf │ ├── costco_bill.jpg │ ├── cvs_coupon.jpg │ ├── grocery_bill.jpg │ ├── medical_invoice_sample1.png │ ├── medical_travel_request_OWCP_957.png │ ├── sample.docx │ ├── sample.pptx │ ├── sample.xlsx │ ├── sample_test.txt │ ├── sample_test_doc.pdf │ ├── screenshot-1.png │ ├── stress_test │ │ ├── large_doc_1.pdf │ │ └── large_doc_2.pdf │ ├── test_1.pdf │ ├── test_2.pdf │ ├── test_3.pdf │ ├── test_4.jpg │ ├── test_5.jpg │ ├── test_explicit_hyperlink_n_img.pdf │ ├── test_hidden_link_with_image.pdf │ └── test_with_hidden_links_no_img.pdf └── outputs │ ├── benchmark.md │ ├── costco_bill.md │ ├── cvs_coupon.md │ ├── grocery_bill.md │ ├── medical_invoice_sample1.md │ ├── medical_travel_request_OWCP_957.md │ ├── test_1.md │ ├── test_2.md │ ├── test_3.md │ ├── test_4.md │ └── test_5.md ├── lexoid ├── api.py └── core │ ├── parse_type │ ├── llm_parser.py │ └── static_parser.py │ ├── prompt_templates.py │ └── utils.py ├── poetry.lock ├── pyproject.toml └── tests ├── api_cost_mapping.json ├── benchmark.py ├── env_template └── test_parser.py /.env_example: -------------------------------------------------------------------------------- 1 | GOOGLE_API_KEY="" 2 | OPENAI_API_KEY="" 3 | HUGGINGFACEHUB_API_TOKEN="" 4 | TOGETHER_API_KEY="" -------------------------------------------------------------------------------- /.github/workflows/deploy_docs.yml: -------------------------------------------------------------------------------- 1 | name: Docs 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - '**docs**' 8 | paths: 9 | - 'docs/**' 10 | - '.github/workflows/deploy_docs.yml' 11 | 12 | jobs: 13 | build-docs: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: "3.11" 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install sphinx docutils 28 | pip install -r docs/requirements.txt || true 29 | 30 | - name: Build Sphinx documentation 31 | run: | 32 | sphinx-build -b html docs/ docs/_build/html 33 | 34 | - name: Upload documentation artifact 35 | uses: actions/upload-pages-artifact@v3 36 | with: 37 | path: docs/_build/html 38 | 39 | deploy: 40 | needs: build-docs 41 | runs-on: ubuntu-latest 42 | permissions: 43 | pages: write 44 | id-token: write 45 | environment: 46 | name: github-pages 47 | url: ${{ steps.deployment.outputs.page_url }} 48 | steps: 49 | - name: Deploy to GitHub Pages 50 | id: deployment 51 | uses: actions/deploy-pages@v4 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | # Custom 165 | tests/outputs/ 166 | outputs/ 167 | inputs/ 168 | 169 | # Others 170 | .DS_Store -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | ## [0.1.1] - 2024-10-28 4 | 5 | ### Added 6 | - Support for URL parsing 7 | 8 | ### Changed 9 | 10 | ### Fixed 11 | 12 | ## [0.1.2] - 2024-11-04 13 | 14 | ### Added 15 | - Initial testing code 16 | - Benchmarking code 17 | 18 | ### Changed 19 | - Improvements in OpenAI prompt 20 | - Conversion of PDFs to images before parsing with OpenAI models 21 | 22 | ### Fixed 23 | 24 | 25 | ## [0.1.3] - 2024-11-12 26 | 27 | ### Added 28 | - `AUTO` parse mode 29 | 30 | ### Changed 31 | - Switch from multithreading to multiprocessing 32 | 33 | ### Fixed 34 | 35 | ## [0.1.4] - 2024-11-22 36 | 37 | ### Added 38 | - Support for structured parsing of HTML pages 39 | - Support for recursive URL parsing in websites and PDFs 40 | 41 | ### Changed 42 | - URL extraction regex 43 | 44 | ### Fixed 45 | - Bug in document appending logic 46 | - Bug caused by split pdfs being in same dir as source pdf 47 | 48 | ## [0.1.5] - 2024-12-06 49 | 50 | ### Added 51 | 52 | ### Changed 53 | - Improved pdfplumber parsing to format markdown and detect hyperlinks 54 | 55 | ### Fixed 56 | 57 | ## [0.1.6] - 2024-12-10 58 | 59 | ### Added 60 | * Support for parsing .csv, .txt, and .html, and .docx files 61 | * Support for parsing links to documents when recursive HTML parsing 62 | 63 | ### Changed 64 | 65 | ### Fixed 66 | 67 | ## [0.1.7] - 2025-01-08 68 | 69 | ### Added 70 | * Colab example notebook 71 | * Support for bold and italic formatting in PDFPlumber 72 | * Support for Llama 3.2 models through HuggingFace and Together AI 73 | 74 | ### Changed 75 | * Improved PDFPlumber table parsing 76 | 77 | ### Fixed 78 | * PDFPlumber text detection bug 79 | 80 | ## [0.1.8] - 2025-01-23 81 | 82 | ### Added 83 | * Retry and error handling for LLM_PARSE 84 | 85 | ### Changed 86 | * Remove together Python client dependency and use REST API calls instead 87 | 88 | ## [0.1.8.post1] - 2025-01-28 89 | 90 | ### Added 91 | * Documentation 92 | 93 | ### Changed 94 | * Specify headers for Playwright web page retrieval 95 | 96 | ## [0.1.9] - 2025-02-17 97 | 98 | ### Added 99 | - Parameters to specify intermediate PDF save path when `as_pdf=True`. 100 | - Return `token_uage` and `pdf_path` with `parse()` output where applicable 101 | 102 | ### Changed 103 | - Switched back to together Python client 104 | - Improved `parse()` function return format to be a dictionary. 105 | 106 | 107 | ## [0.1.10] - 2025-02-23 108 | 109 | ### Added 110 | - Parameter to specify page numbers for parsing 111 | 112 | ### Fixed 113 | - Errors caused by empty token_usage 114 | 115 | ## [0.1.11] - 2025-02-27 116 | 117 | ### Added 118 | - Priority setting to AUTO routing 119 | - More models to benchmark 120 | 121 | ### Changed 122 | - Set default parse_type to AUTO 123 | - Set default LLM to Gemini 2.0 Flash 124 | - Updated benchmark script to aggregate over multiple runs 125 | 126 | ### Fixed 127 | - Incorrect title when `as_pdf=True` 128 | 129 | 130 | ## [0.1.11.post1] - 2025-03-05 131 | 132 | ### Added 133 | - Code of Conduct 134 | 135 | ### Fixed 136 | - Segmentation fault when PyQT app is reinitialized 137 | 138 | ## [0.1.12] - 2025-04-11 139 | 140 | ### Added 141 | * Support for OpenRouter models 142 | * Return token cost when cost mapping is provided 143 | * Support for custom prompts 144 | * Support for parsing Excel and PowerPoint files 145 | 146 | ### Changed 147 | * Set default `router_priority` to `speed` 148 | 149 | ## [0.1.13] - 2025-04-20 150 | 151 | ### Added 152 | * `STATIC_PARSE` improvements 153 | * Horizontal line detection 154 | * Strikethrough text detection 155 | * Email address formatting 156 | * Improved heading level detection 157 | * Monospace font detection 158 | * Indentation detection 159 | 160 | ## [0.1.14] - 2025-06-05 161 | 162 | ### Added 163 | * Add support for Fireworks API 164 | * Add support for matching data in document to pre-defined schema or template 165 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | At Oid Labs we are committed to enabling a safe, welcoming and collaborative environment for everyone. 2 | 3 | # Contributor Covenant Code of Conduct 4 | 5 | ## Our Pledge 6 | 7 | We as members, contributors, and leaders pledge to make participation in our 8 | community a harassment-free experience for everyone, regardless of age, body 9 | size, visible or invisible disability, ethnicity, sex characteristics, gender 10 | identity and expression, level of experience, education, socio-economic status, 11 | nationality, personal appearance, race, caste, color, religion, or sexual 12 | identity and orientation. 13 | 14 | We pledge to act and interact in ways that contribute to an open, welcoming, 15 | diverse, inclusive, and healthy community. 16 | 17 | ## Our Standards 18 | 19 | Examples of behavior that contributes to a positive environment for our 20 | community include: 21 | 22 | - Demonstrating empathy and kindness toward other people 23 | - Being respectful of differing opinions, viewpoints, and experiences 24 | - Giving and gracefully accepting constructive feedback 25 | - Accepting responsibility and apologizing to those affected by our mistakes, 26 | and learning from the experience 27 | - Focusing on what is best not just for us as individuals, but for the overall 28 | community 29 | 30 | Examples of unacceptable behavior include: 31 | 32 | - The use of sexualized language or imagery, and sexual attention or advances of 33 | any kind 34 | - Trolling, insulting or derogatory comments, and personal or political attacks 35 | - Public or private harassment 36 | - Publishing others' private information, such as a physical or email address, 37 | without their explicit permission 38 | - Other conduct which could reasonably be considered inappropriate in a 39 | professional setting 40 | 41 | ## Enforcement Responsibilities 42 | 43 | Community leaders are responsible for clarifying and enforcing our standards of 44 | acceptable behavior and will take appropriate and fair corrective action in 45 | response to any behavior that they deem inappropriate, threatening, offensive, 46 | or harmful. 47 | 48 | Community leaders have the right and responsibility to remove, edit, or reject 49 | comments, commits, code, wiki edits, issues, and other contributions that are 50 | not aligned to this Code of Conduct, and will communicate reasons for moderation 51 | decisions when appropriate. 52 | 53 | ## Scope 54 | 55 | This Code of Conduct applies within all community spaces, and also applies when 56 | an individual is officially representing the community in public spaces. 57 | Examples of representing our community include using an official email address, 58 | posting via an official social media account, or acting as an appointed 59 | representative at an online or offline event. 60 | 61 | ## Enforcement 62 | 63 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 64 | reported to the community leaders responsible for enforcement at 65 | [INSERT CONTACT METHOD]. 66 | All complaints will be reviewed and investigated promptly and fairly. 67 | 68 | All community leaders are obligated to respect the privacy and security of the 69 | reporter of any incident. 70 | 71 | ## Enforcement Guidelines 72 | 73 | Community leaders will follow these Community Impact Guidelines in determining 74 | the consequences for any action they deem in violation of this Code of Conduct: 75 | 76 | ### 1. Correction 77 | 78 | **Community Impact**: Use of inappropriate language or other behavior deemed 79 | unprofessional or unwelcome in the community. 80 | 81 | **Consequence**: A private, written warning from community leaders, providing 82 | clarity around the nature of the violation and an explanation of why the 83 | behavior was inappropriate. A public apology may be requested. 84 | 85 | ### 2. Warning 86 | 87 | **Community Impact**: A violation through a single incident or series of 88 | actions. 89 | 90 | **Consequence**: A warning with consequences for continued behavior. No 91 | interaction with the people involved, including unsolicited interaction with 92 | those enforcing the Code of Conduct, for a specified period of time. This 93 | includes avoiding interactions in community spaces as well as external channels 94 | like social media. Violating these terms may lead to a temporary or permanent 95 | ban. 96 | 97 | ### 3. Temporary Ban 98 | 99 | **Community Impact**: A serious violation of community standards, including 100 | sustained inappropriate behavior. 101 | 102 | **Consequence**: A temporary ban from any sort of interaction or public 103 | communication with the community for a specified period of time. No public or 104 | private interaction with the people involved, including unsolicited interaction 105 | with those enforcing the Code of Conduct, is allowed during this period. 106 | Violating these terms may lead to a permanent ban. 107 | 108 | ### 4. Permanent Ban 109 | 110 | **Community Impact**: Demonstrating a pattern of violation of community 111 | standards, including sustained inappropriate behavior, harassment of an 112 | individual, or aggression toward or disparagement of classes of individuals. 113 | 114 | **Consequence**: A permanent ban from any sort of public interaction within the 115 | community. 116 | 117 | ## Attribution 118 | 119 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 120 | version 2.1, available at 121 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 122 | 123 | Community Impact Guidelines were inspired by 124 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 128 | [https://www.contributor-covenant.org/translations][translations]. 129 | 130 | [homepage]: https://www.contributor-covenant.org 131 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 132 | [Mozilla CoC]: https://github.com/mozilla/diversity 133 | [FAQ]: https://www.contributor-covenant.org/faq 134 | [translations]: https://www.contributor-covenant.org/translations 135 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: dev help 2 | 3 | help: 4 | @echo "make dev - Install development dependencies" 5 | @echo "make setup - Dev setup" 6 | 7 | setup: 8 | python3 -m venv .venv 9 | .venv/bin/python3 -m pip install --upgrade pip 10 | .venv/bin/python3 -m pip install poetry 11 | .venv/bin/poetry update 12 | 13 | install: setup 14 | .venv/bin/poetry install --without dev 15 | .venv/bin/playwright install --with-deps --only-shell chromium 16 | 17 | dev: setup 18 | .venv/bin/poetry install --with dev 19 | .venv/bin/playwright install --with-deps --only-shell chromium 20 | 21 | clean: 22 | rm -rf .venv 23 | rm -rf lexoid.egg-info 24 | rm -rf dist 25 | 26 | build: 27 | .venv/bin/poetry update && .venv/bin/poetry build 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | ``` 4 | ___ _______ __ __ _______ ___ ______ 5 | | | | || |_| || || | | | 6 | | | | ___|| || _ || | | _ | 7 | | | | |___ | || | | || | | | | | 8 | | |___ | ___| | | | |_| || | | |_| | 9 | | || |___ | _ || || | | | 10 | |_______||_______||__| |__||_______||___| |______| 11 | 12 | ``` 13 | 14 |
15 | 16 | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb) 17 | [![Hugging Face](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-yellow)](https://huggingface.co/spaces/oidlabs/Lexoid) 18 | [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-turquoise.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE) 19 | [![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/) 20 | [![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/) 21 | 22 | Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing. 23 | 24 | [Documentation](https://oidlabs-com.github.io/Lexoid/) 25 | 26 | ## Motivation: 27 | 28 | - Use the multi-modal advancement of LLMs 29 | - Enable convenience for users 30 | - Collaborate with a permissive license 31 | 32 | ## Installation 33 | 34 | ### Installing with pip 35 | 36 | ``` 37 | pip install lexoid 38 | ``` 39 | 40 | To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions 41 | 42 | ``` 43 | OPENAI_API_KEY="" 44 | GOOGLE_API_KEY="" 45 | ``` 46 | 47 | Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library): 48 | 49 | ``` 50 | playwright install --with-deps --only-shell chromium 51 | ``` 52 | 53 | ### Building `.whl` from source 54 | 55 | ``` 56 | make build 57 | ``` 58 | 59 | ### Creating a local installation 60 | 61 | To install dependencies: 62 | 63 | ``` 64 | make install 65 | ``` 66 | 67 | or, to install with dev-dependencies: 68 | 69 | ``` 70 | make dev 71 | ``` 72 | 73 | To activate virtual environment: 74 | 75 | ``` 76 | source .venv/bin/activate 77 | ``` 78 | 79 | ## Usage 80 | 81 | [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb) 82 | 83 | [Example Colab Notebook](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb) 84 | 85 | Here's a quick example to parse documents using Lexoid: 86 | 87 | ```python 88 | from lexoid.api import parse 89 | from lexoid.api import ParserType 90 | 91 | parsed_md = parse("https://www.justice.gov/eoir/immigration-law-advisor", parser_type="LLM_PARSE")["raw"] 92 | # or 93 | pdf_path = "path/to/immigration-law-advisor.pdf" 94 | parsed_md = parse(pdf_path, parser_type="LLM_PARSE")["raw"] 95 | 96 | print(parsed_md) 97 | ``` 98 | 99 | ### Parameters 100 | 101 | - path (str): The file path or URL. 102 | - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO". 103 | - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4. 104 | - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4. 105 | - \*\*kwargs: Additional arguments for the parser. 106 | 107 | ## Supported API Providers 108 | * Google 109 | * OpenAI 110 | * Hugging Face 111 | * Together AI 112 | * OpenRouter 113 | * Fireworks 114 | 115 | ## Benchmark 116 | 117 | Results aggregated across 5 iterations each for 5 documents. 118 | 119 | _Note:_ Benchmarks are currently done in the zero-shot setting. 120 | 121 | | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) | 122 | | --- | --- | --- | --- | --- | --- | 123 | | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 | 124 | | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 | 125 | | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 | 126 | | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA | 127 | | 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 | 128 | | 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA | 129 | | 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 | 130 | | 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 | 131 | | 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 | 132 | | 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 | 133 | | 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 | 134 | | 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 | 135 | | 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 | 136 | | 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 | 137 | | 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 | 138 | | 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 | 139 | | 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 | 140 | | 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 | 141 | | 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 | 142 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/docs/.nojekyll -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | Core Function 5 | ------------- 6 | 7 | parse 8 | ^^^^^ 9 | 10 | .. py:function:: lexoid.api.parse(path: str, parser_type: Union[str, ParserType] = "LLM_PARSE", pages_per_split: int = 4, max_processes: int = 4, **kwargs) -> Dict 11 | 12 | Parse a document using specified strategy. 13 | 14 | :param path: File path or URL to parse 15 | :param parser_type: Parser type to use ("LLM_PARSE", "STATIC_PARSE", or "AUTO") 16 | :param pages_per_split: Number of pages per chunk for processing 17 | :param max_processes: Maximum number of parallel processes 18 | :param kwargs: Additional keyword arguments 19 | :return: List of dictionaries containing page metadata and content, or raw text string 20 | 21 | Additional keyword arguments: 22 | 23 | * ``model`` (str): LLM model to use 24 | * ``framework`` (str): Static parsing framework 25 | * ``temperature`` (float): Temperature for LLM generation 26 | * ``depth`` (int): Depth for recursive URL parsing 27 | * ``as_pdf`` (bool): Convert input to PDF before processing 28 | * ``verbose`` (bool): Enable verbose logging 29 | * ``x_tolerance`` (int): X-axis tolerance for text extraction 30 | * ``y_tolerance`` (int): Y-axis tolerance for text extraction 31 | * ``save_dir`` (str): Directory to save intermediate PDFs 32 | * ``page_nums`` (List[int]): List of page numbers to parse 33 | * ``api_cost_mapping`` (Union[dict, str]): Dictionary containing API cost details or the string path to a JSON file containing 34 | the cost details. Sample file available at ``tests/api_cost_mapping.json`` 35 | * ``router_priority`` (str): What the routing strategy should prioritize. Options are ``"speed"`` and ``"accuracy"``. The router directs a file to either ``"STATIC_PARSE"`` or ``"LLM_PARSE"`` based on its type and the selected priority. If priority is "accuracy", it prefers LLM_PARSE unless the PDF has no images but contains embedded/hidden hyperlinks, in which case it uses ``STATIC_PARSE`` (because LLMs currently fail to parse hidden hyperlinks). If priority is "speed", it uses ``STATIC_PARSE`` for documents without images and ``LLM_PARSE`` for documents with images. 36 | * ``api_provider`` (str): The API provider to use for LLM parsing. Options are ``openai``, ``huggingface``, ``together``, ``openrouter``, and ``fireworks``. This parameter is only relevant when using LLM parsing. 37 | 38 | Return value format: 39 | A dictionary containing a subset or all of the following keys: 40 | 41 | * ``raw``: Full markdown content as string 42 | * ``segments``: List of dictionaries with metadata and content of each segment. For PDFs, a segment denotes a page. For webpages, a segment denotes a section (a heading and its content). 43 | * ``title``: Title of the document 44 | * ``url``: URL if applicable 45 | * ``parent_title``: Title of parent doc if recursively parsed 46 | * ``recursive_docs``: List of dictionaries for recursively parsed documents 47 | * ``token_usage``: Token usage statistics 48 | * ``pdf_path``: Path to the intermediate PDF generated when ``as_pdf`` is enabled and the kwarg ``save_dir`` is specified. 49 | 50 | 51 | parse_with_schema 52 | ^^^^^^^^^^^^^^^^^ 53 | 54 | .. py:function:: lexoid.api.parse_with_schema(path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs) -> List[List[Dict]] 55 | 56 | Parses a PDF using an LLM to generate structured output conforming to a given JSON schema. 57 | 58 | :param path: Path to the PDF file. 59 | :param schema: JSON schema to which the parsed output should conform. 60 | :param api: LLM API provider to use (``"openai"``, ``"huggingface"``, ``"together"``, ``"openrouter"``, or ``"fireworks"``). 61 | :param model: LLM model name. 62 | :param kwargs: Additional keyword arguments passed to the LLM (e.g., ``temperature``, ``max_tokens``). 63 | :return: A list where each element represents a page, which in turn contains a list of dictionaries conforming to the provided schema. 64 | 65 | Additional keyword arguments: 66 | 67 | * ``temperature`` (float): Sampling temperature for LLM generation. 68 | * ``max_tokens`` (int): Maximum number of tokens to generate. 69 | 70 | Return value format: 71 | A list of pages, where each page is represented as a list of dictionaries. Each dictionary conforms to the structure defined by the input ``schema``. 72 | 73 | 74 | Examples 75 | -------- 76 | 77 | Basic Usage 78 | ^^^^^^^^^^^ 79 | 80 | .. code-block:: python 81 | 82 | from lexoid.api import parse 83 | 84 | # Basic parsing 85 | result = parse("document.pdf") 86 | 87 | # Raw text output 88 | parsed_md = result["raw"] 89 | 90 | # Segmented output with metadata 91 | parsed_segments = result["segments"] 92 | 93 | # Automatic parser selection 94 | result = parse("document.pdf", parser_type="AUTO") 95 | 96 | LLM-Based Parsing 97 | ^^^^^^^^^^^^^^^^^ 98 | 99 | .. code-block:: python 100 | 101 | # Parse using GPT-4o 102 | result = parse("document.pdf", parser_type="LLM_PARSE", model="gpt-4o") 103 | 104 | # Parse using Gemini 1.5 Pro 105 | result = parse("document.pdf", parser_type="LLM_PARSE", model="gemini-1.5-pro") 106 | 107 | 108 | Static Parsing 109 | ^^^^^^^^^^^^^^ 110 | 111 | .. code-block:: python 112 | 113 | # Parse using PDFPlumber 114 | result = parse("document.pdf", parser_type="STATIC_PARSE", model="pdfplumber") 115 | 116 | # Parse using PDFMiner 117 | result = parse("document.pdf", parser_type="STATIC_PARSE", model="pdfminer") 118 | 119 | 120 | Parse with Schema 121 | ^^^^^^^^^^^^^^^^^ 122 | 123 | .. code-block:: python 124 | 125 | from lexoid.api import parse_with_schema 126 | 127 | sample_schema = [ 128 | { 129 | "Disability Category": "string", 130 | "Participants": "int", 131 | "Ballots Completed": "int", 132 | "Ballots Incomplete/Terminated": "int", 133 | "Accuracy": ["string"], 134 | "Time to complete": ["string"] 135 | } 136 | ] 137 | 138 | pdf_path = "inputs/test_1.pdf" 139 | result = parse_with_schema(path=pdf_path, schema=sample_schema, model="gpt-4o") 140 | 141 | Web Content 142 | ^^^^^^^^^^^ 143 | 144 | .. code-block:: python 145 | 146 | # Parse webpage 147 | result = parse("https://example.com") 148 | 149 | # Parse webpage and the pages linked within the page 150 | result = parse("https://example.com", depth=2) -------------------------------------------------------------------------------- /docs/benchmark.csv: -------------------------------------------------------------------------------- 1 | Model,Mean Similarity,Std. Dev.,Time (s),Cost($) 2 | gemini-2.0-flash,0.829,0.102,7.41,0.00048 3 | gemini-2.0-flash-001,0.814,0.176,6.85,0.000421 4 | gemini-1.5-flash,0.797,0.143,9.54,0.000238 5 | gemini-2.0-pro-exp,0.764,0.227,11.95,TBA 6 | AUTO,0.760,0.184,5.14,0.000217 7 | gemini-2.0-flash-thinking-exp,0.746,0.266,10.46,TBA 8 | gemini-1.5-pro,0.732,0.265,11.44,0.003332 9 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks),0.687,0.221,8.07,0.000419 10 | gpt-4o,0.687,0.247,10.16,0.004736 11 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks),0.675,0.184,5.98,0.000226 12 | gpt-4o-mini,0.642,0.213,9.71,0.000275 13 | gemma-3-27b-it (via OpenRouter),0.628,0.299,18.79,0.000096 14 | gemini-1.5-flash-8b,0.551,0.223,3.91,0.000055 15 | Llama-Vision-Free (via Together AI),0.531,0.198,6.93,0 16 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI),0.524,0.192,3.68,0.00006 17 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter),0.482,0.209,11.53,0.000052 18 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI),0.461,0.306,19.26,0.000426 19 | Llama-3.2-11B-Vision-Instruct (via Hugging Face),0.451,0.257,4.54,0 20 | microsoft/phi-4-multimodal-instruct (via OpenRouter),0.366,0.287,10.8,0.000019 21 | -------------------------------------------------------------------------------- /docs/benchmark.rst: -------------------------------------------------------------------------------- 1 | Benchmark Report 2 | ================ 3 | 4 | Overview 5 | -------- 6 | 7 | This benchmark evaluates the performance of various Large Language Models (LLMs) and parsing strategies in extracting and parsing document content using Lexoid. 8 | 9 | Each approach is evaluated based on a comparison between the parsed content and the manually created ground truths of several documents, with a similarity metric indicating the accuracy of the parsing process. 10 | 11 | Similarity Metric 12 | ^^^^^^^^^^^^^^^^^ 13 | 14 | The similarity metric is calculated using the following steps (see `calculate_similarity()` in `lexoid/core/utils.py` for the implementation). 15 | 16 | 1. Markdown Conversion 17 | Both parsed and ground truth documents are converted to HTML, standardizing their format across structural elements like tables and lists. 18 | 19 | 2. HTML Tag Removal 20 | All HTML markup is stripped away, leaving only the pure textual content. This ensures the comparison focuses on the actual text rather than formatting. 21 | 22 | 3. Sequence Matching 23 | Python's ``SequenceMatcher`` compares the extracted text sequences, calculating a similarity ratio between 0 and 1 that reflects content preservation and accuracy. 24 | 25 | Running the Benchmarks 26 | ---------------------- 27 | 28 | Setup Environment Variables 29 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^ 30 | 31 | Create a ``.env`` file with the necessary API keys: 32 | 33 | .. code-block:: bash 34 | 35 | OPENAI_API_KEY=your_openai_key 36 | GOOGLE_API_KEY=your_google_key 37 | HUGGINGFACEHUB_API_TOKEN=your_huggingface_token 38 | TOGETHER_API_KEY=your_together_api_key 39 | 40 | Running the Benchmark Script 41 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 42 | 43 | .. code-block:: bash 44 | 45 | # Clone the repository 46 | git clone https://github.com/oidlabs-com/lexoid.git 47 | cd lexoid 48 | 49 | # Install dependencies 50 | pip install -r requirements.txt 51 | 52 | # Run benchmarks 53 | python tests/benchmark.py 54 | 55 | Customizing Benchmarks 56 | ^^^^^^^^^^^^^^^^^^^^^^ 57 | 58 | You can modify the ``test_attributes`` list in the ``main()`` function to test different configurations: 59 | 60 | * ``parser_type``: Switch between LLM and static parsing 61 | * ``model``: Test different LLM models 62 | * ``framework``: Test different static parsing frameworks 63 | * ``pages_per_split``: Adjust document chunking 64 | * ``max_threads``: Control parallel processing 65 | 66 | Benchmark Results 67 | ----------------- 68 | 69 | Here are the detailed parsing performance results for various models: 70 | 71 | .. list-table:: 72 | :widths: auto 73 | :header-rows: 1 74 | 75 | * - Rank 76 | - Model 77 | - Mean Similarity 78 | - Std. Dev. 79 | - Time (s) 80 | - Cost ($) 81 | * - 1 82 | - gemini-2.0-flash 83 | - 0.829 84 | - 0.102 85 | - 7.41 86 | - 0.00048 87 | * - 2 88 | - gemini-2.0-flash-001 89 | - 0.814 90 | - 0.176 91 | - 6.85 92 | - 0.000421 93 | * - 3 94 | - gemini-1.5-flash 95 | - 0.797 96 | - 0.143 97 | - 9.54 98 | - 0.000238 99 | * - 4 100 | - gemini-2.0-pro-exp 101 | - 0.764 102 | - 0.227 103 | - 11.95 104 | - TBA 105 | * - 5 106 | - AUTO 107 | - 0.76 108 | - 0.184 109 | - 5.14 110 | - 0.000217 111 | * - 6 112 | - gemini-2.0-flash-thinking-exp 113 | - 0.746 114 | - 0.266 115 | - 10.46 116 | - TBA 117 | * - 7 118 | - gemini-1.5-pro 119 | - 0.732 120 | - 0.265 121 | - 11.44 122 | - 0.003332 123 | * - 8 124 | - accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) 125 | - 0.687 126 | - 0.221 127 | - 8.07 128 | - 0.000419 129 | * - 9 130 | - gpt-4o 131 | - 0.687 132 | - 0.247 133 | - 10.16 134 | - 0.004736 135 | * - 10 136 | - accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) 137 | - 0.675 138 | - 0.184 139 | - 5.98 140 | - 0.000226 141 | * - 11 142 | - gpt-4o-mini 143 | - 0.642 144 | - 0.213 145 | - 9.71 146 | - 0.000275 147 | * - 12 148 | - gemma-3-27b-it (via OpenRouter) 149 | - 0.628 150 | - 0.299 151 | - 18.79 152 | - 0.000096 153 | * - 13 154 | - gemini-1.5-flash-8b 155 | - 0.551 156 | - 0.223 157 | - 3.91 158 | - 0.000055 159 | * - 14 160 | - Llama-Vision-Free (via Together AI) 161 | - 0.531 162 | - 0.198 163 | - 6.93 164 | - 0 165 | * - 15 166 | - Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) 167 | - 0.524 168 | - 0.192 169 | - 3.68 170 | - 0.00006 171 | * - 16 172 | - qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) 173 | - 0.482 174 | - 0.209 175 | - 11.53 176 | - 0.000052 177 | * - 17 178 | - Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) 179 | - 0.461 180 | - 0.306 181 | - 19.26 182 | - 0.000426 183 | * - 18 184 | - Llama-3.2-11B-Vision-Instruct (via Hugging Face) 185 | - 0.451 186 | - 0.257 187 | - 4.54 188 | - 0 189 | * - 19 190 | - microsoft/phi-4-multimodal-instruct (via OpenRouter) 191 | - 0.366 192 | - 0.287 193 | - 10.8 194 | - 0.000019 195 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "Lexoid" 10 | copyright = "2025, Lexoid Contributors" 11 | author = "Lexoid Contributors" 12 | release = "0.1.14" 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions = [] 18 | 19 | templates_path = ["_templates"] 20 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 21 | 22 | 23 | # -- Options for HTML output ------------------------------------------------- 24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 25 | 26 | html_theme = "pydata_sphinx_theme" 27 | html_static_path = ["_build/html/_static"] 28 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing to Lexoid 2 | ====================== 3 | 4 | Thank you for your interest in contributing to Lexoid! We welcome contributions from the community to make our document parsing library even better. 5 | 6 | Getting Started 7 | --------------- 8 | 9 | 1. Fork the repository and clone your fork: 10 | 11 | .. code-block:: bash 12 | 13 | git clone https://github.com/YOUR_USERNAME/lexoid.git 14 | cd lexoid 15 | 16 | 2. Set up your development environment: 17 | 18 | .. code-block:: bash 19 | 20 | make dev 21 | 22 | 3. Activate the virtual environment: 23 | 24 | .. code-block:: bash 25 | 26 | source .venv/bin/activate 27 | 28 | Development Setup 29 | ----------------- 30 | 31 | Environment Variables 32 | ^^^^^^^^^^^^^^^^^^^^^ 33 | 34 | Create a ``.env`` file in the root directory with the following API keys (as needed): 35 | 36 | .. code-block:: bash 37 | 38 | GOOGLE_API_KEY=your_google_api_key 39 | OPENAI_API_KEY=your_openai_api_key 40 | HUGGINGFACEHUB_API_TOKEN=your_huggingface_token 41 | TOGETHER_API_KEY=your_together_api_key 42 | 43 | Running Tests 44 | ^^^^^^^^^^^^^ 45 | 46 | Run the test suite: 47 | 48 | .. code-block:: bash 49 | 50 | python3 -m pytest tests/test_parser.py -v 51 | 52 | To see test logs: 53 | 54 | .. code-block:: bash 55 | 56 | python3 -m pytest tests/test_parser.py -v -s 57 | 58 | Contributing Guidelines 59 | ----------------------- 60 | 61 | Code Style 62 | ^^^^^^^^^^ 63 | 64 | * We use Python's `PEP 8 `_ style guide 65 | * If using VS Code, install the `Black Formatter `_ extension 66 | * Use type hints for function parameters and return values 67 | 68 | Pull Request Process 69 | ^^^^^^^^^^^^^^^^^^^^ 70 | 71 | 1. Create a new branch for your feature or bugfix: 72 | 73 | .. code-block:: bash 74 | 75 | git checkout -b feature-name 76 | 77 | 2. Make your changes and commit them with clear, descriptive commit messages 78 | 3. Add tests for any new functionality 79 | 4. Update documentation as needed 80 | 5. Push your changes and create a pull request 81 | 82 | Areas for Contribution 83 | ^^^^^^^^^^^^^^^^^^^^^^ 84 | 85 | * When starting out, check out the `Issues `_ page and look for tickets tagged with ``good first issue`` 86 | * However, don't let the above restrict you. Feel free to have a go at any ticket or suggest any new features! 87 | 88 | Testing Your Changes 89 | ^^^^^^^^^^^^^^^^^^^^ 90 | 91 | 1. Add test cases to ``tests/test_parser.py`` along with changes if appropriate 92 | 2. Test with different file formats and parsing strategies 93 | 94 | Documentation 95 | ------------- 96 | 97 | When adding new features, please: 98 | 99 | 1. Update the main ``README.md`` if needed 100 | 2. Add docstrings to new functions and classes 101 | 3. Include example usage in the documentation 102 | 4. Update type hints and function signatures in the docs 103 | 104 | Reporting Issues 105 | ---------------- 106 | 107 | When reporting bugs, please include: 108 | 109 | * A clear description of the problem 110 | * Steps to reproduce 111 | * Expected vs actual behavior 112 | * Sample files (if possible) 113 | * Environment information (Python version, OS, etc.) 114 | 115 | Thank you for helping improve Lexoid! -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Lexoid's Documentation 2 | ================================= 3 | 4 | Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | :caption: Contents: 9 | 10 | installation 11 | api 12 | contributing 13 | benchmark 14 | 15 | Key Features 16 | ------------ 17 | 18 | * Multiple parsing strategies (LLM-based and static parsing) 19 | * Automatic parsing strategy selection 20 | * Support for multiple LLM providers (OpenAI, Google, Meta/Llama, Together AI) 21 | * Table detection and markdown conversion 22 | * Hyperlink detection and preservation 23 | * Recursive URL parsing 24 | * Multi-format support 25 | * Parallel processing support 26 | * Permissive license 27 | 28 | Supported API Providers 29 | ----------------------- 30 | 31 | * Google 32 | * OpenAI 33 | * Hugging Face 34 | * Together AI 35 | * OpenRouter 36 | * Fireworks 37 | 38 | Indices and tables 39 | ================== 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` 43 | * :ref:`search` -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | Installing with pip 5 | ------------------- 6 | 7 | .. code-block:: bash 8 | 9 | pip install lexoid 10 | 11 | Environment Setup 12 | ----------------- 13 | 14 | To use LLM-based parsing, define the following environment variables or create a ``.env`` file with the following definitions: 15 | 16 | .. code-block:: bash 17 | 18 | GOOGLE_API_KEY=your_google_api_key 19 | OPENAI_API_KEY=your_openai_api_key 20 | HUGGINGFACEHUB_API_TOKEN=your_huggingface_token 21 | TOGETHER_API_KEY=your_together_api_key 22 | 23 | Optional Dependencies 24 | --------------------- 25 | 26 | To use ``Playwright`` for retrieving web content (instead of the ``requests`` library): 27 | 28 | .. code-block:: bash 29 | 30 | playwright install --with-deps --only-shell chromium 31 | 32 | Building from Source 33 | -------------------- 34 | 35 | To build the ``.whl`` file: 36 | 37 | .. code-block:: bash 38 | 39 | make build 40 | 41 | Local Development Setup 42 | ----------------------- 43 | 44 | To install dependencies: 45 | 46 | .. code-block:: bash 47 | 48 | make install 49 | 50 | Or, to install with dev-dependencies: 51 | 52 | .. code-block:: bash 53 | 54 | make dev 55 | 56 | To activate virtual environment: 57 | 58 | .. code-block:: bash 59 | 60 | source .venv/bin/activate -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | pydata-sphinx-theme 3 | docutils -------------------------------------------------------------------------------- /docs/update_benchmarks.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pandas as pd 3 | import re 4 | 5 | 6 | def update_markdown(content, table_md): 7 | pattern = r"(##\s*Benchmark\s*\n(?:.*?\n)*?\n)(\| Rank .*?\n\|.*?\n(?:\|.*?\n)+)" 8 | replacement = r"\1" + table_md + "\n" 9 | return re.sub(pattern, replacement, content, flags=re.DOTALL) 10 | 11 | 12 | def update_rst(content, table_rst): 13 | pattern = r"(Benchmark Results\s*-+\n.*?\n)(\s*\* - .*\n)+" 14 | return re.sub(pattern, f"\\1{table_rst}\n", content, flags=re.DOTALL) 15 | 16 | 17 | def generate_markdown_table(df): 18 | header = "| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |\n" 19 | sep = "| --- | --- | --- | --- | --- | --- |\n" 20 | rows = [ 21 | f"| {i+1} | {row['Model']} | {row['Mean Similarity']} | {row['Std. Dev.']} | {row['Time (s)']} | {row['Cost($)']} |" 22 | for i, row in df.iterrows() 23 | ] 24 | return header + sep + "\n".join(rows) 25 | 26 | 27 | def generate_rst_table(df): 28 | header = "\n * - Rank\n - Model\n - Mean Similarity\n - Std. Dev.\n - Time (s)\n - Cost ($)" 29 | rows = [ 30 | f" * - {i+1}\n - {row['Model']}\n - {row['Mean Similarity']}\n - {row['Std. Dev.']}\n - {row['Time (s)']}\n - {row['Cost($)']}" 31 | for i, row in df.iterrows() 32 | ] 33 | return header + "\n" + "\n".join(rows) 34 | 35 | 36 | def main(csv_path, md_path, rst_path): 37 | df = pd.read_csv(csv_path) 38 | df = df.sort_values(by="Mean Similarity", ascending=False).reset_index(drop=True) 39 | 40 | with open(md_path, "r", encoding="utf-8") as f: 41 | md_content = f.read() 42 | with open(rst_path, "r", encoding="utf-8") as f: 43 | rst_content = f.read() 44 | 45 | table_md = generate_markdown_table(df) 46 | table_rst = generate_rst_table(df) 47 | 48 | updated_md = update_markdown(md_content, table_md) 49 | updated_rst = update_rst(rst_content, table_rst) 50 | 51 | with open(md_path, "w", encoding="utf-8") as f: 52 | f.write(updated_md) 53 | with open(rst_path, "w", encoding="utf-8") as f: 54 | f.write(updated_rst) 55 | 56 | 57 | if __name__ == "__main__": 58 | parser = argparse.ArgumentParser( 59 | description="Update benchmark tables in README.md and benchmark.rst from CSV" 60 | ) 61 | parser.add_argument("--csv", default="benchmark.csv", help="Path to benchmark.csv") 62 | parser.add_argument("--md", default="../README.md", help="Path to README.md") 63 | parser.add_argument("--rst", default="benchmark.rst", help="Path to benchmark.rst") 64 | args = parser.parse_args() 65 | 66 | main(args.csv, args.md, args.rst) 67 | -------------------------------------------------------------------------------- /examples/inputs/bench_md.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/bench_md.pdf -------------------------------------------------------------------------------- /examples/inputs/benchmark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/benchmark.pdf -------------------------------------------------------------------------------- /examples/inputs/costco_bill.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/costco_bill.jpg -------------------------------------------------------------------------------- /examples/inputs/cvs_coupon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/cvs_coupon.jpg -------------------------------------------------------------------------------- /examples/inputs/grocery_bill.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/grocery_bill.jpg -------------------------------------------------------------------------------- /examples/inputs/medical_invoice_sample1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/medical_invoice_sample1.png -------------------------------------------------------------------------------- /examples/inputs/medical_travel_request_OWCP_957.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/medical_travel_request_OWCP_957.png -------------------------------------------------------------------------------- /examples/inputs/sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample.docx -------------------------------------------------------------------------------- /examples/inputs/sample.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample.pptx -------------------------------------------------------------------------------- /examples/inputs/sample.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample.xlsx -------------------------------------------------------------------------------- /examples/inputs/sample_test.txt: -------------------------------------------------------------------------------- 1 | Large language models (LLMs) have shown impressive performance on complex reasoning by leveraging chain-of-thought (CoT) prompting to generate intermediate reasoning chains as the rationale to infer the answer. However, existing CoT studies have primarily focused on the language modality. We propose Multimodal-CoT that incorporates language (text) and vision (images) modalities into a two-stage framework that separates rationale generation and answer inference. In this way, answer inference can leverage better generated rationales that are based on multimodal information. Experimental results on ScienceQA and A-OKVQA benchmark datasets show the effectiveness of our proposed approach. With Multimodal-CoT, our model under 1 billion parameters achieves state-of-the-art performance on the ScienceQA benchmark. Our analysis indicates that Multimodal-CoT offers the advantages of mitigating hallucination and enhancing convergence speed. 2 | 3 | -------------------------------------------------------------------------------- /examples/inputs/sample_test_doc.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/sample_test_doc.pdf -------------------------------------------------------------------------------- /examples/inputs/screenshot-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/screenshot-1.png -------------------------------------------------------------------------------- /examples/inputs/stress_test/large_doc_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/stress_test/large_doc_1.pdf -------------------------------------------------------------------------------- /examples/inputs/stress_test/large_doc_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/stress_test/large_doc_2.pdf -------------------------------------------------------------------------------- /examples/inputs/test_1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_1.pdf -------------------------------------------------------------------------------- /examples/inputs/test_2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_2.pdf -------------------------------------------------------------------------------- /examples/inputs/test_3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_3.pdf -------------------------------------------------------------------------------- /examples/inputs/test_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_4.jpg -------------------------------------------------------------------------------- /examples/inputs/test_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_5.jpg -------------------------------------------------------------------------------- /examples/inputs/test_explicit_hyperlink_n_img.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_explicit_hyperlink_n_img.pdf -------------------------------------------------------------------------------- /examples/inputs/test_hidden_link_with_image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_hidden_link_with_image.pdf -------------------------------------------------------------------------------- /examples/inputs/test_with_hidden_links_no_img.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oidlabs-com/Lexoid/90f044ff29031a7def8af1ea92b4ba821e8ae8eb/examples/inputs/test_with_hidden_links_no_img.pdf -------------------------------------------------------------------------------- /examples/outputs/benchmark.md: -------------------------------------------------------------------------------- 1 | # Heading level 1 2 | First paragraph. 3 | 4 | Second paragraph. 5 | 6 | Third paragraph. 7 | 8 | ## Heading level 2 9 | This is **bold text**. 10 | 11 | This is *italic text*. 12 | 13 | This is ***bold and italic text***. 14 | 15 | This is ~~strikethrough~~. 16 | 17 | ### Heading level 3 18 | > This is a level one blockquote. 19 | 20 | > > This is a level two blockquote. 21 | 22 | > > > This is a level three blockquote. 23 | 24 | > > This is a level two blockquote. 25 | 26 | 1. First item on the ordered list 27 | 2. Second item on the ordered list 28 | 3. Third item on the ordered list 29 | 30 | - First item on the unordered list 31 | - Second item on the unordered list 32 | - Third item on the unordered list 33 | 34 | Before a horizontal line 35 | 36 | --- 37 | 38 | After horizontal line 39 | 40 | Here comes a link: [example-link](https://www.example.com). 41 | 42 | Email: 43 | 44 | Here comes Python code: 45 | 46 | ```python 47 | def add_integer(a: int, b: int) -> int: 48 | return a + b 49 | ``` 50 | 51 | And here comes a Bash command: 52 | 53 | ```bash 54 | curl -o thatpage.html http://www.example.com/ 55 | ``` 56 | 57 | Here comes a table: 58 | 59 | | **Column L** | **Column C** | **Column R** | 60 | |:-------------|:------------:|-------------:| 61 | | 11 | 12 | 13 | 62 | | 21 | 22 | 23 | 63 | | 31 | 32 | 33 | 64 | 65 | And a second table: 66 | 67 | | | **B1** | **C1** | 68 | |--------|-----------|-----------| 69 | | **A2** | _data 11_ | _data 12_ | 70 | | **A3** | _data 21_ | _data 22_ | 71 | 72 | 73 |
74 | 75 |
76 | V-February Flow 77 |
78 | 79 |
80 | Data Components: 81 |
82 | 83 |
84 | Code: 85 |
86 | 87 |
88 | The-Stack-V2 89 |
90 | 91 |
92 | CodeText: 93 |
94 | 95 |
96 | SE, whatever we've scraped 97 |
98 | 99 |
100 | WebText: 101 |
102 | 103 |
104 | HQ DCLM 105 |
106 | 107 |
108 | 109 |
110 |
111 | 112 |
113 | DATA MIXES 114 |
115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 |
~85% Source Code]Deepseek
Coder
~10% CodeText
~ 5% Webtext
125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 |
~ 85% The-stack-v2]Starcoder
2
~ 15% CodeText
~ 0% webtext
135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 |
~100% Source Code]Arctic
143 | 144 | 145 |
146 |
147 | 148 |
149 | 150 |
151 | 152 |
153 | 154 | 155 | 156 | 157 | 163 | 164 |
Summary of Care 158 | Patient Adam Everyman
159 | D.O.B October 22, 1962 160 | Sex Male
161 | Patient Detail 162 |
165 | 166 |
167 |
🚑 Reason for referral
168 | Pulmonary function tests, Dr. Penny Puffer, Tel: 555-555-1049,
169 | 1047 Healthcare Drive, Portland, OR 97005, Scheduled date:
170 | 08/17/2012 171 |
172 | 173 |
174 |
💊 Medications
175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 |
MedicationInstructionsDosageEffective Dates (start - stop)Status
Albuterol 0.09 MG/ACTUAT2 puffs every 6 hours PRN wheezingAug 10, 2012 -Active
195 |
196 | 197 |
198 |
H Immunizations
199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 |
VaccineLot NumberDateStatus
Influenza Virus Vaccine18/15/2010Completed
217 |
218 | 219 |
220 |
❤️ Vital signs
221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 |
DateTestResultDetails
15-Aug-2012Height70 in
Weight195 lb
Body Mass Index Calculated28
BP Systolic155 mm[Hg]
BP Diastolic92 mm[Hg]
259 |
260 | 261 |
-------------------------------------------------------------------------------- /examples/outputs/costco_bill.md: -------------------------------------------------------------------------------- 1 |
2 |

Costco

3 |

WHOLESALE

4 |

5 | Irvine #454
6 | 115 Technology Drive W
7 | Irvine, CA 92618
8 | (949) 453-0435 9 |

10 |

MT Member 1234

11 |
12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 |
****Bottom of Basket****
1714849SCOOP AWAY16.49A
****BOB Count 1 ****
E5536YNG COCO 3CT9.99
E57554BLUEBERRIES6.99
E370586ORG. DATES11.99
E1280655ORG CSR KIT8.99
E1280655ORG CSR KIT8.99
E161750KS UNS CASHE13.99
E1308623SUJA WELLNES15.39
E1900000000CA REDEMP VA0.50
E1308623SUJA WELLNES15.39
E1900000000CA REDEMP VA0.50
F504882TYL RR 290CT21.49A
F652782ALEVE GEL16019.99A
F1830585ABGMYS90CT19.99A
F1566153MUCINEX 56CT31.99A
0000345923/15661536.50-A
E1801060LIVSFVARIETY28.99
SUBTOTAL225.16
TAX8.02
**** TOTAL233.18
37 | -------------------------------------------------------------------------------- /examples/outputs/cvs_coupon.md: -------------------------------------------------------------------------------- 1 |

2 | CVSpharmacy 3 |

4 |

5 | $2.00 off
6 | $2 off CVS HEALTH Topical
7 | Pain products 8 |

9 |

10 | Expires 11/02/2024 (Up to $2.00 value) 11 |

12 |

13 | 14 | Barcode representing coupon with numbers 7168 4009 6700 2002 15 |

16 |

17 | ExtraCare card required. See
18 | www.cvs.com/COUPONpolicy or policy at register for details.
19 |

20 | ExtraCare Card # 21 |

22 | -------------------------------------------------------------------------------- /examples/outputs/grocery_bill.md: -------------------------------------------------------------------------------- 1 |

2 | Ralphs Logo 3 |

4 | 5 |

6 | 6300 Irvine Blvd
7 | (949) 559-1139
8 | Your Cashier was BROOKE S
9 | VERIFIED TOTAL SAVINGS $ 1.90 10 |

11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 |
DANN OIKOS P PC YT1.99F
SCRALPHS SAVED YOU0.20
DANN OIKOS P SB YT1.99F
SCRALPHS SAVED YOU0.20
COKE ZERO SUGAR RC7.49B
CRV0.50B
2.92 lb @ 0.89 /lb
WTBANANA ORGNC2.60F
2.39 lb @ 0.89 /lb
WTBANANAS ORGANIC2.13F
1 @ 3/5.00
LUNA BAR1.67F
1 @ 3/5.00
LUNA BAR1.67F
PPB RAW CASHEWS RC8.99F
SCRALPHS SAVED YOU1.50
MRCHECKOUT BAG TAX0.10
RALPHS REWARDS CUSTOMER******3844
TAX0.62
**** BALANCE29.75
135 |
136 | 137 |
138 |
139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 174 | 175 |
VISA29.75
CHANGE0.00
TOTAL NUMBER OF ITEMS SOLD =9
RALPHS rewards SAVINGS$1.90
TOTAL COUPONS$1.90
01/13/25 11:09pm 299 6 528 355
163 | ******************************
164 | ANNUAL CARD SAVINGS $29.76
165 | ******************************
166 | Fuel Points Earned Today: 29
167 | Total Jan Fuel Points: 474
168 | ******************************
169 | Next Reward: 256 points
170 | ******************************
171 | Remaining Dec Fuel Points: 365
172 | ****************************** 173 |
176 | 177 |

178 | Apply Now
179 | $100 Statement Credit
180 | When you spend $500 with your card
in the first 90 days* and
181 | get up to 5% CASH BACK
182 | on eligible purchases* with your
183 | Ralphs Rewards World Elite Mastercard 184 |

185 | 186 |

187 | APPLY TODAY!
188 | www.RalphsMastercard.com/42472 189 | 190 |

191 |

192 | *Restrictions apply, see website
for details. 193 |

194 | 195 |

196 | ******************************
197 | With Card & Coupons
198 | VERIFIED TOTAL SAVINGS $ 1. 199 |

200 |

201 | TRY OUR PHARMACY (949) 559-1739
202 | MGR: ADOLFO VERGARA (949) 559-1139
203 | THANK YOU FOR SHOPPING AT RALPHS! 204 |

205 | 206 |

207 | Fresh opportunity awaits
208 | Join our team today! 209 |

210 | 211 |

212 | QR Code - link not provided 213 |

214 |

215 | jobs.ralphs.com
216 | www.ralphs.com 217 |

-------------------------------------------------------------------------------- /examples/outputs/medical_invoice_sample1.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 12 | 40 | 41 |
MAKE CHECK PAYABLE TOIF PAYING BY CREDIT CARD FILL OUT BELOW
8 | Providence Anesthesiology Associates
9 | PO Box 371863
10 | Pittsburgh, PA 15250-7863 11 |
13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 |
MASTERCARD VISA DISCOVER AMERICAN EXPRESS
CARD NUMBEREXP. DATESECURITY CODE
NAME ON CARDSIGNATURE
39 |
42 | 43 | 44 | 45 | 50 | 76 | 77 |
46 |
Return Service Requested
47 |
For all billing questions, call (704)749-5801
48 |
Hrs. 8:00am - 6:00pm EST / M-F
49 |
51 | 52 | 53 | 57 | 61 | 65 | 66 |
54 | (A) STATEMENT DATE
55 | 08/21/2020 56 |
58 | (B) PAY THIS AMOUNT
59 | 463.30 60 |
62 | (C) ACCOUNT NO.
63 | PAA284850 64 |
67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 |
CHARGES AND CREDITS MADE AFTER STATEMENT
DATE WILL APPEAR ON NEXT STATEMENT
Show Amount
Paid Here
$(D)
75 |
78 | 79 | 80 | 81 | 93 | 101 | 102 |
82 |
SEND TO
83 |
84 | JOHN SMITH
85 | 100 S TRYON ST
86 | UNIT 001
87 | CHARLOTTE, NC 28202-3258 88 |
89 |
90 | Please check box if above address is incorrect or insurance information has changed, and indicate changes on reverse side 91 |
92 |
94 |
REMIT TO
95 |
96 | Providence Anesthesiology Associates
97 | PO Box 371863
98 | Pittsburgh, PA 15250-7863 99 |
100 |
103 | 104 |
105 | 106 | 107 | 108 | 109 | 110 | 111 |
STATEMENTPLEASE DETACH AND RETURN TOP PORTION
WITH YOUR PAYMENT
112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 |
(E) Date(F) Patient(G) Description(H) Charge(I) Insurance
Receipts
(J) Patient
Receipts
(K) Adjustments(L) Insurance
Pending
(M) Patient
Resp.
Facility : Southpark Surgery Center
03/09/2020JOHNProfessional Anesthesia Services - Physician863.00517.8054.50463.30
         
161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 |
Under 3031 - 6061 - 9091 - 120121 - 150Over 151Total
0.000.000.00463.300.000.00463.30
190 | 191 | 192 | 193 | 196 | 213 | 214 |
194 | Your payment is below the acceptable payment amount. 195 | 197 | 198 | 199 | 200 | 201 | 202 |
Amount Due(N) 463.30
203 |
204 | Providence Anesthesiology Associates
205 | PO Box 371863
206 | Pittsburgh, PA 15250-7863 207 |
208 |
209 | Billing questions? Call (704)749-5801
210 | Hrs. 8:00am - 6:00pm EST / M-F 211 |
212 |
215 | 216 |
Page : 1 of 1
-------------------------------------------------------------------------------- /examples/outputs/medical_travel_request_OWCP_957.md: -------------------------------------------------------------------------------- 1 |

Instructions – Form OWCP-957 Part A – Medical Travel Refund Request – Mileage

2 |

This is a mileage-only reimbursement form. If you need other travel expenses reimbursed, complete Form OWCP-957 Part 3 | B Medical Travel Refund Request - Expenses.

4 |
    5 |
  1. Enter claimant's full name: last name, first name, middle initial (M.I.).
  2. 6 |
  3. Enter claimant's claim/case file number.
  4. 7 |
  5. Enter payee's full name (if a person other than the claimant is to be reimbursed): last name, first name, middle 8 | initial. A payee other than the claimant must submit proof of special authorization. Not applicable to the FECA 9 | Program.
  6. 10 |
  7. Enter the Claimant's or Payee's phone number to be reached with questions about this form.
  8. 11 |
  9. Enter the street address of the person to be reimbursed, including the: Street or Rural Route (RR), City, State, 12 | and Zip Code.
  10. 13 |
14 |

Note: For the FECA program to process your request, a FECA claimant must provide the home address where the 15 | claimant resides. A Post Office (PO) Box or attorney/representative address is not an acceptable address. 16 |

17 |
    18 |
  1. Enter the Claimant's or Payee's email address to be reached with questions about this form.
  2. 19 |
  3. Complete a separate block for each medical facility, pharmacy, therapist, etc., visited as follows:
  4. 20 |
21 |

Sample: Multiple trips to a physical therapy office 31 miles from home.

22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 37 | 38 | 39 | 41 | 42 | 43 |
7a. Date(s) of Travel7b. Reason for Travel7c. From (Full name and street address)7d. To (Full name and street address)7e. One-way / Round trip7f. Total # Miles
3/2/2022
3/6/2022
3/10/2022
Hospital
Medical Appt.
34 | Therapy/Rehab
Pharmacy
Med. Supply
Other 36 |
Home
123 Oak St.
Everytown, OH 12345
Therapy and Rehab
8000 Main St
Anytown, OH 54321
One-way
Round trip 40 |
62
62
62
44 |
    45 |
  1. Enter date(s) of travel. If you made multiple trips to the same location, you may enter multiple dates in this 46 | column.
  2. 47 |
  3. Mark one box only.
  4. 48 |
  5. Enter the full name and street address of the address where your trip started.
  6. 49 |
  7. Enter the full name and street address of the address where your trip ended.
    If column c or d is a medical 50 | provider, pharmacy, therapist, etc., provide the name of the medical provider or business along with their 51 | address.
  8. 52 |
  9. Mark one box only.
  10. 53 |
  11. If it was a one-way trip, enter the number of miles. If it was a round trip, enter the total miles traveled for 54 | both legs of the trip.
  12. 55 |
56 |

8. The person claiming reimbursement must sign and enter the date here.

-------------------------------------------------------------------------------- /examples/outputs/test_1.md: -------------------------------------------------------------------------------- 1 | ## Example table 2 | This is an example of a data table. 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 |
Disability CategoryParticipantsBallots CompletedBallots Incomplete/ TerminatedResults
AccuracyTime to complete
Blind51434.5%, n=11199 sec, n=1
Low Vision52398.3% n=2
(97.7%, n=3)
1716 sec, n=3
(1934 sec, n=2)
Dexterity54198.3%, n=41672.1 sec, n=4
Mobility33095.4%, n=31416 sec, n=3
-------------------------------------------------------------------------------- /examples/outputs/test_2.md: -------------------------------------------------------------------------------- 1 |

Cumulative Total Shareholder Return (5 Years)

2 | 3 |
4 |
5 | A line chart showing the cumulative total shareholder return over 5 years. The Bank of New York Mellon Corporation is represented by a green line with circle markers. The S&P 500 Index is represented by a gray line with triangle markers. The S&P 500 Financials Index is represented by a blue line with square markers. 6 |
7 |
8 |
9 |

10 | The Bank of New York Mellon Corporation
11 | S&P 500 Index
12 | S&P 500 Financials Index 13 |

14 |
15 | 16 |
17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 |
Cumulative shareholder returns
(in dollars)
Dec. 31,
201820192020202120222023
The Bank of New York Mellon Corporation$ 100.0$ 109.5$ 95.4$ 134.0$ 108.4$ 128.4
S&P 500 Financials Index (a)100.0132.1129.9175.4156.9176.0
S&P 500 Index (a)100.0131.5155.7200.4164.1207.2
59 | 60 |

(a) Returns are weighted by market capitalization at the beginning of the measurement period.

-------------------------------------------------------------------------------- /examples/outputs/test_3.md: -------------------------------------------------------------------------------- 1 |

The Bank of New York Mellon Corporation (and its subsidiaries)

2 |

Financial Summary

3 | 4 |

(dollars in millions, except per share amounts and unless otherwise noted)

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 |
Selected income statement information:
Fee and other revenue$ 13,157$ 12,873$ 13,313
Net interest revenue4,3453,5042,618
Total revenue17,50216,37715,931
Provision for credit losses11939(231)
Noninterest expense13,29513,01011,514
Income before income taxes4,0883,3284,648
Provision for income taxes800768877
Net income3,2882,5603,771
Net (income) loss attributable to noncontrolling interests related to consolidated investment management funds(2)13(12)
Preferred stock dividends(235)(211)(207)
Net income applicable to common shareholders of The Bank of New York Mellon Corporation$ 3,051$ 2,362$ 3,552
Earnings per share applicable to common shareholders of The Bank of New York Mellon Corporation:
Basic$ 3.89$ 2.91$ 4.17
Diluted$ 3.87$ 2.90$ 4.14
Average common shares and equivalents outstanding (in thousands):
Basic784,069811,068851,905
Diluted787,798814,795856,359
At Dec. 31
Assets under custody and/or administration ("AUC/A") (in trillions) (a)$ 47.8$ 44.3$ 46.7
Assets under management ("AUM") (in trillions) (b)2.01.82.4
Selected ratios:
Return on common equity8.5%6.5%8.9%
Return on tangible common equity - Non-GAAP (c)16.613.417.1
Pre-tax operating margin232029
Net interest margin1.250.970.68
Cash dividends per common share$ 1.58$ 1.42$ 1.30
Common dividend payout ratio41%49%32%
Common dividend yield3.0%3.1%2.2%
At Dec. 31
Closing stock price per common share$ 52.05$ 45.52$ 58.08
Market capitalization$ 39,524$ 36,800$ 46,705
Book value per common share$ 48.11$ 44.40$ 47.50
Tangible book value per common share - Non-GAAP (c)$ 25.39$ 23.11$ 24.31
Full-time employees53,40051,70049,100
Common shares outstanding (in thousands)759,344808,445804,145
Regulatory capital ratios (d)
Common Equity Tier 1 ("CET1") ratio11.5%11.2%11.2%
Tier 1 capital ratio14.214.114.0
Total capital ratio15.014.914.9
Tier 1 leverage ratio6.05.85.5
Supplementary leverage ratio ("SLR")7.36.86.6
241 | 242 |

(a) Consists of AUC/A primarily from the Asset Servicing line of business and, to a lesser extent, the Clearance and Collateral Management, Issuer Services, Pershing and Wealth Management lines of business. Includes the AUC/A of CIBC Mellon Global Securities Services Company ("CIBC Mellon"), a joint venture with the Canadian Imperial Bank of Commerce, of $1.7 trillion at Dec. 31, 2023, $1.5 trillion at Dec. 31, 2022 and $1.7 trillion at Dec. 31, 2021.

243 |

(b) Excludes assets managed outside of the Investment and Wealth Management business segment.

244 |

(c) Return on tangible common equity and tangible book value per common share, both Non-GAAP measures, exclude goodwill and intangible assets, net of deferred tax liabilities. See "Supplemental Information - Explanation of GAAP and Non-GAAP financial measures" beginning on page 111 for the reconciliation of these Non-GAAP measures.

245 |

(d) For our CET1, Tier 1 and Total capital ratios, our effective capital ratios under U.S. capital rules are the lower of the ratios as calculated under the Standardized and Advanced Approaches. For additional information on our regulatory capital ratios, see "Capital" beginning on page 39.

-------------------------------------------------------------------------------- /examples/outputs/test_4.md: -------------------------------------------------------------------------------- 1 |

The following represents analytical data compiled from smoke analyses of segments 1 and 2 and also includes the physical evaluation conducted on the filter rods.

2 | 3 |

Smoke Analysis

4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 |
No FilterWith Filter(Percent)
a. Tobacco Rod Burned, mm5151
b. Putts/Cigt.7.27.88.3
c. TPM (Wet), mg/cigt.24.016.431.7
d. Nicotine, mg/cigt.1.06.8519.8
e. FTC “Tar”, mg/cigt.20.314.230.1
-------------------------------------------------------------------------------- /examples/outputs/test_5.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 |
RPE SCALEEMOJIINTENSITY LEVEL...
9 - 10🥵MAXIMUM INTENSITY
7 - 8😬VIGOROUS INTENSITY
5 - 6😐MODERATE INTENSITY
3 - 4😉LIGHT INTENSITY
1 - 2😁VERY LIGHT INTENSITY
-------------------------------------------------------------------------------- /lexoid/api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import re 4 | import tempfile 5 | from concurrent.futures import ProcessPoolExecutor 6 | from enum import Enum 7 | from glob import glob 8 | from time import time 9 | from typing import Union, Dict, List 10 | 11 | from loguru import logger 12 | 13 | from lexoid.core.parse_type.llm_parser import ( 14 | parse_llm_doc, 15 | create_response, 16 | convert_doc_to_base64_images, 17 | ) 18 | from lexoid.core.parse_type.static_parser import parse_static_doc 19 | from lexoid.core.utils import ( 20 | convert_to_pdf, 21 | download_file, 22 | is_supported_url_file_type, 23 | is_supported_file_type, 24 | recursive_read_html, 25 | router, 26 | split_pdf, 27 | create_sub_pdf, 28 | get_webpage_soup, 29 | ) 30 | 31 | 32 | class ParserType(Enum): 33 | LLM_PARSE = "LLM_PARSE" 34 | STATIC_PARSE = "STATIC_PARSE" 35 | AUTO = "AUTO" 36 | 37 | 38 | def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict: 39 | """ 40 | Parses a file using the specified parser type. 41 | 42 | Args: 43 | path (str): The file path or URL. 44 | parser_type (ParserType): The type of parser to use (LLM_PARSE, STATIC_PARSE, or AUTO). 45 | **kwargs: Additional arguments for the parser. 46 | 47 | Returns: 48 | Dict: Dictionary containing: 49 | - raw: Full markdown content as string 50 | - segments: List of dictionaries with metadata and content 51 | - title: Title of the document 52 | - url: URL if applicable 53 | - parent_title: Title of parent doc if recursively parsed 54 | - recursive_docs: List of dictionaries for recursively parsed documents 55 | - token_usage: Dictionary containing token usage statistics 56 | - parser_used: Which parser was actually used 57 | """ 58 | if parser_type == ParserType.AUTO: 59 | router_priority = kwargs.get("router_priority", "speed") 60 | parser_type = ParserType[router(path, router_priority)] 61 | logger.debug(f"Auto-detected parser type: {parser_type}") 62 | 63 | kwargs["start"] = ( 64 | int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0 65 | ) 66 | if parser_type == ParserType.STATIC_PARSE: 67 | logger.debug("Using static parser") 68 | result = parse_static_doc(path, **kwargs) 69 | else: 70 | logger.debug("Using LLM parser") 71 | result = parse_llm_doc(path, **kwargs) 72 | 73 | result["parser_used"] = parser_type 74 | return result 75 | 76 | 77 | def parse_chunk_list( 78 | file_paths: List[str], parser_type: ParserType, kwargs: Dict 79 | ) -> Dict: 80 | """ 81 | Parses a list of files using the specified parser type. 82 | 83 | Args: 84 | file_paths (list): List of file paths. 85 | parser_type (ParserType): The type of parser to use. 86 | kwargs (dict): Additional arguments for the parser. 87 | 88 | Returns: 89 | Dict: Dictionary containing parsed document data 90 | """ 91 | combined_segments = [] 92 | raw_texts = [] 93 | token_usage = {"input": 0, "output": 0, "llm_page_count": 0} 94 | for file_path in file_paths: 95 | result = parse_chunk(file_path, parser_type, **kwargs) 96 | combined_segments.extend(result["segments"]) 97 | raw_texts.append(result["raw"]) 98 | if ( 99 | result.get("parser_used") == ParserType.LLM_PARSE 100 | and "token_usage" in result 101 | ): 102 | token_usage["input"] += result["token_usage"]["input"] 103 | token_usage["output"] += result["token_usage"]["output"] 104 | token_usage["llm_page_count"] += len(result["segments"]) 105 | token_usage["total"] = token_usage["input"] + token_usage["output"] 106 | 107 | return { 108 | "raw": "\n\n".join(raw_texts), 109 | "segments": combined_segments, 110 | "title": kwargs.get("title", ""), 111 | "url": kwargs.get("url", ""), 112 | "parent_title": kwargs.get("parent_title", ""), 113 | "recursive_docs": [], 114 | "token_usage": token_usage, 115 | } 116 | 117 | 118 | def parse( 119 | path: str, 120 | parser_type: Union[str, ParserType] = "AUTO", 121 | pages_per_split: int = 4, 122 | max_processes: int = 4, 123 | **kwargs, 124 | ) -> Dict: 125 | """ 126 | Parses a document or URL, optionally splitting it into chunks and using multiprocessing. 127 | 128 | Args: 129 | path (str): The file path or URL. 130 | parser_type (Union[str, ParserType], optional): Parser type ("LLM_PARSE", "STATIC_PARSE", or "AUTO"). 131 | pages_per_split (int, optional): Number of pages per split for chunking. 132 | max_processes (int, optional): Maximum number of processes for parallel processing. 133 | **kwargs: Additional arguments for the parser. 134 | 135 | Returns: 136 | Dict: Dictionary containing: 137 | - raw: Full markdown content as string 138 | - segments: List of dictionaries with metadata and content 139 | - title: Title of the document 140 | - url: URL if applicable 141 | - parent_title: Title of parent doc if recursively parsed 142 | - recursive_docs: List of dictionaries for recursively parsed documents 143 | - token_usage: Dictionary containing token usage statistics 144 | """ 145 | kwargs["title"] = os.path.basename(path) 146 | kwargs["pages_per_split_"] = pages_per_split 147 | as_pdf = kwargs.get("as_pdf", False) 148 | depth = kwargs.get("depth", 1) 149 | 150 | if type(parser_type) is str: 151 | parser_type = ParserType[parser_type] 152 | if ( 153 | path.lower().endswith((".doc", ".docx")) 154 | and parser_type != ParserType.STATIC_PARSE 155 | ): 156 | as_pdf = True 157 | if path.lower().endswith(".xlsx") and parser_type == ParserType.LLM_PARSE: 158 | logger.warning("LLM_PARSE does not support .xlsx files. Using STATIC_PARSE.") 159 | parser_type = ParserType.STATIC_PARSE 160 | if path.lower().endswith(".pptx") and parser_type == ParserType.LLM_PARSE: 161 | logger.warning("LLM_PARSE does not support .pptx files. Using STATIC_PARSE.") 162 | parser_type = ParserType.STATIC_PARSE 163 | 164 | with tempfile.TemporaryDirectory() as temp_dir: 165 | kwargs["temp_dir"] = temp_dir 166 | if path.startswith(("http://", "https://")): 167 | kwargs["url"] = path 168 | download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/")) 169 | os.makedirs(download_dir, exist_ok=True) 170 | if is_supported_url_file_type(path): 171 | path = download_file(path, download_dir) 172 | elif as_pdf: 173 | kwargs["title"] = get_webpage_soup(path).title.string.strip() 174 | pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf") 175 | if not pdf_filename.endswith(".pdf"): 176 | pdf_filename += ".pdf" 177 | pdf_path = os.path.join(download_dir, pdf_filename) 178 | path = convert_to_pdf(path, pdf_path) 179 | else: 180 | return recursive_read_html(path, depth) 181 | 182 | assert is_supported_file_type( 183 | path 184 | ), f"Unsupported file type {os.path.splitext(path)[1]}" 185 | 186 | if as_pdf and not path.lower().endswith(".pdf"): 187 | pdf_path = os.path.join(temp_dir, "converted.pdf") 188 | path = convert_to_pdf(path, pdf_path) 189 | 190 | if "page_nums" in kwargs and path.lower().endswith(".pdf"): 191 | sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs") 192 | os.makedirs(sub_pdf_dir, exist_ok=True) 193 | sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}") 194 | path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"]) 195 | 196 | if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE: 197 | kwargs["split"] = False 198 | result = parse_chunk_list([path], parser_type, kwargs) 199 | else: 200 | kwargs["split"] = True 201 | split_dir = os.path.join(temp_dir, "splits/") 202 | os.makedirs(split_dir, exist_ok=True) 203 | split_pdf(path, split_dir, pages_per_split) 204 | split_files = sorted(glob(os.path.join(split_dir, "*.pdf"))) 205 | 206 | chunk_size = max(1, len(split_files) // max_processes) 207 | file_chunks = [ 208 | split_files[i : i + chunk_size] 209 | for i in range(0, len(split_files), chunk_size) 210 | ] 211 | 212 | process_args = [(chunk, parser_type, kwargs) for chunk in file_chunks] 213 | 214 | if max_processes == 1 or len(file_chunks) == 1: 215 | chunk_results = [parse_chunk_list(*args) for args in process_args] 216 | else: 217 | with ProcessPoolExecutor(max_workers=max_processes) as executor: 218 | chunk_results = list( 219 | executor.map(parse_chunk_list, *zip(*process_args)) 220 | ) 221 | 222 | # Combine results from all chunks 223 | result = { 224 | "raw": "\n\n".join(r["raw"] for r in chunk_results), 225 | "segments": [seg for r in chunk_results for seg in r["segments"]], 226 | "title": kwargs["title"], 227 | "url": kwargs.get("url", ""), 228 | "parent_title": kwargs.get("parent_title", ""), 229 | "recursive_docs": [], 230 | "token_usage": { 231 | "input": sum(r["token_usage"]["input"] for r in chunk_results), 232 | "output": sum(r["token_usage"]["output"] for r in chunk_results), 233 | "llm_page_count": sum( 234 | r["token_usage"]["llm_page_count"] for r in chunk_results 235 | ), 236 | "total": sum(r["token_usage"]["total"] for r in chunk_results), 237 | }, 238 | } 239 | 240 | if "api_cost_mapping" in kwargs and "token_usage" in result: 241 | api_cost_mapping = kwargs["api_cost_mapping"] 242 | if isinstance(api_cost_mapping, dict): 243 | api_cost_mapping = api_cost_mapping 244 | elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping): 245 | with open(api_cost_mapping, "r") as f: 246 | api_cost_mapping = json.load(f) 247 | else: 248 | raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.") 249 | 250 | api_cost = api_cost_mapping.get( 251 | kwargs.get("model", "gemini-2.0-flash"), None 252 | ) 253 | if api_cost: 254 | token_usage = result["token_usage"] 255 | token_cost = { 256 | "input": token_usage["input"] * api_cost["input"] / 1_000_000, 257 | "input-image": api_cost.get("input-image", 0) 258 | * token_usage.get("llm_page_count", 0), 259 | "output": token_usage["output"] * api_cost["output"] / 1_000_000, 260 | } 261 | token_cost["total"] = ( 262 | token_cost["input"] 263 | + token_cost["input-image"] 264 | + token_cost["output"] 265 | ) 266 | result["token_cost"] = token_cost 267 | 268 | if as_pdf: 269 | result["pdf_path"] = path 270 | 271 | if depth > 1: 272 | recursive_docs = [] 273 | for segment in result["segments"]: 274 | urls = re.findall( 275 | r'https?://[^\s<>"\']+|www\.[^\s<>"\']+(?:\.[^\s<>"\']+)*', 276 | segment["content"], 277 | ) 278 | for url in urls: 279 | if "](" in url: 280 | url = url.split("](")[-1] 281 | logger.debug(f"Reading content from {url}") 282 | if not url.startswith("http"): 283 | url = "https://" + url 284 | 285 | kwargs_cp = kwargs.copy() 286 | kwargs_cp["depth"] = depth - 1 287 | kwargs_cp["parent_title"] = result["title"] 288 | sub_doc = parse( 289 | url, 290 | parser_type=parser_type, 291 | pages_per_split=pages_per_split, 292 | max_processes=max_processes, 293 | **kwargs_cp, 294 | ) 295 | recursive_docs.append(sub_doc) 296 | 297 | result["recursive_docs"] = recursive_docs 298 | 299 | return result 300 | 301 | 302 | def parse_with_schema( 303 | path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs 304 | ) -> List[List[Dict]]: 305 | """ 306 | Parses a PDF using an LLM to generate structured output conforming to a given JSON schema. 307 | 308 | Args: 309 | path (str): Path to the PDF file. 310 | schema (Dict): JSON schema to which the parsed output should conform. 311 | api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks"). 312 | model (str, optional): LLM model name. 313 | **kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens). 314 | 315 | Returns: 316 | List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema. 317 | """ 318 | system_prompt = f""" 319 | The output should be formatted as a JSON instance that conforms to the JSON schema below. 320 | 321 | As an example, for the schema {{ 322 | "properties": {{ 323 | "foo": {{ 324 | "title": "Foo", 325 | "description": "a list of strings", 326 | "type": "array", 327 | "items": {{"type": "string"}} 328 | }} 329 | }}, 330 | "required": ["foo"] 331 | }}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not. 332 | 333 | Here is the output schema: 334 | {json.dumps(schema, indent=2)} 335 | 336 | """ 337 | 338 | user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format." 339 | 340 | responses = [] 341 | images = convert_doc_to_base64_images(path) 342 | for i, (page_num, image) in enumerate(images): 343 | resp_dict = create_response( 344 | api=api, 345 | model=model, 346 | user_prompt=user_prompt, 347 | system_prompt=system_prompt, 348 | image_url=image, 349 | temperature=kwargs.get("temperature", 0.0), 350 | max_tokens=kwargs.get("max_tokens", 1024), 351 | ) 352 | 353 | response = resp_dict.get("response", "") 354 | response = response.split("```json")[-1].split("```")[0].strip() 355 | logger.debug(f"Processing page {page_num + 1} with response: {response}") 356 | new_dict = json.loads(response) 357 | responses.append(new_dict) 358 | 359 | return responses 360 | -------------------------------------------------------------------------------- /lexoid/core/parse_type/llm_parser.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import io 3 | import mimetypes 4 | import os 5 | import time 6 | from functools import wraps 7 | from typing import Dict, List, Optional, Tuple 8 | 9 | import pypdfium2 as pdfium 10 | import requests 11 | from huggingface_hub import InferenceClient 12 | from loguru import logger 13 | from openai import OpenAI 14 | from requests.exceptions import HTTPError 15 | from together import Together 16 | 17 | from lexoid.core.prompt_templates import ( 18 | INSTRUCTIONS_ADD_PG_BREAK, 19 | LLAMA_PARSER_PROMPT, 20 | OPENAI_USER_PROMPT, 21 | PARSER_PROMPT, 22 | ) 23 | from lexoid.core.utils import convert_image_to_pdf 24 | 25 | 26 | def retry_on_http_error(func): 27 | @wraps(func) 28 | def wrapper(*args, **kwargs): 29 | try: 30 | return func(*args, **kwargs) 31 | except HTTPError as e: 32 | logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...") 33 | time.sleep(10) 34 | try: 35 | logger.debug(f"Retry {func.__name__}") 36 | return func(*args, **kwargs) 37 | except HTTPError as e: 38 | logger.error(f"Retry failed: {e}") 39 | return { 40 | "raw": "", 41 | "segments": [], 42 | "title": kwargs["title"], 43 | "url": kwargs.get("url", ""), 44 | "parent_title": kwargs.get("parent_title", ""), 45 | "recursive_docs": [], 46 | "error": f"HTTPError encountered on page {kwargs.get('start', 0)}: {e}", 47 | } 48 | 49 | return wrapper 50 | 51 | 52 | @retry_on_http_error 53 | def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str: 54 | if "api_provider" in kwargs and kwargs["api_provider"]: 55 | return parse_with_api(path, api=kwargs["api_provider"], **kwargs) 56 | if "model" not in kwargs: 57 | kwargs["model"] = "gemini-2.0-flash" 58 | model = kwargs.get("model") 59 | if model.startswith("gemini"): 60 | return parse_with_gemini(path, **kwargs) 61 | if model.startswith("gpt"): 62 | return parse_with_api(path, api="openai", **kwargs) 63 | if model.startswith("meta-llama"): 64 | if "Turbo" in model or model == "meta-llama/Llama-Vision-Free": 65 | return parse_with_api(path, api="together", **kwargs) 66 | return parse_with_api(path, api="huggingface", **kwargs) 67 | if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]): 68 | return parse_with_api(path, api="openrouter", **kwargs) 69 | if model.startswith("accounts/fireworks"): 70 | return parse_with_api(path, api="fireworks", **kwargs) 71 | raise ValueError(f"Unsupported model: {model}") 72 | 73 | 74 | def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str: 75 | logger.debug(f"Parsing with Gemini API and model {kwargs['model']}") 76 | api_key = os.environ.get("GOOGLE_API_KEY") 77 | if not api_key: 78 | raise ValueError("GOOGLE_API_KEY environment variable is not set") 79 | 80 | url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}" 81 | 82 | # Check if the file is an image and convert to PDF if necessary 83 | mime_type, _ = mimetypes.guess_type(path) 84 | if mime_type and mime_type.startswith("image"): 85 | pdf_content = convert_image_to_pdf(path) 86 | mime_type = "application/pdf" 87 | base64_file = base64.b64encode(pdf_content).decode("utf-8") 88 | else: 89 | with open(path, "rb") as file: 90 | file_content = file.read() 91 | base64_file = base64.b64encode(file_content).decode("utf-8") 92 | 93 | if "system_prompt" in kwargs: 94 | prompt = kwargs["system_prompt"] 95 | else: 96 | # Ideally, we do this ourselves. But, for now this might be a good enough. 97 | custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}""" 98 | if kwargs["pages_per_split_"] == 1: 99 | custom_instruction = "" 100 | prompt = PARSER_PROMPT.format(custom_instructions=custom_instruction) 101 | 102 | payload = { 103 | "contents": [ 104 | { 105 | "parts": [ 106 | {"text": prompt}, 107 | {"inline_data": {"mime_type": mime_type, "data": base64_file}}, 108 | ] 109 | } 110 | ], 111 | "generationConfig": { 112 | "temperature": kwargs.get("temperature", 0.2), 113 | }, 114 | } 115 | 116 | headers = {"Content-Type": "application/json"} 117 | try: 118 | response = requests.post(url, json=payload, headers=headers, timeout=120) 119 | response.raise_for_status() 120 | except requests.Timeout as e: 121 | raise HTTPError(f"Timeout error occurred: {e}") 122 | 123 | result = response.json() 124 | 125 | raw_text = "".join( 126 | part["text"] 127 | for candidate in result.get("candidates", []) 128 | for part in candidate.get("content", {}).get("parts", []) 129 | if "text" in part 130 | ) 131 | 132 | combined_text = "" 133 | if "" in raw_text: 134 | combined_text = raw_text.split("")[-1].strip() 135 | if "" in result: 136 | combined_text = result.split("")[0].strip() 137 | 138 | token_usage = result["usageMetadata"] 139 | input_tokens = token_usage.get("promptTokenCount", 0) 140 | output_tokens = token_usage.get("candidatesTokenCount", 0) 141 | total_tokens = input_tokens + output_tokens 142 | 143 | return { 144 | "raw": combined_text.replace("", "\n\n"), 145 | "segments": [ 146 | {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page} 147 | for page_no, page in enumerate(combined_text.split(""), start=1) 148 | ], 149 | "title": kwargs["title"], 150 | "url": kwargs.get("url", ""), 151 | "parent_title": kwargs.get("parent_title", ""), 152 | "recursive_docs": [], 153 | "token_usage": { 154 | "input": input_tokens, 155 | "output": output_tokens, 156 | "total": total_tokens, 157 | }, 158 | } 159 | 160 | 161 | def convert_pdf_page_to_base64( 162 | pdf_document: pdfium.PdfDocument, page_number: int 163 | ) -> str: 164 | """Convert a PDF page to a base64-encoded PNG string.""" 165 | page = pdf_document[page_number] 166 | # Render with 4x scaling for better quality 167 | pil_image = page.render(scale=4).to_pil() 168 | 169 | # Convert to base64 170 | img_byte_arr = io.BytesIO() 171 | pil_image.save(img_byte_arr, format="PNG") 172 | img_byte_arr.seek(0) 173 | return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8") 174 | 175 | 176 | def get_messages( 177 | system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str] 178 | ) -> List[Dict]: 179 | messages = [] 180 | if system_prompt: 181 | messages.append( 182 | { 183 | "role": "system", 184 | "content": system_prompt, 185 | } 186 | ) 187 | base_message = ( 188 | [ 189 | {"type": "text", "text": user_prompt}, 190 | ] 191 | if user_prompt 192 | else [] 193 | ) 194 | image_message = ( 195 | [ 196 | { 197 | "type": "image_url", 198 | "image_url": {"url": image_url}, 199 | } 200 | ] 201 | if image_url 202 | else [] 203 | ) 204 | 205 | messages.append( 206 | { 207 | "role": "user", 208 | "content": base_message + image_message, 209 | } 210 | ) 211 | 212 | return messages 213 | 214 | 215 | def create_response( 216 | api: str, 217 | model: str, 218 | system_prompt: Optional[str] = None, 219 | user_prompt: Optional[str] = None, 220 | image_url: Optional[str] = None, 221 | temperature: float = 0.2, 222 | max_tokens: int = 1024, 223 | ) -> Dict: 224 | # Initialize appropriate client 225 | clients = { 226 | "openai": lambda: OpenAI(), 227 | "huggingface": lambda: InferenceClient( 228 | token=os.environ["HUGGINGFACEHUB_API_TOKEN"] 229 | ), 230 | "together": lambda: Together(), 231 | "openrouter": lambda: OpenAI( 232 | base_url="https://openrouter.ai/api/v1", 233 | api_key=os.environ["OPENROUTER_API_KEY"], 234 | ), 235 | "fireworks": lambda: OpenAI( 236 | base_url="https://api.fireworks.ai/inference/v1", 237 | api_key=os.environ["FIREWORKS_API_KEY"], 238 | ), 239 | } 240 | assert api in clients, f"Unsupported API: {api}" 241 | client = clients[api]() 242 | 243 | # Prepare messages for the API call 244 | messages = get_messages(system_prompt, user_prompt, image_url) 245 | 246 | # Common completion parameters 247 | completion_params = { 248 | "model": model, 249 | "messages": messages, 250 | "max_tokens": max_tokens, 251 | "temperature": temperature, 252 | } 253 | 254 | # Get completion from selected API 255 | response = client.chat.completions.create(**completion_params) 256 | token_usage = response.usage 257 | 258 | # Extract the response text 259 | page_text = response.choices[0].message.content 260 | 261 | return { 262 | "response": page_text, 263 | "usage": token_usage, 264 | } 265 | 266 | 267 | def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str: 268 | """ 269 | Parse documents (PDFs or images) using various vision model APIs. 270 | 271 | Args: 272 | path (str): Path to the document to parse 273 | api (str): Which API to use ("openai", "huggingface", or "together") 274 | **kwargs: Additional arguments including model, temperature, title, etc. 275 | 276 | Returns: 277 | Dict: Dictionary containing parsed document data 278 | """ 279 | logger.debug(f"Parsing with {api} API and model {kwargs['model']}") 280 | 281 | # Handle different input types 282 | mime_type, _ = mimetypes.guess_type(path) 283 | if mime_type and mime_type.startswith("image"): 284 | # Single image processing 285 | with open(path, "rb") as img_file: 286 | image_base64 = base64.b64encode(img_file.read()).decode("utf-8") 287 | images = [(0, f"data:{mime_type};base64,{image_base64}")] 288 | else: 289 | # PDF processing 290 | pdf_document = pdfium.PdfDocument(path) 291 | images = [ 292 | ( 293 | page_num, 294 | f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}", 295 | ) 296 | for page_num in range(len(pdf_document)) 297 | ] 298 | 299 | # Process each page/image 300 | all_results = [] 301 | for page_num, image_url in images: 302 | if api == "openai": 303 | system_prompt = kwargs.get( 304 | "system_prompt", PARSER_PROMPT.format(custom_instructions="") 305 | ) 306 | user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT) 307 | else: 308 | system_prompt = kwargs.get("system_prompt", None) 309 | user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT) 310 | 311 | response = create_response( 312 | api=api, 313 | model=kwargs["model"], 314 | system_prompt=system_prompt, 315 | user_prompt=user_prompt, 316 | image_url=image_url, 317 | temperature=kwargs.get("temperature", 0.2), 318 | max_tokens=kwargs.get("max_tokens", 1024), 319 | ) 320 | 321 | # Get completion from selected API 322 | page_text = response["response"] 323 | token_usage = response["usage"] 324 | 325 | if kwargs.get("verbose", None): 326 | logger.debug(f"Page {page_num + 1} response: {page_text}") 327 | 328 | # Extract content between output tags if present 329 | result = page_text 330 | if "" in page_text: 331 | result = page_text.split("")[-1].strip() 332 | if "" in result: 333 | result = result.split("")[0].strip() 334 | all_results.append( 335 | ( 336 | page_num, 337 | result, 338 | token_usage.prompt_tokens, 339 | token_usage.completion_tokens, 340 | token_usage.total_tokens, 341 | ) 342 | ) 343 | 344 | # Sort results by page number and combine 345 | all_results.sort(key=lambda x: x[0]) 346 | all_texts = [text for _, text, _, _, _ in all_results] 347 | combined_text = "\n\n".join(all_texts) 348 | 349 | return { 350 | "raw": combined_text, 351 | "segments": [ 352 | { 353 | "metadata": { 354 | "page": kwargs.get("start", 0) + page_no + 1, 355 | "token_usage": { 356 | "input": input_tokens, 357 | "output": output_tokens, 358 | "total": total_tokens, 359 | }, 360 | }, 361 | "content": page, 362 | } 363 | for page_no, page, input_tokens, output_tokens, total_tokens in all_results 364 | ], 365 | "title": kwargs["title"], 366 | "url": kwargs.get("url", ""), 367 | "parent_title": kwargs.get("parent_title", ""), 368 | "recursive_docs": [], 369 | "token_usage": { 370 | "input": sum(input_tokens for _, _, input_tokens, _, _ in all_results), 371 | "output": sum(output_tokens for _, _, _, output_tokens, _ in all_results), 372 | "total": sum(total_tokens for _, _, _, _, total_tokens in all_results), 373 | }, 374 | } 375 | 376 | 377 | def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]: 378 | """ 379 | Converts a document (PDF or image) to a base64 encoded string. 380 | 381 | Args: 382 | path (str): Path to the PDF file. 383 | 384 | Returns: 385 | str: Base64 encoded string of the PDF content. 386 | """ 387 | if path.endswith(".pdf"): 388 | pdf_document = pdfium.PdfDocument(path) 389 | return [ 390 | ( 391 | page_num, 392 | f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}", 393 | ) 394 | for page_num in range(len(pdf_document)) 395 | ] 396 | elif mimetypes.guess_type(path)[0].startswith("image"): 397 | with open(path, "rb") as img_file: 398 | image_base64 = base64.b64encode(img_file.read()).decode("utf-8") 399 | return [(0, f"data:image/png;base64,{image_base64}")] 400 | -------------------------------------------------------------------------------- /lexoid/core/prompt_templates.py: -------------------------------------------------------------------------------- 1 | # Initial prompt, 2 | # This might go through further changes as the library evolves. 3 | PARSER_PROMPT = """\ 4 | You are a specialized document parsing (including OCR) and conversion agent. 5 | Your primary task is to analyze various types of documents and reproduce their content in a format that, when rendered, visually replicates the original input as closely as possible. 6 | Your output should use a combination of Markdown and HTML to achieve this goal. 7 | Think step-by-step. 8 | 9 | **Instructions:** 10 | - Analyze the given document thoroughly, identify formatting patterns, choose optimal markup, implement conversion and verify quality. 11 | - Your primary goal is to ensure structural fidelity of the input is replicated. Preserve all content without loss. 12 | - Use a combination of Markdown and HTML in your output. HTML can be used anywhere in the document, not just for complex structures. Choose the format that best replicates the original structural appearance. However, keep the font colors black and the background colors white. 13 | - When reproducing tables, use HTML tables (, ,
) if they better represent the original layout. Utilize `colspan` and `rowspan` attributes as necessary to accurately represent merged cells. 14 | - Preserve all formatting elements such as bold, italic, underline, strikethrough text, font sizes, and colors using appropriate HTML tags and inline styles if needed. 15 | - Maintain the hierarchy (h1-h6) and styling of headings and subheadings using appropriate HTML tags or Markdown. 16 | - Visual Elements: 17 | * Images: If there is text within the image, try to recreate the structure within the image. If there is no text, describe the image content and position, and use placeholder `` tags to represent their location in the document. Capture the image meaning in the alt attribute. Don't specify src if not known. 18 | * Emojis: Use Unicode characters instead of images. 19 | * Charts/Diagrams: For content that cannot be accurately represented in text format, provide a detailed textual description within an HTML element that visually represents its position in the document. 20 | * Complex visuals: Mark with [?] and make a note for ambiguities or uncertain interpretations in the document. Use HTML comments for conversion notes. Only output notes with comment tags. 21 | - Special Characters: 22 | * Letters with ascenders are usually: b, d, f, h, k, l, t 23 | * Letters with descenders are usually: g, j, p, q, y. Lowercase f and z also have descenders in many typefaces. 24 | * Pay special attention to these commonly confused character pairs, 25 | Letter 'l' vs number '1' vs exclamation mark '!' 26 | Number '2' vs letter 'Z' 27 | Number '5' vs letter 'S' 28 | Number '51' vs number '±1' 29 | Number '6' vs letter 'G' vs letter 'b' 30 | Number '0' vs letter 'O' 31 | Number '8' vs letter 'B' 32 | Letter 'f' vs letter 't' 33 | * Contextual clues to differentiate: 34 | - If in a numeric column, interpret 'O' as '0' 35 | - If preceded/followed by numbers, interpret 'l' as '1' 36 | - Consider font characteristics, e.g. 37 | '1' typically has no serif 38 | '2' has a curved bottom vs 'Z's straight line 39 | '5' has more rounded features than 'S' 40 | '6' has a closed loop vs 'G's open curve 41 | '0' is typically more oval than 'O' 42 | '8' has a more angular top than 'B' 43 | {custom_instructions} 44 | - Return only the correct markdown without additional text or explanations. 45 | - DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content. 46 | - Think before generating the output in tags. 47 | 48 | Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details. 49 | Prioritize replicating structure above all else. 50 | Use tables without borders to represent column-like structures. 51 | Keep the font color black (#000000) and the background white (#ffffff). 52 | 53 | OUTPUT FORMAT: 54 | Enclose the response within XML tags as follows: 55 | 56 | [Step-by-step analysis and generation strategy] 57 | 58 | 59 | "Your converted document content here in markdown format" 60 | 61 | 62 | Quality Checks: 63 | 1. Verify structural and layout accuracy 64 | 2. Verify content completeness 65 | 3. Visual element handling 66 | 4. Hierarchy preservation 67 | 5. Confirm table alignment and cell merging accuracy 68 | 6. Spacing fidelity 69 | 7. Verify that numbers fall within expected ranges for their column 70 | 8. Flag any suspicious characters that could be OCR errors 71 | 9. Validate markdown syntax 72 | """ 73 | 74 | OPENAI_USER_PROMPT = """\ 75 | Convert the following document to markdown. 76 | Ensure accurate representation of all content, including tables and visual elements, per your instructions. 77 | """ 78 | 79 | INSTRUCTIONS_ADD_PG_BREAK = "Insert a `` tag between the content of each page to maintain the original page structure." 80 | 81 | LLAMA_PARSER_PROMPT = """\ 82 | You are a document conversion assistant. Your task is to accurately reproduce the content of an image in Markdown and HTML format, maintaining the visual structure and layout of the original document as closely as possible. 83 | 84 | Instructions: 85 | 1. Use a combination of Markdown and HTML to replicate the document's layout and formatting. 86 | 2. Reproduce all text content exactly as it appears, including preserving capitalization, punctuation, and any apparent errors or inconsistencies in the original. 87 | 3. Use appropriate Markdown syntax for headings, emphasis (bold, italic), and lists where applicable. 88 | 4. Always use HTML (``, ``, `
`) to represent tabular data. Include `colspan` and `rowspan` attributes if needed. 89 | 5. For figures, graphs, or diagrams, represent them using `` tags and use appropriate `alt` text. 90 | 6. For handwritten documents, reproduce the content as typed text, maintaining the original structure and layout. 91 | 7. Do not include any descriptions of the document's appearance, paper type, or writing implements used. 92 | 8. Do not add any explanatory notes, comments, or additional information outside of the converted content. 93 | 9. Ensure all special characters, symbols, and equations are accurately represented. 94 | 10. Provide the output only once, without any duplication. 95 | 11. Enclose the entire output within and tags. 96 | 97 | Output the converted content directly in Markdown and HTML without any additional explanations, descriptions, or notes. 98 | """ 99 | -------------------------------------------------------------------------------- /lexoid/core/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import io 3 | import mimetypes 4 | import os 5 | import re 6 | import sys 7 | from difflib import SequenceMatcher 8 | from hashlib import md5 9 | from typing import Dict, List, Optional 10 | from urllib.parse import urlparse 11 | 12 | import nest_asyncio 13 | import pikepdf 14 | import pypdfium2 15 | import requests 16 | from bs4 import BeautifulSoup 17 | from docx2pdf import convert 18 | from loguru import logger 19 | from markdown import markdown 20 | from markdownify import markdownify as md 21 | from PIL import Image 22 | from PyQt5.QtCore import QMarginsF, QUrl 23 | from PyQt5.QtGui import QPageLayout, QPageSize 24 | from PyQt5.QtPrintSupport import QPrinter 25 | from PyQt5.QtWebEngineWidgets import QWebEngineView 26 | from PyQt5.QtWidgets import QApplication 27 | 28 | # Source: https://stackoverflow.com/a/12982689 29 | HTML_TAG_PATTERN = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") 30 | 31 | 32 | def split_pdf(input_path: str, output_dir: str, pages_per_split: int): 33 | paths = [] 34 | with pikepdf.open(input_path) as pdf: 35 | total_pages = len(pdf.pages) 36 | for start in range(0, total_pages, pages_per_split): 37 | end = min(start + pages_per_split, total_pages) 38 | output_path = os.path.join( 39 | output_dir, f"split_{str(start + 1).zfill(4)}_{end}.pdf" 40 | ) 41 | with pikepdf.new() as new_pdf: 42 | new_pdf.pages.extend(pdf.pages[start:end]) 43 | new_pdf.save(output_path) 44 | paths.append(output_path) 45 | return paths 46 | 47 | 48 | def create_sub_pdf( 49 | input_path: str, output_path: str, page_nums: Optional[tuple[int, ...] | int] = None 50 | ) -> str: 51 | if isinstance(page_nums, int): 52 | page_nums = (page_nums,) 53 | page_nums = tuple(sorted(set(page_nums))) 54 | with pikepdf.open(input_path) as pdf: 55 | indices = page_nums if page_nums else range(len(pdf.pages)) 56 | with pikepdf.new() as new_pdf: 57 | new_pdf.pages.extend([pdf.pages[i - 1] for i in indices]) 58 | new_pdf.save(output_path) 59 | return output_path 60 | 61 | 62 | def convert_image_to_pdf(image_path: str) -> bytes: 63 | with Image.open(image_path) as img: 64 | img_rgb = img.convert("RGB") 65 | pdf_buffer = io.BytesIO() 66 | img_rgb.save(pdf_buffer, format="PDF") 67 | return pdf_buffer.getvalue() 68 | 69 | 70 | def remove_html_tags(text: str): 71 | html = markdown(text, extensions=["tables"]) 72 | return re.sub(HTML_TAG_PATTERN, "", html) 73 | 74 | 75 | def calculate_similarity(text1: str, text2: str, ignore_html=True) -> float: 76 | """Calculate similarity ratio between two texts using SequenceMatcher.""" 77 | if ignore_html: 78 | text1 = remove_html_tags(text1) 79 | text2 = remove_html_tags(text2) 80 | return SequenceMatcher(None, text1, text2).ratio() 81 | 82 | 83 | def convert_pdf_page_to_image( 84 | pdf_document: pypdfium2.PdfDocument, page_number: int 85 | ) -> bytes: 86 | """Convert a PDF page to an image.""" 87 | page = pdf_document[page_number] 88 | # Render with 4x scaling for better quality 89 | pil_image = page.render(scale=4).to_pil() 90 | 91 | # Convert to bytes 92 | img_byte_arr = io.BytesIO() 93 | pil_image.save(img_byte_arr, format="PNG") 94 | img_byte_arr.seek(0) 95 | return img_byte_arr.getvalue() 96 | 97 | 98 | def get_file_type(path: str) -> str: 99 | """Get the file type of a file based on its extension.""" 100 | return mimetypes.guess_type(path)[0] 101 | 102 | 103 | def is_supported_file_type(path: str) -> bool: 104 | """Check if the file type is supported for parsing.""" 105 | file_type = get_file_type(path) 106 | if ( 107 | file_type == "application/pdf" 108 | or "wordprocessing" in file_type 109 | or "spreadsheet" in file_type 110 | or "presentation" in file_type 111 | or file_type.startswith("image/") 112 | or file_type.startswith("text") 113 | ): 114 | return True 115 | return False 116 | 117 | 118 | def is_supported_url_file_type(url: str) -> bool: 119 | """ 120 | Check if the file type from the URL is supported. 121 | 122 | Args: 123 | url (str): The URL of the file. 124 | 125 | Returns: 126 | bool: True if the file type is supported, False otherwise. 127 | """ 128 | supported_extensions = [".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"] 129 | parsed_url = urlparse(url) 130 | ext = os.path.splitext(parsed_url.path)[1].lower() 131 | 132 | if ext in supported_extensions: 133 | return True 134 | 135 | # If no extension in URL, try to get content type from headers 136 | try: 137 | response = requests.head(url) 138 | except requests.exceptions.ConnectionError: 139 | return False 140 | content_type = response.headers.get("Content-Type", "") 141 | ext = mimetypes.guess_extension(content_type) 142 | 143 | return ext in supported_extensions 144 | 145 | 146 | def download_file(url: str, temp_dir: str) -> str: 147 | """ 148 | Downloads a file from the given URL and saves it to a temporary directory. 149 | 150 | Args: 151 | url (str): The URL of the file to download. 152 | temp_dir (str): The temporary directory to save the file. 153 | 154 | Returns: 155 | str: The path to the downloaded file. 156 | """ 157 | response = requests.get(url) 158 | file_name = os.path.basename(urlparse(url).path) 159 | if not file_name: 160 | content_type = response.headers.get("Content-Type", "") 161 | ext = mimetypes.guess_extension(content_type) 162 | file_name = f"downloaded_file{ext}" if ext else "downloaded_file" 163 | 164 | file_path = os.path.join(temp_dir, file_name) 165 | with open(file_path, "wb") as f: 166 | f.write(response.content) 167 | return file_path 168 | 169 | 170 | def find_dominant_heading_level(markdown_content: str) -> str: 171 | """ 172 | Finds the most common heading level that occurs more than once. 173 | Also checks for underline style headings (---). 174 | 175 | Args: 176 | markdown_content (str): The markdown content to analyze 177 | 178 | Returns: 179 | str: The dominant heading pattern (e.g., '##' or 'underline') 180 | """ 181 | # Check for underline style headings first 182 | underline_pattern = r"^[^\n]+\n-+$" 183 | underline_matches = re.findall(underline_pattern, markdown_content, re.MULTILINE) 184 | if len(underline_matches) > 1: 185 | return "underline" 186 | 187 | # Find all hash-style headings in the markdown content 188 | heading_patterns = ["#####", "####", "###", "##", "#"] 189 | heading_counts = {} 190 | 191 | for pattern in heading_patterns: 192 | # Look for headings at the start of a line 193 | regex = f"^{pattern} .*$" 194 | matches = re.findall(regex, markdown_content, re.MULTILINE) 195 | if len(matches) > 1: # Only consider headings that appear more than once 196 | heading_counts[pattern] = len(matches) 197 | 198 | if not heading_counts: 199 | return "#" # Default to h1 if no repeated headings found 200 | 201 | return min(heading_counts.keys(), key=len) 202 | 203 | 204 | def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Dict]: 205 | """ 206 | Splits markdown content by the specified heading pattern and structures it. 207 | 208 | Args: 209 | markdown_content (str): The markdown content to split 210 | heading_pattern (str): The heading pattern to split on (e.g., '##' or 'underline') 211 | 212 | Returns: 213 | List[Dict]: List of dictionaries containing metadata and content 214 | """ 215 | structured_content = [] 216 | 217 | if heading_pattern == "underline": 218 | # Split by underline headings 219 | pattern = r"^([^\n]+)\n-+$" 220 | sections = re.split(pattern, markdown_content, flags=re.MULTILINE) 221 | # Remove empty sections and strip whitespace 222 | sections = [section.strip() for section in sections] 223 | 224 | # Handle content before first heading if it exists 225 | if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE): 226 | structured_content.append( 227 | { 228 | "metadata": {"page": "Introduction"}, 229 | "content": sections.pop(0), 230 | } 231 | ) 232 | 233 | # Process sections pairwise (heading, content) 234 | for i in range(0, len(sections), 2): 235 | if i + 1 < len(sections): 236 | structured_content.append( 237 | { 238 | "metadata": {"page": sections[i]}, 239 | "content": sections[i + 1], 240 | } 241 | ) 242 | else: 243 | # Split by hash headings 244 | regex = f"^{heading_pattern} .*$" 245 | sections = re.split(regex, markdown_content, flags=re.MULTILINE) 246 | headings = re.findall(regex, markdown_content, flags=re.MULTILINE) 247 | 248 | # Remove empty sections and strip whitespace 249 | sections = [section.strip() for section in sections] 250 | 251 | # Handle content before first heading if it exists 252 | if len(sections) > len(headings): 253 | structured_content.append( 254 | { 255 | "metadata": {"page": "Introduction"}, 256 | "content": sections.pop(0), 257 | } 258 | ) 259 | 260 | # Process remaining sections 261 | for heading, content in zip(headings, sections): 262 | clean_heading = heading.replace(heading_pattern, "").strip() 263 | structured_content.append( 264 | { 265 | "metadata": {"page": clean_heading}, 266 | "content": content, 267 | } 268 | ) 269 | 270 | return structured_content 271 | 272 | 273 | def html_to_markdown(html: str, title: str, url: str) -> str: 274 | """ 275 | Converts HTML content to markdown. 276 | 277 | Args: 278 | html (str): The HTML content to convert. 279 | title (str): The title of the HTML page 280 | url (str): The URL of the HTML page 281 | 282 | Returns: 283 | Dict: Dictionary containing parsed document data 284 | """ 285 | markdown_content = md(html) 286 | 287 | # Find the dominant heading level 288 | heading_pattern = find_dominant_heading_level(markdown_content) 289 | 290 | # Split content by headings and structure it 291 | split_md = split_md_by_headings(markdown_content, heading_pattern) 292 | 293 | content = { 294 | "raw": markdown_content, 295 | "segments": split_md, 296 | "title": title, 297 | "url": url, 298 | "parent_title": "", 299 | "recursive_docs": [], 300 | } 301 | 302 | return content 303 | 304 | 305 | def get_webpage_soup(url: str) -> BeautifulSoup: 306 | try: 307 | from playwright.async_api import async_playwright 308 | 309 | nest_asyncio.apply() 310 | 311 | async def fetch_page(): 312 | async with async_playwright() as p: 313 | browser = await p.chromium.launch( 314 | headless=True, 315 | args=[ 316 | "--disable-blink-features=AutomationControlled", 317 | "--no-sandbox", 318 | "--window-size=1920,1080", 319 | ], 320 | ) 321 | context = await browser.new_context( 322 | viewport={"width": 1920, "height": 1080}, 323 | user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 324 | bypass_csp=True, 325 | ) 326 | page = await context.new_page() 327 | 328 | # Add headers to appear more like a real browser 329 | await page.set_extra_http_headers( 330 | { 331 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 332 | "Accept-Language": "en-US,en;q=0.5", 333 | "Sec-Fetch-Dest": "document", 334 | "Sec-Fetch-Mode": "navigate", 335 | "Sec-Fetch-Site": "none", 336 | "Sec-Fetch-User": "?1", 337 | } 338 | ) 339 | 340 | await page.goto(url) 341 | 342 | # Wait for Cloudflare check to complete 343 | await page.wait_for_load_state("networkidle") 344 | 345 | # Additional wait for any dynamic content 346 | try: 347 | await page.wait_for_selector("body", timeout=30000) 348 | except Exception: 349 | pass 350 | 351 | html = await page.content() 352 | await browser.close() 353 | return html 354 | 355 | loop = asyncio.get_event_loop() 356 | html = loop.run_until_complete(fetch_page()) 357 | soup = BeautifulSoup(html, "html.parser") 358 | except Exception as e: 359 | logger.debug( 360 | f"Error reading HTML content from URL, attempting with default https request: {str(e)}" 361 | ) 362 | response = requests.get(url) 363 | soup = BeautifulSoup( 364 | response.content, "html.parser", from_encoding="iso-8859-1" 365 | ) 366 | return soup 367 | 368 | 369 | def read_html_content(url: str) -> Dict: 370 | """ 371 | Reads the content of an HTML page from the given URL and converts it to markdown or structured content. 372 | 373 | Args: 374 | url (str): The URL of the HTML page. 375 | 376 | Returns: 377 | Dict: Dictionary containing parsed document data 378 | """ 379 | 380 | soup = get_webpage_soup(url) 381 | title = soup.title.string.strip() if soup.title else "No title" 382 | url_hash = md5(url.encode("utf-8")).hexdigest()[:8] 383 | full_title = f"{title} - {url_hash}" 384 | return html_to_markdown(str(soup), title=full_title, url=url) 385 | 386 | 387 | def extract_urls_from_markdown(content: str) -> List[str]: 388 | """ 389 | Extracts URLs from markdown content using regex. 390 | Matches both [text](url) and bare http(s):// URLs. 391 | 392 | Args: 393 | content (str): Markdown content to search for URLs 394 | 395 | Returns: 396 | List[str]: List of unique URLs found 397 | """ 398 | # Match markdown links [text](url) and bare URLs 399 | markdown_pattern = r"\[([^\]]+)\]\((https?://[^\s\)]+)\)" 400 | bare_url_pattern = r"(? Dict: 412 | """ 413 | Recursively reads HTML content from URLs up to specified depth. 414 | 415 | Args: 416 | url (str): The URL to parse 417 | depth (int): How many levels deep to recursively parse 418 | visited_urls (set): Set of already visited URLs to prevent cycles 419 | 420 | Returns: 421 | Dict: Dictionary containing parsed document data 422 | """ 423 | if visited_urls is None: 424 | visited_urls = set() 425 | 426 | if url in visited_urls: 427 | return { 428 | "raw": "", 429 | "segments": [], 430 | "title": "", 431 | "url": url, 432 | "parent_title": "", 433 | "recursive_docs": [], 434 | } 435 | 436 | visited_urls.add(url) 437 | 438 | try: 439 | content = read_html_content(url) 440 | except Exception as e: 441 | print(f"Error processing URL {url}: {str(e)}") 442 | return { 443 | "raw": "", 444 | "segments": [], 445 | "title": "", 446 | "url": url, 447 | "parent_title": "", 448 | "recursive_docs": [], 449 | } 450 | 451 | if depth <= 1: 452 | return content 453 | 454 | # Extract URLs from all content sections 455 | urls = extract_urls_from_markdown(content["raw"]) 456 | 457 | # Recursively process each URL 458 | recursive_docs = [] 459 | for sub_url in urls: 460 | if sub_url not in visited_urls: 461 | sub_content = recursive_read_html(sub_url, depth - 1, visited_urls) 462 | recursive_docs.append(sub_content) 463 | 464 | content["recursive_docs"] = recursive_docs 465 | return content 466 | 467 | 468 | def save_webpage_as_pdf(url: str, output_path: str) -> str: 469 | """ 470 | Saves a webpage as a PDF file using PyQt5. 471 | 472 | Args: 473 | url (str): The URL of the webpage. 474 | output_path (str): The path to save the PDF file. 475 | 476 | Returns: 477 | str: The path to the saved PDF file. 478 | """ 479 | if not QApplication.instance(): 480 | app = QApplication(sys.argv) 481 | else: 482 | app = QApplication.instance() 483 | web = QWebEngineView() 484 | web.load(QUrl(url)) 485 | 486 | def handle_print_finished(filename, status): 487 | print(f"PDF saved to: {filename}") 488 | app.quit() 489 | 490 | def handle_load_finished(status): 491 | if status: 492 | printer = QPrinter(QPrinter.HighResolution) 493 | printer.setOutputFormat(QPrinter.PdfFormat) 494 | printer.setOutputFileName(output_path) 495 | 496 | page_layout = QPageLayout( 497 | QPageSize(QPageSize.A4), QPageLayout.Portrait, QMarginsF(15, 15, 15, 15) 498 | ) 499 | printer.setPageLayout(page_layout) 500 | 501 | web.page().printToPdf(output_path) 502 | web.page().pdfPrintingFinished.connect(handle_print_finished) 503 | 504 | web.loadFinished.connect(handle_load_finished) 505 | app.exec_() 506 | 507 | return output_path 508 | 509 | 510 | def convert_to_pdf(input_path: str, output_path: str) -> str: 511 | """ 512 | Converts a file or webpage to PDF. 513 | 514 | Args: 515 | input_path (str): The path to the input file or URL. 516 | output_path (str): The path to save the output PDF file. 517 | 518 | Returns: 519 | str: The path to the saved PDF file. 520 | """ 521 | if input_path.startswith(("http://", "https://")): 522 | return save_webpage_as_pdf(input_path, output_path) 523 | file_type = get_file_type(input_path) 524 | if file_type.startswith("image/"): 525 | img_data = convert_image_to_pdf(input_path) 526 | with open(output_path, "wb") as f: 527 | f.write(img_data) 528 | elif "word" in file_type: 529 | return convert_doc_to_pdf(input_path, os.path.dirname(output_path)) 530 | else: 531 | # Assume it's already a PDF, just copy it 532 | with open(input_path, "rb") as src, open(output_path, "wb") as dst: 533 | dst.write(src.read()) 534 | 535 | return output_path 536 | 537 | 538 | def has_image_in_pdf(path: str): 539 | with open(path, "rb") as fp: 540 | content = fp.read() 541 | return "Image".lower() in list( 542 | map(lambda x: x.strip(), (str(content).lower().split("/"))) 543 | ) 544 | 545 | 546 | def has_hyperlink_in_pdf(path: str): 547 | with open(path, "rb") as fp: 548 | content = fp.read() 549 | # URI tag is used if Links are hidden. 550 | return "URI".lower() in list( 551 | map(lambda x: x.strip(), (str(content).lower().split("/"))) 552 | ) 553 | 554 | 555 | def router(path: str, priority: str = "speed") -> str: 556 | """ 557 | Routes the file path to the appropriate parser based on the file type. 558 | 559 | Args: 560 | path (str): The file path to route. 561 | priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE). 562 | """ 563 | file_type = get_file_type(path) 564 | if ( 565 | file_type.startswith("text/") 566 | or "spreadsheet" in file_type 567 | or "presentation" in file_type 568 | ): 569 | return "STATIC_PARSE" 570 | 571 | if priority == "accuracy": 572 | # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE 573 | # Otherwise, use LLM_PARSE 574 | has_image = has_image_in_pdf(path) 575 | has_hyperlink = has_hyperlink_in_pdf(path) 576 | if file_type == "application/pdf" and not has_image and has_hyperlink: 577 | logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.") 578 | return "STATIC_PARSE" 579 | logger.debug( 580 | f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})." 581 | ) 582 | return "LLM_PARSE" 583 | else: 584 | # If the file is a PDF without images, use STATIC_PARSE 585 | # Otherwise, use LLM_PARSE 586 | if file_type == "application/pdf" and not has_image_in_pdf(path): 587 | logger.debug("Using STATIC_PARSE for PDF without images.") 588 | return "STATIC_PARSE" 589 | logger.debug("Using LLM_PARSE because PDF has images") 590 | return "LLM_PARSE" 591 | 592 | 593 | def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str: 594 | temp_path = os.path.join( 595 | temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf" 596 | ) 597 | 598 | # Convert the document to PDF 599 | # docx2pdf is not supported in linux. Use LibreOffice in linux instead. 600 | # May need to install LibreOffice if not already installed. 601 | if "linux" in sys.platform.lower(): 602 | os.system( 603 | f'lowriter --headless --convert-to pdf --outdir {temp_dir} "{input_path}"' 604 | ) 605 | else: 606 | convert(input_path, temp_path) 607 | 608 | # Return the path of the converted PDF 609 | return temp_path 610 | 611 | 612 | def get_uri_rect(path): 613 | with open(path, "rb") as fp: 614 | byte_str = str(fp.read()) 615 | pattern = r"\((https?://[^\s)]+)\)" 616 | uris = re.findall(pattern, byte_str) 617 | rect_splits = byte_str.split("/Rect [")[1:] 618 | rects = [ 619 | list(map(float, rect_split.split("]")[0].split())) for rect_split in rect_splits 620 | ] 621 | return {uri: rect for uri, rect in zip(uris, rects)} 622 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "lexoid" 3 | version = "0.1.14" 4 | description = "" 5 | authors = [] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.10" 10 | google-generativeai = "^0.8.1" 11 | openai = "^1.47.0" 12 | pikepdf = "^9.3.0" 13 | pdfplumber = "^0.11.4" 14 | pandas = "^2.2.3" 15 | tabulate = "^0.9.0" 16 | bs4 = "^0.0.2" 17 | markdownify = "^0.13.1" 18 | opencv-python = "^4.10.0.84" 19 | pypdfium2 = "^4.30.0" 20 | markdown = "^3.7" 21 | python-dotenv = "^1.0.0" 22 | loguru = "^0.7.2" 23 | playwright = "^1.49.0" 24 | docx2pdf = "^0.1.8" 25 | python-docx = "^1.1.2" 26 | nest-asyncio ="^1.6.0" 27 | pyqt5 = {version = "^5.15.11", markers = "platform_system != 'debian'"} 28 | pyqtwebengine = {version = "^5.15.7", markers = "platform_system != 'debian'"} 29 | huggingface-hub = "^0.27.0" 30 | together = "^1.4.0" 31 | openpyxl = "^3.1.5" 32 | pptx2md = "^2.0.6" 33 | 34 | [tool.poetry.group.dev.dependencies] 35 | ipykernel = "^6.29.5" 36 | pytest-asyncio = "^0.23.8" 37 | pytest = "^8.3.2" 38 | 39 | 40 | [tool.poetry.group.docs.dependencies] 41 | sphinx = "^8.1.3" 42 | pydata-sphinx-theme = "^0.16.1" 43 | docutils = "^0.21.2" 44 | 45 | [build-system] 46 | requires = ["poetry-core", "wheel"] 47 | build-backend = "poetry.core.masonry.api" 48 | -------------------------------------------------------------------------------- /tests/api_cost_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "gemini-2.5-flash-preview-04-17": { 3 | "input": 0.15, 4 | "output": 0.6 5 | }, 6 | "gemini-2.5-pro-preview-03-25": { 7 | "input": 1.25, 8 | "output": 10 9 | }, 10 | "gemini-2.0-flash": { 11 | "input": 0.1, 12 | "output": 0.4 13 | }, 14 | "gemini-2.0-pro-exp": {}, 15 | "gemini-2.0-flash-thinking-exp": {}, 16 | "gemini-2.0-flash-001": { 17 | "input": 0.1, 18 | "output": 0.4 19 | }, 20 | "gemini-1.5-flash-8b": { 21 | "input": 0.0375, 22 | "output": 0.15 23 | }, 24 | "gemini-1.5-flash": { 25 | "input": 0.075, 26 | "output": 0.3 27 | }, 28 | "gemini-1.5-pro": { 29 | "input": 1.25, 30 | "output": 5 31 | }, 32 | "gpt-4o": { 33 | "input": 2.5, 34 | "output": 10 35 | }, 36 | "gpt-4o-mini": { 37 | "input": 0.15, 38 | "output": 0.6 39 | }, 40 | "meta-llama/Llama-3.2-11B-Vision-Instruct": { 41 | "input": 0, 42 | "output": 0 43 | }, 44 | "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo": { 45 | "input": 0.18, 46 | "output": 0.18 47 | }, 48 | "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo": { 49 | "input": 1.2, 50 | "output": 1.2 51 | }, 52 | "meta-llama/Llama-Vision-Free": { 53 | "input": 0, 54 | "output": 0 55 | }, 56 | "google/gemma-3-27b-it": { 57 | "input": 0.1, 58 | "input-image": 0.0000256, 59 | "output": 0.2 60 | }, 61 | "qwen/qwen-2.5-vl-7b-instruct": { 62 | "input": 0.2, 63 | "input-image": 0.0001445, 64 | "output": 0.2 65 | }, 66 | "microsoft/phi-4-multimodal-instruct": { 67 | "input": 0.05, 68 | "input-image": 0.0001769, 69 | "output": 0.1 70 | }, 71 | "accounts/fireworks/models/llama4-maverick-instruct-basic": { 72 | "input": 0.22, 73 | "output": 0.88 74 | }, 75 | "accounts/fireworks/models/llama4-scout-instruct-basic": { 76 | "input": 0.15, 77 | "output": 0.6 78 | } 79 | } -------------------------------------------------------------------------------- /tests/benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from dataclasses import dataclass 4 | from glob import glob 5 | from pathlib import Path 6 | from statistics import mean, stdev 7 | from typing import Dict, List, Optional, Tuple 8 | 9 | import pandas as pd 10 | from dotenv import load_dotenv 11 | 12 | from lexoid.api import parse 13 | from lexoid.core.utils import calculate_similarity 14 | 15 | load_dotenv() 16 | 17 | 18 | @dataclass 19 | class BenchmarkResult: 20 | config: Dict 21 | similarity: List[float] # Store all similarity scores for iterations 22 | execution_time: List[float] # Store all execution times for iterations 23 | cost: Optional[List[float]] = None 24 | error: Optional[str] = None 25 | 26 | 27 | def get_input_output_pairs(input_path: str, output_dir: str) -> List[Tuple[str, str]]: 28 | """Get matching input and ground truth file pairs.""" 29 | if os.path.isfile(input_path): 30 | # Single file mode 31 | base_name = Path(input_path).stem 32 | ground_truth_path = os.path.join(output_dir, f"{base_name}.md") 33 | if os.path.exists(ground_truth_path): 34 | return [(input_path, ground_truth_path)] 35 | return [] 36 | 37 | # Directory mode 38 | input_files = sorted(glob(os.path.join(input_path, "*"))) 39 | pairs = [] 40 | 41 | for input_file in input_files: 42 | base_name = Path(input_file).stem 43 | ground_truth_path = os.path.join(output_dir, f"{base_name}.md") 44 | 45 | if os.path.exists(ground_truth_path): 46 | pairs.append((input_file, ground_truth_path)) 47 | 48 | return pairs 49 | 50 | 51 | def run_benchmark_config( 52 | input_path: str, 53 | ground_truth: str, 54 | config: Dict, 55 | output_save_dir: str = None, 56 | iterations: int = 1, 57 | ) -> BenchmarkResult: 58 | """Run a single benchmark configuration for a specified number of iterations.""" 59 | similarities = [] 60 | execution_times = [] 61 | costs = [] 62 | error = None 63 | 64 | for _ in range(iterations): 65 | try: 66 | start_time = time.time() 67 | config["parser_type"] = config.get( 68 | "parser_type", 69 | ( 70 | "LLM_PARSE" 71 | if "model" in config 72 | else ("STATIC_PARSE" if "framework" in config else "AUTO") 73 | ), 74 | ) 75 | result = parse( 76 | input_path, 77 | pages_per_split=1, 78 | api_cost_mapping="tests/api_cost_mapping.json", 79 | **config, 80 | ) 81 | execution_time = time.time() - start_time 82 | 83 | if output_save_dir: 84 | filename = ( 85 | f"{Path(input_path).stem}_" 86 | + ", ".join( 87 | [ 88 | f"{key}={str(value).replace('/', '_')}" 89 | for key, value in config.items() 90 | ] 91 | ) 92 | + f"{int(start_time)}.md" 93 | ) 94 | with open(os.path.join(output_save_dir, filename), "w") as fp: 95 | fp.write(result["raw"]) 96 | 97 | similarity = calculate_similarity(result["raw"], ground_truth) 98 | similarities.append(similarity) 99 | execution_times.append(execution_time) 100 | costs.append( 101 | result["token_cost"]["output"] if "token_cost" in result else 0.0 102 | ) 103 | except Exception as e: 104 | print(f"Error running benchmark for config: {config}\n{e}") 105 | error = str(e) 106 | break # Stop further iterations if an error occurs 107 | 108 | return BenchmarkResult( 109 | config=config, 110 | similarity=similarities, 111 | execution_time=execution_times, 112 | cost=costs, 113 | error=error, 114 | ) 115 | 116 | 117 | def aggregate_results(results: List[BenchmarkResult]) -> BenchmarkResult: 118 | """Aggregate multiple benchmark results into a single result.""" 119 | if not results: 120 | return None 121 | 122 | valid_results = [r for r in results if r.error is None] 123 | if valid_results: 124 | all_similarities = [s for r in valid_results for s in r.similarity] 125 | all_execution_times = [t for r in valid_results for t in r.execution_time] 126 | all_costs = [c for r in valid_results for c in r.cost] 127 | avg_similarity = mean(all_similarities) 128 | std_similarity = stdev(all_similarities) if len(all_similarities) > 1 else 0.0 129 | avg_execution_time = mean(all_execution_times) 130 | avg_cost = mean(all_costs) 131 | error = ( 132 | None 133 | if len(valid_results) == len(results) 134 | else f"Failed: {len(results) - len(valid_results)}/{len(results)}" 135 | ) 136 | else: 137 | avg_similarity = 0.0 138 | std_similarity = 0.0 139 | avg_execution_time = 0.0 140 | avg_cost = 0.0 141 | error = f"Failed: {len(results)}/{len(results)}" 142 | 143 | return BenchmarkResult( 144 | config=results[0].config, 145 | similarity=[avg_similarity, std_similarity], # Store mean and std dev 146 | execution_time=[avg_execution_time], 147 | cost=[avg_cost], 148 | error=error, 149 | ) 150 | 151 | 152 | def generate_test_configs(input_path: str, test_attributes: List[str]) -> List[Dict]: 153 | """ 154 | Generate different configuration combinations to test based on specified attributes. 155 | """ 156 | config_options = { 157 | "parser_type": ["LLM_PARSE", "STATIC_PARSE", "AUTO"], 158 | "model": [ 159 | # # Google models 160 | "gemini-2.5-flash-preview-04-17", 161 | # "gemini-2.5-pro-preview-03-25", 162 | # "gemini-2.0-pro-exp", 163 | "gemini-2.0-flash", 164 | # "gemini-2.0-flash-thinking-exp", 165 | # "gemini-2.0-flash-001", 166 | # "gemini-1.5-flash-8b", 167 | # "gemini-1.5-flash", 168 | # "gemini-1.5-pro", 169 | # # OpenAI models 170 | "gpt-4o", 171 | "gpt-4o-mini", 172 | # # Meta-LLAMA models through HF Hub 173 | # "meta-llama/Llama-3.2-11B-Vision-Instruct", 174 | # # Meta-LLAMA models through Together AI 175 | # "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", 176 | "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo", 177 | # "meta-llama/Llama-Vision-Free", 178 | # # Model through OpenRouter 179 | "google/gemma-3-27b-it", 180 | "qwen/qwen-2.5-vl-7b-instruct", 181 | # "microsoft/phi-4-multimodal-instruct", 182 | # # Model through fireworks 183 | "accounts/fireworks/models/llama4-maverick-instruct-basic", 184 | # "accounts/fireworks/models/llama4-scout-instruct-basic", 185 | ], 186 | "framework": ["pdfminer", "pdfplumber"], 187 | "pages_per_split": [1, 2, 4, 8], 188 | "max_threads": [1, 2, 4, 8], 189 | "as_pdf": [True, False], 190 | "temperature": [0.2, 0.7], 191 | } 192 | 193 | # Only test as_pdf if input is not a PDF 194 | is_pdf = input_path.lower().endswith(".pdf") 195 | if is_pdf and "as_pdf" in test_attributes: 196 | test_attributes.remove("as_pdf") 197 | 198 | configs = [{}] 199 | 200 | for attr in test_attributes: 201 | new_configs = [] 202 | for config in configs: 203 | if attr == "parser_type" or attr == "temperature": 204 | for value in config_options[attr]: 205 | new_config = config.copy() 206 | new_config[attr] = value 207 | new_configs.append(new_config) 208 | elif attr == "model" and ( 209 | "parser_type" not in config or config.get("parser_type") == "LLM_PARSE" 210 | ): 211 | for value in config_options[attr]: 212 | new_config = config.copy() 213 | new_config[attr] = value 214 | new_configs.append(new_config) 215 | elif attr == "framework" and ( 216 | "parser_type" not in config 217 | or config.get("parser_type") == "STATIC_PARSE" 218 | ): 219 | for value in config_options[attr]: 220 | new_config = config.copy() 221 | new_config[attr] = value 222 | new_configs.append(new_config) 223 | elif attr in ("pages_per_split", "max_threads"): 224 | for value in config_options[attr]: 225 | new_config = config.copy() 226 | new_config[attr] = value 227 | new_configs.append(new_config) 228 | elif attr == "as_pdf" and not is_pdf: 229 | for value in config_options[attr]: 230 | new_config = config.copy() 231 | new_config[attr] = value 232 | new_configs.append(new_config) 233 | else: 234 | new_configs.append(config) 235 | configs = new_configs 236 | 237 | return configs 238 | 239 | 240 | def format_results(results: List[BenchmarkResult], test_attributes: List[str]) -> str: 241 | """Format benchmark results as a markdown table, including only tested attributes.""" 242 | sorted_results = sorted(results, key=lambda x: x.similarity[0], reverse=True) 243 | 244 | # Dynamically generate table headers based on test_attributes 245 | headers = ["Rank"] 246 | for attr in test_attributes: 247 | headers.append(attr.replace("_", " ").title()) 248 | headers.extend(["Mean Similarity", "Std. Dev.", "Time (s)", "Cost ($)", "Error"]) 249 | 250 | md_lines = [ 251 | "# Parser Benchmark Results\n", 252 | "| " + " | ".join(headers) + " |", 253 | "|" + "|".join(["---"] * len(headers)) + "|", 254 | ] 255 | 256 | for i, result in enumerate(sorted_results, 1): 257 | config = result.config 258 | error_msg = result.error if result.error else "-" 259 | 260 | row = [str(i)] 261 | for attr in test_attributes: 262 | row.append(str(config.get(attr, "-"))) 263 | row.extend( 264 | [ 265 | f"{result.similarity[0]:.3f}", 266 | f"{result.similarity[1]:.3f}", 267 | f"{result.execution_time[0]:.2f}", 268 | f"{result.cost[0]}", 269 | error_msg, 270 | ] 271 | ) 272 | md_lines.append("| " + " | ".join(row) + " |") 273 | 274 | return "\n".join(md_lines) 275 | 276 | 277 | def run_benchmarks( 278 | input_path: str, 279 | output_dir: str, 280 | test_attributes: List[str], 281 | benchmark_output_dir: str, 282 | iterations: int = 3, 283 | ) -> List[BenchmarkResult]: 284 | """Run all benchmarks for given input(s) and return results.""" 285 | # Get input/output file pairs 286 | file_pairs = get_input_output_pairs(input_path, output_dir) 287 | if not file_pairs: 288 | print("No matching input/output file pairs found!") 289 | return [] 290 | 291 | # Generate test configurations based on first input file 292 | configs = generate_test_configs(file_pairs[0][0], test_attributes) 293 | 294 | # Run benchmarks 295 | results = [] 296 | total_configs = len(configs) 297 | total_files = len(file_pairs) 298 | 299 | print( 300 | f"Running {total_configs} configurations across {total_files} file(s) for {iterations} iterations..." 301 | ) 302 | 303 | all_results = [] 304 | for i, config in enumerate(configs, 1): 305 | print(f"Progress: {i}/{total_configs} - Testing config: {config}") 306 | 307 | # Run benchmark for each file 308 | file_results = [] 309 | for input_file, ground_truth_path in file_pairs: 310 | print(f"Running benchmark for file: {input_file}") 311 | with open(ground_truth_path, "r", encoding="utf-8") as f: 312 | ground_truth = f.read() 313 | result = run_benchmark_config( 314 | input_file, ground_truth, config, benchmark_output_dir, iterations 315 | ) 316 | file_results.append(result) 317 | all_results.append((input_file, result)) 318 | 319 | result = aggregate_results(file_results) 320 | 321 | results.append(result) 322 | 323 | # Format and save results 324 | save_format = "csv" 325 | if save_format == "markdown": 326 | markdown_report = format_results(results, test_attributes) 327 | result_path = os.path.join(benchmark_output_dir, "results.md") 328 | with open(result_path, "w", encoding="utf-8") as f: 329 | f.write(markdown_report) 330 | elif save_format == "csv": 331 | df = pd.DataFrame( 332 | [ 333 | { 334 | "Model": result.config.get("model", "-"), 335 | "Mean Similarity": result.similarity[0], 336 | "Std. Dev.": result.similarity[1], 337 | "Time (s)": result.execution_time[0], 338 | "Cost($)": result.cost[0], 339 | } 340 | for result in results 341 | ] 342 | ) 343 | result_path = os.path.join(benchmark_output_dir, "results.csv") 344 | df.to_csv(result_path, index=False) 345 | 346 | print(f"\nBenchmark complete! Results saved to {result_path}") 347 | 348 | # Save document-wise results to CSV 349 | doc_results = [] 350 | for input_file, result in all_results: 351 | doc_result = { 352 | "Input File": os.path.basename(input_file), 353 | "Mean Similarity": result.similarity[0], 354 | "Time (s)": result.execution_time[0], 355 | "Cost($)": result.cost[0], 356 | } 357 | for key, value in result.config.items(): 358 | doc_result[key] = value 359 | doc_results.append(doc_result) 360 | doc_df = pd.DataFrame(doc_results) 361 | doc_result_path = os.path.join(benchmark_output_dir, "document_results.csv") 362 | doc_df.to_csv(doc_result_path, index=False) 363 | print(f"Document-wise results saved to {doc_result_path}") 364 | 365 | return results 366 | 367 | 368 | def main(): 369 | # Can be either a single file or directory 370 | input_path = "examples/inputs" 371 | output_dir = "examples/outputs" 372 | 373 | benchmark_output_dir = f"tests/outputs/benchmark_{int(time.time())}/" 374 | os.makedirs(benchmark_output_dir, exist_ok=True) 375 | 376 | # Specify which attributes to test 377 | test_attributes = [ 378 | # "parser_type", 379 | "model", 380 | # "framework", 381 | # "pages_per_split", 382 | # "max_threads", 383 | # "as_pdf", 384 | # "temperature", 385 | ] 386 | 387 | # Number of iterations for each benchmark 388 | iterations = 5 389 | 390 | results = run_benchmarks( 391 | input_path, output_dir, test_attributes, benchmark_output_dir, iterations 392 | ) 393 | 394 | # Print top 3 configurations 395 | top_results = sorted(results, key=lambda x: x.similarity[0], reverse=True)[:3] 396 | print("\nTop 3 Configurations:") 397 | for i, result in enumerate(top_results, 1): 398 | print( 399 | f"{i}. Similarity: {result.similarity[0]:.3f} (±{result.similarity[1]:.3f}), Time: {result.execution_time[0]:.2f}s" 400 | ) 401 | print(f" Config: {result.config}") 402 | 403 | 404 | if __name__ == "__main__": 405 | main() 406 | -------------------------------------------------------------------------------- /tests/env_template: -------------------------------------------------------------------------------- 1 | GOOGLE_API_KEY = "" 2 | OPENAI_API_KEY = "" 3 | HUGGINGFACEHUB_API_TOKEN = "" 4 | TOGETHER_API_KEY = "" -------------------------------------------------------------------------------- /tests/test_parser.py: -------------------------------------------------------------------------------- 1 | # python3 -m pytest tests/test_parser.py -v 2 | # With logs: python3 -m pytest tests/test_parser.py -v -s 3 | 4 | import os 5 | 6 | import pytest 7 | from dotenv import load_dotenv 8 | from loguru import logger 9 | 10 | from lexoid.api import parse 11 | from lexoid.core.utils import calculate_similarity 12 | 13 | load_dotenv() 14 | output_dir = "tests/outputs" 15 | os.makedirs(output_dir, exist_ok=True) 16 | models = [ 17 | # Google models 18 | "gemini-2.0-pro-exp", 19 | "gemini-2.0-flash", 20 | "gemini-1.5-flash", 21 | "gemini-1.5-flash-8b", 22 | "gemini-1.5-pro", 23 | # OpenAI models 24 | "gpt-4o", 25 | "gpt-4o-mini", 26 | # Meta-LLAMA models through HF Hub 27 | "meta-llama/Llama-3.2-11B-Vision-Instruct", 28 | # Meta-LLAMA models through Together AI 29 | "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", 30 | "meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo", 31 | "meta-llama/Llama-Vision-Free", 32 | ] 33 | 34 | 35 | @pytest.mark.asyncio 36 | @pytest.mark.parametrize("model", models) 37 | async def test_llm_parse(model): 38 | input_data = "examples/inputs/test_1.pdf" 39 | expected_ouput_path = "examples/outputs/test_1.md" 40 | config = {"parser_type": "LLM_PARSE", "model": model, "verbose": True} 41 | result = parse(input_data, **config)["raw"] 42 | assert isinstance(result, str) 43 | 44 | # Compare the result with the expected output 45 | expected_ouput = open(expected_ouput_path, "r").read() 46 | # save the result to a file 47 | with open(f"{output_dir}/input_table_{model.replace('/', '_')}.md", "w") as f: 48 | f.write(result) 49 | score = calculate_similarity(result, expected_ouput) 50 | assert round(score, 3) > 0.75 51 | 52 | 53 | @pytest.mark.asyncio 54 | @pytest.mark.parametrize("model", models) 55 | async def test_jpg_parse(model): 56 | input_data = "examples/inputs/test_4.jpg" 57 | expected_ouput_path = "examples/outputs/test_4.md" 58 | config = {"parser_type": "LLM_PARSE", "model": model} 59 | result = parse(input_data, **config)["raw"] 60 | assert isinstance(result, str) 61 | 62 | # Compare the result with the expected output 63 | expected_ouput = open(expected_ouput_path, "r").read() 64 | # save the result to a file 65 | m_name = model.replace("/", "_") 66 | with open(f"{output_dir}/input_image_{m_name}.md", "w") as f: 67 | f.write(result) 68 | score = calculate_similarity(result, expected_ouput) 69 | assert round(score, 3) > 0.8 70 | 71 | 72 | @pytest.mark.asyncio 73 | @pytest.mark.parametrize( 74 | "sample", 75 | [ 76 | "examples/inputs/test_explicit_hyperlink_n_img.pdf", 77 | "examples/inputs/test_hidden_link_with_image.pdf", # currently fails 78 | "examples/inputs/test_with_hidden_links_no_img.pdf", 79 | ], 80 | ) 81 | async def test_url_detection_auto_routing(sample): 82 | patterns = ["http", "https", "www"] 83 | model_type = "gemini-1.5-pro" 84 | config = {"parser_type": "AUTO", "model": model_type, "verbose": True} 85 | result = parse(sample, **config)["raw"] 86 | assert isinstance(result, str) 87 | found = [True if p in result else False for p in patterns] 88 | assert any(found) 89 | 90 | 91 | @pytest.mark.asyncio 92 | @pytest.mark.parametrize( 93 | "sample", 94 | [ 95 | "examples/inputs/test_explicit_hyperlink_n_img.pdf", 96 | "examples/inputs/test_hidden_link_with_image.pdf", 97 | "examples/inputs/test_with_hidden_links_no_img.pdf", 98 | ], 99 | ) 100 | async def test_url_detection_pdfplumber(sample): 101 | patterns = ["http", "https", "www"] 102 | framework = "pdfplumber" 103 | config = {"parser_type": "STATIC_PARSE", "framework": framework} 104 | result = parse(sample, **config)["raw"] 105 | assert isinstance(result, str) 106 | found = [True if p in result else False for p in patterns] 107 | assert any(found) 108 | 109 | 110 | @pytest.mark.parametrize("model", models) 111 | @pytest.mark.asyncio 112 | async def test_url_detection_multi_page_auto_routing(model): 113 | sample = "examples/inputs/sample_test_doc.pdf" 114 | patterns = ["http", "https", "www"] 115 | config = {"parser_type": "AUTO", "model": model, "verbose": True} 116 | results = parse(sample, pages_per_split=1, **config)["segments"] 117 | 118 | assert len(results) == 6 119 | for res in results: 120 | content = res["content"] 121 | if res["metadata"]["page"] == 1: 122 | # Page 1: Fails to detect the URL 123 | found = [p in content for p in patterns] 124 | assert not any(found) 125 | elif res["metadata"]["page"] == 2: 126 | # Page 2: Detects the URL 127 | found = [p in content for p in patterns] 128 | assert any(found) 129 | elif res["metadata"]["page"] == 3: 130 | # Page 3: Does not contain any URL 131 | found = [p in content for p in patterns] 132 | assert not any(found) 133 | elif res["metadata"]["page"] == 4: 134 | # Page 4: Detects the URL 135 | found = [p in content for p in patterns] 136 | assert any(found) 137 | elif res["metadata"]["page"] == 5: 138 | # Page 5: Detects all the URLs 139 | found = [p in content for p in patterns] 140 | assert all(found) 141 | elif res["metadata"]["page"] == 6: 142 | # Page 6: Detects the URL 143 | found = "https://github" in content 144 | assert found 145 | 146 | 147 | @pytest.mark.asyncio 148 | @pytest.mark.parametrize("depth", [1, 2]) 149 | async def test_recursive_url_parsing(depth): 150 | results = parse("https://example.com/", depth=depth)["segments"] 151 | 152 | # Not necessarily always the case. Just the case for "example.com". 153 | assert len(results) == depth 154 | 155 | 156 | @pytest.mark.asyncio 157 | async def test_recursive_url_parsing_in_pdf(): 158 | sample = "examples/inputs/sample_test_doc.pdf" 159 | parser_type = "AUTO" 160 | results = parse(sample, parser_type, pages_per_split=1, depth=2) 161 | assert len(results["recursive_docs"]) >= 7, results 162 | 163 | 164 | @pytest.mark.asyncio 165 | async def test_parsing_txt_type(): 166 | sample = "examples/inputs/sample_test.txt" 167 | parser_type = "AUTO" 168 | results = parse(sample, parser_type)["segments"] 169 | assert len(results) == 1 170 | assert results[0]["content"] is not None 171 | 172 | 173 | @pytest.mark.asyncio 174 | async def test_parsing_docx_type(): 175 | sample = "examples/inputs/sample.docx" 176 | parser_type = "STATIC_PARSE" 177 | results = parse(sample, parser_type)["segments"] 178 | assert len(results) >= 1 179 | assert results[0]["content"] is not None 180 | 181 | parser_type = "LLM_PARSE" 182 | results = parse(sample, parser_type)["segments"] 183 | assert len(results) > 1 184 | assert results[0]["content"] is not None 185 | 186 | 187 | @pytest.mark.asyncio 188 | async def test_parsing_xlsx_type(): 189 | sample = "examples/inputs/sample.xlsx" 190 | parser_type = "STATIC_PARSE" 191 | results = parse(sample, parser_type)["segments"] 192 | assert len(results) >= 1 193 | assert results[0]["content"] is not None 194 | 195 | 196 | @pytest.mark.asyncio 197 | async def test_parsing_pptx_type(): 198 | sample = "examples/inputs/sample.pptx" 199 | parser_type = "STATIC_PARSE" 200 | results = parse(sample, parser_type)["segments"] 201 | assert len(results) >= 1 202 | assert results[0]["content"] is not None 203 | 204 | 205 | @pytest.mark.asyncio 206 | async def test_dynamic_js_parsing(): 207 | test_url = "https://go.contentsquare.com/ab-testing-playbook" 208 | results = parse(test_url, parser_type="AUTO")["raw"] 209 | # Check if the content contains the expected information 210 | should_contain_info = "6 Types of experimentation" 211 | assert should_contain_info.lower() in results.strip().lower() 212 | 213 | 214 | @pytest.mark.asyncio 215 | async def test_pdfplumber_table_parsing(): 216 | sample = "examples/inputs/test_1.pdf" 217 | parser_type = "STATIC_PARSE" 218 | results = parse(sample, parser_type, framework="pdfplumber")["raw"] 219 | assert [token in results for token in ["|", "Results", "Accuracy"]] 220 | 221 | 222 | @pytest.mark.asyncio 223 | @pytest.mark.parametrize( 224 | "sample", 225 | [ 226 | ("examples/inputs/stress_test/large_doc_1.pdf", 527), 227 | ("examples/inputs/stress_test/large_doc_2.pdf", 117), 228 | ], 229 | ) 230 | async def test_large_pdf_parsing(sample): 231 | parser_type = "AUTO" 232 | file_name = sample[0] 233 | n_pages = sample[1] 234 | results = parse(file_name, parser_type, pages_per_split=1)["segments"] 235 | assert len(results) == n_pages 236 | assert results[0]["content"] is not None 237 | 238 | 239 | token_usage_models = [ 240 | # Google models 241 | "gemini-2.0-flash-001", 242 | # OpenAI models 243 | "gpt-4o", 244 | # Meta-LLAMA models through HF Hub 245 | "meta-llama/Llama-3.2-11B-Vision-Instruct", 246 | # Meta-LLAMA models through Together AI 247 | "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", 248 | ] 249 | 250 | 251 | @pytest.mark.parametrize("model", token_usage_models) 252 | @pytest.mark.asyncio 253 | async def test_token_usage_api(model): 254 | sample = "examples/inputs/test_1.pdf" 255 | parser_type = "LLM_PARSE" 256 | config = {"parser_type": parser_type, "model": model} 257 | token_usage = parse(sample, **config)["token_usage"] 258 | logger.info(f"Token usage: {token_usage}") 259 | assert token_usage["input"] > 0 260 | assert token_usage["output"] > 0 261 | assert token_usage["total"] > 0 262 | 263 | 264 | @pytest.mark.asyncio 265 | async def test_pdf_save_path(): 266 | sample = "https://example.com/" 267 | parser_type = "LLM_PARSE" 268 | result = parse( 269 | sample, 270 | parser_type, 271 | as_pdf=True, 272 | save_dir="tests/outputs/temp", 273 | save_filename="test_output.pdf", 274 | ) 275 | assert "pdf_path" in result 276 | assert result["pdf_path"].endswith(".pdf") 277 | assert os.path.exists(result["pdf_path"]) 278 | 279 | # Clean up 280 | os.remove(result["pdf_path"]) 281 | os.rmdir("tests/outputs/temp") 282 | 283 | 284 | @pytest.mark.asyncio 285 | async def test_page_nums(): 286 | sample = "examples/inputs/sample_test_doc.pdf" 287 | result = parse(sample, "LLM_PARSE", page_nums=(3, 4), pages_per_split=1) 288 | assert len(result["segments"]) == 2 289 | assert all(keyword in result["raw"] for keyword in ["Table 24", "apple"]) 290 | assert all(keyword not in result["raw"] for keyword in ["Aenean", "Lexoid"]) 291 | 292 | result = parse(sample, "LLM_PARSE", page_nums=(3, 3), pages_per_split=1) 293 | assert len(result["segments"]) == 1 294 | assert "Table 24" in result["raw"] 295 | 296 | sample = "https://www.dca.ca.gov/acp/pdf_files/lemonlaw_qa.pdf" 297 | result = parse(sample, "STATIC_PARSE", page_nums=2, pages_per_split=1) 298 | assert len(result["segments"]) == 1 299 | assert "ATTEMPTS" in result["raw"] 300 | assert "acp@dca.ca.gov" not in result["raw"] 301 | 302 | 303 | @pytest.mark.parametrize( 304 | "model", 305 | [ 306 | "gemini-2.0-flash", 307 | "gpt-4o", 308 | "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", 309 | ], 310 | ) 311 | @pytest.mark.asyncio 312 | async def test_token_cost(model): 313 | sample = "examples/inputs/test_1.pdf" 314 | parser_type = "LLM_PARSE" 315 | api_cost_path = os.path.join(os.path.dirname(__file__), "api_cost_mapping.json") 316 | config = { 317 | "parser_type": parser_type, 318 | "model": model, 319 | "api_cost_mapping": api_cost_path, 320 | } 321 | result = parse(sample, **config) 322 | assert "token_cost" in result 323 | assert result["token_cost"]["input"] > 0 324 | assert result["token_cost"]["output"] > 0 325 | assert result["token_cost"]["total"] > 0 326 | 327 | 328 | @pytest.mark.asyncio 329 | async def test_blockquote(): 330 | sample = "examples/inputs/bench_md.pdf" 331 | parser_type = "STATIC_PARSE" 332 | results = parse(sample, parser_type, framework="pdfplumber")["raw"] 333 | # Assert that there is at least one fenced code block 334 | assert " " * 3 in results 335 | 336 | 337 | @pytest.mark.asyncio 338 | async def test_monospace_code_block(): 339 | sample = "examples/inputs/bench_md.pdf" 340 | parser_type = "STATIC_PARSE" 341 | results = parse(sample, parser_type, framework="pdfplumber")["raw"] 342 | # Assert that there is at least one fenced code block 343 | assert "```" in results 344 | 345 | 346 | @pytest.mark.asyncio 347 | async def test_pdf_headings(): 348 | sample_path = "examples/inputs/bench_md.pdf" 349 | parser_type = "STATIC_PARSE" 350 | results = parse(sample_path, parser_type, framework="pdfplumber")["raw"] 351 | 352 | # Test for h1 (should have # in markdown) 353 | assert "#" in results 354 | assert "##" in results 355 | 356 | 357 | @pytest.mark.asyncio 358 | async def test_email_address(): 359 | sample = "examples/inputs/bench_md.pdf" 360 | parser_type = "STATIC_PARSE" 361 | results = parse(sample, parser_type, framework="pdfplumber")["raw"] 362 | assert "" in results 363 | 364 | 365 | @pytest.mark.asyncio 366 | async def test_horizontal_lines(): 367 | sample = "examples/inputs/bench_md.pdf" 368 | parser_type = "STATIC_PARSE" 369 | results = parse(sample, parser_type, framework="pdfplumber")["raw"] 370 | assert "\n---\n" in results, "Markdown horizontal rule not found" 371 | 372 | 373 | @pytest.mark.asyncio 374 | async def test_strikethrough_words(): 375 | sample = "examples/inputs/bench_md.pdf" 376 | parser_type = "STATIC_PARSE" 377 | results = parse(sample, parser_type, framework="pdfplumber")["raw"] 378 | assert "~~" in results, "Markdown strikethrough text not found" 379 | --------------------------------------------------------------------------------