├── .github ├── dependabot.yml └── workflows │ ├── ci.yml │ └── create_issue.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── examples └── ocr │ ├── engine.py │ ├── output │ └── .gitignore │ ├── requirements.txt │ └── validate_ocr_performance.py ├── img └── unstructured_logo.png ├── logger_config.yaml ├── pyproject.toml ├── pytest.ini ├── requirements ├── base.in ├── base.txt ├── constraints.in ├── dev.in ├── dev.txt ├── test.in └── test.txt ├── sample-docs ├── 2023-Jan-economic-outlook.pdf ├── IRS-form-1987.pdf ├── RGBA_image.png ├── Silent-Giant.pdf ├── design-thinking.pdf ├── easy_table.jpg ├── embedded-images.pdf ├── empty-document.pdf ├── example_table.jpg ├── ilpa-example-1.jpg ├── layout-parser-paper-fast.jpg ├── layout-parser-paper-fast.pdf ├── layout-parser-paper.pdf ├── loremipsum-flat.pdf ├── loremipsum.jpg ├── loremipsum.pdf ├── loremipsum.png ├── loremipsum.tiff ├── loremipsum_multipage.pdf ├── non-embedded.pdf ├── password.pdf ├── patent-1p.pdf ├── patent.pdf ├── pdf2image-memory-error-test-400p.pdf ├── recalibrating-risk-report.pdf ├── receipt-sample.jpg ├── table-multi-row-column-cells.png └── test-image.jpg ├── scripts ├── docker-build.sh ├── shellcheck.sh ├── test-unstructured-ingest-helper.sh └── version-sync.sh ├── setup.cfg ├── setup.py ├── test_unstructured_inference ├── conftest.py ├── inference │ ├── test_layout.py │ └── test_layout_element.py ├── models │ ├── test_detectron2onnx.py │ ├── test_eval.py │ ├── test_model.py │ ├── test_tables.py │ └── test_yolox.py ├── test_config.py ├── test_elements.py ├── test_logger.py ├── test_math.py ├── test_utils.py └── test_visualization.py └── unstructured_inference ├── __init__.py ├── __version__.py ├── config.py ├── constants.py ├── inference ├── __init__.py ├── elements.py ├── layout.py └── layoutelement.py ├── logger.py ├── math.py ├── models ├── __init__.py ├── base.py ├── detectron2onnx.py ├── eval.py ├── table_postprocess.py ├── tables.py ├── unstructuredmodel.py └── yolox.py ├── utils.py └── visualize.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/requirements" 5 | schedule: 6 | interval: "monthly" 7 | 8 | - package-ecosystem: "github-actions" 9 | # NOTE(robinson) - Workflow files stored in the 10 | # default location of `.github/workflows` 11 | directory: "/" 12 | schedule: 13 | interval: "monthly" 14 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main, robinson/initial-repo-setup ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | setup: 11 | strategy: 12 | matrix: 13 | python-version: ["3.10","3.11", "3.12"] 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v4 17 | - uses: actions/cache@v4 18 | id: virtualenv-cache 19 | with: 20 | path: | 21 | .venv 22 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} 23 | lookup-only: true 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Install Poppler 29 | run: | 30 | sudo apt-get update 31 | sudo apt-get -y install poppler-utils 32 | - name: Setup virtual environment (no cache hit) 33 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 34 | run: | 35 | python${{ matrix.python-version }} -m venv .venv 36 | source .venv/bin/activate 37 | make install-ci 38 | 39 | lint: 40 | strategy: 41 | matrix: 42 | python-version: ["3.10","3.11", "3.12"] 43 | runs-on: ubuntu-latest 44 | needs: setup 45 | steps: 46 | - uses: actions/checkout@v4 47 | - uses: actions/cache/restore@v4 48 | id: virtualenv-cache 49 | with: 50 | path: .venv 51 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} 52 | # NOTE(robinson) - This is a fallback in case the lint job does not find the cache. 53 | # We can take this out when we implement the fix in CORE-99 54 | - name: Setup virtual environment (no cache hit) 55 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 56 | run: | 57 | python${{ matrix.python-version }} -m venv .venv 58 | - name: Lint 59 | run: | 60 | source .venv/bin/activate 61 | make install-ci 62 | make check 63 | 64 | shellcheck: 65 | runs-on: ubuntu-latest 66 | steps: 67 | - uses: actions/checkout@v4 68 | - name: ShellCheck 69 | uses: ludeeus/action-shellcheck@master 70 | 71 | test: 72 | strategy: 73 | matrix: 74 | python-version: ["3.10","3.11", "3.12"] 75 | runs-on: ubuntu-latest 76 | needs: [setup, lint] 77 | steps: 78 | - uses: actions/checkout@v4 79 | - uses: actions/cache/restore@v4 80 | id: virtualenv-cache 81 | with: 82 | path: | 83 | .venv 84 | key: ${{ runner.os }}-${{ matrix.python-version }}-${{ hashFiles('requirements/*.txt') }} 85 | # NOTE(robinson) - This is a fallback in case the lint job does not find the cache. 86 | # We can take this out when we implement the fix in CORE-99 87 | - name: Setup virtual environment (no cache hit) 88 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 89 | run: | 90 | python${{ matrix.python-version }} -m venv .venv 91 | - name: Install Poppler 92 | run: | 93 | sudo apt-get update 94 | sudo apt-get -y install poppler-utils tesseract-ocr 95 | - name: Configure AWS credentials 96 | uses: aws-actions/configure-aws-credentials@v4 97 | with: 98 | aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }} 99 | aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 100 | aws-region: us-east-2 101 | - name: Test 102 | env: 103 | UNSTRUCTURED_HF_TOKEN: ${{ secrets.HF_TOKEN }} 104 | run: | 105 | source .venv/bin/activate 106 | make install-ci 107 | aws s3 cp s3://utic-dev-models/ci_test_model/test_ci_model.onnx test_unstructured_inference/models/ 108 | CI=true make test 109 | make check-coverage 110 | 111 | # NOTE(robinson) - disabling ingest tests for now, as of 5/22/2024 they seem to have been 112 | # broken for the past six months 113 | # test_ingest: 114 | # strategy: 115 | # matrix: 116 | # python-version: ["3.9","3.10"] 117 | # runs-on: ubuntu-latest 118 | # env: 119 | # NLTK_DATA: ${{ github.workspace }}/nltk_data 120 | # needs: lint 121 | # steps: 122 | # - name: Checkout unstructured repo for integration testing 123 | # uses: actions/checkout@v4 124 | # with: 125 | # repository: 'Unstructured-IO/unstructured' 126 | # - name: Checkout this repo 127 | # uses: actions/checkout@v4 128 | # with: 129 | # path: inference 130 | # - name: Set up Python ${{ matrix.python-version }} 131 | # uses: actions/setup-python@v4 132 | # with: 133 | # python-version: ${{ matrix.python-version }} 134 | # - name: Test 135 | # env: 136 | # GH_READ_ONLY_ACCESS_TOKEN: ${{ secrets.GH_READ_ONLY_ACCESS_TOKEN }} 137 | # SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }} 138 | # DISCORD_TOKEN: ${{ secrets.DISCORD_TOKEN }} 139 | # run: | 140 | # python${{ matrix.python-version }} -m venv .venv 141 | # source .venv/bin/activate 142 | # [ ! -d "$NLTK_DATA" ] && mkdir "$NLTK_DATA" 143 | # make install-ci 144 | # pip install -e inference/ 145 | # sudo apt-get update 146 | # sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc 147 | # sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 148 | # sudo apt-get install -y tesseract-ocr 149 | # sudo apt-get install -y tesseract-ocr-kor 150 | # sudo apt-get install -y diffstat 151 | # tesseract --version 152 | # make install-all-ingest 153 | # # only run ingest tests that check expected output diffs. 154 | # bash inference/scripts/test-unstructured-ingest-helper.sh 155 | 156 | changelog: 157 | runs-on: ubuntu-latest 158 | steps: 159 | - uses: actions/checkout@v4 160 | - if: github.ref != 'refs/heads/main' 161 | uses: dorny/paths-filter@v2 162 | id: changes 163 | with: 164 | filters: | 165 | src: 166 | - 'unstructured_inference/**' 167 | 168 | - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' 169 | uses: dangoslen/changelog-enforcer@v3 -------------------------------------------------------------------------------- /.github/workflows/create_issue.yml: -------------------------------------------------------------------------------- 1 | name: create_jira_issue 2 | 3 | on: 4 | issues: 5 | types: 6 | - opened 7 | 8 | jobs: 9 | create: 10 | runs-on: ubuntu-latest 11 | name: Create JIRA Issue 12 | steps: 13 | 14 | - name: Login to Jira 15 | uses: atlassian/gajira-login@v3 16 | env: 17 | JIRA_BASE_URL: ${{ secrets.JIRA_BASE_URL }} 18 | JIRA_USER_EMAIL: ${{ secrets.JIRA_USER_EMAIL }} 19 | JIRA_API_TOKEN: ${{ secrets.JIRA_API_TOKEN }} 20 | 21 | - name: Create Jira issue 22 | uses: atlassian/gajira-create@v3 23 | with: 24 | project: CORE 25 | issuetype: Task 26 | summary: ${{ github.event.issue.title }} 27 | description: | 28 | Created from github issue: ${{ github.event.issue.html_url }} 29 | ---- 30 | ${{ github.event.issue.body }} 31 | fields: '{ "labels": ["github-issue"] }' 32 | 33 | - name: Log created issue 34 | run: echo "Issue ${{ steps.create.outputs.issue }} was created" 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | nbs/ 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # Pycharm 122 | .idea/ 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # Model artifacts 136 | .models/* 137 | !.models/.gitkeep 138 | 139 | # Mac stuff 140 | .DS_Store 141 | 142 | # VSCode 143 | .vscode/ 144 | 145 | sample-docs/*_images 146 | examples/**/output 147 | figures 148 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: "v4.3.0" 4 | hooks: 5 | - id: check-added-large-files 6 | - id: check-toml 7 | - id: check-yaml 8 | - id: check-json 9 | - id: check-xml 10 | - id: end-of-file-fixer 11 | exclude: \.json$ 12 | include: \.py$ 13 | - id: trailing-whitespace 14 | - id: mixed-line-ending 15 | 16 | - repo: https://github.com/psf/black 17 | rev: 22.10.0 18 | hooks: 19 | - id: black 20 | args: ["--line-length=100"] 21 | language_version: python3 22 | 23 | - repo: https://github.com/charliermarsh/ruff-pre-commit 24 | rev: "v0.0.230" 25 | hooks: 26 | - id: ruff 27 | args: 28 | [ 29 | "--fix", 30 | "--select=I,UP015,UP032,UP034,UP018,COM,C4,PT,SIM,PLR0402", 31 | "--ignore=PT011,PT012,SIM117", 32 | ] 33 | 34 | - repo: https://github.com/pycqa/flake8 35 | rev: 4.0.1 36 | hooks: 37 | - id: flake8 38 | language_version: python3 39 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | FROM quay.io/unstructured-io/base-images:rocky8.7-3 as base 3 | 4 | ARG PIP_VERSION 5 | 6 | # Set up environment 7 | ENV HOME /home/ 8 | WORKDIR ${HOME} 9 | RUN mkdir ${HOME}/.ssh && chmod go-rwx ${HOME}/.ssh \ 10 | && ssh-keyscan -t rsa github.com >> /home/.ssh/known_hosts 11 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}" 12 | ENV PATH="/home/usr/.local/bin:${PATH}" 13 | 14 | FROM base as deps 15 | # Copy and install Unstructured 16 | COPY requirements requirements 17 | 18 | RUN python3.8 -m pip install pip==${PIP_VERSION} && \ 19 | dnf -y groupinstall "Development Tools" && \ 20 | pip install --no-cache -r requirements/base.txt && \ 21 | pip install --no-cache -r requirements/test.txt && \ 22 | pip install --no-cache -r requirements/dev.txt && \ 23 | dnf -y groupremove "Development Tools" && \ 24 | dnf clean all 25 | 26 | FROM deps as code 27 | ARG PACKAGE_NAME=unstructured_inference 28 | COPY unstructured_inference unstructured_inference 29 | 30 | #CMD ["pytest -m \"not slow\" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing"] 31 | CMD ["/bin/bash"] 32 | #CMD ["bash -c pytest test_unstructured_inference"] 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements/base.in 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PACKAGE_NAME := unstructured_inference 2 | PIP_VERSION := 23.2.1 3 | CURRENT_DIR := $(shell pwd) 4 | 5 | 6 | .PHONY: help 7 | help: Makefile 8 | @sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $< 9 | 10 | 11 | ########### 12 | # Install # 13 | ########### 14 | 15 | ## install-base: installs core requirements needed for text processing bricks 16 | .PHONY: install-base 17 | install-base: install-base-pip-packages 18 | python3 -m pip install -r requirements/base.txt 19 | 20 | ## install: installs all test, dev, and experimental requirements 21 | .PHONY: install 22 | install: install-base-pip-packages install-dev 23 | 24 | .PHONY: install-ci 25 | install-ci: install-base-pip-packages install-test 26 | 27 | .PHONY: install-base-pip-packages 28 | install-base-pip-packages: 29 | python3 -m pip install pip==${PIP_VERSION} 30 | 31 | .PHONY: install-test 32 | install-test: install-base 33 | python3 -m pip install -r requirements/test.txt 34 | 35 | .PHONY: install-dev 36 | install-dev: install-test 37 | python3 -m pip install -r requirements/dev.txt 38 | 39 | ## pip-compile: compiles all base/dev/test requirements 40 | .PHONY: pip-compile 41 | pip-compile: 42 | pip-compile --upgrade requirements/base.in 43 | pip-compile --upgrade requirements/test.in 44 | pip-compile --upgrade requirements/dev.in 45 | 46 | ################# 47 | # Test and Lint # 48 | ################# 49 | 50 | export CI ?= false 51 | 52 | ## test: runs all unittests 53 | .PHONY: test 54 | test: 55 | PYTHONPATH=. CI=$(CI) pytest -m "not slow" test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing 56 | 57 | .PHONY: test-slow 58 | test-slow: 59 | PYTHONPATH=. CI=$(CI) pytest test_${PACKAGE_NAME} --cov=${PACKAGE_NAME} --cov-report term-missing 60 | 61 | ## check: runs linters (includes tests) 62 | .PHONY: check 63 | check: check-src check-tests check-version 64 | 65 | ## check-src: runs linters (source only, no tests) 66 | .PHONY: check-src 67 | check-src: 68 | ruff check ${PACKAGE_NAME} --line-length 100 --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --ignore COM812,PT011,PT012,SIM117 69 | python -m black --line-length 100 ${PACKAGE_NAME} --check 70 | python -m flake8 ${PACKAGE_NAME} 71 | python -m mypy ${PACKAGE_NAME} --ignore-missing-imports 72 | 73 | .PHONY: check-tests 74 | check-tests: 75 | python -m black --line-length 100 test_${PACKAGE_NAME} --check 76 | python -m flake8 test_${PACKAGE_NAME} 77 | 78 | ## check-scripts: run shellcheck 79 | .PHONY: check-scripts 80 | check-scripts: 81 | # Fail if any of these files have warnings 82 | scripts/shellcheck.sh 83 | 84 | ## check-version: run check to ensure version in CHANGELOG.md matches version in package 85 | .PHONY: check-version 86 | check-version: 87 | # Fail if syncing version would produce changes 88 | scripts/version-sync.sh -c \ 89 | -s CHANGELOG.md \ 90 | -f unstructured_inference/__version__.py semver 91 | 92 | ## tidy: run black 93 | .PHONY: tidy 94 | tidy: 95 | ruff check ${PACKAGE_NAME} --fix --line-length 100 --select C4,COM,E,F,I,PLR0402,PT,SIM,UP015,UP018,UP032,UP034 --ignore COM812,PT011,PT012,SIM117 96 | black --line-length 100 ${PACKAGE_NAME} 97 | black --line-length 100 test_${PACKAGE_NAME} 98 | 99 | ## version-sync: update __version__.py with most recent version from CHANGELOG.md 100 | .PHONY: version-sync 101 | version-sync: 102 | scripts/version-sync.sh \ 103 | -s CHANGELOG.md \ 104 | -f unstructured_inference/__version__.py semver 105 | 106 | .PHONY: check-coverage 107 | check-coverage: 108 | python -m coverage report --fail-under=95 109 | 110 | ########## 111 | # Docker # 112 | ########## 113 | 114 | # Docker targets are provided for convenience only and are not required in a standard development environment 115 | 116 | DOCKER_IMAGE ?= unstructured-inference:dev 117 | 118 | .PHONY: docker-build 119 | docker-build: 120 | PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh 121 | 122 | .PHONY: docker-test 123 | docker-test: docker-build 124 | docker run --rm \ 125 | -v ${CURRENT_DIR}/test_unstructured_inference:/home/test_unstructured_inference \ 126 | -v ${CURRENT_DIR}/sample-docs:/home/sample-docs \ 127 | $(DOCKER_IMAGE) \ 128 | bash -c "pytest $(if $(TEST_NAME),-k $(TEST_NAME),) test_unstructured_inference" 129 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
Open-Source Pre-Processing Tools for Unstructured Data
11 | 12 | 13 | The `unstructured-inference` repo contains hosted model inference code for layout parsing models. 14 | These models are invoked via API as part of the partitioning bricks in the `unstructured` package. 15 | 16 | ## Installation 17 | 18 | ### Package 19 | 20 | Run `pip install unstructured-inference`. 21 | 22 | ### Detectron2 23 | 24 | [Detectron2](https://github.com/facebookresearch/detectron2) is required for using models from the [layoutparser model zoo](#using-models-from-the-layoutparser-model-zoo) 25 | but is not automatically installed with this package. 26 | For MacOS and Linux, build from source with: 27 | ```shell 28 | pip install 'git+https://github.com/facebookresearch/detectron2.git@57bdb21249d5418c130d54e2ebdc94dda7a4c01a' 29 | ``` 30 | Other install options can be found in the 31 | [Detectron2 installation guide](https://detectron2.readthedocs.io/en/latest/tutorials/install.html). 32 | 33 | Windows is not officially supported by Detectron2, but some users are able to install it anyway. 34 | See discussion [here](https://layout-parser.github.io/tutorials/installation#for-windows-users) for 35 | tips on installing Detectron2 on Windows. 36 | 37 | ### Repository 38 | 39 | To install the repository for development, clone the repo and run `make install` to install dependencies. 40 | Run `make help` for a full list of install options. 41 | 42 | ## Getting Started 43 | 44 | To get started with the layout parsing model, use the following commands: 45 | 46 | ```python 47 | from unstructured_inference.inference.layout import DocumentLayout 48 | 49 | layout = DocumentLayout.from_file("sample-docs/loremipsum.pdf") 50 | 51 | print(layout.pages[0].elements) 52 | ``` 53 | 54 | Once the model has detected the layout and OCR'd the document, the text extracted from the first 55 | page of the sample document will be displayed. 56 | You can convert a given element to a `dict` by running the `.to_dict()` method. 57 | 58 | ## Models 59 | 60 | The inference pipeline operates by finding text elements in a document page using a detection model, then extracting the contents of the elements using direct extraction (if available), OCR, and optionally table inference models. 61 | 62 | We offer several detection models including [Detectron2](https://github.com/facebookresearch/detectron2) and [YOLOX](https://github.com/Megvii-BaseDetection/YOLOX). 63 | 64 | ### Using a non-default model 65 | 66 | When doing inference, an alternate model can be used by passing the model object to the ingestion method via the `model` parameter. The `get_model` function can be used to construct one of our out-of-the-box models from a keyword, e.g.: 67 | ```python 68 | from unstructured_inference.models.base import get_model 69 | from unstructured_inference.inference.layout import DocumentLayout 70 | 71 | model = get_model("yolox") 72 | layout = DocumentLayout.from_file("sample-docs/layout-parser-paper.pdf", detection_model=model) 73 | ``` 74 | 75 | ### Using your own model 76 | 77 | Any detection model can be used for in the `unstructured_inference` pipeline by wrapping the model in the `UnstructuredObjectDetectionModel` class. To integrate with the `DocumentLayout` class, a subclass of `UnstructuredObjectDetectionModel` must have a `predict` method that accepts a `PIL.Image.Image` and returns a list of `LayoutElement`s, and an `initialize` method, which loads the model and prepares it for inference. 78 | 79 | ## Security Policy 80 | 81 | See our [security policy](https://github.com/Unstructured-IO/unstructured-inference/security/policy) for 82 | information on how to report security vulnerabilities. 83 | 84 | ## Learn more 85 | 86 | | Section | Description | 87 | |-|-| 88 | | [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects | 89 | | [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories | 90 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info | 91 | -------------------------------------------------------------------------------- /examples/ocr/engine.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import time 4 | from typing import List, cast 5 | 6 | import cv2 7 | import numpy as np 8 | import pytesseract 9 | from pytesseract import Output 10 | 11 | from unstructured_inference.inference import layout 12 | from unstructured_inference.inference.elements import Rectangle, TextRegion 13 | 14 | 15 | def remove_non_printable(s): 16 | dst_str = re.sub(r'[^\x20-\x7E]', ' ', s) 17 | return ' '.join(dst_str.split()) 18 | 19 | 20 | def run_ocr_with_layout_detection( 21 | images, 22 | detection_model=None, 23 | element_extraction_model=None, 24 | mode="individual_blocks", 25 | output_dir="", 26 | drawable=True, 27 | printable=True, 28 | ): 29 | total_text_extraction_infer_time = 0 30 | total_extracted_text = {} 31 | for i, image in enumerate(images): 32 | page_num = i + 1 33 | page_num_str = f"page{page_num}" 34 | 35 | page = layout.PageLayout( 36 | number=i+1, 37 | image=image, 38 | layout=None, 39 | detection_model=detection_model, 40 | element_extraction_model=element_extraction_model, 41 | ) 42 | 43 | inferred_layout: List[TextRegion] = cast(List[TextRegion], page.detection_model(page.image)) 44 | 45 | cv_img = np.array(image) 46 | 47 | if mode == "individual_blocks": 48 | # OCR'ing individual blocks (current approach) 49 | text_extraction_start_time = time.time() 50 | 51 | elements = page.get_elements_from_layout(inferred_layout) 52 | 53 | text_extraction_infer_time = time.time() - text_extraction_start_time 54 | 55 | total_text_extraction_infer_time += text_extraction_infer_time 56 | 57 | page_text = "" 58 | for el in elements: 59 | page_text += el.text 60 | filtered_page_text = remove_non_printable(page_text) 61 | total_extracted_text[page_num_str] = filtered_page_text 62 | elif mode == "entire_page": 63 | # OCR'ing entire page (new approach to implement) 64 | text_extraction_start_time = time.time() 65 | 66 | ocr_data = pytesseract.image_to_data(image, lang='eng', output_type=Output.DICT) 67 | boxes = ocr_data['level'] 68 | extracted_text_list = [] 69 | for k in range(len(boxes)): 70 | (x, y, w, h) = ocr_data['left'][k], ocr_data['top'][k], ocr_data['width'][k], ocr_data['height'][k] 71 | extracted_text = ocr_data['text'][k] 72 | if not extracted_text: 73 | continue 74 | 75 | extracted_region = Rectangle(x1=x, y1=y, x2=x+w, y2=y+h) 76 | 77 | extracted_is_subregion_of_inferred = False 78 | for inferred_region in inferred_layout: 79 | extracted_is_subregion_of_inferred = extracted_region.is_almost_subregion_of( 80 | inferred_region.pad(12), 81 | subregion_threshold=0.75, 82 | ) 83 | if extracted_is_subregion_of_inferred: 84 | break 85 | 86 | if extracted_is_subregion_of_inferred: 87 | extracted_text_list.append(extracted_text) 88 | 89 | if drawable: 90 | if extracted_is_subregion_of_inferred: 91 | cv2.rectangle(cv_img, (x, y), (x + w, y + h), (0, 255, 0), 2, None) 92 | else: 93 | cv2.rectangle(cv_img, (x, y), (x + w, y + h), (255, 0, 0), 2, None) 94 | 95 | text_extraction_infer_time = time.time() - text_extraction_start_time 96 | total_text_extraction_infer_time += text_extraction_infer_time 97 | 98 | page_text = " ".join(extracted_text_list) 99 | filtered_page_text = remove_non_printable(page_text) 100 | total_extracted_text[page_num_str] = filtered_page_text 101 | else: 102 | raise ValueError("Invalid mode") 103 | 104 | if drawable: 105 | for el in inferred_layout: 106 | pt1 = [int(el.x1), int(el.y1)] 107 | pt2 = [int(el.x2), int(el.y2)] 108 | cv2.rectangle( 109 | img=cv_img, 110 | pt1=pt1, pt2=pt2, 111 | color=(0, 0, 255), 112 | thickness=4, 113 | lineType=None, 114 | ) 115 | 116 | f_path = os.path.join(output_dir, f"ocr_{mode}_{page_num_str}.jpg") 117 | cv2.imwrite(f_path, cv_img) 118 | 119 | if printable: 120 | print(f"page: {i + 1} - n_layout_elements: {len(inferred_layout)} - " 121 | f"text_extraction_infer_time: {text_extraction_infer_time}") 122 | 123 | return total_text_extraction_infer_time, total_extracted_text 124 | 125 | 126 | def run_ocr( 127 | images, 128 | printable=True, 129 | ): 130 | total_text_extraction_infer_time = 0 131 | total_text = "" 132 | for i, image in enumerate(images): 133 | text_extraction_start_time = time.time() 134 | 135 | page_text = pytesseract.image_to_string(image) 136 | 137 | text_extraction_infer_time = time.time() - text_extraction_start_time 138 | 139 | if printable: 140 | print(f"page: {i + 1} - text_extraction_infer_time: {text_extraction_infer_time}") 141 | 142 | total_text_extraction_infer_time += text_extraction_infer_time 143 | total_text += page_text 144 | 145 | return total_text_extraction_infer_time, total_text 146 | -------------------------------------------------------------------------------- /examples/ocr/output/.gitignore: -------------------------------------------------------------------------------- 1 | * -------------------------------------------------------------------------------- /examples/ocr/requirements.txt: -------------------------------------------------------------------------------- 1 | unstructured[local-inference] 2 | nltk -------------------------------------------------------------------------------- /examples/ocr/validate_ocr_performance.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | from datetime import datetime 5 | from difflib import SequenceMatcher 6 | 7 | import nltk 8 | import pdf2image 9 | 10 | from unstructured_inference.inference.layout import ( 11 | DocumentLayout, 12 | create_image_output_dir, 13 | process_file_with_model, 14 | ) 15 | 16 | # Download the required resources (run this once) 17 | nltk.download('punkt') 18 | 19 | 20 | def validate_performance( 21 | f_name, 22 | validation_mode, 23 | is_image_file=False, 24 | ): 25 | print(f">>> Start performance comparison - filename: {f_name} - validation_mode: {validation_mode}" 26 | f" - is_image_file: {is_image_file}") 27 | 28 | now_dt = datetime.utcnow() 29 | now_str = now_dt.strftime("%Y_%m_%d-%H_%M_%S") 30 | 31 | f_path = os.path.join(example_docs_dir, f_name) 32 | 33 | image_f_paths = [] 34 | if validation_mode == "pdf": 35 | pdf_info = pdf2image.pdfinfo_from_path(f_path) 36 | n_pages = pdf_info["Pages"] 37 | elif validation_mode == "image": 38 | if is_image_file: 39 | image_f_paths.append(f_path) 40 | else: 41 | image_output_dir = create_image_output_dir(f_path) 42 | images = pdf2image.convert_from_path(f_path, output_folder=image_output_dir) 43 | image_f_paths = [image.filename for image in images] 44 | n_pages = len(image_f_paths) 45 | else: 46 | n_pages = 0 47 | 48 | processing_result = {} 49 | for ocr_mode in ["individual_blocks", "entire_page"]: 50 | start_time = time.time() 51 | 52 | if validation_mode == "pdf": 53 | layout = process_file_with_model( 54 | f_path, 55 | model_name=None, 56 | ocr_mode=ocr_mode, 57 | ) 58 | elif validation_mode == "image": 59 | pages = [] 60 | for image_f_path in image_f_paths: 61 | _layout = process_file_with_model( 62 | image_f_path, 63 | model_name=None, 64 | ocr_mode=ocr_mode, 65 | is_image=True, 66 | ) 67 | pages += _layout.pages 68 | for i, page in enumerate(pages): 69 | page.number = i + 1 70 | layout = DocumentLayout.from_pages(pages) 71 | else: 72 | layout = None 73 | 74 | infer_time = time.time() - start_time 75 | 76 | if layout is None: 77 | print("Layout is None") 78 | return 79 | 80 | full_text = str(layout) 81 | page_text = {} 82 | for page in layout.pages: 83 | page_text[page.number] = str(page) 84 | 85 | processing_result[ocr_mode] = { 86 | "infer_time": infer_time, 87 | "full_text": full_text, 88 | "page_text": page_text, 89 | } 90 | 91 | individual_mode_page_text = processing_result["individual_blocks"]["page_text"] 92 | entire_mode_page_text = processing_result["individual_blocks"]["page_text"] 93 | individual_mode_full_text = processing_result["individual_blocks"]["full_text"] 94 | entire_mode_full_text = processing_result["entire_page"]["full_text"] 95 | 96 | compare_result = compare_processed_text(individual_mode_full_text, entire_mode_full_text) 97 | 98 | report = { 99 | "validation_mode": validation_mode, 100 | "file_info": { 101 | "filename": f_name, 102 | "n_pages": n_pages, 103 | }, 104 | "processing_time": { 105 | "individual_blocks": processing_result["individual_blocks"]["infer_time"], 106 | "entire_page": processing_result["entire_page"]["infer_time"], 107 | }, 108 | "text_similarity": compare_result, 109 | "extracted_text": { 110 | "individual_blocks": { 111 | "page_text": individual_mode_page_text, 112 | "full_text": individual_mode_full_text, 113 | }, 114 | "entire_page": { 115 | "page_text": entire_mode_page_text, 116 | "full_text": entire_mode_full_text, 117 | }, 118 | }, 119 | } 120 | 121 | write_report(report, now_str, validation_mode) 122 | 123 | print("<<< End performance comparison", f_name) 124 | 125 | 126 | def compare_processed_text(individual_mode_full_text, entire_mode_full_text, delimiter=" "): 127 | # Calculate similarity ratio 128 | similarity_ratio = SequenceMatcher(None, individual_mode_full_text, entire_mode_full_text).ratio() 129 | 130 | print(f"similarity_ratio: {similarity_ratio}") 131 | 132 | # Tokenize the text into words 133 | word_list_individual = nltk.word_tokenize(individual_mode_full_text) 134 | n_word_list_individual = len(word_list_individual) 135 | print("n_word_list_in_text_individual:", n_word_list_individual) 136 | word_sets_individual = set(word_list_individual) 137 | n_word_sets_individual = len(word_sets_individual) 138 | print(f"n_word_sets_in_text_individual: {n_word_sets_individual}") 139 | # print("word_sets_merged:", word_sets_merged) 140 | 141 | word_list_entire = nltk.word_tokenize(entire_mode_full_text) 142 | n_word_list_entire = len(word_list_entire) 143 | print("n_word_list_individual:", n_word_list_entire) 144 | word_sets_entire = set(word_list_entire) 145 | n_word_sets_entire = len(word_sets_entire) 146 | print(f"n_word_sets_individual: {n_word_sets_entire}") 147 | # print("word_sets_individual:", word_sets_individual) 148 | 149 | # Find unique elements using difference 150 | print("diff_elements:") 151 | unique_words_individual = word_sets_individual - word_sets_entire 152 | unique_words_entire = word_sets_entire - word_sets_individual 153 | print(f"unique_words_in_text_individual: {unique_words_individual}\n") 154 | print(f"unique_words_in_text_entire: {unique_words_entire}") 155 | 156 | return { 157 | "similarity_ratio": similarity_ratio, 158 | "individual_blocks": { 159 | "n_word_list": n_word_list_individual, 160 | "n_word_sets": n_word_sets_individual, 161 | "unique_words": delimiter.join(list(unique_words_individual)), 162 | }, 163 | "entire_page": { 164 | "n_word_list": n_word_list_entire, 165 | "n_word_sets": n_word_sets_entire, 166 | "unique_words": delimiter.join(list(unique_words_entire)), 167 | }, 168 | } 169 | 170 | 171 | def write_report(report, now_str, validation_mode): 172 | report_f_name = f"validate-ocr-{validation_mode}-{now_str}.json" 173 | report_f_path = os.path.join(output_dir, report_f_name) 174 | with open(report_f_path, "w", encoding="utf-8-sig") as f: 175 | json.dump(report, f, indent=4) 176 | 177 | 178 | def run(): 179 | test_files = [ 180 | {"name": "layout-parser-paper-fast.pdf", "mode": "image", "is_image_file": False}, 181 | {"name": "loremipsum_multipage.pdf", "mode": "image", "is_image_file": False}, 182 | {"name": "2023-Jan-economic-outlook.pdf", "mode": "image", "is_image_file": False}, 183 | {"name": "recalibrating-risk-report.pdf", "mode": "image", "is_image_file": False}, 184 | {"name": "Silent-Giant.pdf", "mode": "image", "is_image_file": False}, 185 | ] 186 | 187 | for test_file in test_files: 188 | f_name = test_file["name"] 189 | validation_mode = test_file["mode"] 190 | is_image_file = test_file["is_image_file"] 191 | 192 | validate_performance(f_name, validation_mode, is_image_file) 193 | 194 | 195 | if __name__ == '__main__': 196 | cur_dir = os.getcwd() 197 | base_dir = os.path.join(cur_dir, os.pardir, os.pardir) 198 | example_docs_dir = os.path.join(base_dir, "sample-docs") 199 | 200 | # folder path to save temporary outputs 201 | output_dir = os.path.join(cur_dir, "output") 202 | os.makedirs(output_dir, exist_ok=True) 203 | 204 | run() 205 | -------------------------------------------------------------------------------- /img/unstructured_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/img/unstructured_logo.png -------------------------------------------------------------------------------- /logger_config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | default_format: 5 | "()": uvicorn.logging.DefaultFormatter 6 | format: '%(asctime)s %(name)s %(levelname)s %(message)s' 7 | access: 8 | "()": uvicorn.logging.AccessFormatter 9 | format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s' 10 | handlers: 11 | access_handler: 12 | formatter: access 13 | class: logging.StreamHandler 14 | stream: ext://sys.stderr 15 | standard_handler: 16 | formatter: default_format 17 | class: logging.StreamHandler 18 | stream: ext://sys.stderr 19 | loggers: 20 | uvicorn.error: 21 | level: INFO 22 | handlers: 23 | - standard_handler 24 | propagate: no 25 | # disable logging for uvicorn.error by not having a handler 26 | uvicorn.access: 27 | level: INFO 28 | handlers: 29 | - access_handler 30 | propagate: no 31 | # disable logging for uvicorn.access by not having a handler 32 | unstructured: 33 | level: INFO 34 | handlers: 35 | - standard_handler 36 | propagate: no 37 | unstructured_inference: 38 | level: DEBUG 39 | handlers: 40 | - standard_handler 41 | propagate: no 42 | 43 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | slow: marks tests as slow (deselect with '-m "not long"') 4 | -------------------------------------------------------------------------------- /requirements/base.in: -------------------------------------------------------------------------------- 1 | -c constraints.in 2 | python-multipart 3 | huggingface-hub 4 | numpy 5 | opencv-python!=4.7.0.68 6 | onnx 7 | onnxruntime>=1.18.0 8 | matplotlib 9 | torch 10 | timm 11 | # NOTE(alan): Pinned because this is when the most recent module we import appeared 12 | transformers>=4.25.1 13 | accelerate 14 | rapidfuzz 15 | pandas 16 | scipy 17 | pypdfium2 18 | pdfminer-six 19 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile requirements/base.in 6 | # 7 | accelerate==1.7.0 8 | # via -r requirements/base.in 9 | certifi==2025.4.26 10 | # via requests 11 | cffi==1.17.1 12 | # via cryptography 13 | charset-normalizer==3.4.2 14 | # via 15 | # pdfminer-six 16 | # requests 17 | coloredlogs==15.0.1 18 | # via onnxruntime 19 | contourpy==1.3.2 20 | # via matplotlib 21 | cryptography==44.0.3 22 | # via pdfminer-six 23 | cycler==0.12.1 24 | # via matplotlib 25 | filelock==3.18.0 26 | # via 27 | # huggingface-hub 28 | # torch 29 | # transformers 30 | flatbuffers==25.2.10 31 | # via onnxruntime 32 | fonttools==4.58.0 33 | # via matplotlib 34 | fsspec==2025.3.2 35 | # via 36 | # huggingface-hub 37 | # torch 38 | huggingface-hub==0.31.2 39 | # via 40 | # -r requirements/base.in 41 | # accelerate 42 | # timm 43 | # tokenizers 44 | # transformers 45 | humanfriendly==10.0 46 | # via coloredlogs 47 | idna==3.10 48 | # via requests 49 | jinja2==3.1.6 50 | # via torch 51 | kiwisolver==1.4.8 52 | # via matplotlib 53 | markupsafe==3.0.2 54 | # via jinja2 55 | matplotlib==3.10.3 56 | # via -r requirements/base.in 57 | mpmath==1.3.0 58 | # via sympy 59 | networkx==3.4.2 60 | # via torch 61 | numpy==2.2.5 62 | # via 63 | # -r requirements/base.in 64 | # accelerate 65 | # contourpy 66 | # matplotlib 67 | # onnx 68 | # onnxruntime 69 | # opencv-python 70 | # pandas 71 | # scipy 72 | # torchvision 73 | # transformers 74 | onnx==1.18.0 75 | # via -r requirements/base.in 76 | onnxruntime==1.22.0 77 | # via -r requirements/base.in 78 | opencv-python==4.11.0.86 79 | # via -r requirements/base.in 80 | packaging==25.0 81 | # via 82 | # accelerate 83 | # huggingface-hub 84 | # matplotlib 85 | # onnxruntime 86 | # transformers 87 | pandas==2.2.3 88 | # via -r requirements/base.in 89 | pdfminer-six==20250506 90 | # via -r requirements/base.in 91 | pillow==11.2.1 92 | # via 93 | # matplotlib 94 | # torchvision 95 | protobuf==6.31.0 96 | # via 97 | # onnx 98 | # onnxruntime 99 | psutil==7.0.0 100 | # via accelerate 101 | pycparser==2.22 102 | # via cffi 103 | pyparsing==3.2.3 104 | # via matplotlib 105 | pypdfium2==4.30.1 106 | # via -r requirements/base.in 107 | python-dateutil==2.9.0.post0 108 | # via 109 | # matplotlib 110 | # pandas 111 | python-multipart==0.0.20 112 | # via -r requirements/base.in 113 | pytz==2025.2 114 | # via pandas 115 | pyyaml==6.0.2 116 | # via 117 | # accelerate 118 | # huggingface-hub 119 | # timm 120 | # transformers 121 | rapidfuzz==3.13.0 122 | # via -r requirements/base.in 123 | regex==2024.11.6 124 | # via transformers 125 | requests==2.32.3 126 | # via 127 | # huggingface-hub 128 | # transformers 129 | safetensors==0.5.3 130 | # via 131 | # accelerate 132 | # timm 133 | # transformers 134 | scipy==1.15.3 135 | # via -r requirements/base.in 136 | six==1.17.0 137 | # via python-dateutil 138 | sympy==1.14.0 139 | # via 140 | # onnxruntime 141 | # torch 142 | timm==1.0.15 143 | # via -r requirements/base.in 144 | tokenizers==0.21.1 145 | # via transformers 146 | torch==2.7.0 147 | # via 148 | # -r requirements/base.in 149 | # accelerate 150 | # timm 151 | # torchvision 152 | torchvision==0.22.0 153 | # via timm 154 | tqdm==4.67.1 155 | # via 156 | # huggingface-hub 157 | # transformers 158 | transformers==4.51.3 159 | # via -r requirements/base.in 160 | typing-extensions==4.13.2 161 | # via 162 | # huggingface-hub 163 | # onnx 164 | # torch 165 | tzdata==2025.2 166 | # via pandas 167 | urllib3==2.4.0 168 | # via requests 169 | 170 | # The following packages are considered to be unsafe in a requirements file: 171 | # setuptools 172 | -------------------------------------------------------------------------------- /requirements/constraints.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/requirements/constraints.in -------------------------------------------------------------------------------- /requirements/dev.in: -------------------------------------------------------------------------------- 1 | -c constraints.in 2 | -c base.txt 3 | -c test.txt 4 | jupyter 5 | ipython 6 | pip-tools 7 | matplotlib -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile requirements/dev.in 6 | # 7 | anyio==4.9.0 8 | # via 9 | # -c requirements/test.txt 10 | # httpx 11 | # jupyter-server 12 | appnope==0.1.4 13 | # via ipykernel 14 | argon2-cffi==23.1.0 15 | # via jupyter-server 16 | argon2-cffi-bindings==21.2.0 17 | # via argon2-cffi 18 | arrow==1.3.0 19 | # via isoduration 20 | asttokens==3.0.0 21 | # via stack-data 22 | async-lru==2.0.5 23 | # via jupyterlab 24 | attrs==25.3.0 25 | # via 26 | # jsonschema 27 | # referencing 28 | babel==2.17.0 29 | # via jupyterlab-server 30 | beautifulsoup4==4.13.4 31 | # via nbconvert 32 | bleach[css]==6.2.0 33 | # via nbconvert 34 | build==1.2.2.post1 35 | # via pip-tools 36 | certifi==2025.4.26 37 | # via 38 | # -c requirements/base.txt 39 | # -c requirements/test.txt 40 | # httpcore 41 | # httpx 42 | # requests 43 | cffi==1.17.1 44 | # via 45 | # -c requirements/base.txt 46 | # argon2-cffi-bindings 47 | charset-normalizer==3.4.2 48 | # via 49 | # -c requirements/base.txt 50 | # -c requirements/test.txt 51 | # requests 52 | click==8.2.0 53 | # via 54 | # -c requirements/test.txt 55 | # pip-tools 56 | comm==0.2.2 57 | # via 58 | # ipykernel 59 | # ipywidgets 60 | contourpy==1.3.2 61 | # via 62 | # -c requirements/base.txt 63 | # matplotlib 64 | cycler==0.12.1 65 | # via 66 | # -c requirements/base.txt 67 | # matplotlib 68 | debugpy==1.8.14 69 | # via ipykernel 70 | decorator==5.2.1 71 | # via ipython 72 | defusedxml==0.7.1 73 | # via nbconvert 74 | executing==2.2.0 75 | # via stack-data 76 | fastjsonschema==2.21.1 77 | # via nbformat 78 | fonttools==4.58.0 79 | # via 80 | # -c requirements/base.txt 81 | # matplotlib 82 | fqdn==1.5.1 83 | # via jsonschema 84 | h11==0.16.0 85 | # via 86 | # -c requirements/test.txt 87 | # httpcore 88 | httpcore==1.0.9 89 | # via 90 | # -c requirements/test.txt 91 | # httpx 92 | httpx==0.28.1 93 | # via 94 | # -c requirements/test.txt 95 | # jupyterlab 96 | idna==3.10 97 | # via 98 | # -c requirements/base.txt 99 | # -c requirements/test.txt 100 | # anyio 101 | # httpx 102 | # jsonschema 103 | # requests 104 | ipykernel==6.29.5 105 | # via 106 | # jupyter 107 | # jupyter-console 108 | # jupyterlab 109 | ipython==9.2.0 110 | # via 111 | # -r requirements/dev.in 112 | # ipykernel 113 | # ipywidgets 114 | # jupyter-console 115 | ipython-pygments-lexers==1.1.1 116 | # via ipython 117 | ipywidgets==8.1.7 118 | # via jupyter 119 | isoduration==20.11.0 120 | # via jsonschema 121 | jedi==0.19.2 122 | # via ipython 123 | jinja2==3.1.6 124 | # via 125 | # -c requirements/base.txt 126 | # jupyter-server 127 | # jupyterlab 128 | # jupyterlab-server 129 | # nbconvert 130 | json5==0.12.0 131 | # via jupyterlab-server 132 | jsonpointer==3.0.0 133 | # via jsonschema 134 | jsonschema[format-nongpl]==4.23.0 135 | # via 136 | # jupyter-events 137 | # jupyterlab-server 138 | # nbformat 139 | jsonschema-specifications==2025.4.1 140 | # via jsonschema 141 | jupyter==1.1.1 142 | # via -r requirements/dev.in 143 | jupyter-client==8.6.3 144 | # via 145 | # ipykernel 146 | # jupyter-console 147 | # jupyter-server 148 | # nbclient 149 | jupyter-console==6.6.3 150 | # via jupyter 151 | jupyter-core==5.7.2 152 | # via 153 | # ipykernel 154 | # jupyter-client 155 | # jupyter-console 156 | # jupyter-server 157 | # jupyterlab 158 | # nbclient 159 | # nbconvert 160 | # nbformat 161 | jupyter-events==0.12.0 162 | # via jupyter-server 163 | jupyter-lsp==2.2.5 164 | # via jupyterlab 165 | jupyter-server==2.16.0 166 | # via 167 | # jupyter-lsp 168 | # jupyterlab 169 | # jupyterlab-server 170 | # notebook 171 | # notebook-shim 172 | jupyter-server-terminals==0.5.3 173 | # via jupyter-server 174 | jupyterlab==4.4.2 175 | # via 176 | # jupyter 177 | # notebook 178 | jupyterlab-pygments==0.3.0 179 | # via nbconvert 180 | jupyterlab-server==2.27.3 181 | # via 182 | # jupyterlab 183 | # notebook 184 | jupyterlab-widgets==3.0.15 185 | # via ipywidgets 186 | kiwisolver==1.4.8 187 | # via 188 | # -c requirements/base.txt 189 | # matplotlib 190 | markupsafe==3.0.2 191 | # via 192 | # -c requirements/base.txt 193 | # jinja2 194 | # nbconvert 195 | matplotlib==3.10.3 196 | # via 197 | # -c requirements/base.txt 198 | # -r requirements/dev.in 199 | matplotlib-inline==0.1.7 200 | # via 201 | # ipykernel 202 | # ipython 203 | mistune==3.1.3 204 | # via nbconvert 205 | nbclient==0.10.2 206 | # via nbconvert 207 | nbconvert==7.16.6 208 | # via 209 | # jupyter 210 | # jupyter-server 211 | nbformat==5.10.4 212 | # via 213 | # jupyter-server 214 | # nbclient 215 | # nbconvert 216 | nest-asyncio==1.6.0 217 | # via ipykernel 218 | notebook==7.4.2 219 | # via jupyter 220 | notebook-shim==0.2.4 221 | # via 222 | # jupyterlab 223 | # notebook 224 | numpy==2.2.5 225 | # via 226 | # -c requirements/base.txt 227 | # contourpy 228 | # matplotlib 229 | overrides==7.7.0 230 | # via jupyter-server 231 | packaging==25.0 232 | # via 233 | # -c requirements/base.txt 234 | # -c requirements/test.txt 235 | # build 236 | # ipykernel 237 | # jupyter-events 238 | # jupyter-server 239 | # jupyterlab 240 | # jupyterlab-server 241 | # matplotlib 242 | # nbconvert 243 | pandocfilters==1.5.1 244 | # via nbconvert 245 | parso==0.8.4 246 | # via jedi 247 | pexpect==4.9.0 248 | # via ipython 249 | pillow==11.2.1 250 | # via 251 | # -c requirements/base.txt 252 | # -c requirements/test.txt 253 | # matplotlib 254 | pip-tools==7.4.1 255 | # via -r requirements/dev.in 256 | platformdirs==4.3.8 257 | # via 258 | # -c requirements/test.txt 259 | # jupyter-core 260 | prometheus-client==0.21.1 261 | # via jupyter-server 262 | prompt-toolkit==3.0.51 263 | # via 264 | # ipython 265 | # jupyter-console 266 | psutil==7.0.0 267 | # via 268 | # -c requirements/base.txt 269 | # ipykernel 270 | ptyprocess==0.7.0 271 | # via 272 | # pexpect 273 | # terminado 274 | pure-eval==0.2.3 275 | # via stack-data 276 | pycparser==2.22 277 | # via 278 | # -c requirements/base.txt 279 | # cffi 280 | pygments==2.19.1 281 | # via 282 | # ipython 283 | # ipython-pygments-lexers 284 | # jupyter-console 285 | # nbconvert 286 | pyparsing==3.2.3 287 | # via 288 | # -c requirements/base.txt 289 | # matplotlib 290 | pyproject-hooks==1.2.0 291 | # via 292 | # build 293 | # pip-tools 294 | python-dateutil==2.9.0.post0 295 | # via 296 | # -c requirements/base.txt 297 | # arrow 298 | # jupyter-client 299 | # matplotlib 300 | python-json-logger==3.3.0 301 | # via jupyter-events 302 | pyyaml==6.0.2 303 | # via 304 | # -c requirements/base.txt 305 | # -c requirements/test.txt 306 | # jupyter-events 307 | pyzmq==26.4.0 308 | # via 309 | # ipykernel 310 | # jupyter-client 311 | # jupyter-console 312 | # jupyter-server 313 | referencing==0.36.2 314 | # via 315 | # jsonschema 316 | # jsonschema-specifications 317 | # jupyter-events 318 | requests==2.32.3 319 | # via 320 | # -c requirements/base.txt 321 | # -c requirements/test.txt 322 | # jupyterlab-server 323 | rfc3339-validator==0.1.4 324 | # via 325 | # jsonschema 326 | # jupyter-events 327 | rfc3986-validator==0.1.1 328 | # via 329 | # jsonschema 330 | # jupyter-events 331 | rpds-py==0.25.0 332 | # via 333 | # jsonschema 334 | # referencing 335 | send2trash==1.8.3 336 | # via jupyter-server 337 | six==1.17.0 338 | # via 339 | # -c requirements/base.txt 340 | # python-dateutil 341 | # rfc3339-validator 342 | sniffio==1.3.1 343 | # via 344 | # -c requirements/test.txt 345 | # anyio 346 | soupsieve==2.7 347 | # via beautifulsoup4 348 | stack-data==0.6.3 349 | # via ipython 350 | terminado==0.18.1 351 | # via 352 | # jupyter-server 353 | # jupyter-server-terminals 354 | tinycss2==1.4.0 355 | # via bleach 356 | tornado==6.5 357 | # via 358 | # ipykernel 359 | # jupyter-client 360 | # jupyter-server 361 | # jupyterlab 362 | # notebook 363 | # terminado 364 | traitlets==5.14.3 365 | # via 366 | # comm 367 | # ipykernel 368 | # ipython 369 | # ipywidgets 370 | # jupyter-client 371 | # jupyter-console 372 | # jupyter-core 373 | # jupyter-events 374 | # jupyter-server 375 | # jupyterlab 376 | # matplotlib-inline 377 | # nbclient 378 | # nbconvert 379 | # nbformat 380 | types-python-dateutil==2.9.0.20241206 381 | # via arrow 382 | typing-extensions==4.13.2 383 | # via 384 | # -c requirements/base.txt 385 | # -c requirements/test.txt 386 | # anyio 387 | # beautifulsoup4 388 | # referencing 389 | uri-template==1.3.0 390 | # via jsonschema 391 | urllib3==2.4.0 392 | # via 393 | # -c requirements/base.txt 394 | # -c requirements/test.txt 395 | # requests 396 | wcwidth==0.2.13 397 | # via prompt-toolkit 398 | webcolors==24.11.1 399 | # via jsonschema 400 | webencodings==0.5.1 401 | # via 402 | # bleach 403 | # tinycss2 404 | websocket-client==1.8.0 405 | # via jupyter-server 406 | wheel==0.45.1 407 | # via pip-tools 408 | widgetsnbextension==4.0.14 409 | # via ipywidgets 410 | 411 | # The following packages are considered to be unsafe in a requirements file: 412 | # pip 413 | # setuptools 414 | -------------------------------------------------------------------------------- /requirements/test.in: -------------------------------------------------------------------------------- 1 | -c constraints.in 2 | -c base.txt 3 | black>=22.3.0 4 | coverage 5 | # NOTE(mrobinson) - Pinning click due to a unicode issue in black 6 | # can remove after black drops support for Python 3.6 7 | # ref: https://github.com/psf/black/issues/2964 8 | click>=8.1 9 | # NOTE(alan) - Added to cover the fact that is isn't specified in 10 | # starlette even though it's required for TestClient 11 | httpx 12 | flake8 13 | flake8-docstrings 14 | mypy 15 | pytest-cov 16 | pytest-mock 17 | pdf2image>=1.16.2 18 | huggingface_hub>=0.11.1 19 | ruff 20 | types-pyyaml 21 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.12 3 | # by the following command: 4 | # 5 | # pip-compile requirements/test.in 6 | # 7 | anyio==4.9.0 8 | # via httpx 9 | black==25.1.0 10 | # via -r requirements/test.in 11 | certifi==2025.4.26 12 | # via 13 | # -c requirements/base.txt 14 | # httpcore 15 | # httpx 16 | # requests 17 | charset-normalizer==3.4.2 18 | # via 19 | # -c requirements/base.txt 20 | # requests 21 | click==8.2.0 22 | # via 23 | # -r requirements/test.in 24 | # black 25 | coverage[toml]==7.8.0 26 | # via 27 | # -r requirements/test.in 28 | # pytest-cov 29 | filelock==3.18.0 30 | # via 31 | # -c requirements/base.txt 32 | # huggingface-hub 33 | flake8==7.2.0 34 | # via 35 | # -r requirements/test.in 36 | # flake8-docstrings 37 | flake8-docstrings==1.7.0 38 | # via -r requirements/test.in 39 | fsspec==2025.3.2 40 | # via 41 | # -c requirements/base.txt 42 | # huggingface-hub 43 | h11==0.16.0 44 | # via httpcore 45 | httpcore==1.0.9 46 | # via httpx 47 | httpx==0.28.1 48 | # via -r requirements/test.in 49 | huggingface-hub==0.31.2 50 | # via 51 | # -c requirements/base.txt 52 | # -r requirements/test.in 53 | idna==3.10 54 | # via 55 | # -c requirements/base.txt 56 | # anyio 57 | # httpx 58 | # requests 59 | iniconfig==2.1.0 60 | # via pytest 61 | mccabe==0.7.0 62 | # via flake8 63 | mypy==1.15.0 64 | # via -r requirements/test.in 65 | mypy-extensions==1.1.0 66 | # via 67 | # black 68 | # mypy 69 | packaging==25.0 70 | # via 71 | # -c requirements/base.txt 72 | # black 73 | # huggingface-hub 74 | # pytest 75 | pathspec==0.12.1 76 | # via black 77 | pdf2image==1.17.0 78 | # via -r requirements/test.in 79 | pillow==11.2.1 80 | # via 81 | # -c requirements/base.txt 82 | # pdf2image 83 | platformdirs==4.3.8 84 | # via black 85 | pluggy==1.6.0 86 | # via pytest 87 | pycodestyle==2.13.0 88 | # via flake8 89 | pydocstyle==6.3.0 90 | # via flake8-docstrings 91 | pyflakes==3.3.2 92 | # via flake8 93 | pytest==8.3.5 94 | # via 95 | # pytest-cov 96 | # pytest-mock 97 | pytest-cov==6.1.1 98 | # via -r requirements/test.in 99 | pytest-mock==3.14.0 100 | # via -r requirements/test.in 101 | pyyaml==6.0.2 102 | # via 103 | # -c requirements/base.txt 104 | # huggingface-hub 105 | requests==2.32.3 106 | # via 107 | # -c requirements/base.txt 108 | # huggingface-hub 109 | ruff==0.11.10 110 | # via -r requirements/test.in 111 | sniffio==1.3.1 112 | # via anyio 113 | snowballstemmer==3.0.1 114 | # via pydocstyle 115 | tqdm==4.67.1 116 | # via 117 | # -c requirements/base.txt 118 | # huggingface-hub 119 | types-pyyaml==6.0.12.20250402 120 | # via -r requirements/test.in 121 | typing-extensions==4.13.2 122 | # via 123 | # -c requirements/base.txt 124 | # anyio 125 | # huggingface-hub 126 | # mypy 127 | urllib3==2.4.0 128 | # via 129 | # -c requirements/base.txt 130 | # requests 131 | -------------------------------------------------------------------------------- /sample-docs/2023-Jan-economic-outlook.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/2023-Jan-economic-outlook.pdf -------------------------------------------------------------------------------- /sample-docs/IRS-form-1987.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/IRS-form-1987.pdf -------------------------------------------------------------------------------- /sample-docs/RGBA_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/RGBA_image.png -------------------------------------------------------------------------------- /sample-docs/Silent-Giant.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/Silent-Giant.pdf -------------------------------------------------------------------------------- /sample-docs/design-thinking.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/design-thinking.pdf -------------------------------------------------------------------------------- /sample-docs/easy_table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/easy_table.jpg -------------------------------------------------------------------------------- /sample-docs/embedded-images.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/embedded-images.pdf -------------------------------------------------------------------------------- /sample-docs/empty-document.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/empty-document.pdf -------------------------------------------------------------------------------- /sample-docs/example_table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/example_table.jpg -------------------------------------------------------------------------------- /sample-docs/ilpa-example-1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/ilpa-example-1.jpg -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper-fast.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/layout-parser-paper-fast.jpg -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper-fast.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/layout-parser-paper-fast.pdf -------------------------------------------------------------------------------- /sample-docs/layout-parser-paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/layout-parser-paper.pdf -------------------------------------------------------------------------------- /sample-docs/loremipsum-flat.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/loremipsum-flat.pdf -------------------------------------------------------------------------------- /sample-docs/loremipsum.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/loremipsum.jpg -------------------------------------------------------------------------------- /sample-docs/loremipsum.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/loremipsum.pdf -------------------------------------------------------------------------------- /sample-docs/loremipsum.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/loremipsum.png -------------------------------------------------------------------------------- /sample-docs/loremipsum.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/loremipsum.tiff -------------------------------------------------------------------------------- /sample-docs/loremipsum_multipage.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/loremipsum_multipage.pdf -------------------------------------------------------------------------------- /sample-docs/non-embedded.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/non-embedded.pdf -------------------------------------------------------------------------------- /sample-docs/password.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/password.pdf -------------------------------------------------------------------------------- /sample-docs/patent-1p.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/patent-1p.pdf -------------------------------------------------------------------------------- /sample-docs/patent.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/patent.pdf -------------------------------------------------------------------------------- /sample-docs/pdf2image-memory-error-test-400p.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/pdf2image-memory-error-test-400p.pdf -------------------------------------------------------------------------------- /sample-docs/recalibrating-risk-report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/recalibrating-risk-report.pdf -------------------------------------------------------------------------------- /sample-docs/receipt-sample.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/receipt-sample.jpg -------------------------------------------------------------------------------- /sample-docs/table-multi-row-column-cells.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/table-multi-row-column-cells.png -------------------------------------------------------------------------------- /sample-docs/test-image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/unstructured-inference/a0df407c3f7143fa66a9ad8bb40a6ee06907ce5a/sample-docs/test-image.jpg -------------------------------------------------------------------------------- /scripts/docker-build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | PIP_VERSION="${PIP_VERSION:-23.1.2}" 5 | DOCKER_IMAGE="unstructured-inference:dev" 6 | 7 | DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile \ 8 | --build-arg PIP_VERSION="$PIP_VERSION" \ 9 | --build-arg BUILDKIT_INLINE_CACHE=1 \ 10 | --progress plain \ 11 | -t "$DOCKER_IMAGE" .) 12 | 13 | DOCKER_BUILDKIT=1 "${DOCKER_BUILD_CMD[@]}" -------------------------------------------------------------------------------- /scripts/shellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | find scripts -name "*.sh" -exec shellcheck {} + 4 | 5 | -------------------------------------------------------------------------------- /scripts/test-unstructured-ingest-helper.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This is intended to be run from an unstructured checkout, not in this repo 4 | # The goal here is to see what changes the current branch would introduce to unstructured 5 | # fixtures 6 | 7 | INGEST_COMMANDS=( 8 | test_unstructured_ingest/src/azure.sh 9 | test_unstructured_ingest/src/biomed-api.sh 10 | test_unstructured_ingest/src/biomed-path.sh 11 | test_unstructured_ingest/src/box.sh 12 | test_unstructured_ingest/src/dropbox.sh 13 | test_unstructured_ingest/src/gcs.sh 14 | test_unstructured_ingest/src/onedrive.sh 15 | test_unstructured_ingest/src/s3.sh 16 | ) 17 | 18 | EXIT_STATUSES=() 19 | 20 | # Run each command and capture its exit status 21 | for INGEST_COMMAND in "${INGEST_COMMANDS[@]}"; do 22 | $INGEST_COMMAND 23 | EXIT_STATUSES+=($?) 24 | done 25 | 26 | # Check for failures 27 | for STATUS in "${EXIT_STATUSES[@]}"; do 28 | if [[ $STATUS -ne 0 ]]; then 29 | echo "At least one ingest command failed! Scroll up to see which" 30 | exit 1 31 | fi 32 | done 33 | 34 | echo "No diff's resulted from any ingest commands" 35 | -------------------------------------------------------------------------------- /scripts/version-sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | function usage { 3 | echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1 4 | echo 'Synchronize files to latest version in source file' 5 | echo ' -s Specifies source file for version (default is CHANGELOG.md)' 6 | echo ' -f Specifies a file to change and the format for searching and replacing versions' 7 | echo ' FILE_TO_CHANGE is the file to be updated/checked for updates' 8 | echo ' REPLACEMENT_FORMAT is one of (semver, release, api-release)' 9 | echo ' semver indicates to look for a full semver version and replace with the latest full version' 10 | echo ' release indicates to look for a release semver version (x.x.x) and replace with the latest release version' 11 | echo ' api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version' 12 | echo ' -c Compare versions and output proposed changes without changing anything.' 13 | } 14 | 15 | function getopts-extra () { 16 | declare -i i=1 17 | # if the next argument is not an option, then append it to array OPTARG 18 | while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do 19 | OPTARG[i]=${!OPTIND} 20 | ((i += 1)) 21 | ((OPTIND += 1)) 22 | done 23 | } 24 | 25 | # Parse input options 26 | declare CHECK=0 27 | declare SOURCE_FILE="CHANGELOG.md" 28 | declare -a FILES_TO_CHECK=() 29 | declare -a REPLACEMENT_FORMATS=() 30 | declare args 31 | declare OPTIND OPTARG opt 32 | while getopts ":hcs:f:" opt; do 33 | case $opt in 34 | h) 35 | usage 36 | exit 0 37 | ;; 38 | c) 39 | CHECK=1 40 | ;; 41 | s) 42 | SOURCE_FILE="$OPTARG" 43 | ;; 44 | f) 45 | getopts-extra "$@" 46 | args=( "${OPTARG[@]}" ) 47 | # validate length of args, should be 2 48 | if [ ${#args[@]} -eq 2 ]; then 49 | FILES_TO_CHECK+=( "${args[0]}" ) 50 | REPLACEMENT_FORMATS+=( "${args[1]}" ) 51 | else 52 | echo "Exactly 2 arguments must follow -f option." >&2 53 | exit 1 54 | fi 55 | ;; 56 | \?) 57 | echo "Invalid option: -$OPTARG." >&2 58 | usage 59 | exit 1 60 | ;; 61 | esac 62 | done 63 | 64 | # Parse REPLACEMENT_FORMATS 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 68 | # Pull out semver appearing earliest in SOURCE_FILE. 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE") 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}") 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")" 72 | declare -a RE_SEMVERS=() 73 | declare -a UPDATED_VERSIONS=() 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do 75 | REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]} 76 | case $REPLACEMENT_FORMAT in 77 | semver) 78 | RE_SEMVERS+=( "$RE_SEMVER_FULL" ) 79 | UPDATED_VERSIONS+=( "$LAST_VERSION" ) 80 | ;; 81 | release) 82 | RE_SEMVERS+=( "$RE_RELEASE" ) 83 | UPDATED_VERSIONS+=( "$LAST_RELEASE" ) 84 | ;; 85 | api-release) 86 | RE_SEMVERS+=( "$RE_API_RELEASE" ) 87 | UPDATED_VERSIONS+=( "$LAST_API_RELEASE" ) 88 | ;; 89 | *) 90 | echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2 91 | exit 1 92 | ;; 93 | esac 94 | done 95 | 96 | if [ -z "$LAST_VERSION" ]; 97 | then 98 | # No match to semver regex in SOURCE_FILE, so no version to go from. 99 | printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE" 100 | exit 1 101 | fi 102 | 103 | # Search files in FILES_TO_CHECK and change (or get diffs) 104 | declare FAILED_CHECK=0 105 | 106 | for i in "${!FILES_TO_CHECK[@]}"; do 107 | FILE_TO_CHANGE=${FILES_TO_CHECK[$i]} 108 | RE_SEMVER=${RE_SEMVERS[$i]} 109 | UPDATED_VERSION=${UPDATED_VERSIONS[$i]} 110 | FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE") 111 | if [ -z "$FILE_VERSION" ]; 112 | then 113 | # No match to semver regex in VERSIONFILE, so nothing to replace 114 | printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE" 115 | exit 1 116 | else 117 | # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE 118 | TMPFILE=$(mktemp /tmp/new_version.XXXXXX) 119 | # Check sed version, exit if version < 4.3 120 | if ! sed --version > /dev/null 2>&1; then 121 | CURRENT_VERSION=1.archaic 122 | else 123 | CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4) 124 | fi 125 | REQUIRED_VERSION="4.3" 126 | if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then 127 | echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1 128 | fi 129 | sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE" 130 | if [ $CHECK == 1 ]; 131 | then 132 | DIFF=$(diff "$FILE_TO_CHANGE" "$TMPFILE" ) 133 | if [ -z "$DIFF" ]; 134 | then 135 | printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE" 136 | rm "$TMPFILE" 137 | else 138 | FAILED_CHECK=1 139 | printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF" 140 | rm "$TMPFILE" 141 | fi 142 | else 143 | cp "$TMPFILE" "$FILE_TO_CHANGE" 144 | rm "$TMPFILE" 145 | fi 146 | fi 147 | done 148 | 149 | # Exit with code determined by whether changes were needed in a check. 150 | if [ ${FAILED_CHECK} -ne 0 ]; then 151 | exit 1 152 | else 153 | exit 0 154 | fi 155 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_files = LICENSE.md 3 | 4 | [flake8] 5 | max-line-length = 100 6 | extend-ignore = D100, D101, D104, D105, D107, D2, D4 7 | per-file-ignores = 8 | test_*/**: D 9 | 10 | [tool:pytest] 11 | filterwarnings = 12 | ignore::DeprecationWarning 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | setup.py 3 | 4 | unstructured_inference - Tools to utilize trained models 5 | 6 | Copyright 2022 Unstructured Technologies, Inc. 7 | 8 | Licensed under the Apache License, Version 2.0 (the "License"); 9 | you may not use this file except in compliance with the License. 10 | You may obtain a copy of the License at 11 | 12 | http://www.apache.org/licenses/LICENSE-2.0 13 | 14 | Unless required by applicable law or agreed to in writing, software 15 | distributed under the License is distributed on an "AS IS" BASIS, 16 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | See the License for the specific language governing permissions and 18 | limitations under the License. 19 | """ 20 | from typing import List, Optional, Union 21 | 22 | from setuptools import find_packages, setup 23 | 24 | from unstructured_inference.__version__ import __version__ 25 | 26 | 27 | def load_requirements(file_list: Optional[Union[str, List[str]]] = None): 28 | """Loads the requirements from a .in file or list of .in files.""" 29 | if file_list is None: 30 | file_list = ["requirements/base.in"] 31 | if isinstance(file_list, str): 32 | file_list = [file_list] 33 | requirements: List[str] = [] 34 | for file in file_list: 35 | with open(file, encoding="utf-8") as f: 36 | requirements.extend(f.readlines()) 37 | requirements = [ 38 | req for req in requirements if not req.startswith("#") and not req.startswith("-") 39 | ] 40 | return requirements 41 | 42 | 43 | def load_text_from_file(filename: str): 44 | """Retrieves text from a file.""" 45 | with open(filename, encoding="utf-8") as fp: 46 | description = fp.read() 47 | return description 48 | 49 | 50 | setup( 51 | name="unstructured_inference", 52 | description="A library for performing inference using trained models.", 53 | long_description=load_text_from_file("README.md"), 54 | long_description_content_type="text/markdown", 55 | keywords="NLP PDF HTML CV XML parsing preprocessing", 56 | url="https://github.com/Unstructured-IO/unstructured-inference", 57 | python_requires=">=3.7.0", 58 | classifiers=[ 59 | "Development Status :: 4 - Beta", 60 | "Intended Audience :: Developers", 61 | "Intended Audience :: Education", 62 | "Intended Audience :: Science/Research", 63 | "License :: OSI Approved :: Apache Software License", 64 | "Operating System :: OS Independent", 65 | "Programming Language :: Python :: 3", 66 | "Programming Language :: Python :: 3.8", 67 | "Programming Language :: Python :: 3.9", 68 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 69 | ], 70 | author="Unstructured Technologies", 71 | author_email="devops@unstructuredai.io", 72 | license="Apache-2.0", 73 | packages=find_packages(), 74 | version=__version__, 75 | entry_points={}, 76 | install_requires=load_requirements(), 77 | ) 78 | -------------------------------------------------------------------------------- /test_unstructured_inference/conftest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | from PIL import Image 4 | 5 | from unstructured_inference.inference.elements import ( 6 | EmbeddedTextRegion, 7 | Rectangle, 8 | TextRegion, 9 | ) 10 | from unstructured_inference.inference.layoutelement import LayoutElement 11 | 12 | 13 | @pytest.fixture() 14 | def mock_pil_image(): 15 | return Image.new("RGB", (50, 50)) 16 | 17 | 18 | @pytest.fixture() 19 | def mock_numpy_image(): 20 | return np.zeros((50, 50, 3), np.uint8) 21 | 22 | 23 | @pytest.fixture() 24 | def mock_rectangle(): 25 | return Rectangle(100, 100, 300, 300) 26 | 27 | 28 | @pytest.fixture() 29 | def mock_text_region(): 30 | return TextRegion.from_coords(100, 100, 300, 300, text="Sample text") 31 | 32 | 33 | @pytest.fixture() 34 | def mock_layout_element(): 35 | return LayoutElement.from_coords( 36 | 100, 37 | 100, 38 | 300, 39 | 300, 40 | text="Sample text", 41 | source=None, 42 | type="Text", 43 | ) 44 | 45 | 46 | @pytest.fixture() 47 | def mock_embedded_text_regions(): 48 | return [ 49 | EmbeddedTextRegion.from_coords( 50 | x1=453.00277777777774, 51 | y1=317.319341111111, 52 | x2=711.5338541666665, 53 | y2=358.28571222222206, 54 | text="LayoutParser:", 55 | ), 56 | EmbeddedTextRegion.from_coords( 57 | x1=726.4778125, 58 | y1=317.319341111111, 59 | x2=760.3308594444444, 60 | y2=357.1698966666667, 61 | text="A", 62 | ), 63 | EmbeddedTextRegion.from_coords( 64 | x1=775.2748177777777, 65 | y1=317.319341111111, 66 | x2=917.3579885555555, 67 | y2=357.1698966666667, 68 | text="Unified", 69 | ), 70 | EmbeddedTextRegion.from_coords( 71 | x1=932.3019468888888, 72 | y1=317.319341111111, 73 | x2=1071.8426522222221, 74 | y2=357.1698966666667, 75 | text="Toolkit", 76 | ), 77 | EmbeddedTextRegion.from_coords( 78 | x1=1086.7866105555556, 79 | y1=317.319341111111, 80 | x2=1141.2105142777777, 81 | y2=357.1698966666667, 82 | text="for", 83 | ), 84 | EmbeddedTextRegion.from_coords( 85 | x1=1156.154472611111, 86 | y1=317.319341111111, 87 | x2=1256.334784222222, 88 | y2=357.1698966666667, 89 | text="Deep", 90 | ), 91 | EmbeddedTextRegion.from_coords( 92 | x1=437.83888888888885, 93 | y1=367.13322999999986, 94 | x2=610.0171992222222, 95 | y2=406.9837855555556, 96 | text="Learning", 97 | ), 98 | EmbeddedTextRegion.from_coords( 99 | x1=624.9611575555555, 100 | y1=367.13322999999986, 101 | x2=741.6754646666665, 102 | y2=406.9837855555556, 103 | text="Based", 104 | ), 105 | EmbeddedTextRegion.from_coords( 106 | x1=756.619423, 107 | y1=367.13322999999986, 108 | x2=958.3867708333332, 109 | y2=406.9837855555556, 110 | text="Document", 111 | ), 112 | EmbeddedTextRegion.from_coords( 113 | x1=973.3307291666665, 114 | y1=367.13322999999986, 115 | x2=1092.0535042777776, 116 | y2=406.9837855555556, 117 | text="Image", 118 | ), 119 | ] 120 | 121 | 122 | # TODO(alan): Make a better test layout 123 | @pytest.fixture() 124 | def mock_layout(mock_embedded_text_regions): 125 | return [ 126 | LayoutElement(text=r.text, type="UncategorizedText", bbox=r.bbox) 127 | for r in mock_embedded_text_regions 128 | ] 129 | 130 | 131 | @pytest.fixture() 132 | def example_table_cells(): 133 | cells = [ 134 | {"cell text": "Disability Category", "row_nums": [0, 1], "column_nums": [0]}, 135 | {"cell text": "Participants", "row_nums": [0, 1], "column_nums": [1]}, 136 | {"cell text": "Ballots Completed", "row_nums": [0, 1], "column_nums": [2]}, 137 | {"cell text": "Ballots Incomplete/Terminated", "row_nums": [0, 1], "column_nums": [3]}, 138 | {"cell text": "Results", "row_nums": [0], "column_nums": [4, 5]}, 139 | {"cell text": "Accuracy", "row_nums": [1], "column_nums": [4]}, 140 | {"cell text": "Time to complete", "row_nums": [1], "column_nums": [5]}, 141 | {"cell text": "Blind", "row_nums": [2], "column_nums": [0]}, 142 | {"cell text": "Low Vision", "row_nums": [3], "column_nums": [0]}, 143 | {"cell text": "Dexterity", "row_nums": [4], "column_nums": [0]}, 144 | {"cell text": "Mobility", "row_nums": [5], "column_nums": [0]}, 145 | {"cell text": "5", "row_nums": [2], "column_nums": [1]}, 146 | {"cell text": "5", "row_nums": [3], "column_nums": [1]}, 147 | {"cell text": "5", "row_nums": [4], "column_nums": [1]}, 148 | {"cell text": "3", "row_nums": [5], "column_nums": [1]}, 149 | {"cell text": "1", "row_nums": [2], "column_nums": [2]}, 150 | {"cell text": "2", "row_nums": [3], "column_nums": [2]}, 151 | {"cell text": "4", "row_nums": [4], "column_nums": [2]}, 152 | {"cell text": "3", "row_nums": [5], "column_nums": [2]}, 153 | {"cell text": "4", "row_nums": [2], "column_nums": [3]}, 154 | {"cell text": "3", "row_nums": [3], "column_nums": [3]}, 155 | {"cell text": "1", "row_nums": [4], "column_nums": [3]}, 156 | {"cell text": "0", "row_nums": [5], "column_nums": [3]}, 157 | {"cell text": "34.5%, n=1", "row_nums": [2], "column_nums": [4]}, 158 | {"cell text": "98.3% n=2 (97.7%, n=3)", "row_nums": [3], "column_nums": [4]}, 159 | {"cell text": "98.3%, n=4", "row_nums": [4], "column_nums": [4]}, 160 | {"cell text": "95.4%, n=3", "row_nums": [5], "column_nums": [4]}, 161 | {"cell text": "1199 sec, n=1", "row_nums": [2], "column_nums": [5]}, 162 | {"cell text": "1716 sec, n=3 (1934 sec, n=2)", "row_nums": [3], "column_nums": [5]}, 163 | {"cell text": "1672.1 sec, n=4", "row_nums": [4], "column_nums": [5]}, 164 | {"cell text": "1416 sec, n=3", "row_nums": [5], "column_nums": [5]}, 165 | ] 166 | for i in range(len(cells)): 167 | cells[i]["column header"] = False 168 | return [cells] 169 | -------------------------------------------------------------------------------- /test_unstructured_inference/inference/test_layout_element.py: -------------------------------------------------------------------------------- 1 | from unstructured_inference.inference.layoutelement import LayoutElement, TextRegion 2 | 3 | 4 | def test_layout_element_do_dict(mock_layout_element): 5 | expected = { 6 | "coordinates": ((100, 100), (100, 300), (300, 300), (300, 100)), 7 | "text": "Sample text", 8 | "type": "Text", 9 | "prob": None, 10 | "source": None, 11 | } 12 | 13 | assert mock_layout_element.to_dict() == expected 14 | 15 | 16 | def test_layout_element_from_region(mock_rectangle): 17 | expected = LayoutElement.from_coords(100, 100, 300, 300) 18 | region = TextRegion(bbox=mock_rectangle) 19 | 20 | assert LayoutElement.from_region(region) == expected 21 | -------------------------------------------------------------------------------- /test_unstructured_inference/models/test_detectron2onnx.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest.mock import patch 3 | 4 | import pytest 5 | from PIL import Image 6 | 7 | import unstructured_inference.models.base as models 8 | import unstructured_inference.models.detectron2onnx as detectron2 9 | 10 | 11 | class MockDetectron2ONNXLayoutModel: 12 | def __init__(self, *args, **kwargs): 13 | self.args = args 14 | self.kwargs = kwargs 15 | 16 | def run(self, *args): 17 | return ([(1, 2, 3, 4)], [0], [(4, 5)], [0.818]) 18 | 19 | def get_inputs(self): 20 | class input_thing: 21 | name = "Bernard" 22 | 23 | return [input_thing()] 24 | 25 | 26 | def test_load_default_model(monkeypatch): 27 | monkeypatch.setattr(models, "models", {}) 28 | with patch.object( 29 | detectron2.onnxruntime, 30 | "InferenceSession", 31 | new=MockDetectron2ONNXLayoutModel, 32 | ): 33 | model = models.get_model("detectron2_mask_rcnn") 34 | 35 | assert isinstance(model.model, MockDetectron2ONNXLayoutModel) 36 | 37 | 38 | @pytest.mark.parametrize(("model_path", "label_map"), [("asdf", "diufs"), ("dfaw", "hfhfhfh")]) 39 | def test_load_model(model_path, label_map): 40 | with patch.object(detectron2.onnxruntime, "InferenceSession", return_value=True): 41 | model = detectron2.UnstructuredDetectronONNXModel() 42 | model.initialize(model_path=model_path, label_map=label_map) 43 | args, _ = detectron2.onnxruntime.InferenceSession.call_args 44 | assert args == (model_path,) 45 | assert label_map == model.label_map 46 | 47 | 48 | def test_unstructured_detectron_model(): 49 | model = detectron2.UnstructuredDetectronONNXModel() 50 | model.model = 1 51 | with patch.object(detectron2.UnstructuredDetectronONNXModel, "predict", return_value=[]): 52 | result = model(None) 53 | assert isinstance(result, list) 54 | assert len(result) == 0 55 | 56 | 57 | def test_inference(): 58 | with patch.object( 59 | detectron2.onnxruntime, 60 | "InferenceSession", 61 | return_value=MockDetectron2ONNXLayoutModel(), 62 | ): 63 | model = detectron2.UnstructuredDetectronONNXModel() 64 | model.initialize(model_path="test_path", label_map={0: "test_class"}) 65 | assert isinstance(model.model, MockDetectron2ONNXLayoutModel) 66 | with open(os.path.join("sample-docs", "receipt-sample.jpg"), mode="rb") as fp: 67 | image = Image.open(fp) 68 | image.load() 69 | elements = model(image) 70 | assert len(elements) == 1 71 | element = elements[0] 72 | (x1, y1), _, (x2, y2), _ = element.bbox.coordinates 73 | assert hasattr( 74 | element, 75 | "prob", 76 | ) # NOTE(pravin) New Assertion to Make Sure element has probabilities 77 | assert isinstance( 78 | element.prob, 79 | float, 80 | ) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float 81 | # NOTE(alan): The bbox coordinates get resized, so check their relative proportions 82 | assert x2 / x1 == pytest.approx(3.0) # x1 == 1, x2 == 3 before scaling 83 | assert y2 / y1 == pytest.approx(2.0) # y1 == 2, y2 == 4 before scaling 84 | assert element.type == "test_class" 85 | -------------------------------------------------------------------------------- /test_unstructured_inference/models/test_eval.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from unstructured_inference.inference.layoutelement import table_cells_to_dataframe 4 | from unstructured_inference.models.eval import compare_contents_as_df, default_tokenizer 5 | 6 | 7 | @pytest.fixture() 8 | def actual_cells(): 9 | return [ 10 | { 11 | "column_nums": [0], 12 | "row_nums": [0, 1], 13 | "column header": True, 14 | "cell text": "Disability Category", 15 | }, 16 | { 17 | "column_nums": [1], 18 | "row_nums": [0, 1], 19 | "column header": True, 20 | "cell text": "Participants", 21 | }, 22 | { 23 | "column_nums": [2], 24 | "row_nums": [0, 1], 25 | "column header": True, 26 | "cell text": "Ballots Completed", 27 | }, 28 | { 29 | "column_nums": [3], 30 | "row_nums": [0, 1], 31 | "column header": True, 32 | "cell text": "Ballots Incomplete/Terminated", 33 | }, 34 | {"column_nums": [4, 5], "row_nums": [0], "column header": True, "cell text": "Results"}, 35 | {"column_nums": [4], "row_nums": [1], "column header": False, "cell text": "Accuracy"}, 36 | { 37 | "column_nums": [5], 38 | "row_nums": [1], 39 | "column header": False, 40 | "cell text": "Time to complete", 41 | }, 42 | {"column_nums": [0], "row_nums": [2], "column header": False, "cell text": "Blind"}, 43 | {"column_nums": [0], "row_nums": [3], "column header": False, "cell text": "Low Vision"}, 44 | {"column_nums": [0], "row_nums": [4], "column header": False, "cell text": "Dexterity"}, 45 | {"column_nums": [0], "row_nums": [5], "column header": False, "cell text": "Mobility"}, 46 | {"column_nums": [1], "row_nums": [2], "column header": False, "cell text": "5"}, 47 | {"column_nums": [1], "row_nums": [3], "column header": False, "cell text": "5"}, 48 | {"column_nums": [1], "row_nums": [4], "column header": False, "cell text": "5"}, 49 | {"column_nums": [1], "row_nums": [5], "column header": False, "cell text": "3"}, 50 | {"column_nums": [2], "row_nums": [2], "column header": False, "cell text": "1"}, 51 | {"column_nums": [2], "row_nums": [3], "column header": False, "cell text": "2"}, 52 | {"column_nums": [2], "row_nums": [4], "column header": False, "cell text": "4"}, 53 | {"column_nums": [2], "row_nums": [5], "column header": False, "cell text": "3"}, 54 | {"column_nums": [3], "row_nums": [2], "column header": False, "cell text": "4"}, 55 | {"column_nums": [3], "row_nums": [3], "column header": False, "cell text": "3"}, 56 | {"column_nums": [3], "row_nums": [4], "column header": False, "cell text": "1"}, 57 | {"column_nums": [3], "row_nums": [5], "column header": False, "cell text": "0"}, 58 | {"column_nums": [4], "row_nums": [2], "column header": False, "cell text": "34.5%, n=1"}, 59 | { 60 | "column_nums": [4], 61 | "row_nums": [3], 62 | "column header": False, 63 | "cell text": "98.3% n=2 (97.7%, n=3)", 64 | }, 65 | {"column_nums": [4], "row_nums": [4], "column header": False, "cell text": "98.3%, n=4"}, 66 | {"column_nums": [4], "row_nums": [5], "column header": False, "cell text": "95.4%, n=3"}, 67 | {"column_nums": [5], "row_nums": [2], "column header": False, "cell text": "1199 sec, n=1"}, 68 | { 69 | "column_nums": [5], 70 | "row_nums": [3], 71 | "column header": False, 72 | "cell text": "1716 sec, n=3 (1934 sec, n=2)", 73 | }, 74 | { 75 | "column_nums": [5], 76 | "row_nums": [4], 77 | "column header": False, 78 | "cell text": "1672.1 sec, n=4", 79 | }, 80 | {"column_nums": [5], "row_nums": [5], "column header": False, "cell text": "1416 sec, n=3"}, 81 | ] 82 | 83 | 84 | @pytest.fixture() 85 | def pred_cells(): 86 | return [ 87 | {"column_nums": [0], "row_nums": [2], "column header": False, "cell text": "Blind"}, 88 | {"column_nums": [0], "row_nums": [3], "column header": False, "cell text": "Low Vision"}, 89 | {"column_nums": [0], "row_nums": [4], "column header": False, "cell text": "Dexterity"}, 90 | {"column_nums": [0], "row_nums": [5], "column header": False, "cell text": "Mobility"}, 91 | {"column_nums": [1], "row_nums": [2], "column header": False, "cell text": "5"}, 92 | {"column_nums": [1], "row_nums": [3], "column header": False, "cell text": "5"}, 93 | {"column_nums": [1], "row_nums": [4], "column header": False, "cell text": "5"}, 94 | {"column_nums": [1], "row_nums": [5], "column header": False, "cell text": "3"}, 95 | {"column_nums": [2], "row_nums": [2], "column header": False, "cell text": "1"}, 96 | {"column_nums": [2], "row_nums": [3], "column header": False, "cell text": "2"}, 97 | {"column_nums": [2], "row_nums": [4], "column header": False, "cell text": "4"}, 98 | {"column_nums": [2], "row_nums": [5], "column header": False, "cell text": "3"}, 99 | {"column_nums": [3], "row_nums": [2], "column header": False, "cell text": "4"}, 100 | {"column_nums": [3], "row_nums": [3], "column header": False, "cell text": "3"}, 101 | {"column_nums": [3], "row_nums": [4], "column header": False, "cell text": "1"}, 102 | {"column_nums": [3], "row_nums": [5], "column header": False, "cell text": "0"}, 103 | {"column_nums": [4], "row_nums": [1], "column header": False, "cell text": "Accuracy"}, 104 | {"column_nums": [4], "row_nums": [2], "column header": False, "cell text": "34.5%, n=1"}, 105 | { 106 | "column_nums": [4], 107 | "row_nums": [3], 108 | "column header": False, 109 | "cell text": "98.3% n=2 (97.7%, n=3)", 110 | }, 111 | {"column_nums": [4], "row_nums": [4], "column header": False, "cell text": "98.3%, n=4"}, 112 | {"column_nums": [4], "row_nums": [5], "column header": False, "cell text": "95.4%, n=3"}, 113 | { 114 | "column_nums": [5], 115 | "row_nums": [1], 116 | "column header": False, 117 | "cell text": "Time to complete", 118 | }, 119 | {"column_nums": [5], "row_nums": [2], "column header": False, "cell text": "1199 sec, n=1"}, 120 | { 121 | "column_nums": [5], 122 | "row_nums": [3], 123 | "column header": False, 124 | "cell text": "1716 sec, n=3 | (1934 sec, n=2)", 125 | }, 126 | { 127 | "column_nums": [5], 128 | "row_nums": [4], 129 | "column header": False, 130 | "cell text": "1672.1 sec, n=4", 131 | }, 132 | {"column_nums": [5], "row_nums": [5], "column header": False, "cell text": "1416 sec, n=3"}, 133 | { 134 | "column_nums": [0], 135 | "row_nums": [0, 1], 136 | "column header": True, 137 | "cell text": "soa etealeiliay Category", 138 | }, 139 | {"column_nums": [4, 5], "row_nums": [0], "column header": True, "cell text": "Results"}, 140 | { 141 | "column_nums": [1], 142 | "row_nums": [0, 1], 143 | "column header": True, 144 | "cell text": "Participants P", 145 | }, 146 | { 147 | "column_nums": [2], 148 | "row_nums": [0, 1], 149 | "column header": True, 150 | "cell text": "pallets Completed", 151 | }, 152 | { 153 | "column_nums": [3], 154 | "row_nums": [0, 1], 155 | "column header": True, 156 | "cell text": "Ballot: incom lete/ Ne Terminated", 157 | }, 158 | ] 159 | 160 | 161 | @pytest.fixture() 162 | def actual_df(actual_cells): 163 | return table_cells_to_dataframe(actual_cells).fillna("") 164 | 165 | 166 | @pytest.fixture() 167 | def pred_df(pred_cells): 168 | return table_cells_to_dataframe(pred_cells).fillna("") 169 | 170 | 171 | @pytest.mark.parametrize( 172 | ("eval_func", "processor"), 173 | [ 174 | ("token_ratio", default_tokenizer), 175 | ("token_ratio", None), 176 | ("partial_token_ratio", default_tokenizer), 177 | ("ratio", None), 178 | ("ratio", default_tokenizer), 179 | ("partial_ratio", default_tokenizer), 180 | ], 181 | ) 182 | def test_compare_content_as_df(actual_df, pred_df, eval_func, processor): 183 | results = compare_contents_as_df(actual_df, pred_df, eval_func=eval_func, processor=processor) 184 | assert 0 < results.get(f"by_col_{eval_func}") < 100 185 | 186 | 187 | def test_compare_content_as_df_with_invalid_input(actual_df, pred_df): 188 | with pytest.raises(ValueError, match="eval_func must be one of"): 189 | compare_contents_as_df(actual_df, pred_df, eval_func="foo") 190 | -------------------------------------------------------------------------------- /test_unstructured_inference/models/test_model.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any 3 | from unittest import mock 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | import unstructured_inference.models.base as models 9 | from unstructured_inference.inference.layoutelement import LayoutElement, LayoutElements 10 | from unstructured_inference.models.unstructuredmodel import ( 11 | ModelNotInitializedError, 12 | UnstructuredObjectDetectionModel, 13 | ) 14 | 15 | 16 | class MockModel(UnstructuredObjectDetectionModel): 17 | call_count = 0 18 | 19 | def __init__(self): 20 | self.initializer = mock.MagicMock() 21 | super().__init__() 22 | 23 | def initialize(self, *args, **kwargs): 24 | return self.initializer(self, *args, **kwargs) 25 | 26 | def predict(self, x: Any) -> Any: 27 | return LayoutElements(element_coords=np.array([])) 28 | 29 | 30 | MOCK_MODEL_TYPES = { 31 | "foo": { 32 | "input_shape": (640, 640), 33 | }, 34 | } 35 | 36 | 37 | def test_get_model(monkeypatch): 38 | monkeypatch.setattr(models, "models", {}) 39 | with mock.patch.dict(models.model_class_map, {"yolox": MockModel}): 40 | assert isinstance(models.get_model("yolox"), MockModel) 41 | 42 | 43 | def test_register_new_model(): 44 | assert "foo" not in models.model_class_map 45 | assert "foo" not in models.model_config_map 46 | models.register_new_model(MOCK_MODEL_TYPES, MockModel) 47 | assert "foo" in models.model_class_map 48 | assert "foo" in models.model_config_map 49 | model = models.get_model("foo") 50 | assert len(model.initializer.mock_calls) == 1 51 | assert model.initializer.mock_calls[0][-1] == MOCK_MODEL_TYPES["foo"] 52 | assert isinstance(model, MockModel) 53 | # unregister the new model by reset to default 54 | models.model_class_map, models.model_config_map = models.get_default_model_mappings() 55 | assert "foo" not in models.model_class_map 56 | assert "foo" not in models.model_config_map 57 | 58 | 59 | def test_raises_invalid_model(): 60 | with pytest.raises(models.UnknownModelException): 61 | models.get_model("fake_model") 62 | 63 | 64 | def test_raises_uninitialized(): 65 | with pytest.raises(ModelNotInitializedError): 66 | models.UnstructuredDetectronONNXModel().predict(None) 67 | 68 | 69 | def test_model_initializes_once(): 70 | from unstructured_inference.inference import layout 71 | 72 | with mock.patch.dict(models.model_class_map, {"yolox": MockModel}), mock.patch.object( 73 | models, 74 | "models", 75 | {}, 76 | ): 77 | doc = layout.DocumentLayout.from_file("sample-docs/loremipsum.pdf") 78 | doc.pages[0].detection_model.initializer.assert_called_once() 79 | 80 | 81 | def test_deduplicate_detected_elements(): 82 | import numpy as np 83 | 84 | from unstructured_inference.inference.elements import intersections 85 | from unstructured_inference.inference.layout import DocumentLayout 86 | from unstructured_inference.models.base import get_model 87 | 88 | model = get_model("yolox_quantized") 89 | # model.confidence_threshold=0.5 90 | file = "sample-docs/example_table.jpg" 91 | doc = DocumentLayout.from_image_file( 92 | file, 93 | model, 94 | ) 95 | known_elements = [e.bbox for e in doc.pages[0].elements if e.type != "UncategorizedText"] 96 | # Compute intersection matrix 97 | intersections_mtx = intersections(*known_elements) 98 | # Get rid off diagonal (cause an element will always intersect itself) 99 | np.fill_diagonal(intersections_mtx, False) 100 | # Now all the elements should be False, because any intersection remains 101 | assert not intersections_mtx.any() 102 | 103 | 104 | def test_enhance_regions(): 105 | from unstructured_inference.inference.elements import Rectangle 106 | from unstructured_inference.models.base import get_model 107 | 108 | elements = [ 109 | LayoutElement(bbox=Rectangle(0, 0, 1, 1)), 110 | LayoutElement(bbox=Rectangle(0.01, 0.01, 1.01, 1.01)), 111 | LayoutElement(bbox=Rectangle(0.02, 0.02, 1.02, 1.02)), 112 | LayoutElement(bbox=Rectangle(0.03, 0.03, 1.03, 1.03)), 113 | LayoutElement(bbox=Rectangle(0.04, 0.04, 1.04, 1.04)), 114 | LayoutElement(bbox=Rectangle(0.05, 0.05, 1.05, 1.05)), 115 | LayoutElement(bbox=Rectangle(0.06, 0.06, 1.06, 1.06)), 116 | LayoutElement(bbox=Rectangle(0.07, 0.07, 1.07, 1.07)), 117 | LayoutElement(bbox=Rectangle(0.08, 0.08, 1.08, 1.08)), 118 | LayoutElement(bbox=Rectangle(0.09, 0.09, 1.09, 1.09)), 119 | LayoutElement(bbox=Rectangle(0.10, 0.10, 1.10, 1.10)), 120 | ] 121 | model = get_model("yolox_tiny") 122 | elements = model.enhance_regions(elements, 0.5) 123 | assert len(elements) == 1 124 | assert ( 125 | elements[0].bbox.x1, 126 | elements[0].bbox.y1, 127 | elements[0].bbox.x2, 128 | elements[0].bbox.x2, 129 | ) == ( 130 | 0, 131 | 0, 132 | 1.10, 133 | 1.10, 134 | ) 135 | 136 | 137 | def test_clean_type(): 138 | from unstructured_inference.inference.layout import LayoutElement 139 | from unstructured_inference.models.base import get_model 140 | 141 | elements = [ 142 | LayoutElement.from_coords( 143 | 0.6, 144 | 0.6, 145 | 0.65, 146 | 0.65, 147 | type="Table", 148 | ), # One little table nested inside all the others 149 | LayoutElement.from_coords(0.5, 0.5, 0.7, 0.7, type="Table"), # One nested table 150 | LayoutElement.from_coords(0, 0, 1, 1, type="Table"), # Big table 151 | LayoutElement.from_coords(0.01, 0.01, 1.01, 1.01), 152 | LayoutElement.from_coords(0.02, 0.02, 1.02, 1.02), 153 | LayoutElement.from_coords(0.03, 0.03, 1.03, 1.03), 154 | LayoutElement.from_coords(0.04, 0.04, 1.04, 1.04), 155 | LayoutElement.from_coords(0.05, 0.05, 1.05, 1.05), 156 | ] 157 | model = get_model("yolox_tiny") 158 | elements = model.clean_type(elements, type_to_clean="Table") 159 | assert len(elements) == 1 160 | assert ( 161 | elements[0].bbox.x1, 162 | elements[0].bbox.y1, 163 | elements[0].bbox.x2, 164 | elements[0].bbox.x2, 165 | ) == (0, 0, 1, 1) 166 | 167 | 168 | def test_env_variables_override_default_model(monkeypatch): 169 | # When an environment variable specifies a different default model and we call get_model with no 170 | # args, we should get back the model the env var calls for 171 | monkeypatch.setattr(models, "models", {}) 172 | with mock.patch.dict( 173 | models.os.environ, 174 | {"UNSTRUCTURED_DEFAULT_MODEL_NAME": "yolox"}, 175 | ), mock.patch.dict(models.model_class_map, {"yolox": MockModel}): 176 | model = models.get_model() 177 | assert isinstance(model, MockModel) 178 | 179 | 180 | def test_env_variables_override_initialization_params(monkeypatch): 181 | # When initialization params are specified in an environment variable, and we call get_model, we 182 | # should see that the model was initialized with those params 183 | monkeypatch.setattr(models, "models", {}) 184 | fake_label_map = {"1": "label1", "2": "label2"} 185 | with mock.patch.dict( 186 | models.os.environ, 187 | {"UNSTRUCTURED_DEFAULT_MODEL_INITIALIZE_PARAMS_JSON_PATH": "fake_json.json"}, 188 | ), mock.patch.object(models, "DEFAULT_MODEL", "fake"), mock.patch.dict( 189 | models.model_class_map, 190 | {"fake": mock.MagicMock()}, 191 | ), mock.patch( 192 | "builtins.open", 193 | mock.mock_open( 194 | read_data='{"model_path": "fakepath", "label_map": ' + json.dumps(fake_label_map) + "}", 195 | ), 196 | ): 197 | model = models.get_model() 198 | model.initialize.assert_called_once_with( 199 | model_path="fakepath", 200 | label_map={1: "label1", 2: "label2"}, 201 | ) 202 | -------------------------------------------------------------------------------- /test_unstructured_inference/models/test_yolox.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pytest 4 | 5 | from unstructured_inference.inference.layout import process_file_with_model 6 | 7 | 8 | @pytest.mark.slow() 9 | def test_layout_yolox_local_parsing_image(): 10 | filename = os.path.join("sample-docs", "test-image.jpg") 11 | # NOTE(benjamin) keep_output = True create a file for each image in 12 | # localstorage for visualization of the result 13 | document_layout = process_file_with_model(filename, model_name="yolox", is_image=True) 14 | # NOTE(benjamin) The example image should result in one page result 15 | assert len(document_layout.pages) == 1 16 | # NOTE(benjamin) The example sent to the test contains 13 detections 17 | types_known = ["Text", "Section-header", "Page-header"] 18 | elements = document_layout.pages[0].elements_array 19 | known_regions = [ 20 | e for e in elements.element_class_ids if elements.element_class_id_map[e] in types_known 21 | ] 22 | assert len(known_regions) == 13 23 | # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities 24 | assert hasattr(elements, "element_probs") 25 | assert isinstance( 26 | elements.element_probs[0], 27 | float, 28 | ) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float 29 | 30 | 31 | @pytest.mark.slow() 32 | def test_layout_yolox_local_parsing_pdf(): 33 | filename = os.path.join("sample-docs", "loremipsum.pdf") 34 | document_layout = process_file_with_model(filename, model_name="yolox") 35 | assert len(document_layout.pages) == 1 36 | # NOTE(benjamin) The example sent to the test contains 5 text detections 37 | text_elements = [e for e in document_layout.pages[0].elements if e.type == "Text"] 38 | assert len(text_elements) == 5 39 | assert hasattr( 40 | document_layout.pages[0].elements[0], 41 | "prob", 42 | ) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities 43 | assert isinstance( 44 | document_layout.pages[0].elements[0].prob, 45 | float, 46 | ) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float 47 | 48 | 49 | @pytest.mark.slow() 50 | def test_layout_yolox_local_parsing_empty_pdf(): 51 | filename = os.path.join("sample-docs", "empty-document.pdf") 52 | document_layout = process_file_with_model(filename, model_name="yolox") 53 | assert len(document_layout.pages) == 1 54 | # NOTE(benjamin) The example sent to the test contains 0 detections 55 | assert len(document_layout.pages[0].elements) == 0 56 | 57 | 58 | ######################## 59 | # ONLY SHORT TESTS BELOW 60 | ######################## 61 | 62 | 63 | def test_layout_yolox_local_parsing_image_soft(): 64 | filename = os.path.join("sample-docs", "example_table.jpg") 65 | # NOTE(benjamin) keep_output = True create a file for each image in 66 | # localstorage for visualization of the result 67 | document_layout = process_file_with_model(filename, model_name="yolox_quantized", is_image=True) 68 | # NOTE(benjamin) The example image should result in one page result 69 | assert len(document_layout.pages) == 1 70 | # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model 71 | assert len(document_layout.pages[0].elements) > 0 72 | assert hasattr( 73 | document_layout.pages[0].elements[0], 74 | "prob", 75 | ) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities 76 | assert isinstance( 77 | document_layout.pages[0].elements[0].prob, 78 | float, 79 | ) # NOTE(pravin) New Assertion to Make Sure Populated Probability is Float 80 | 81 | 82 | def test_layout_yolox_local_parsing_pdf_soft(): 83 | filename = os.path.join("sample-docs", "loremipsum.pdf") 84 | document_layout = process_file_with_model(filename, model_name="yolox_tiny") 85 | assert len(document_layout.pages) == 1 86 | # NOTE(benjamin) Soft version of the test, run make test-long in order to run with full model 87 | assert len(document_layout.pages[0].elements) > 0 88 | assert hasattr( 89 | document_layout.pages[0].elements[0], 90 | "prob", 91 | ) # NOTE(pravin) New Assertion to Make Sure LayoutElement has probabilities 92 | 93 | 94 | def test_layout_yolox_local_parsing_empty_pdf_soft(): 95 | filename = os.path.join("sample-docs", "empty-document.pdf") 96 | document_layout = process_file_with_model(filename, model_name="yolox_tiny") 97 | assert len(document_layout.pages) == 1 98 | # NOTE(benjamin) The example sent to the test contains 0 detections 99 | text_elements_page_1 = [el for el in document_layout.pages[0].elements if el.type != "Image"] 100 | assert len(text_elements_page_1) == 0 101 | -------------------------------------------------------------------------------- /test_unstructured_inference/test_config.py: -------------------------------------------------------------------------------- 1 | def test_default_config(): 2 | from unstructured_inference.config import inference_config 3 | 4 | assert inference_config.TT_TABLE_CONF == 0.5 5 | 6 | 7 | def test_env_override(monkeypatch): 8 | monkeypatch.setenv("TT_TABLE_CONF", 1) 9 | from unstructured_inference.config import inference_config 10 | 11 | assert inference_config.TT_TABLE_CONF == 1 12 | -------------------------------------------------------------------------------- /test_unstructured_inference/test_logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import pytest 4 | 5 | from unstructured_inference import logger 6 | 7 | 8 | @pytest.mark.parametrize("level", range(50)) 9 | def test_translate_log_level(level): 10 | level_name = logging.getLevelName(level) 11 | if level_name in ["WARNING", "INFO", "DEBUG", "NOTSET", "WARN"]: 12 | expected = 4 13 | elif level_name in ["ERROR", "CRITICAL"]: 14 | expected = 3 15 | else: 16 | expected = 0 17 | assert logger.translate_log_level(level) == expected 18 | -------------------------------------------------------------------------------- /test_unstructured_inference/test_math.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from unstructured_inference.math import FLOAT_EPSILON, safe_division 5 | 6 | 7 | @pytest.mark.parametrize( 8 | ("a", "b", "expected"), 9 | [(0, 0, 0), (0, 1, 0), (1, 0, np.round(1 / FLOAT_EPSILON, 1)), (2, 3, 0.7)], 10 | ) 11 | def test_safe_division(a, b, expected): 12 | assert np.round(safe_division(a, b), 1) == expected 13 | -------------------------------------------------------------------------------- /test_unstructured_inference/test_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from unstructured_inference.inference.layout import DocumentLayout 5 | from unstructured_inference.utils import ( 6 | LazyDict, 7 | LazyEvaluateInfo, 8 | pad_image_with_background_color, 9 | strip_tags, 10 | ) 11 | 12 | 13 | # Mocking the DocumentLayout and Page classes 14 | class MockPageLayout: 15 | def annotate(self, annotation_data): 16 | return "mock_image" 17 | 18 | 19 | class MockDocumentLayout(DocumentLayout): 20 | @property 21 | def pages(self): 22 | return [MockPageLayout(), MockPageLayout()] 23 | 24 | 25 | def test_dict_same(): 26 | d = {"a": 1, "b": 2, "c": 3} 27 | ld = LazyDict(**d) 28 | assert all(kd == kld for kd, kld in zip(d, ld)) 29 | assert all(d[k] == ld[k] for k in d) 30 | assert len(ld) == len(d) 31 | 32 | 33 | def test_lazy_evaluate(): 34 | called = 0 35 | 36 | def func(x): 37 | nonlocal called 38 | called += 1 39 | return x 40 | 41 | lei = LazyEvaluateInfo(func, 3) 42 | assert called == 0 43 | ld = LazyDict(a=lei) 44 | assert called == 0 45 | assert ld["a"] == 3 46 | assert called == 1 47 | 48 | 49 | @pytest.mark.parametrize(("cache", "expected"), [(True, 1), (False, 2)]) 50 | def test_caches(cache, expected): 51 | called = 0 52 | 53 | def func(x): 54 | nonlocal called 55 | called += 1 56 | return x 57 | 58 | lei = LazyEvaluateInfo(func, 3) 59 | assert called == 0 60 | ld = LazyDict(cache=cache, a=lei) 61 | assert called == 0 62 | assert ld["a"] == 3 63 | assert ld["a"] == 3 64 | assert called == expected 65 | 66 | 67 | def test_pad_image_with_background_color(mock_pil_image): 68 | pad = 10 69 | height, width = mock_pil_image.size 70 | padded = pad_image_with_background_color(mock_pil_image, pad, "black") 71 | assert padded.size == (height + 2 * pad, width + 2 * pad) 72 | np.testing.assert_array_almost_equal( 73 | np.array(padded.crop((pad, pad, width + pad, height + pad))), 74 | np.array(mock_pil_image), 75 | ) 76 | assert padded.getpixel((1, 1)) == (0, 0, 0) 77 | 78 | 79 | def test_pad_image_with_invalid_input(mock_pil_image): 80 | with pytest.raises(ValueError, match="Can not pad an image with negative space!"): 81 | pad_image_with_background_color(mock_pil_image, -1) 82 | 83 | 84 | @pytest.mark.parametrize( 85 | ("html", "text"), 86 | [ 87 | ("