├── .github ├── dependabot.yml └── workflows │ ├── ci.yml │ └── codeql-analysis.yml ├── .gitignore ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── exploration-notebooks └── .gitkeep ├── img ├── 0.png └── unstructured_logo.png ├── lib └── libstdc++.so.6 ├── logger_config.yaml ├── pipeline-notebooks ├── .gitkeep └── pipeline-paddleocr.ipynb ├── prepline_paddleocr ├── __init__.py └── api │ ├── __init__.py │ ├── app.py │ └── paddleocr.py ├── preprocessing-pipeline-family.yaml ├── requirements ├── base.in ├── base.txt ├── dev.in ├── dev.txt ├── test.in └── test.txt ├── sample-docs ├── .gitkeep └── sample-receipt.jpg ├── scripts ├── check-and-format-notebooks.py ├── docker-build.sh ├── shellcheck.sh ├── test-doc-pipeline-apis-consistent.sh └── version-sync.sh ├── setup.cfg └── test_paddleocr └── api ├── .gitkeep └── test_paddleocr.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/requirements" 5 | schedule: 6 | interval: "weekly" 7 | 8 | - package-ecosystem: "github-actions" 9 | # NOTE(robinson) - Workflow files stored in the 10 | # default location of `.github/workflows` 11 | directory: "/" 12 | schedule: 13 | interval: "weekly" 14 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | env: 10 | PYTHON_VERSION: "3.8" 11 | PIPELINE_FAMILY: "paddleocr" 12 | 13 | jobs: 14 | setup: 15 | runs-on: ubuntu-latest 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/cache@v3 19 | id: virtualenv-cache 20 | with: 21 | path: | 22 | .venv 23 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }} 24 | - name: Set up Python ${{ env.PYTHON_VERSION }} 25 | uses: actions/setup-python@v4 26 | with: 27 | python-version: ${{ env.PYTHON_VERSION }} 28 | - name: Setup virtual environment (no cache hit) 29 | if: steps.virtualenv-cache.outputs.cache-hit != 'true' 30 | run: | 31 | python${{ env.PYTHON_VERSION }} -m venv .venv 32 | source .venv/bin/activate 33 | make install 34 | 35 | lint: 36 | runs-on: ubuntu-latest 37 | needs: setup 38 | steps: 39 | - uses: actions/checkout@v3 40 | - uses: actions/cache@v3 41 | id: virtualenv-cache 42 | with: 43 | path: | 44 | .venv 45 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }} 46 | - name: Lint 47 | run: | 48 | source .venv/bin/activate 49 | make check 50 | 51 | shellcheck: 52 | runs-on: ubuntu-latest 53 | steps: 54 | - uses: actions/checkout@v3 55 | - name: ShellCheck 56 | uses: ludeeus/action-shellcheck@master 57 | 58 | test: 59 | runs-on: ubuntu-latest 60 | needs: [setup, lint] 61 | steps: 62 | - uses: actions/checkout@v3 63 | - uses: actions/cache@v3 64 | id: virtualenv-cache 65 | with: 66 | path: | 67 | .venv 68 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }} 69 | - name: Run core tests 70 | run: | 71 | source .venv/bin/activate 72 | sudo apt-get install --yes poppler-utils 73 | make test 74 | make check-coverage 75 | make check-notebooks 76 | 77 | changelog: 78 | runs-on: ubuntu-latest 79 | steps: 80 | - uses: actions/checkout@v3 81 | - if: github.ref != 'refs/heads/main' 82 | uses: dorny/paths-filter@v2 83 | id: changes 84 | with: 85 | filters: | 86 | src: 87 | - 'doc_recipe/**' 88 | - 'recipe-notebooks/**' 89 | 90 | - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main' 91 | uses: dangoslen/changelog-enforcer@v3 92 | 93 | api_consistency: 94 | runs-on: ubuntu-latest 95 | needs: setup 96 | steps: 97 | - uses: actions/checkout@v3 98 | - uses: actions/cache@v3 99 | id: virtualenv-cache 100 | with: 101 | path: | 102 | .venv 103 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }} 104 | - name: API Consistency 105 | run: | 106 | source .venv/bin/activate 107 | make api-check 108 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '21 21 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v3 42 | 43 | # Initializes the CodeQL tools for scanning. 44 | - name: Initialize CodeQL 45 | uses: github/codeql-action/init@v2 46 | with: 47 | languages: ${{ matrix.language }} 48 | # If you wish to specify custom queries, you can do so here or in a config file. 49 | # By default, queries listed here will override any specified in a config file. 50 | # Prefix the list here with "+" to use these queries and those in the config file. 51 | 52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 53 | # queries: security-extended,security-and-quality 54 | 55 | 56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 57 | # If this step fails, then you should remove it and run the build manually (see below) 58 | - name: Autobuild 59 | uses: github/codeql-action/autobuild@v2 60 | 61 | # ℹ️ Command-line programs to run using the OS shell. 62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 63 | 64 | # If the Autobuild fails above, remove it and uncomment the following three lines. 65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 66 | 67 | # - run: | 68 | # echo "Run, Build Application using script" 69 | # ./location_of_script_within_repo/buildscript.sh 70 | 71 | - name: Perform CodeQL Analysis 72 | uses: github/codeql-action/analyze@v2 73 | with: 74 | category: "/language:${{matrix.language}}" 75 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # VSCode 132 | .vscode/ 133 | 134 | # Mac 135 | .DS_Store 136 | 137 | nbs/ 138 | 139 | # Celery files that are created when the mercury dashboard is run 140 | celery.sqlite 141 | celerybeat-schedule.db 142 | 143 | # temporarily generated files by project-specific Makefile 144 | tmp* 145 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.0.1 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax=docker/dockerfile:experimental 2 | 3 | FROM centos:centos7.9.2009 4 | 5 | # NOTE(crag): NB_USER ARG for mybinder.org compat: 6 | # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html 7 | ARG NB_USER=notebook-user 8 | ARG NB_UID=1000 9 | ARG PIP_VERSION 10 | ARG PIPELINE_PACKAGE 11 | 12 | ENV DEBIAN_FRONTEND=noninteractive 13 | 14 | RUN yum update -y 15 | RUN yum upgrade -y 16 | 17 | RUN yum install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ 18 | libreadline-dev libsqlite3-dev wget curl libncurses5-dev libncursesw5-dev \ 19 | xz-utils tk-dev libffi-dev liblzma-dev git mesa-libGL 20 | 21 | RUN yum -y install gcc openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \ 22 | curl -O https://www.python.org/ftp/python/3.8.15/Python-3.8.15.tgz && tar -xzf Python-3.8.15.tgz && \ 23 | cd Python-3.8.15/ && ./configure --enable-shared --enable-optimizations && make altinstall && \ 24 | cd .. && rm -rf Python-3.8.15* && \ 25 | ln -s /usr/local/bin/python3.8 /usr/local/bin/python3 26 | 27 | COPY lib/libstdc++.so.6 /usr/lib64 28 | 29 | # create user with a home directory 30 | ENV USER ${NB_USER} 31 | ENV HOME /home/${NB_USER} 32 | 33 | RUN groupadd --gid ${NB_UID} ${NB_USER} 34 | RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER} 35 | USER ${NB_USER} 36 | WORKDIR ${HOME} 37 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}" 38 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}" 39 | ENV LD_LIBRARY_PATH=/usr/local/lib 40 | ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH 41 | 42 | COPY logger_config.yaml logger_config.yaml 43 | COPY requirements/dev.txt requirements-dev.txt 44 | COPY requirements/base.txt requirements-base.txt 45 | COPY prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/ 46 | COPY exploration-notebooks exploration-notebooks 47 | COPY pipeline-notebooks pipeline-notebooks 48 | COPY img/ img/ 49 | 50 | #RUN echo 'export LD_LIBRARY_PATH=/usr/local/lib' >> ~/.bashrc 51 | 52 | RUN python3.8 -m pip install pip==${PIP_VERSION} \ 53 | && pip3.8 install --no-cache -r requirements-base.txt \ 54 | && pip3.8 install --no-cache -r requirements-dev.txt 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PIPELINE_FAMILY := paddleocr 2 | PIPELINE_PACKAGE := paddleocr 3 | PACKAGE_NAME := prepline_${PIPELINE_PACKAGE} 4 | PIP_VERSION := 23.1.2 5 | 6 | .PHONY: help 7 | help: Makefile 8 | @sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $< 9 | 10 | 11 | ########### 12 | # Install # 13 | ########### 14 | 15 | ## install-base: installs minimum requirements to run the API 16 | .PHONY: install-base 17 | install-base: install-base-pip-packages 18 | 19 | ## install: installs all test and dev requirements 20 | .PHONY: install 21 | install: install-base install-test install-dev 22 | 23 | .PHONY: install-base-pip-packages 24 | install-base-pip-packages: 25 | python3 -m pip install pip==${PIP_VERSION} 26 | pip install -r requirements/base.txt 27 | 28 | .PHONY: install-test 29 | install-test: 30 | pip install -r requirements/test.txt 31 | 32 | .PHONY: install-dev 33 | install-dev: 34 | pip install -r requirements/dev.txt 35 | 36 | .PHONY: install-ci 37 | install-ci: install-base install-test 38 | 39 | ## pip-compile: compiles all base/dev/test requirements 40 | .PHONY: pip-compile 41 | pip-compile: 42 | pip-compile requirements/base.in 43 | pip-compile requirements/dev.in 44 | pip-compile requirements/test.in 45 | 46 | 47 | ######### 48 | # Build # 49 | ######### 50 | 51 | ## generate-api: generates the FastAPI python APIs from notebooks 52 | .PHONY: generate-api 53 | generate-api: 54 | PYTHONPATH=. unstructured_api_tools convert-pipeline-notebooks \ 55 | --input-directory ./pipeline-notebooks \ 56 | --output-directory ./${PACKAGE_NAME}/api 57 | 58 | ########## 59 | # Docker # 60 | ########## 61 | 62 | # Docker targets are provided for convenience only and are not required in a standard development environment 63 | 64 | # Note that the image has notebooks baked in, however the current working directory 65 | # is mounted under /home/notebook-user/local/ when the image is started with 66 | # docker-start-api or docker-start-jupyter 67 | 68 | .PHONY: docker-build 69 | docker-build: 70 | PIP_VERSION=${PIP_VERSION} PIPELINE_FAMILY=${PIPELINE_FAMILY} PIPELINE_PACKAGE=${PIPELINE_PACKAGE} ./scripts/docker-build.sh 71 | 72 | .PHONY: docker-start-api 73 | docker-start-api: 74 | docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --host 0.0.0.0 --port 8000 75 | 76 | .PHONY: docker-start-jupyter 77 | docker-start-jupyter: 78 | docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest jupyter-notebook --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.password='' 79 | 80 | 81 | ######### 82 | # Local # 83 | ######### 84 | 85 | ## run-jupyter: starts jupyter notebook 86 | .PHONY: run-jupyter 87 | run-jupyter: 88 | PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password='' 89 | 90 | ## run-web-app: runs the FastAPI api with hot reloading 91 | .PHONY: run-web-app 92 | run-web-app: 93 | PYTHONPATH=$(realpath .) uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --reload 94 | 95 | 96 | ################# 97 | # Test and Lint # 98 | ################# 99 | 100 | ## test: runs core tests 101 | .PHONY: test 102 | test: 103 | PYTHONPATH=. pytest test_${PIPELINE_PACKAGE} --cov=${PACKAGE_NAME} --cov-report term-missing 104 | 105 | .PHONY: check-coverage 106 | check-coverage: 107 | coverage report --fail-under=90 108 | 109 | ## test-integration: runs integration tests 110 | .PHONY: test-integration 111 | test-integration: 112 | PYTHONPATH=. pytest test_${PIPELINE_PACKAGE}_integration 113 | 114 | ## api-check: verifies auto-generated pipeline APIs match the existing ones 115 | .PHONY: api-check 116 | api-check: 117 | PYTHONPATH=. PACKAGE_NAME=${PACKAGE_NAME} ./scripts/test-doc-pipeline-apis-consistent.sh 118 | 119 | ## check: runs linters (includes tests) 120 | .PHONY: check 121 | check: check-src check-tests check-version 122 | 123 | ## check-src: runs linters (source only, no tests) 124 | .PHONY: check-src 125 | check-src: 126 | black --line-length 100 ${PACKAGE_NAME} --check --exclude ${PACKAGE_NAME}/api 127 | flake8 ${PACKAGE_NAME} 128 | mypy ${PACKAGE_NAME} --ignore-missing-imports --install-types --non-interactive --implicit-optional 129 | 130 | .PHONY: check-tests 131 | check-tests: 132 | black --line-length 100 test_${PIPELINE_PACKAGE} --check 133 | flake8 test_${PIPELINE_PACKAGE} 134 | 135 | ## tidy: run black 136 | .PHONY: tidy 137 | tidy: 138 | black --line-length 100 ${PACKAGE_NAME} 139 | black --line-length 100 test_${PIPELINE_PACKAGE} 140 | 141 | ## check-scripts: run shellcheck 142 | .PHONY: check-scripts 143 | check-scripts: 144 | # Fail if any of these files have warnings 145 | scripts/shellcheck.sh 146 | 147 | ## check-version: run check to ensure version in CHANGELOG.md matches references in files 148 | .PHONY: check-version 149 | check-version: 150 | # Fail if syncing version would produce changes 151 | scripts/version-sync.sh -c \ 152 | -s CHANGELOG.md \ 153 | -f README.md api-release \ 154 | -f preprocessing-pipeline-family.yaml release 155 | 156 | ## check-notebooks: check that executing and cleaning notebooks doesn't produce changes 157 | .PHONY: check-notebooks 158 | check-notebooks: 159 | scripts/check-and-format-notebooks.py --check 160 | 161 | ## tidy-notebooks: execute notebooks and remove metadata 162 | .PHONY: tidy-notebooks 163 | tidy-notebooks: 164 | scripts/check-and-format-notebooks.py 165 | 166 | ## version-sync: update references to version with most recent version from CHANGELOG.md 167 | .PHONY: version-sync 168 | version-sync: 169 | scripts/version-sync.sh \ 170 | -s CHANGELOG.md \ 171 | -f README.md api-release \ 172 | -f preprocessing-pipeline-family.yaml release 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 |

6 |

Pre-Processing OCR Pipeline for PaddleOCR

7 |

8 | 9 |
10 | 11 | ![https://pypi.python.org/pypi/unstructured/](https://img.shields.io/pypi/l/unstructured.svg) 12 | ![https://pypi.python.org/pypi/unstructured/](https://img.shields.io/pypi/pyversions/unstructured.svg) 13 | ![https://GitHub.com/unstructured-io/unstructured.js/graphs/contributors](https://img.shields.io/github/contributors/unstructured-io/unstructured) 14 | ![code_of_conduct.md](https://img.shields.io/badge/Contributor%20Covenant-2.1-4baaaa.svg) 15 | ![https://github.com/Naereen/badges/](https://badgen.net/badge/Open%20Source%20%3F/Yes%21/blue?icon=github) 16 | 17 |
18 | 19 | 20 | This pipeline processes input image documents in the English language using [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR). 21 | The pipeline works on `x86_64` cpus. 22 | 23 | ## Developer Quick Start 24 | 25 | * Using `pyenv` to manage virtualenvs is recommended 26 | * Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions. 27 | * `brew install pyenv-virtualenv` 28 | * `pyenv install 3.8.15` 29 | * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux). 30 | 31 | * Create a virtualenv to work in and activate it, e.g. for one named `paddleocr`: 32 | 33 | `pyenv virtualenv 3.8.15 paddleocr`
34 | `pyenv activate paddleocr` 35 | 36 | * If you are on a Mac with an M1 chip, run `brew install mupdf swig freetype` to install 37 | required non-Python dependencies. 38 | * Run `make install` 39 | * Start a local jupyter notebook server with `make run-jupyter`
40 | **OR**
41 | just start the fast-API locally with `make run-web-app` 42 | 43 | ### Performing OCR on a JPG image 44 | 45 | To run OCR on a JPG image, run `make run-web-app` and run the following `curl` command, 46 | replacing `sample-docs/sample-receipt.jpg` with your filename: 47 | 48 | ``` 49 | curl -X 'POST' \ 50 | 'http://localhost:8000/paddleocr/v0.0.1/paddleocr' \ 51 | -H 'accept: application/json' \ 52 | -H 'Content-Type: multipart/form-data' \ 53 | -F 'files=@sample-docs/sample-receipt.jpg' | jq -C . | less -R 54 | ``` 55 | 56 | The result should look like the following. 57 | 58 | ``` 59 | "{\"result\": [[[[162.0, 111.0], [429.0, 110.0], [429.0, 138.0], [162.0, 139.0]], [\"PETRON BKT 60 | LANJAN SB\", 0.918]], [[[162.0, 142.0], [418.0, 141.0], [418.0, 170.0], [162.0, 171.0]], [\"ALSERKAM 61 | ENTERPRISE\", 0.9785]], [[[44.0, 178.0], [562.0, 175.0], [562.0, 199.0], [44.0, 202.0]], [\"Te1 62 | 03-6156 8757 Co No 001083069-M\", 0.9282]], [[[121.0, 209.0], [467.0, 209.0], [467.0, 232.0], 63 | [121.0, 232.0]], [\"KM 458.4 BKT LANJAN UTARA,\", 0.9205]], [[[95.0, 239.0], [484.0, 237.0], [484.0, 64 | 264.0], [95.0, 267.0]], [\"L/RAYA UTARA SELATAN,SG BULOH\", 0.9525]], [[[188.0, 270.0], [403.0, 65 | 270.0], [403.0, 298.0], [188.0, 298.0]], [\"47000 SUNGAI BUL\", 0.9704]], [[[139.0, 335.0], [443.0, 66 | 335.0], [443.0, 359.0], [139.0, 359.0]], [\"GST ID No001210736640\", 0.9619]], [[[217.0, 397.0], 67 | [366.0, 397.0], [366.0, 424.0], [217.0, 424.0]], [\"TAX INVOICE\", 0.9886]], [[[29.0, 491.0], 68 | [351.0, 490.0], [351.0, 518.0], [29.0, 519.0]], [\"TAX INVOICE NO 19729058\", 0.963]], [[[28.0, 69 | 523.0], [129.0, 523.0], [129.0, 552.0], [28.0, 552.0]], [\"POS1\", 0.9617]], [[[29.0, 554.0], 70 | [272.0, 552.0], [272.0, 582.0], [29.0, 583.0]], [\"Store No.:129077\", 0.9439]], [[[492.0, 552.0], 71 | [553.0, 552.0], [553.0, 584.0], [492.0, 584.0]], [\"Babu\", 0.9968]], [[[28.0, 586.0], [169.0, 72 | 589.0], [169.0, 618.0], [27.0, 615.0]], [\"01/02/2018\", 0.9972]], [[[162.0, 587.0], [340.0, 587.0], 73 | [340.0, 615.0], [162.0, 615.0]], [\"4:43:17PM\", 0.8981]], [[[28.0, 683.0], [311.0, 683.0], [311.0, 74 | 711.0], [28.0, 711.0]], [\"A 2 doublemint te\", 0.9652]], [[[506.0, 679.0], [566.0, 679.0], [566.0, 75 | 710.0], [506.0, 710.0]], [\"3.00\", 0.9931]], [[[25.0, 714.0], [313.0, 712.0], [314.0, 742.0], 76 | [25.0, 743.0]], [\"A1sandwich vanill\", 0.9318]], [[[507.0, 711.0], [566.0, 711.0], [566.0, 743.0], 77 | [507.0, 743.0]], [\"1.90\", 0.9937]], [[[69.0, 778.0], [165.0, 778.0], [165.0, 807.0], [69.0, 78 | 807.0]], [\"GST RM\", 0.9119]], [[[505.0, 775.0], [566.0, 775.0], [566.0, 807.0], [505.0, 807.0]], 79 | [\"0.28\", 0.9929]], [[[70.0, 811.0], [296.0, 811.0], [296.0, 839.0], [70.0, 839.0]], [\"Total RM 80 | inc.GST:\", 0.9176]], [[[506.0, 807.0], [566.0, 807.0], [566.0, 839.0], [506.0, 839.0]], [\"4.90\", 81 | 0.9949]], [[[67.0, 873.0], [128.0, 873.0], [128.0, 905.0], [67.0, 905.0]], [\"Cash\", 0.9938]], 82 | [[[505.0, 868.0], [568.0, 868.0], [568.0, 905.0], [505.0, 905.0]], [\"5.00\", 0.992]], [[[67.0, 83 | 904.0], [154.0, 908.0], [153.0, 938.0], [66.0, 935.0]], [\"Change\", 0.9971]], [[[506.0, 903.0], 84 | [566.0, 903.0], [566.0, 935.0], [506.0, 935.0]], [\"0.10\", 0.9981]], [[[29.0, 968.0], [179.0, 85 | 973.0], [178.0, 1002.0], [29.0, 998.0]], [\"GsT Summary\", 0.8839]], [[[242.0, 969.0], [387.0, 86 | 966.0], [388.0, 996.0], [242.0, 999.0]], [\"AnountRM\", 0.895]], [[[454.0, 969.0], [562.0, 969.0], 87 | [562.0, 998.0], [454.0, 998.0]], [\"Tax (RM)\", 0.8915]], [[[29.0, 1002.0], [128.0, 1002.0], [128.0, 88 | 1033.0], [29.0, 1033.0]], [\"A=6.00%\", 0.9756]], [[[241.0, 1001.0], [301.0, 1001.0], [301.0, 89 | 1033.0], [241.0, 1033.0]], [\"4.62\", 0.9949]], [[[452.0, 999.0], [513.0, 999.0], [513.0, 1031.0], 90 | [452.0, 1031.0]], [\"0.28\", 0.9955]], [[[29.0, 1070.0], [47.0, 1070.0], [47.0, 1092.0], [29.0, 91 | 1092.0]], [\"A\", 0.9864]], [[[106.0, 1066.0], [418.0, 1066.0], [418.0, 1094.0], [106.0, 1094.0]], 92 | [\"ITAL INCLUDES 6.00%GST\", 0.9485]], [[[151.0, 1166.0], [429.0, 1166.0], [429.0, 1190.0], [151.0, 93 | 1190.0]], [\"Use 3000 Petron Miles\", 0.9395]], [[[176.0, 1197.0], [403.0, 1194.0], [403.0, 1223.0], 94 | [176.0, 1226.0]], [\"points to pay for\", 0.9474]], [[[228.0, 1227.0], [351.0, 1227.0], [351.0, 95 | 1257.0], [228.0, 1257.0]], [\"RM45 Fue1\", 0.932]]]} 96 | ``` 97 | 98 | You can also run OCR through the Python API using the following commands: 99 | 100 | ```python 101 | from prepline_paddleocr.api.paddleocr import pipeline_api 102 | 103 | filename = "sample-docs/sample-receipt.jpg" 104 | 105 | with open(filename, "rb") as f: 106 | pipeline_api(file=f) 107 | ``` 108 | 109 | 110 | ### Generating Python files from the pipeline notebooks 111 | 112 | You can generate the FastAPI APIs from your pipeline notebooks by running `make generate-api`. 113 | 114 | ## Security Policy 115 | 116 | See our [security policy](https://github.com/Unstructured-IO/pipeline-paddleocr/security/policy) for 117 | information on how to report security vulnerabilities. 118 | 119 | ## Learn more 120 | 121 | | Section | Description | 122 | |-|-| 123 | | [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects | 124 | | [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories | 125 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info | 126 | -------------------------------------------------------------------------------- /exploration-notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/exploration-notebooks/.gitkeep -------------------------------------------------------------------------------- /img/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/img/0.png -------------------------------------------------------------------------------- /img/unstructured_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/img/unstructured_logo.png -------------------------------------------------------------------------------- /lib/libstdc++.so.6: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/lib/libstdc++.so.6 -------------------------------------------------------------------------------- /logger_config.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | disable_existing_loggers: False 3 | formatters: 4 | default_format: 5 | "()": uvicorn.logging.DefaultFormatter 6 | format: '%(asctime)s %(name)s %(levelname)s %(message)s' 7 | access: 8 | "()": uvicorn.logging.AccessFormatter 9 | format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s' 10 | handlers: 11 | access_handler: 12 | formatter: access 13 | class: logging.StreamHandler 14 | stream: ext://sys.stderr 15 | standard_handler: 16 | formatter: default_format 17 | class: logging.StreamHandler 18 | stream: ext://sys.stderr 19 | loggers: 20 | uvicorn.error: 21 | level: INFO 22 | handlers: 23 | - standard_handler 24 | propagate: no 25 | # disable logging for uvicorn.error by not having a handler 26 | uvicorn.access: 27 | level: INFO 28 | handlers: 29 | - access_handler 30 | propagate: no 31 | # disable logging for uvicorn.access by not having a handler 32 | unstructured: 33 | level: INFO 34 | handlers: 35 | - standard_handler 36 | propagate: no 37 | 38 | -------------------------------------------------------------------------------- /pipeline-notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/pipeline-notebooks/.gitkeep -------------------------------------------------------------------------------- /pipeline-notebooks/pipeline-paddleocr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3931743a", 6 | "metadata": {}, 7 | "source": [ 8 | "# PaddleOCR Pipeline" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "757bd7cd", 14 | "metadata": {}, 15 | "source": [ 16 | "## Section 1: Introduction\n", 17 | "\n", 18 | "The goal of this notebook is to setup a pipeline for PaddleOCR" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "7db1e471", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import json\n", 29 | "import os\n", 30 | "\n", 31 | "def get_filename(directory, filename):\n", 32 | " cwd = os.getcwd()\n", 33 | " local_directory = os.path.join(os.path.split(cwd)[0], directory)\n", 34 | " ci_directory = os.path.join(cwd, directory)\n", 35 | "\n", 36 | " if os.path.exists(local_directory) and filename in os.listdir(local_directory):\n", 37 | " return os.path.join(local_directory, filename)\n", 38 | " elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):\n", 39 | " return os.path.join(ci_directory, filename)\n", 40 | " else:\n", 41 | " raise FileNotFoundError" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "48daac01", 47 | "metadata": {}, 48 | "source": [ 49 | "## Show example image" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "id": "18bbc559", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "image/png": "\n", 61 | "text/plain": [ 62 | "" 63 | ] 64 | }, 65 | "execution_count": null, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "from PIL import Image\n", 72 | "\n", 73 | "filename = get_filename(\"img\", \"0.png\")\n", 74 | "\n", 75 | "Image.open(filename)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "id": "e21660e2", 81 | "metadata": {}, 82 | "source": [ 83 | "## Section 2: Pipeline API" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "ef0b7cb5", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "# pipeline-api\n", 94 | "from paddleocr import PaddleOCR\n", 95 | "\n", 96 | "import logging\n", 97 | "logging.disable()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "id": "7cb5e00b", 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# pipeline-api\n", 108 | "from PIL import Image\n", 109 | "import numpy as np\n", 110 | "\n", 111 | "def pipeline_api(\n", 112 | " file,\n", 113 | " file_content_type=None,\n", 114 | " m_some_parameters=[],\n", 115 | "):\n", 116 | " ocr = PaddleOCR(lang=\"en\", use_gpu = False, show_log = False) \n", 117 | " result = ocr.ocr(img=np.array(Image.open(file)))\n", 118 | " \n", 119 | " result =[(p1[0],tuple((p1[1][0],round(p1[1][1],4)))) for p in result for p1 in p]\n", 120 | "\n", 121 | " return json.dumps({ \"result\" : result})\n" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "id": "0400f975", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "{'result': [[[[48.0, 24.0], [112.0, 24.0], [112.0, 35.0], [48.0, 35.0]], ['AK Transport', 0.9921]], [[[462.0, 26.0], [576.0, 26.0], [576.0, 50.0], [462.0, 50.0]], ['INVOICE', 0.998]], [[[46.0, 36.0], [126.0, 36.0], [126.0, 49.0], [46.0, 49.0]], ['352 Palmer Road', 0.981]], [[[47.0, 46.0], [76.0, 49.0], [75.0, 61.0], [46.0, 58.0]], ['Ware', 0.9973]], [[[47.0, 60.0], [116.0, 60.0], [116.0, 71.0], [47.0, 71.0]], ['MA, 1082, USA', 0.9995]], [[[520.0, 53.0], [573.0, 56.0], [572.0, 70.0], [519.0, 68.0]], ['#659950', 0.9963]], [[[437.0, 107.0], [466.0, 107.0], [466.0, 122.0], [437.0, 122.0]], ['Date:', 0.9989]], [[[522.0, 105.0], [569.0, 107.0], [568.0, 121.0], [521.0, 119.0]], ['4/11/2020', 1.0]], [[[47.0, 121.0], [79.0, 123.0], [78.0, 134.0], [46.0, 131.0]], ['Bill To:', 0.9944]], [[[45.0, 136.0], [156.0, 138.0], [156.0, 152.0], [45.0, 149.0]], ['Quadrant Lite Planning', 0.9752]], [[[392.0, 130.0], [464.0, 130.0], [464.0, 144.0], [392.0, 144.0]], ['Balance Due:', 0.9757]], [[[525.0, 131.0], [570.0, 131.0], [570.0, 145.0], [525.0, 145.0]], ['$198.30', 0.9956]], [[[46.0, 149.0], [141.0, 149.0], [141.0, 162.0], [46.0, 162.0]], ['3371 S Alabama Ave', 0.9998]], [[[46.0, 161.0], [99.0, 162.0], [99.0, 173.0], [46.0, 172.0]], ['Monroeville', 0.9992]], [[[47.0, 174.0], [119.0, 174.0], [119.0, 185.0], [47.0, 185.0]], ['AL, 36460, USA', 0.9779]], [[[42.0, 225.0], [65.0, 225.0], [65.0, 236.0], [42.0, 236.0]], ['Item', 0.9908]], [[[370.0, 223.0], [411.0, 223.0], [411.0, 237.0], [370.0, 237.0]], ['Quantity', 0.9994]], [[[471.0, 224.0], [494.0, 224.0], [494.0, 236.0], [471.0, 236.0]], ['Rate', 0.9999]], [[[532.0, 225.0], [570.0, 225.0], [570.0, 236.0], [532.0, 236.0]], ['Amount', 0.9999]], [[[43.0, 248.0], [198.0, 248.0], [198.0, 261.0], [43.0, 261.0]], ['Reviva Oatmeal Soap Bar 4.20 oz', 0.9934]], [[[370.0, 249.0], [379.0, 249.0], [379.0, 260.0], [370.0, 260.0]], ['3', 0.9993]], [[[461.0, 248.0], [494.0, 248.0], [494.0, 261.0], [461.0, 261.0]], ['$66.10', 0.9974]], [[[530.0, 248.0], [571.0, 248.0], [571.0, 261.0], [530.0, 261.0]], ['$198.30', 0.997]], [[[438.0, 314.0], [466.0, 314.0], [466.0, 327.0], [438.0, 327.0]], ['Total:', 0.9996]], [[[530.0, 313.0], [570.0, 313.0], [570.0, 327.0], [530.0, 327.0]], ['$198.30', 0.9974]]]}\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "with open(filename, 'rb') as f:\n", 140 | " result = pipeline_api(f)\n", 141 | "print(json.loads(result))" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "id": "fcb6a317", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [] 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "python3", 156 | "language": "python", 157 | "name": "python3" 158 | } 159 | }, 160 | "nbformat": 4, 161 | "nbformat_minor": 5 162 | } 163 | -------------------------------------------------------------------------------- /prepline_paddleocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/prepline_paddleocr/__init__.py -------------------------------------------------------------------------------- /prepline_paddleocr/api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/prepline_paddleocr/api/__init__.py -------------------------------------------------------------------------------- /prepline_paddleocr/api/app.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | 7 | from fastapi import FastAPI, Request, status 8 | 9 | from slowapi import Limiter, _rate_limit_exceeded_handler 10 | from slowapi.errors import RateLimitExceeded 11 | from slowapi.util import get_remote_address 12 | 13 | from .paddleocr import router as paddleocr_router 14 | 15 | 16 | limiter = Limiter(key_func=get_remote_address) 17 | app = FastAPI() 18 | app.state.limiter = limiter 19 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) 20 | 21 | app.include_router(paddleocr_router) 22 | 23 | 24 | @app.get("/healthcheck", status_code=status.HTTP_200_OK) 25 | async def healthcheck(request: Request): 26 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 27 | -------------------------------------------------------------------------------- /prepline_paddleocr/api/paddleocr.py: -------------------------------------------------------------------------------- 1 | ##################################################################### 2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS. 3 | # DO NOT MODIFY DIRECTLY 4 | ##################################################################### 5 | 6 | import os 7 | from typing import List, Union 8 | 9 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter 10 | from slowapi.errors import RateLimitExceeded 11 | from slowapi import Limiter, _rate_limit_exceeded_handler 12 | from slowapi.util import get_remote_address 13 | from fastapi.responses import PlainTextResponse 14 | 15 | limiter = Limiter(key_func=get_remote_address) 16 | app = FastAPI() 17 | app.state.limiter = limiter 18 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler) 19 | router = APIRouter() 20 | 21 | RATE_LIMIT = os.environ.get("PIPELINE_API_RATE_LIMIT", "1/second") 22 | 23 | 24 | # pipeline-api 25 | from paddleocr import PaddleOCR 26 | 27 | import logging 28 | 29 | logging.disable() 30 | from PIL import Image 31 | import numpy as np 32 | 33 | 34 | def pipeline_api( 35 | file, 36 | file_content_type=None, 37 | m_some_parameters=[], 38 | ): 39 | ocr = PaddleOCR(lang="en", use_gpu=False, show_log=False) 40 | result = ocr.ocr(img=np.array(Image.open(file))) 41 | 42 | result = [ 43 | (p1[0], tuple((p1[1][0], round(p1[1][1], 4)))) for p in result for p1 in p 44 | ] 45 | 46 | return json.dumps({"result": result}) 47 | 48 | 49 | import json 50 | from fastapi.responses import StreamingResponse 51 | from starlette.types import Send 52 | from base64 import b64encode 53 | from typing import Optional, Mapping, Iterator, Tuple 54 | import secrets 55 | 56 | 57 | class MultipartMixedResponse(StreamingResponse): 58 | CRLF = b"\r\n" 59 | 60 | def __init__(self, *args, content_type: str = None, **kwargs): 61 | super().__init__(*args, **kwargs) 62 | self.content_type = content_type 63 | 64 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None: 65 | super().init_headers(headers) 66 | self.boundary_value = secrets.token_hex(16) 67 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"' 68 | self.raw_headers.append((b"content-type", content_type.encode("latin-1"))) 69 | 70 | @property 71 | def boundary(self): 72 | return b"--" + self.boundary_value.encode() 73 | 74 | def _build_part_headers(self, headers: dict) -> bytes: 75 | header_bytes = b"" 76 | for header, value in headers.items(): 77 | header_bytes += f"{header}: {value}".encode() + self.CRLF 78 | return header_bytes 79 | 80 | def build_part(self, chunk: bytes) -> bytes: 81 | part = self.boundary + self.CRLF 82 | part_headers = { 83 | "Content-Length": len(chunk), 84 | "Content-Transfer-Encoding": "base64", 85 | } 86 | if self.content_type is not None: 87 | part_headers["Content-Type"] = self.content_type 88 | part += self._build_part_headers(part_headers) 89 | part += self.CRLF + chunk + self.CRLF 90 | return part 91 | 92 | async def stream_response(self, send: Send) -> None: 93 | await send( 94 | { 95 | "type": "http.response.start", 96 | "status": self.status_code, 97 | "headers": self.raw_headers, 98 | } 99 | ) 100 | async for chunk in self.body_iterator: 101 | if not isinstance(chunk, bytes): 102 | chunk = chunk.encode(self.charset) 103 | chunk = b64encode(chunk) 104 | await send( 105 | { 106 | "type": "http.response.body", 107 | "body": self.build_part(chunk), 108 | "more_body": True, 109 | } 110 | ) 111 | 112 | await send({"type": "http.response.body", "body": b"", "more_body": False}) 113 | 114 | 115 | @router.post("/paddleocr/v0.0.1/paddleocr") 116 | @limiter.limit(RATE_LIMIT) 117 | async def pipeline_1( 118 | request: Request, 119 | files: Union[List[UploadFile], None] = File(default=None), 120 | some_parameters: List[str] = Form(default=[]), 121 | ): 122 | content_type = request.headers.get("Accept") 123 | 124 | if isinstance(files, list) and len(files): 125 | if len(files) > 1: 126 | if content_type and content_type not in ["*/*", "multipart/mixed"]: 127 | return PlainTextResponse( 128 | content=( 129 | f"Conflict in media type {content_type}" 130 | ' with response type "multipart/mixed".\n' 131 | ), 132 | status_code=status.HTTP_406_NOT_ACCEPTABLE, 133 | ) 134 | 135 | def response_generator(): 136 | for file in files: 137 | _file = file.file 138 | 139 | response = pipeline_api( 140 | _file, 141 | m_some_parameters=some_parameters, 142 | file_content_type=file.content_type, 143 | ) 144 | if type(response) not in [str, bytes]: 145 | response = json.dumps(response) 146 | yield response 147 | 148 | return MultipartMixedResponse( 149 | response_generator(), 150 | ) 151 | else: 152 | file = files[0] 153 | _file = file.file 154 | 155 | response = pipeline_api( 156 | _file, 157 | m_some_parameters=some_parameters, 158 | file_content_type=file.content_type, 159 | ) 160 | 161 | return response 162 | 163 | else: 164 | return PlainTextResponse( 165 | content='Request parameter "files" is required.\n', 166 | status_code=status.HTTP_400_BAD_REQUEST, 167 | ) 168 | 169 | 170 | @app.get("/healthcheck", status_code=status.HTTP_200_OK) 171 | async def healthcheck(request: Request): 172 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"} 173 | 174 | 175 | app.include_router(router) 176 | -------------------------------------------------------------------------------- /preprocessing-pipeline-family.yaml: -------------------------------------------------------------------------------- 1 | name: paddleocr 2 | version: 0.0.1 3 | -------------------------------------------------------------------------------- /requirements/base.in: -------------------------------------------------------------------------------- 1 | unstructured>=0.2.4 2 | unstructured-api-tools>=0.4.4 3 | 4 | opencv-python==4.5.5.64 5 | pip-tools>=6.11.0 6 | ipython>=8.7.0 7 | ratelimit 8 | 9 | paddlepaddle 10 | paddleocr 11 | werkzeug>=2.2.3 12 | future>=0.18.3 13 | jupyter-core>=4.11.2 14 | nbdev>=2.3.12 15 | #protobuf>=3.20.2 16 | #starlette>=0.25.0 17 | IPython>=8.10 18 | wheel>=0.38.1 19 | pytest>=7.2.0 20 | -------------------------------------------------------------------------------- /requirements/base.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.9 3 | # by the following command: 4 | # 5 | # pip-compile requirements/base.in 6 | # 7 | anyio==3.6.1 8 | # via 9 | # starlette 10 | # watchfiles 11 | astor==0.8.1 12 | # via paddlepaddle 13 | asttokens==2.2.1 14 | # via 15 | # nbdev 16 | # stack-data 17 | astunparse==1.6.3 18 | # via nbdev 19 | attrdict==2.0.1 20 | # via paddleocr 21 | attrs==22.1.0 22 | # via 23 | # jsonschema 24 | # pytest 25 | babel==2.11.0 26 | # via flask-babel 27 | backcall==0.2.0 28 | # via ipython 29 | bce-python-sdk==0.8.74 30 | # via visualdl 31 | beautifulsoup4==4.11.1 32 | # via 33 | # nbconvert 34 | # paddleocr 35 | bleach==5.0.1 36 | # via nbconvert 37 | build==0.9.0 38 | # via pip-tools 39 | cachetools==5.2.0 40 | # via premailer 41 | certifi==2022.12.7 42 | # via requests 43 | charset-normalizer==2.1.1 44 | # via requests 45 | click==8.1.3 46 | # via 47 | # flask 48 | # nltk 49 | # pip-tools 50 | # unstructured-api-tools 51 | # uvicorn 52 | contourpy==1.0.6 53 | # via matplotlib 54 | cssselect==1.2.0 55 | # via premailer 56 | cssutils==2.6.0 57 | # via premailer 58 | cycler==0.11.0 59 | # via matplotlib 60 | cython==0.29.32 61 | # via paddleocr 62 | decorator==5.1.1 63 | # via 64 | # ipython 65 | # paddlepaddle 66 | defusedxml==0.7.1 67 | # via nbconvert 68 | dill==0.3.6 69 | # via multiprocess 70 | entrypoints==0.4 71 | # via jupyter-client 72 | et-xmlfile==1.1.0 73 | # via openpyxl 74 | exceptiongroup==1.1.0 75 | # via pytest 76 | execnb==0.1.5 77 | # via nbdev 78 | executing==1.2.0 79 | # via stack-data 80 | fastapi==0.85.0 81 | # via unstructured-api-tools 82 | fastcore==1.5.28 83 | # via 84 | # execnb 85 | # ghapi 86 | # nbdev 87 | fastjsonschema==2.16.2 88 | # via nbformat 89 | fire==0.4.0 90 | # via 91 | # paddleocr 92 | # pdf2docx 93 | flask==2.2.2 94 | # via 95 | # flask-babel 96 | # visualdl 97 | flask-babel==2.0.0 98 | # via visualdl 99 | fonttools==4.38.0 100 | # via 101 | # matplotlib 102 | # paddleocr 103 | # pdf2docx 104 | future==0.18.3 105 | # via 106 | # -r requirements/base.in 107 | # bce-python-sdk 108 | ghapi==1.0.3 109 | # via nbdev 110 | h11==0.13.0 111 | # via uvicorn 112 | httptools==0.5.0 113 | # via uvicorn 114 | idna==3.4 115 | # via 116 | # anyio 117 | # requests 118 | imageio==2.22.4 119 | # via 120 | # imgaug 121 | # scikit-image 122 | imgaug==0.4.0 123 | # via paddleocr 124 | importlib-metadata==5.0.0 125 | # via 126 | # flask 127 | # nbconvert 128 | iniconfig==2.0.0 129 | # via pytest 130 | ipython==8.10.0 131 | # via 132 | # -r requirements/base.in 133 | # execnb 134 | itsdangerous==2.1.2 135 | # via flask 136 | jedi==0.18.2 137 | # via ipython 138 | jinja2==3.1.2 139 | # via 140 | # flask 141 | # flask-babel 142 | # nbconvert 143 | # unstructured-api-tools 144 | joblib==1.2.0 145 | # via nltk 146 | jsonschema==4.16.0 147 | # via nbformat 148 | jupyter-client==7.3.5 149 | # via nbclient 150 | jupyter-core==5.2.0 151 | # via 152 | # -r requirements/base.in 153 | # jupyter-client 154 | # nbconvert 155 | # nbformat 156 | jupyterlab-pygments==0.2.2 157 | # via nbconvert 158 | kiwisolver==1.4.4 159 | # via matplotlib 160 | limits==1.6 161 | # via slowapi 162 | lmdb==1.4.0 163 | # via paddleocr 164 | lxml==4.9.1 165 | # via 166 | # nbconvert 167 | # paddleocr 168 | # premailer 169 | # python-docx 170 | # unstructured 171 | markupsafe==2.1.1 172 | # via 173 | # jinja2 174 | # nbconvert 175 | # werkzeug 176 | matplotlib==3.6.2 177 | # via 178 | # imgaug 179 | # visualdl 180 | matplotlib-inline==0.1.6 181 | # via ipython 182 | mistune==2.0.4 183 | # via nbconvert 184 | multiprocess==0.70.14 185 | # via visualdl 186 | mypy==0.991 187 | # via unstructured-api-tools 188 | mypy-extensions==0.4.3 189 | # via mypy 190 | nbclient==0.6.8 191 | # via nbconvert 192 | nbconvert==7.0.0 193 | # via unstructured-api-tools 194 | nbdev==2.3.12 195 | # via -r requirements/base.in 196 | nbformat==5.6.0 197 | # via 198 | # nbclient 199 | # nbconvert 200 | nest-asyncio==1.5.5 201 | # via 202 | # jupyter-client 203 | # nbclient 204 | networkx==2.8.8 205 | # via scikit-image 206 | nltk==3.7 207 | # via unstructured 208 | numpy==1.23.5 209 | # via 210 | # contourpy 211 | # imageio 212 | # imgaug 213 | # matplotlib 214 | # opencv-contrib-python 215 | # opencv-python 216 | # opt-einsum 217 | # paddleocr 218 | # paddlepaddle 219 | # pandas 220 | # pdf2docx 221 | # pywavelets 222 | # scikit-image 223 | # scipy 224 | # tifffile 225 | # visualdl 226 | opencv-contrib-python==4.6.0.66 227 | # via paddleocr 228 | opencv-python==4.5.5.64 229 | # via 230 | # -r requirements/base.in 231 | # imgaug 232 | # paddleocr 233 | # pdf2docx 234 | openpyxl==3.0.10 235 | # via paddleocr 236 | opt-einsum==3.3.0 237 | # via paddlepaddle 238 | packaging==21.3 239 | # via 240 | # build 241 | # fastcore 242 | # ghapi 243 | # matplotlib 244 | # nbconvert 245 | # pytest 246 | # scikit-image 247 | # visualdl 248 | paddle-bfloat==0.1.7 249 | # via paddlepaddle 250 | paddleocr==2.6.1.1 251 | # via -r requirements/base.in 252 | paddlepaddle==2.4.0 253 | # via -r requirements/base.in 254 | pandas==1.5.2 255 | # via visualdl 256 | pandocfilters==1.5.0 257 | # via nbconvert 258 | parso==0.8.3 259 | # via jedi 260 | pdf2docx==0.5.6 261 | # via paddleocr 262 | pep517==0.13.0 263 | # via build 264 | pexpect==4.8.0 265 | # via ipython 266 | pickleshare==0.7.5 267 | # via ipython 268 | pillow==9.3.0 269 | # via 270 | # imageio 271 | # imgaug 272 | # matplotlib 273 | # paddlepaddle 274 | # scikit-image 275 | # visualdl 276 | pip-tools==6.11.0 277 | # via -r requirements/base.in 278 | platformdirs==3.0.0 279 | # via jupyter-core 280 | pluggy==1.0.0 281 | # via pytest 282 | premailer==3.10.0 283 | # via paddleocr 284 | prompt-toolkit==3.0.36 285 | # via ipython 286 | protobuf==3.20.0 287 | # via 288 | # paddlepaddle 289 | # visualdl 290 | ptyprocess==0.7.0 291 | # via pexpect 292 | pure-eval==0.2.2 293 | # via stack-data 294 | pyclipper==1.3.0.post4 295 | # via paddleocr 296 | pycryptodome==3.16.0 297 | # via bce-python-sdk 298 | pydantic==1.10.2 299 | # via fastapi 300 | pygments==2.13.0 301 | # via 302 | # ipython 303 | # nbconvert 304 | pymupdf==1.20.2 305 | # via 306 | # paddleocr 307 | # pdf2docx 308 | pyparsing==3.0.9 309 | # via 310 | # matplotlib 311 | # packaging 312 | pyrsistent==0.18.1 313 | # via jsonschema 314 | pytest==7.2.1 315 | # via -r requirements/base.in 316 | python-dateutil==2.8.2 317 | # via 318 | # jupyter-client 319 | # matplotlib 320 | # pandas 321 | python-docx==0.8.11 322 | # via 323 | # paddleocr 324 | # pdf2docx 325 | python-dotenv==0.21.0 326 | # via uvicorn 327 | python-multipart==0.0.5 328 | # via unstructured-api-tools 329 | pytz==2022.6 330 | # via 331 | # babel 332 | # flask-babel 333 | # pandas 334 | pywavelets==1.4.1 335 | # via scikit-image 336 | pyyaml==6.0 337 | # via 338 | # nbdev 339 | # uvicorn 340 | pyzmq==24.0.1 341 | # via jupyter-client 342 | rapidfuzz==2.13.3 343 | # via paddleocr 344 | ratelimit==2.2.1 345 | # via -r requirements/base.in 346 | regex==2022.10.31 347 | # via nltk 348 | requests==2.28.1 349 | # via 350 | # paddlepaddle 351 | # premailer 352 | # visualdl 353 | scikit-image==0.19.3 354 | # via 355 | # imgaug 356 | # paddleocr 357 | scipy==1.9.3 358 | # via 359 | # imgaug 360 | # scikit-image 361 | shapely==1.8.5.post1 362 | # via 363 | # imgaug 364 | # paddleocr 365 | six==1.16.0 366 | # via 367 | # asttokens 368 | # astunparse 369 | # attrdict 370 | # bce-python-sdk 371 | # bleach 372 | # fire 373 | # imgaug 374 | # limits 375 | # paddlepaddle 376 | # python-dateutil 377 | # python-multipart 378 | # visualdl 379 | slowapi==0.1.6 380 | # via unstructured-api-tools 381 | sniffio==1.3.0 382 | # via anyio 383 | soupsieve==2.3.2.post1 384 | # via beautifulsoup4 385 | stack-data==0.6.2 386 | # via ipython 387 | starlette==0.20.4 388 | # via fastapi 389 | termcolor==2.1.1 390 | # via fire 391 | tifffile==2022.10.10 392 | # via scikit-image 393 | tinycss2==1.1.1 394 | # via nbconvert 395 | tomli==2.0.1 396 | # via 397 | # build 398 | # mypy 399 | # pep517 400 | # pytest 401 | tornado==6.2 402 | # via jupyter-client 403 | tqdm==4.64.1 404 | # via 405 | # nltk 406 | # paddleocr 407 | traitlets==5.4.0 408 | # via 409 | # ipython 410 | # jupyter-client 411 | # jupyter-core 412 | # matplotlib-inline 413 | # nbclient 414 | # nbconvert 415 | # nbformat 416 | types-requests==2.28.11 417 | # via unstructured-api-tools 418 | types-ujson==5.5.0 419 | # via unstructured-api-tools 420 | types-urllib3==1.26.24 421 | # via types-requests 422 | typing-extensions==4.3.0 423 | # via 424 | # mypy 425 | # pydantic 426 | # starlette 427 | unstructured==0.2.5 428 | # via -r requirements/base.in 429 | unstructured-api-tools==0.4.6 430 | # via -r requirements/base.in 431 | urllib3==1.26.13 432 | # via requests 433 | uvicorn[standard]==0.18.3 434 | # via unstructured-api-tools 435 | uvloop==0.17.0 436 | # via uvicorn 437 | visualdl==2.4.1 438 | # via paddleocr 439 | watchdog==2.2.1 440 | # via nbdev 441 | watchfiles==0.17.0 442 | # via uvicorn 443 | wcwidth==0.2.5 444 | # via prompt-toolkit 445 | webencodings==0.5.1 446 | # via 447 | # bleach 448 | # tinycss2 449 | websockets==10.3 450 | # via uvicorn 451 | werkzeug==2.2.3 452 | # via 453 | # -r requirements/base.in 454 | # flask 455 | wheel==0.38.4 456 | # via 457 | # -r requirements/base.in 458 | # astunparse 459 | # pip-tools 460 | zipp==3.10.0 461 | # via importlib-metadata 462 | 463 | # The following packages are considered to be unsafe in a requirements file: 464 | # pip 465 | # setuptools 466 | -------------------------------------------------------------------------------- /requirements/dev.in: -------------------------------------------------------------------------------- 1 | black 2 | flake8 3 | jupyter 4 | mypy 5 | nbdev 6 | pip-tools 7 | # NOTE(crag): consistency with unstructured-api-tools. pinned for a reason, see there. 8 | ipython==8.7.0 9 | -------------------------------------------------------------------------------- /requirements/dev.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.9 3 | # by the following command: 4 | # 5 | # pip-compile requirements/dev.in 6 | # 7 | argon2-cffi==21.3.0 8 | # via notebook 9 | argon2-cffi-bindings==21.2.0 10 | # via argon2-cffi 11 | asttokens==2.0.8 12 | # via 13 | # nbdev 14 | # stack-data 15 | astunparse==1.6.3 16 | # via nbdev 17 | attrs==22.1.0 18 | # via jsonschema 19 | backcall==0.2.0 20 | # via ipython 21 | beautifulsoup4==4.11.1 22 | # via nbconvert 23 | black==23.1.0 24 | # via -r requirements/dev.in 25 | bleach==5.0.1 26 | # via nbconvert 27 | build==0.8.0 28 | # via pip-tools 29 | cffi==1.15.1 30 | # via argon2-cffi-bindings 31 | click==8.1.3 32 | # via 33 | # black 34 | # pip-tools 35 | debugpy==1.6.3 36 | # via ipykernel 37 | decorator==5.1.1 38 | # via ipython 39 | defusedxml==0.7.1 40 | # via nbconvert 41 | entrypoints==0.4 42 | # via jupyter-client 43 | execnb==0.1.4 44 | # via nbdev 45 | executing==1.0.0 46 | # via stack-data 47 | fastcore==1.5.27 48 | # via 49 | # execnb 50 | # ghapi 51 | # nbdev 52 | fastjsonschema==2.16.2 53 | # via nbformat 54 | flake8==6.0.0 55 | # via -r requirements/dev.in 56 | ghapi==1.0.3 57 | # via nbdev 58 | importlib-metadata==6.0.0 59 | # via nbconvert 60 | ipykernel==6.15.3 61 | # via 62 | # ipywidgets 63 | # jupyter 64 | # jupyter-console 65 | # notebook 66 | # qtconsole 67 | ipython==8.7.0 68 | # via 69 | # -r requirements/dev.in 70 | # execnb 71 | # ipykernel 72 | # ipywidgets 73 | # jupyter-console 74 | ipython-genutils==0.2.0 75 | # via 76 | # notebook 77 | # qtconsole 78 | ipywidgets==8.0.2 79 | # via jupyter 80 | jedi==0.18.1 81 | # via ipython 82 | jinja2==3.1.2 83 | # via 84 | # nbconvert 85 | # notebook 86 | jsonschema==4.16.0 87 | # via nbformat 88 | jupyter==1.0.0 89 | # via -r requirements/dev.in 90 | jupyter-client==7.3.5 91 | # via 92 | # ipykernel 93 | # jupyter-console 94 | # nbclient 95 | # notebook 96 | # qtconsole 97 | jupyter-console==6.4.4 98 | # via jupyter 99 | jupyter-core==4.11.1 100 | # via 101 | # jupyter-client 102 | # nbconvert 103 | # nbformat 104 | # notebook 105 | # qtconsole 106 | jupyterlab-pygments==0.2.2 107 | # via nbconvert 108 | jupyterlab-widgets==3.0.3 109 | # via ipywidgets 110 | lxml==4.9.1 111 | # via nbconvert 112 | markupsafe==2.1.1 113 | # via 114 | # jinja2 115 | # nbconvert 116 | matplotlib-inline==0.1.6 117 | # via 118 | # ipykernel 119 | # ipython 120 | mccabe==0.7.0 121 | # via flake8 122 | mistune==2.0.4 123 | # via nbconvert 124 | mypy==0.991 125 | # via -r requirements/dev.in 126 | mypy-extensions==0.4.3 127 | # via 128 | # black 129 | # mypy 130 | nbclient==0.6.8 131 | # via nbconvert 132 | nbconvert==7.0.0 133 | # via 134 | # jupyter 135 | # notebook 136 | nbdev==2.3.11 137 | # via -r requirements/dev.in 138 | nbformat==5.6.0 139 | # via 140 | # nbclient 141 | # nbconvert 142 | # notebook 143 | nest-asyncio==1.5.5 144 | # via 145 | # ipykernel 146 | # jupyter-client 147 | # nbclient 148 | # notebook 149 | notebook==6.4.12 150 | # via jupyter 151 | packaging==23.0 152 | # via 153 | # black 154 | # build 155 | # fastcore 156 | # ghapi 157 | # ipykernel 158 | # nbconvert 159 | # qtpy 160 | pandocfilters==1.5.0 161 | # via nbconvert 162 | parso==0.8.3 163 | # via jedi 164 | pathspec==0.10.1 165 | # via black 166 | pep517==0.13.0 167 | # via build 168 | pexpect==4.8.0 169 | # via ipython 170 | pickleshare==0.7.5 171 | # via ipython 172 | pip-tools==6.11.0 173 | # via -r requirements/dev.in 174 | platformdirs==2.5.2 175 | # via black 176 | prometheus-client==0.14.1 177 | # via notebook 178 | prompt-toolkit==3.0.31 179 | # via 180 | # ipython 181 | # jupyter-console 182 | psutil==5.9.2 183 | # via ipykernel 184 | ptyprocess==0.7.0 185 | # via 186 | # pexpect 187 | # terminado 188 | pure-eval==0.2.2 189 | # via stack-data 190 | pycodestyle==2.10.0 191 | # via flake8 192 | pycparser==2.21 193 | # via cffi 194 | pyflakes==3.0.1 195 | # via flake8 196 | pygments==2.13.0 197 | # via 198 | # ipython 199 | # jupyter-console 200 | # nbconvert 201 | # qtconsole 202 | pyrsistent==0.18.1 203 | # via jsonschema 204 | python-dateutil==2.8.2 205 | # via jupyter-client 206 | pyyaml==6.0 207 | # via nbdev 208 | pyzmq==24.0.1 209 | # via 210 | # ipykernel 211 | # jupyter-client 212 | # notebook 213 | # qtconsole 214 | qtconsole==5.3.2 215 | # via jupyter 216 | qtpy==2.2.0 217 | # via qtconsole 218 | send2trash==1.8.0 219 | # via notebook 220 | six==1.16.0 221 | # via 222 | # asttokens 223 | # astunparse 224 | # bleach 225 | # python-dateutil 226 | soupsieve==2.3.2.post1 227 | # via beautifulsoup4 228 | stack-data==0.5.0 229 | # via ipython 230 | terminado==0.15.0 231 | # via notebook 232 | tinycss2==1.1.1 233 | # via nbconvert 234 | tomli==2.0.1 235 | # via 236 | # black 237 | # build 238 | # mypy 239 | # pep517 240 | tornado==6.2 241 | # via 242 | # ipykernel 243 | # jupyter-client 244 | # notebook 245 | # terminado 246 | traitlets==5.4.0 247 | # via 248 | # ipykernel 249 | # ipython 250 | # ipywidgets 251 | # jupyter-client 252 | # jupyter-core 253 | # matplotlib-inline 254 | # nbclient 255 | # nbconvert 256 | # nbformat 257 | # notebook 258 | # qtconsole 259 | typing-extensions==4.3.0 260 | # via 261 | # black 262 | # mypy 263 | watchdog==2.1.9 264 | # via nbdev 265 | wcwidth==0.2.5 266 | # via prompt-toolkit 267 | webencodings==0.5.1 268 | # via 269 | # bleach 270 | # tinycss2 271 | wheel==0.37.1 272 | # via 273 | # astunparse 274 | # pip-tools 275 | widgetsnbextension==4.0.3 276 | # via ipywidgets 277 | zipp==3.12.1 278 | # via importlib-metadata 279 | 280 | # The following packages are considered to be unsafe in a requirements file: 281 | # pip 282 | # setuptools 283 | -------------------------------------------------------------------------------- /requirements/test.in: -------------------------------------------------------------------------------- 1 | black 2 | # NOTE(mrobinson) - Pinning click due to a unicode issue in black 3 | # can remove after black drops support for Python 3.6 4 | # ref: https://github.com/psf/black/issues/2964 5 | click==8.1.3 6 | flake8 7 | mypy 8 | pytest-cov 9 | -------------------------------------------------------------------------------- /requirements/test.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.9 3 | # by the following command: 4 | # 5 | # pip-compile requirements/test.in 6 | # 7 | attrs==22.1.0 8 | # via pytest 9 | black==23.1.0 10 | # via -r requirements/test.in 11 | click==8.1.3 12 | # via 13 | # -r requirements/test.in 14 | # black 15 | coverage[toml]==6.4.4 16 | # via pytest-cov 17 | flake8==6.0.0 18 | # via -r requirements/test.in 19 | iniconfig==1.1.1 20 | # via pytest 21 | mccabe==0.7.0 22 | # via flake8 23 | mypy==0.991 24 | # via -r requirements/test.in 25 | mypy-extensions==0.4.3 26 | # via 27 | # black 28 | # mypy 29 | packaging==23.0 30 | # via 31 | # black 32 | # pytest 33 | pathspec==0.10.1 34 | # via black 35 | platformdirs==2.5.2 36 | # via black 37 | pluggy==1.0.0 38 | # via pytest 39 | py==1.11.0 40 | # via pytest 41 | pycodestyle==2.10.0 42 | # via flake8 43 | pyflakes==3.0.1 44 | # via flake8 45 | pytest==7.1.3 46 | # via pytest-cov 47 | pytest-cov==4.0.0 48 | # via -r requirements/test.in 49 | tomli==2.0.1 50 | # via 51 | # black 52 | # coverage 53 | # mypy 54 | # pytest 55 | typing-extensions==4.3.0 56 | # via 57 | # black 58 | # mypy 59 | -------------------------------------------------------------------------------- /sample-docs/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/sample-docs/.gitkeep -------------------------------------------------------------------------------- /sample-docs/sample-receipt.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/sample-docs/sample-receipt.jpg -------------------------------------------------------------------------------- /scripts/check-and-format-notebooks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import argparse 4 | from copy import deepcopy 5 | import difflib 6 | import json 7 | from pathlib import Path 8 | import sys 9 | from typing import List, Tuple, Union 10 | 11 | from nbdev import clean 12 | from nbconvert.preprocessors import ExecutePreprocessor 13 | import nbformat 14 | from unstructured_api_tools.pipelines.convert import read_notebook 15 | 16 | 17 | def process_nb(nb: nbformat.NotebookNode, working_dir: Union[str, Path]) -> nbformat.NotebookNode: 18 | """Execute cells in nb using working_dir as the working directory for imports, modifying the 19 | notebook in place (in memory).""" 20 | # Clear existing outputs before executing the notebook 21 | for cell in nb.cells: 22 | if cell.cell_type == "code": 23 | cell.outputs = [] 24 | ep = ExecutePreprocessor(timeout=600) 25 | ep.preprocess(nb, {"metadata": {"path": working_dir}}) 26 | # Merge adjacent text outputs after executing the notebook 27 | for cell in nb.cells: 28 | merge_adjacent_text_outputs(cell) 29 | return nb 30 | 31 | 32 | def merge_adjacent_text_outputs(cell: nbformat.NotebookNode) -> nbformat.NotebookNode: 33 | """Merges adjacent text stream outputs to avoid non-deterministic splits in output.""" 34 | if cell.cell_type != "code": 35 | return cell 36 | 37 | new_outputs = [] 38 | current_output = None 39 | 40 | for output in cell.outputs: 41 | if output.output_type == "stream": 42 | if current_output is None: 43 | current_output = output 44 | elif current_output.name == output.name: 45 | current_output.text += output.text 46 | else: 47 | new_outputs.append(current_output) 48 | current_output = output 49 | else: 50 | if current_output is not None: 51 | new_outputs.append(current_output) 52 | current_output = None 53 | new_outputs.append(output) 54 | 55 | if current_output is not None: 56 | new_outputs.append(current_output) 57 | 58 | cell.outputs = new_outputs 59 | return cell 60 | 61 | 62 | def nb_paths(root_path: Union[str, Path]) -> List[Path]: 63 | """Fetches all .ipynb filenames that belong to subdirectories of root_path (1 level deep) with 64 | 'notebooks' in the name.""" 65 | root_path = Path(root_path) 66 | return [ 67 | fn 68 | for dir in root_path.iterdir() 69 | # NOTE(alan): Search only in paths with 'notebooks' in the title such as pipeline-notebooks 70 | # and exploration-notebooks 71 | if "notebooks" in dir.stem and dir.is_dir() 72 | for fn in dir.iterdir() 73 | if fn.suffix == ".ipynb" 74 | ] 75 | 76 | 77 | def to_results_str(fns: List[Path], nonmatching_nbs: List[Path]) -> Tuple[str, str]: 78 | """Given files that were checked and list of files that would be changed, produces a summary of 79 | changes as well as a list of files to be changed""" 80 | unchanged = len(fns) - len(nonmatching_nbs) 81 | results = [] 82 | if nonmatching_nbs: 83 | results.append( 84 | f"{len(nonmatching_nbs)} " 85 | f"{'file' if len(nonmatching_nbs) == 1 else 'files'} " 86 | f"{'would be ' if check else ''}changed" 87 | ) 88 | if unchanged: 89 | results.append( 90 | f"{unchanged} " 91 | f"{'file' if unchanged == 1 else 'files'} " 92 | f"{'would be ' if check else ''}left unchanged" 93 | ) 94 | summary_str = ", ".join(results) + ".\n" 95 | if nonmatching_nbs: 96 | details_str = ( 97 | f"The following notebooks {'would have been' if check else 'were'} " 98 | "changed when executed and cleaned:\n* " + "\n* ".join(nonmatching_nbs) + "\n" 99 | ) 100 | else: 101 | details_str = "" 102 | 103 | return summary_str, details_str 104 | 105 | 106 | if __name__ == "__main__": 107 | parser = argparse.ArgumentParser() 108 | parser.add_argument( 109 | "--check", 110 | default=False, 111 | action="store_true", 112 | help="Check notebook format without making changes. Return code 0 means formatting would " 113 | "produce no changes. Return code 1 means some files would be changed.", 114 | ) 115 | parser.add_argument( 116 | "notebooks", 117 | metavar="notebook", 118 | nargs="*", 119 | help="Path(s) to notebook(s) to format (or check). If you don't pass any paths, " 120 | "notebooks in any subfolders with 'notebooks' in the name will be processed.", 121 | default=[], 122 | ) 123 | args = parser.parse_args() 124 | check = args.check 125 | notebooks = args.notebooks 126 | 127 | root_path = Path(__file__).parent.parent 128 | nonmatching_nbs = [] 129 | fns = notebooks if notebooks else nb_paths(root_path) 130 | for fn in fns: 131 | print(f"{'checking' if check else 'processing'} {fn}") 132 | nb = read_notebook(fn) 133 | modified_nb = deepcopy(nb) 134 | process_nb(modified_nb, root_path) 135 | clean.clean_nb(modified_nb, allowed_cell_metadata_keys=["tags"]) 136 | if nb != modified_nb: 137 | nonmatching_nbs.append(str(fn)) 138 | nb_json = json.dumps(nb.dict(), indent=2, sort_keys=True) 139 | modified_nb_json = json.dumps(modified_nb.dict(), indent=2, sort_keys=True) 140 | sys.stderr.write(f"The following diff shows the modifications made to {fn}\n") 141 | sys.stderr.writelines( 142 | ( 143 | difflib.unified_diff( 144 | nb_json.splitlines(keepends=True), 145 | modified_nb_json.splitlines(keepends=True), 146 | ) 147 | ) 148 | ) 149 | if not check: 150 | nbformat.write(modified_nb, fn) 151 | 152 | summary_str, details_str = to_results_str(fns, nonmatching_nbs) 153 | print(summary_str) 154 | if check: 155 | sys.stderr.write(details_str) 156 | if nonmatching_nbs: 157 | sys.exit(1) 158 | else: 159 | print(details_str) 160 | -------------------------------------------------------------------------------- /scripts/docker-build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euo pipefail 4 | 5 | DOCKER_BUILDKIT=1 docker buildx build --load --platform=linux/amd64 -f Dockerfile \ 6 | --build-arg PIP_VERSION="$PIP_VERSION" \ 7 | --build-arg PIPELINE_PACKAGE="$PIPELINE_PACKAGE" \ 8 | --progress plain \ 9 | -t pipeline-family-"$PIPELINE_FAMILY"-dev:latest . 10 | -------------------------------------------------------------------------------- /scripts/shellcheck.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | find scripts -name "*.sh" -exec shellcheck {} + 4 | 5 | -------------------------------------------------------------------------------- /scripts/test-doc-pipeline-apis-consistent.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -eu -o pipefail 4 | 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 6 | cd "$SCRIPT_DIR"/.. 7 | 8 | PIPELINE_OUTPUT_DIR=tmp-api-check-output-$RANDOM 9 | FILE_INDICTATING_FAILURE="$PIPELINE_OUTPUT_DIR"-has-failures 10 | mkdir -p $PIPELINE_OUTPUT_DIR 11 | touch $PIPELINE_OUTPUT_DIR/__init__.py 12 | 13 | function tmp_pipeline_comp_cleanup () { 14 | cd "$SCRIPT_DIR"/.. 15 | rm -f "$FILE_INDICTATING_FAILURE" 16 | if [[ "$1" -eq 0 ]]; then 17 | rm -rf $PIPELINE_OUTPUT_DIR 18 | fi 19 | exit "$1" 20 | } 21 | 22 | unstructured_api_tools convert-pipeline-notebooks \ 23 | --input-directory ./pipeline-notebooks \ 24 | --output-directory "$PIPELINE_OUTPUT_DIR" 25 | 26 | NUM_PIPELINE_API_FILES_GENERATED=$(find "$PIPELINE_OUTPUT_DIR" -name "*.py" | wc -l) 27 | 28 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -eq 0 ]]; then 29 | echo "No pipelines where created by unstructured_api_tools convert-pipeline-notebooks" 30 | tmp_pipeline_comp_cleanup 1 31 | fi 32 | 33 | NUM_EXISTING_PIPELINE_API_FILES=$(find "$PACKAGE_NAME"/api -name "*.py" | wc -l) 34 | 35 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -gt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then 36 | echo "More pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api" 37 | tmp_pipeline_comp_cleanup 1 38 | elif [[ "$NUM_PIPELINE_API_FILES_GENERATED" -lt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then 39 | echo "Fewer pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api" 40 | tmp_pipeline_comp_cleanup 1 41 | fi 42 | 43 | cd "$PACKAGE_NAME"/api 44 | find . -name "*.py" -print0 | while IFS= read -r -d '' pipeline_file; do 45 | set +o pipefail 46 | if ! diff -u "$pipeline_file" ../../"$PIPELINE_OUTPUT_DIR/$pipeline_file"; then 47 | touch "../../$FILE_INDICTATING_FAILURE" 48 | fi 49 | set -o pipefail 50 | done 51 | cd - 52 | 53 | if [ -r "$FILE_INDICTATING_FAILURE" ]; then 54 | echo 55 | echo "Autogenerated pipeline api file(s) do not match existing versions, see above for diff's" 56 | echo " or run: diff -ru ${PACKAGE_NAME}/api/ ${PIPELINE_OUTPUT_DIR}/" 57 | tmp_pipeline_comp_cleanup 1 58 | fi 59 | tmp_pipeline_comp_cleanup 0 60 | -------------------------------------------------------------------------------- /scripts/version-sync.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | function usage { 3 | echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1 4 | echo 'Synchronize files to latest version in source file' 5 | echo ' -s Specifies source file for version (default is CHANGELOG.md)' 6 | echo ' -f Specifies a file to change and the format for searching and replacing versions' 7 | echo ' FILE_TO_CHANGE is the file to be updated/checked for updates' 8 | echo ' REPLACEMENT_FORMAT is one of (semver, release, api-release)' 9 | echo ' semver indicates to look for a full semver version and replace with the latest full version' 10 | echo ' release indicates to look for a release semver version (x.x.x) and replace with the latest release version' 11 | echo ' api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version' 12 | echo ' -c Compare versions and output proposed changes without changing anything.' 13 | } 14 | 15 | function getopts-extra () { 16 | declare i=1 17 | # if the next argument is not an option, then append it to array OPTARG 18 | while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do 19 | OPTARG[i]=${!OPTIND} 20 | i+=1 21 | OPTIND+=1 22 | done 23 | } 24 | 25 | # Parse input options 26 | declare CHECK=0 27 | declare SOURCE_FILE="CHANGELOG.md" 28 | declare -a FILES_TO_CHECK=() 29 | declare -a REPLACEMENT_FORMATS=() 30 | declare args 31 | declare OPTIND OPTARG opt 32 | while getopts ":hcs:f:" opt; do 33 | case $opt in 34 | h) 35 | usage 36 | exit 0 37 | ;; 38 | c) 39 | CHECK=1 40 | ;; 41 | s) 42 | SOURCE_FILE="$OPTARG" 43 | ;; 44 | f) 45 | getopts-extra "$@" 46 | args=( "${OPTARG[@]}" ) 47 | # validate length of args, should be 2 48 | if [ ${#args[@]} -eq 2 ]; then 49 | FILES_TO_CHECK+=( "${args[0]}" ) 50 | REPLACEMENT_FORMATS+=( "${args[1]}" ) 51 | else 52 | echo "Exactly 2 arguments must follow -f option." >&2 53 | exit 1 54 | fi 55 | ;; 56 | \?) 57 | echo "Invalid option: -$OPTARG." >&2 58 | usage 59 | exit 1 60 | ;; 61 | esac 62 | done 63 | 64 | # Parse REPLACEMENT_FORMATS 65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?" 66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)" 68 | # Pull out semver appearing earliest in SOURCE_FILE. 69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE") 70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}") 71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")" 72 | declare -a RE_SEMVERS=() 73 | declare -a UPDATED_VERSIONS=() 74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do 75 | REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]} 76 | case $REPLACEMENT_FORMAT in 77 | semver) 78 | RE_SEMVERS+=( "$RE_SEMVER_FULL" ) 79 | UPDATED_VERSIONS+=( "$LAST_VERSION" ) 80 | ;; 81 | release) 82 | RE_SEMVERS+=( "$RE_RELEASE" ) 83 | UPDATED_VERSIONS+=( "$LAST_RELEASE" ) 84 | ;; 85 | api-release) 86 | RE_SEMVERS+=( "$RE_API_RELEASE" ) 87 | UPDATED_VERSIONS+=( "$LAST_API_RELEASE" ) 88 | ;; 89 | *) 90 | echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2 91 | exit 1 92 | ;; 93 | esac 94 | done 95 | 96 | if [ -z "$LAST_VERSION" ]; 97 | then 98 | # No match to semver regex in SOURCE_FILE, so no version to go from. 99 | printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE" 100 | exit 1 101 | fi 102 | 103 | # Search files in FILES_TO_CHECK and change (or get diffs) 104 | declare FAILED_CHECK=0 105 | 106 | for i in "${!FILES_TO_CHECK[@]}"; do 107 | FILE_TO_CHANGE=${FILES_TO_CHECK[$i]} 108 | RE_SEMVER=${RE_SEMVERS[$i]} 109 | UPDATED_VERSION=${UPDATED_VERSIONS[$i]} 110 | FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE") 111 | if [ -z "$FILE_VERSION" ]; 112 | then 113 | # No match to semver regex in VERSIONFILE, so nothing to replace 114 | printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE" 115 | exit 1 116 | else 117 | # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE 118 | TMPFILE=$(mktemp /tmp/new_version.XXXXXX) 119 | # Check sed version, exit if version < 4.3 120 | if ! sed --version > /dev/null 2>&1; then 121 | CURRENT_VERSION=1.archaic 122 | else 123 | CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4) 124 | fi 125 | REQUIRED_VERSION="4.3" 126 | if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then 127 | echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1 128 | fi 129 | sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE" 130 | if [ $CHECK == 1 ]; 131 | then 132 | DIFF=$(diff "$FILE_TO_CHANGE" "$TMPFILE" ) 133 | if [ -z "$DIFF" ]; 134 | then 135 | printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE" 136 | rm "$TMPFILE" 137 | else 138 | FAILED_CHECK=1 139 | printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF" 140 | rm "$TMPFILE" 141 | fi 142 | else 143 | cp "$TMPFILE" "$FILE_TO_CHANGE" 144 | rm "$TMPFILE" 145 | fi 146 | fi 147 | done 148 | 149 | # Exit with code determined by whether changes were needed in a check. 150 | if [ ${FAILED_CHECK} -ne 0 ]; then 151 | exit 1 152 | else 153 | exit 0 154 | fi 155 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | exclude = 4 | prepline_*/api 5 | -------------------------------------------------------------------------------- /test_paddleocr/api/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/test_paddleocr/api/.gitkeep -------------------------------------------------------------------------------- /test_paddleocr/api/test_paddleocr.py: -------------------------------------------------------------------------------- 1 | from fastapi.testclient import TestClient 2 | 3 | 4 | from prepline_paddleocr.api.app import app 5 | 6 | 7 | def test_api_health_check(): 8 | client = TestClient(app) 9 | response = client.get("/healthcheck") 10 | 11 | assert response.status_code == 200 12 | 13 | 14 | def test_api_call(): 15 | client = TestClient(app) 16 | with open("img/0.png", "rb") as f: 17 | response = client.post( 18 | "/paddleocr/v0.0.1/paddleocr", files={"files": ("filename", f, "image/jpeg")} 19 | ) 20 | 21 | assert response.status_code == 200 22 | 23 | 24 | def test_api_call_files(): 25 | client = TestClient(app) 26 | 27 | files = [("files", open("img/0.png", "rb")), ("files", open("img/0.png", "rb"))] 28 | 29 | response = client.post("/paddleocr/v0.0.1/paddleocr", files=files) 30 | 31 | assert response.status_code == 200 32 | --------------------------------------------------------------------------------