├── .github
├── dependabot.yml
└── workflows
│ ├── ci.yml
│ └── codeql-analysis.yml
├── .gitignore
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── exploration-notebooks
└── .gitkeep
├── img
├── 0.png
└── unstructured_logo.png
├── lib
└── libstdc++.so.6
├── logger_config.yaml
├── pipeline-notebooks
├── .gitkeep
└── pipeline-paddleocr.ipynb
├── prepline_paddleocr
├── __init__.py
└── api
│ ├── __init__.py
│ ├── app.py
│ └── paddleocr.py
├── preprocessing-pipeline-family.yaml
├── requirements
├── base.in
├── base.txt
├── dev.in
├── dev.txt
├── test.in
└── test.txt
├── sample-docs
├── .gitkeep
└── sample-receipt.jpg
├── scripts
├── check-and-format-notebooks.py
├── docker-build.sh
├── shellcheck.sh
├── test-doc-pipeline-apis-consistent.sh
└── version-sync.sh
├── setup.cfg
└── test_paddleocr
└── api
├── .gitkeep
└── test_paddleocr.py
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 | - package-ecosystem: "pip"
4 | directory: "/requirements"
5 | schedule:
6 | interval: "weekly"
7 |
8 | - package-ecosystem: "github-actions"
9 | # NOTE(robinson) - Workflow files stored in the
10 | # default location of `.github/workflows`
11 | directory: "/"
12 | schedule:
13 | interval: "weekly"
14 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | push:
5 | branches: [ main ]
6 | pull_request:
7 | branches: [ main ]
8 |
9 | env:
10 | PYTHON_VERSION: "3.8"
11 | PIPELINE_FAMILY: "paddleocr"
12 |
13 | jobs:
14 | setup:
15 | runs-on: ubuntu-latest
16 | steps:
17 | - uses: actions/checkout@v3
18 | - uses: actions/cache@v3
19 | id: virtualenv-cache
20 | with:
21 | path: |
22 | .venv
23 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
24 | - name: Set up Python ${{ env.PYTHON_VERSION }}
25 | uses: actions/setup-python@v4
26 | with:
27 | python-version: ${{ env.PYTHON_VERSION }}
28 | - name: Setup virtual environment (no cache hit)
29 | if: steps.virtualenv-cache.outputs.cache-hit != 'true'
30 | run: |
31 | python${{ env.PYTHON_VERSION }} -m venv .venv
32 | source .venv/bin/activate
33 | make install
34 |
35 | lint:
36 | runs-on: ubuntu-latest
37 | needs: setup
38 | steps:
39 | - uses: actions/checkout@v3
40 | - uses: actions/cache@v3
41 | id: virtualenv-cache
42 | with:
43 | path: |
44 | .venv
45 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
46 | - name: Lint
47 | run: |
48 | source .venv/bin/activate
49 | make check
50 |
51 | shellcheck:
52 | runs-on: ubuntu-latest
53 | steps:
54 | - uses: actions/checkout@v3
55 | - name: ShellCheck
56 | uses: ludeeus/action-shellcheck@master
57 |
58 | test:
59 | runs-on: ubuntu-latest
60 | needs: [setup, lint]
61 | steps:
62 | - uses: actions/checkout@v3
63 | - uses: actions/cache@v3
64 | id: virtualenv-cache
65 | with:
66 | path: |
67 | .venv
68 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
69 | - name: Run core tests
70 | run: |
71 | source .venv/bin/activate
72 | sudo apt-get install --yes poppler-utils
73 | make test
74 | make check-coverage
75 | make check-notebooks
76 |
77 | changelog:
78 | runs-on: ubuntu-latest
79 | steps:
80 | - uses: actions/checkout@v3
81 | - if: github.ref != 'refs/heads/main'
82 | uses: dorny/paths-filter@v2
83 | id: changes
84 | with:
85 | filters: |
86 | src:
87 | - 'doc_recipe/**'
88 | - 'recipe-notebooks/**'
89 |
90 | - if: steps.changes.outputs.src == 'true' && github.ref != 'refs/heads/main'
91 | uses: dangoslen/changelog-enforcer@v3
92 |
93 | api_consistency:
94 | runs-on: ubuntu-latest
95 | needs: setup
96 | steps:
97 | - uses: actions/checkout@v3
98 | - uses: actions/cache@v3
99 | id: virtualenv-cache
100 | with:
101 | path: |
102 | .venv
103 | key: ci-venv-${{ env.PIPELINE_FAMILY }}-${{ hashFiles('requirements/*.txt') }}
104 | - name: API Consistency
105 | run: |
106 | source .venv/bin/activate
107 | make api-check
108 |
--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ "main" ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ "main" ]
20 | schedule:
21 | - cron: '21 21 * * 5'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ 'python' ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v3
42 |
43 | # Initializes the CodeQL tools for scanning.
44 | - name: Initialize CodeQL
45 | uses: github/codeql-action/init@v2
46 | with:
47 | languages: ${{ matrix.language }}
48 | # If you wish to specify custom queries, you can do so here or in a config file.
49 | # By default, queries listed here will override any specified in a config file.
50 | # Prefix the list here with "+" to use these queries and those in the config file.
51 |
52 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
53 | # queries: security-extended,security-and-quality
54 |
55 |
56 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java).
57 | # If this step fails, then you should remove it and run the build manually (see below)
58 | - name: Autobuild
59 | uses: github/codeql-action/autobuild@v2
60 |
61 | # ℹ️ Command-line programs to run using the OS shell.
62 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
63 |
64 | # If the Autobuild fails above, remove it and uncomment the following three lines.
65 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
66 |
67 | # - run: |
68 | # echo "Run, Build Application using script"
69 | # ./location_of_script_within_repo/buildscript.sh
70 |
71 | - name: Perform CodeQL Analysis
72 | uses: github/codeql-action/analyze@v2
73 | with:
74 | category: "/language:${{matrix.language}}"
75 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
131 | # VSCode
132 | .vscode/
133 |
134 | # Mac
135 | .DS_Store
136 |
137 | nbs/
138 |
139 | # Celery files that are created when the mercury dashboard is run
140 | celery.sqlite
141 | celerybeat-schedule.db
142 |
143 | # temporarily generated files by project-specific Makefile
144 | tmp*
145 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ## 0.0.1
2 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:experimental
2 |
3 | FROM centos:centos7.9.2009
4 |
5 | # NOTE(crag): NB_USER ARG for mybinder.org compat:
6 | # https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
7 | ARG NB_USER=notebook-user
8 | ARG NB_UID=1000
9 | ARG PIP_VERSION
10 | ARG PIPELINE_PACKAGE
11 |
12 | ENV DEBIAN_FRONTEND=noninteractive
13 |
14 | RUN yum update -y
15 | RUN yum upgrade -y
16 |
17 | RUN yum install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
18 | libreadline-dev libsqlite3-dev wget curl libncurses5-dev libncursesw5-dev \
19 | xz-utils tk-dev libffi-dev liblzma-dev git mesa-libGL
20 |
21 | RUN yum -y install gcc openssl-devel bzip2-devel libffi-devel make git sqlite-devel && \
22 | curl -O https://www.python.org/ftp/python/3.8.15/Python-3.8.15.tgz && tar -xzf Python-3.8.15.tgz && \
23 | cd Python-3.8.15/ && ./configure --enable-shared --enable-optimizations && make altinstall && \
24 | cd .. && rm -rf Python-3.8.15* && \
25 | ln -s /usr/local/bin/python3.8 /usr/local/bin/python3
26 |
27 | COPY lib/libstdc++.so.6 /usr/lib64
28 |
29 | # create user with a home directory
30 | ENV USER ${NB_USER}
31 | ENV HOME /home/${NB_USER}
32 |
33 | RUN groupadd --gid ${NB_UID} ${NB_USER}
34 | RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER}
35 | USER ${NB_USER}
36 | WORKDIR ${HOME}
37 | ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
38 | ENV PATH="/home/${NB_USER}/.local/bin:${PATH}"
39 | ENV LD_LIBRARY_PATH=/usr/local/lib
40 | ENV LD_LIBRARY_PATH=/usr/lib64:$LD_LIBRARY_PATH
41 |
42 | COPY logger_config.yaml logger_config.yaml
43 | COPY requirements/dev.txt requirements-dev.txt
44 | COPY requirements/base.txt requirements-base.txt
45 | COPY prepline_${PIPELINE_PACKAGE}/ prepline_${PIPELINE_PACKAGE}/
46 | COPY exploration-notebooks exploration-notebooks
47 | COPY pipeline-notebooks pipeline-notebooks
48 | COPY img/ img/
49 |
50 | #RUN echo 'export LD_LIBRARY_PATH=/usr/local/lib' >> ~/.bashrc
51 |
52 | RUN python3.8 -m pip install pip==${PIP_VERSION} \
53 | && pip3.8 install --no-cache -r requirements-base.txt \
54 | && pip3.8 install --no-cache -r requirements-dev.txt
55 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | PIPELINE_FAMILY := paddleocr
2 | PIPELINE_PACKAGE := paddleocr
3 | PACKAGE_NAME := prepline_${PIPELINE_PACKAGE}
4 | PIP_VERSION := 23.1.2
5 |
6 | .PHONY: help
7 | help: Makefile
8 | @sed -n 's/^\(## \)\([a-zA-Z]\)/\2/p' $<
9 |
10 |
11 | ###########
12 | # Install #
13 | ###########
14 |
15 | ## install-base: installs minimum requirements to run the API
16 | .PHONY: install-base
17 | install-base: install-base-pip-packages
18 |
19 | ## install: installs all test and dev requirements
20 | .PHONY: install
21 | install: install-base install-test install-dev
22 |
23 | .PHONY: install-base-pip-packages
24 | install-base-pip-packages:
25 | python3 -m pip install pip==${PIP_VERSION}
26 | pip install -r requirements/base.txt
27 |
28 | .PHONY: install-test
29 | install-test:
30 | pip install -r requirements/test.txt
31 |
32 | .PHONY: install-dev
33 | install-dev:
34 | pip install -r requirements/dev.txt
35 |
36 | .PHONY: install-ci
37 | install-ci: install-base install-test
38 |
39 | ## pip-compile: compiles all base/dev/test requirements
40 | .PHONY: pip-compile
41 | pip-compile:
42 | pip-compile requirements/base.in
43 | pip-compile requirements/dev.in
44 | pip-compile requirements/test.in
45 |
46 |
47 | #########
48 | # Build #
49 | #########
50 |
51 | ## generate-api: generates the FastAPI python APIs from notebooks
52 | .PHONY: generate-api
53 | generate-api:
54 | PYTHONPATH=. unstructured_api_tools convert-pipeline-notebooks \
55 | --input-directory ./pipeline-notebooks \
56 | --output-directory ./${PACKAGE_NAME}/api
57 |
58 | ##########
59 | # Docker #
60 | ##########
61 |
62 | # Docker targets are provided for convenience only and are not required in a standard development environment
63 |
64 | # Note that the image has notebooks baked in, however the current working directory
65 | # is mounted under /home/notebook-user/local/ when the image is started with
66 | # docker-start-api or docker-start-jupyter
67 |
68 | .PHONY: docker-build
69 | docker-build:
70 | PIP_VERSION=${PIP_VERSION} PIPELINE_FAMILY=${PIPELINE_FAMILY} PIPELINE_PACKAGE=${PIPELINE_PACKAGE} ./scripts/docker-build.sh
71 |
72 | .PHONY: docker-start-api
73 | docker-start-api:
74 | docker run -p 8000:8000 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --host 0.0.0.0 --port 8000
75 |
76 | .PHONY: docker-start-jupyter
77 | docker-start-jupyter:
78 | docker run -p 8888:8888 --mount type=bind,source=$(realpath .),target=/home/notebook-user/local -t --rm pipeline-family-${PIPELINE_FAMILY}-dev:latest jupyter-notebook --port 8888 --ip 0.0.0.0 --no-browser --NotebookApp.token='' --NotebookApp.password=''
79 |
80 |
81 | #########
82 | # Local #
83 | #########
84 |
85 | ## run-jupyter: starts jupyter notebook
86 | .PHONY: run-jupyter
87 | run-jupyter:
88 | PYTHONPATH=$(realpath .) JUPYTER_PATH=$(realpath .) jupyter-notebook --NotebookApp.token='' --NotebookApp.password=''
89 |
90 | ## run-web-app: runs the FastAPI api with hot reloading
91 | .PHONY: run-web-app
92 | run-web-app:
93 | PYTHONPATH=$(realpath .) uvicorn ${PACKAGE_NAME}.api.app:app --log-config logger_config.yaml --reload
94 |
95 |
96 | #################
97 | # Test and Lint #
98 | #################
99 |
100 | ## test: runs core tests
101 | .PHONY: test
102 | test:
103 | PYTHONPATH=. pytest test_${PIPELINE_PACKAGE} --cov=${PACKAGE_NAME} --cov-report term-missing
104 |
105 | .PHONY: check-coverage
106 | check-coverage:
107 | coverage report --fail-under=90
108 |
109 | ## test-integration: runs integration tests
110 | .PHONY: test-integration
111 | test-integration:
112 | PYTHONPATH=. pytest test_${PIPELINE_PACKAGE}_integration
113 |
114 | ## api-check: verifies auto-generated pipeline APIs match the existing ones
115 | .PHONY: api-check
116 | api-check:
117 | PYTHONPATH=. PACKAGE_NAME=${PACKAGE_NAME} ./scripts/test-doc-pipeline-apis-consistent.sh
118 |
119 | ## check: runs linters (includes tests)
120 | .PHONY: check
121 | check: check-src check-tests check-version
122 |
123 | ## check-src: runs linters (source only, no tests)
124 | .PHONY: check-src
125 | check-src:
126 | black --line-length 100 ${PACKAGE_NAME} --check --exclude ${PACKAGE_NAME}/api
127 | flake8 ${PACKAGE_NAME}
128 | mypy ${PACKAGE_NAME} --ignore-missing-imports --install-types --non-interactive --implicit-optional
129 |
130 | .PHONY: check-tests
131 | check-tests:
132 | black --line-length 100 test_${PIPELINE_PACKAGE} --check
133 | flake8 test_${PIPELINE_PACKAGE}
134 |
135 | ## tidy: run black
136 | .PHONY: tidy
137 | tidy:
138 | black --line-length 100 ${PACKAGE_NAME}
139 | black --line-length 100 test_${PIPELINE_PACKAGE}
140 |
141 | ## check-scripts: run shellcheck
142 | .PHONY: check-scripts
143 | check-scripts:
144 | # Fail if any of these files have warnings
145 | scripts/shellcheck.sh
146 |
147 | ## check-version: run check to ensure version in CHANGELOG.md matches references in files
148 | .PHONY: check-version
149 | check-version:
150 | # Fail if syncing version would produce changes
151 | scripts/version-sync.sh -c \
152 | -s CHANGELOG.md \
153 | -f README.md api-release \
154 | -f preprocessing-pipeline-family.yaml release
155 |
156 | ## check-notebooks: check that executing and cleaning notebooks doesn't produce changes
157 | .PHONY: check-notebooks
158 | check-notebooks:
159 | scripts/check-and-format-notebooks.py --check
160 |
161 | ## tidy-notebooks: execute notebooks and remove metadata
162 | .PHONY: tidy-notebooks
163 | tidy-notebooks:
164 | scripts/check-and-format-notebooks.py
165 |
166 | ## version-sync: update references to version with most recent version from CHANGELOG.md
167 | .PHONY: version-sync
168 | version-sync:
169 | scripts/version-sync.sh \
170 | -s CHANGELOG.md \
171 | -f README.md api-release \
172 | -f preprocessing-pipeline-family.yaml release
173 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
Pre-Processing OCR Pipeline for PaddleOCR
7 |
8 |
9 |
18 |
19 |
20 | This pipeline processes input image documents in the English language using [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR).
21 | The pipeline works on `x86_64` cpus.
22 |
23 | ## Developer Quick Start
24 |
25 | * Using `pyenv` to manage virtualenvs is recommended
26 | * Mac install instructions. See [here](https://github.com/Unstructured-IO/community#mac--homebrew) for more detailed instructions.
27 | * `brew install pyenv-virtualenv`
28 | * `pyenv install 3.8.15`
29 | * Linux instructions are available [here](https://github.com/Unstructured-IO/community#linux).
30 |
31 | * Create a virtualenv to work in and activate it, e.g. for one named `paddleocr`:
32 |
33 | `pyenv virtualenv 3.8.15 paddleocr`
34 | `pyenv activate paddleocr`
35 |
36 | * If you are on a Mac with an M1 chip, run `brew install mupdf swig freetype` to install
37 | required non-Python dependencies.
38 | * Run `make install`
39 | * Start a local jupyter notebook server with `make run-jupyter`
40 | **OR**
41 | just start the fast-API locally with `make run-web-app`
42 |
43 | ### Performing OCR on a JPG image
44 |
45 | To run OCR on a JPG image, run `make run-web-app` and run the following `curl` command,
46 | replacing `sample-docs/sample-receipt.jpg` with your filename:
47 |
48 | ```
49 | curl -X 'POST' \
50 | 'http://localhost:8000/paddleocr/v0.0.1/paddleocr' \
51 | -H 'accept: application/json' \
52 | -H 'Content-Type: multipart/form-data' \
53 | -F 'files=@sample-docs/sample-receipt.jpg' | jq -C . | less -R
54 | ```
55 |
56 | The result should look like the following.
57 |
58 | ```
59 | "{\"result\": [[[[162.0, 111.0], [429.0, 110.0], [429.0, 138.0], [162.0, 139.0]], [\"PETRON BKT
60 | LANJAN SB\", 0.918]], [[[162.0, 142.0], [418.0, 141.0], [418.0, 170.0], [162.0, 171.0]], [\"ALSERKAM
61 | ENTERPRISE\", 0.9785]], [[[44.0, 178.0], [562.0, 175.0], [562.0, 199.0], [44.0, 202.0]], [\"Te1
62 | 03-6156 8757 Co No 001083069-M\", 0.9282]], [[[121.0, 209.0], [467.0, 209.0], [467.0, 232.0],
63 | [121.0, 232.0]], [\"KM 458.4 BKT LANJAN UTARA,\", 0.9205]], [[[95.0, 239.0], [484.0, 237.0], [484.0,
64 | 264.0], [95.0, 267.0]], [\"L/RAYA UTARA SELATAN,SG BULOH\", 0.9525]], [[[188.0, 270.0], [403.0,
65 | 270.0], [403.0, 298.0], [188.0, 298.0]], [\"47000 SUNGAI BUL\", 0.9704]], [[[139.0, 335.0], [443.0,
66 | 335.0], [443.0, 359.0], [139.0, 359.0]], [\"GST ID No001210736640\", 0.9619]], [[[217.0, 397.0],
67 | [366.0, 397.0], [366.0, 424.0], [217.0, 424.0]], [\"TAX INVOICE\", 0.9886]], [[[29.0, 491.0],
68 | [351.0, 490.0], [351.0, 518.0], [29.0, 519.0]], [\"TAX INVOICE NO 19729058\", 0.963]], [[[28.0,
69 | 523.0], [129.0, 523.0], [129.0, 552.0], [28.0, 552.0]], [\"POS1\", 0.9617]], [[[29.0, 554.0],
70 | [272.0, 552.0], [272.0, 582.0], [29.0, 583.0]], [\"Store No.:129077\", 0.9439]], [[[492.0, 552.0],
71 | [553.0, 552.0], [553.0, 584.0], [492.0, 584.0]], [\"Babu\", 0.9968]], [[[28.0, 586.0], [169.0,
72 | 589.0], [169.0, 618.0], [27.0, 615.0]], [\"01/02/2018\", 0.9972]], [[[162.0, 587.0], [340.0, 587.0],
73 | [340.0, 615.0], [162.0, 615.0]], [\"4:43:17PM\", 0.8981]], [[[28.0, 683.0], [311.0, 683.0], [311.0,
74 | 711.0], [28.0, 711.0]], [\"A 2 doublemint te\", 0.9652]], [[[506.0, 679.0], [566.0, 679.0], [566.0,
75 | 710.0], [506.0, 710.0]], [\"3.00\", 0.9931]], [[[25.0, 714.0], [313.0, 712.0], [314.0, 742.0],
76 | [25.0, 743.0]], [\"A1sandwich vanill\", 0.9318]], [[[507.0, 711.0], [566.0, 711.0], [566.0, 743.0],
77 | [507.0, 743.0]], [\"1.90\", 0.9937]], [[[69.0, 778.0], [165.0, 778.0], [165.0, 807.0], [69.0,
78 | 807.0]], [\"GST RM\", 0.9119]], [[[505.0, 775.0], [566.0, 775.0], [566.0, 807.0], [505.0, 807.0]],
79 | [\"0.28\", 0.9929]], [[[70.0, 811.0], [296.0, 811.0], [296.0, 839.0], [70.0, 839.0]], [\"Total RM
80 | inc.GST:\", 0.9176]], [[[506.0, 807.0], [566.0, 807.0], [566.0, 839.0], [506.0, 839.0]], [\"4.90\",
81 | 0.9949]], [[[67.0, 873.0], [128.0, 873.0], [128.0, 905.0], [67.0, 905.0]], [\"Cash\", 0.9938]],
82 | [[[505.0, 868.0], [568.0, 868.0], [568.0, 905.0], [505.0, 905.0]], [\"5.00\", 0.992]], [[[67.0,
83 | 904.0], [154.0, 908.0], [153.0, 938.0], [66.0, 935.0]], [\"Change\", 0.9971]], [[[506.0, 903.0],
84 | [566.0, 903.0], [566.0, 935.0], [506.0, 935.0]], [\"0.10\", 0.9981]], [[[29.0, 968.0], [179.0,
85 | 973.0], [178.0, 1002.0], [29.0, 998.0]], [\"GsT Summary\", 0.8839]], [[[242.0, 969.0], [387.0,
86 | 966.0], [388.0, 996.0], [242.0, 999.0]], [\"AnountRM\", 0.895]], [[[454.0, 969.0], [562.0, 969.0],
87 | [562.0, 998.0], [454.0, 998.0]], [\"Tax (RM)\", 0.8915]], [[[29.0, 1002.0], [128.0, 1002.0], [128.0,
88 | 1033.0], [29.0, 1033.0]], [\"A=6.00%\", 0.9756]], [[[241.0, 1001.0], [301.0, 1001.0], [301.0,
89 | 1033.0], [241.0, 1033.0]], [\"4.62\", 0.9949]], [[[452.0, 999.0], [513.0, 999.0], [513.0, 1031.0],
90 | [452.0, 1031.0]], [\"0.28\", 0.9955]], [[[29.0, 1070.0], [47.0, 1070.0], [47.0, 1092.0], [29.0,
91 | 1092.0]], [\"A\", 0.9864]], [[[106.0, 1066.0], [418.0, 1066.0], [418.0, 1094.0], [106.0, 1094.0]],
92 | [\"ITAL INCLUDES 6.00%GST\", 0.9485]], [[[151.0, 1166.0], [429.0, 1166.0], [429.0, 1190.0], [151.0,
93 | 1190.0]], [\"Use 3000 Petron Miles\", 0.9395]], [[[176.0, 1197.0], [403.0, 1194.0], [403.0, 1223.0],
94 | [176.0, 1226.0]], [\"points to pay for\", 0.9474]], [[[228.0, 1227.0], [351.0, 1227.0], [351.0,
95 | 1257.0], [228.0, 1257.0]], [\"RM45 Fue1\", 0.932]]]}
96 | ```
97 |
98 | You can also run OCR through the Python API using the following commands:
99 |
100 | ```python
101 | from prepline_paddleocr.api.paddleocr import pipeline_api
102 |
103 | filename = "sample-docs/sample-receipt.jpg"
104 |
105 | with open(filename, "rb") as f:
106 | pipeline_api(file=f)
107 | ```
108 |
109 |
110 | ### Generating Python files from the pipeline notebooks
111 |
112 | You can generate the FastAPI APIs from your pipeline notebooks by running `make generate-api`.
113 |
114 | ## Security Policy
115 |
116 | See our [security policy](https://github.com/Unstructured-IO/pipeline-paddleocr/security/policy) for
117 | information on how to report security vulnerabilities.
118 |
119 | ## Learn more
120 |
121 | | Section | Description |
122 | |-|-|
123 | | [Unstructured Community Github](https://github.com/Unstructured-IO/community) | Information about Unstructured.io community projects |
124 | | [Unstructured Github](https://github.com/Unstructured-IO) | Unstructured.io open source repositories |
125 | | [Company Website](https://unstructured.io) | Unstructured.io product and company info |
126 |
--------------------------------------------------------------------------------
/exploration-notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/exploration-notebooks/.gitkeep
--------------------------------------------------------------------------------
/img/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/img/0.png
--------------------------------------------------------------------------------
/img/unstructured_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/img/unstructured_logo.png
--------------------------------------------------------------------------------
/lib/libstdc++.so.6:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/lib/libstdc++.so.6
--------------------------------------------------------------------------------
/logger_config.yaml:
--------------------------------------------------------------------------------
1 | version: 1
2 | disable_existing_loggers: False
3 | formatters:
4 | default_format:
5 | "()": uvicorn.logging.DefaultFormatter
6 | format: '%(asctime)s %(name)s %(levelname)s %(message)s'
7 | access:
8 | "()": uvicorn.logging.AccessFormatter
9 | format: '%(asctime)s %(client_addr)s %(request_line)s - %(status_code)s'
10 | handlers:
11 | access_handler:
12 | formatter: access
13 | class: logging.StreamHandler
14 | stream: ext://sys.stderr
15 | standard_handler:
16 | formatter: default_format
17 | class: logging.StreamHandler
18 | stream: ext://sys.stderr
19 | loggers:
20 | uvicorn.error:
21 | level: INFO
22 | handlers:
23 | - standard_handler
24 | propagate: no
25 | # disable logging for uvicorn.error by not having a handler
26 | uvicorn.access:
27 | level: INFO
28 | handlers:
29 | - access_handler
30 | propagate: no
31 | # disable logging for uvicorn.access by not having a handler
32 | unstructured:
33 | level: INFO
34 | handlers:
35 | - standard_handler
36 | propagate: no
37 |
38 |
--------------------------------------------------------------------------------
/pipeline-notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/pipeline-notebooks/.gitkeep
--------------------------------------------------------------------------------
/pipeline-notebooks/pipeline-paddleocr.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "3931743a",
6 | "metadata": {},
7 | "source": [
8 | "# PaddleOCR Pipeline"
9 | ]
10 | },
11 | {
12 | "cell_type": "markdown",
13 | "id": "757bd7cd",
14 | "metadata": {},
15 | "source": [
16 | "## Section 1: Introduction\n",
17 | "\n",
18 | "The goal of this notebook is to setup a pipeline for PaddleOCR"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "id": "7db1e471",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import json\n",
29 | "import os\n",
30 | "\n",
31 | "def get_filename(directory, filename):\n",
32 | " cwd = os.getcwd()\n",
33 | " local_directory = os.path.join(os.path.split(cwd)[0], directory)\n",
34 | " ci_directory = os.path.join(cwd, directory)\n",
35 | "\n",
36 | " if os.path.exists(local_directory) and filename in os.listdir(local_directory):\n",
37 | " return os.path.join(local_directory, filename)\n",
38 | " elif os.path.exists(ci_directory) and filename in os.listdir(ci_directory):\n",
39 | " return os.path.join(ci_directory, filename)\n",
40 | " else:\n",
41 | " raise FileNotFoundError"
42 | ]
43 | },
44 | {
45 | "cell_type": "markdown",
46 | "id": "48daac01",
47 | "metadata": {},
48 | "source": [
49 | "## Show example image"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "id": "18bbc559",
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "data": {
60 | "image/png": "\n",
61 | "text/plain": [
62 | ""
63 | ]
64 | },
65 | "execution_count": null,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "from PIL import Image\n",
72 | "\n",
73 | "filename = get_filename(\"img\", \"0.png\")\n",
74 | "\n",
75 | "Image.open(filename)"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "id": "e21660e2",
81 | "metadata": {},
82 | "source": [
83 | "## Section 2: Pipeline API"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "id": "ef0b7cb5",
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "# pipeline-api\n",
94 | "from paddleocr import PaddleOCR\n",
95 | "\n",
96 | "import logging\n",
97 | "logging.disable()"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "id": "7cb5e00b",
104 | "metadata": {},
105 | "outputs": [],
106 | "source": [
107 | "# pipeline-api\n",
108 | "from PIL import Image\n",
109 | "import numpy as np\n",
110 | "\n",
111 | "def pipeline_api(\n",
112 | " file,\n",
113 | " file_content_type=None,\n",
114 | " m_some_parameters=[],\n",
115 | "):\n",
116 | " ocr = PaddleOCR(lang=\"en\", use_gpu = False, show_log = False) \n",
117 | " result = ocr.ocr(img=np.array(Image.open(file)))\n",
118 | " \n",
119 | " result =[(p1[0],tuple((p1[1][0],round(p1[1][1],4)))) for p in result for p1 in p]\n",
120 | "\n",
121 | " return json.dumps({ \"result\" : result})\n"
122 | ]
123 | },
124 | {
125 | "cell_type": "code",
126 | "execution_count": null,
127 | "id": "0400f975",
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "name": "stdout",
132 | "output_type": "stream",
133 | "text": [
134 | "{'result': [[[[48.0, 24.0], [112.0, 24.0], [112.0, 35.0], [48.0, 35.0]], ['AK Transport', 0.9921]], [[[462.0, 26.0], [576.0, 26.0], [576.0, 50.0], [462.0, 50.0]], ['INVOICE', 0.998]], [[[46.0, 36.0], [126.0, 36.0], [126.0, 49.0], [46.0, 49.0]], ['352 Palmer Road', 0.981]], [[[47.0, 46.0], [76.0, 49.0], [75.0, 61.0], [46.0, 58.0]], ['Ware', 0.9973]], [[[47.0, 60.0], [116.0, 60.0], [116.0, 71.0], [47.0, 71.0]], ['MA, 1082, USA', 0.9995]], [[[520.0, 53.0], [573.0, 56.0], [572.0, 70.0], [519.0, 68.0]], ['#659950', 0.9963]], [[[437.0, 107.0], [466.0, 107.0], [466.0, 122.0], [437.0, 122.0]], ['Date:', 0.9989]], [[[522.0, 105.0], [569.0, 107.0], [568.0, 121.0], [521.0, 119.0]], ['4/11/2020', 1.0]], [[[47.0, 121.0], [79.0, 123.0], [78.0, 134.0], [46.0, 131.0]], ['Bill To:', 0.9944]], [[[45.0, 136.0], [156.0, 138.0], [156.0, 152.0], [45.0, 149.0]], ['Quadrant Lite Planning', 0.9752]], [[[392.0, 130.0], [464.0, 130.0], [464.0, 144.0], [392.0, 144.0]], ['Balance Due:', 0.9757]], [[[525.0, 131.0], [570.0, 131.0], [570.0, 145.0], [525.0, 145.0]], ['$198.30', 0.9956]], [[[46.0, 149.0], [141.0, 149.0], [141.0, 162.0], [46.0, 162.0]], ['3371 S Alabama Ave', 0.9998]], [[[46.0, 161.0], [99.0, 162.0], [99.0, 173.0], [46.0, 172.0]], ['Monroeville', 0.9992]], [[[47.0, 174.0], [119.0, 174.0], [119.0, 185.0], [47.0, 185.0]], ['AL, 36460, USA', 0.9779]], [[[42.0, 225.0], [65.0, 225.0], [65.0, 236.0], [42.0, 236.0]], ['Item', 0.9908]], [[[370.0, 223.0], [411.0, 223.0], [411.0, 237.0], [370.0, 237.0]], ['Quantity', 0.9994]], [[[471.0, 224.0], [494.0, 224.0], [494.0, 236.0], [471.0, 236.0]], ['Rate', 0.9999]], [[[532.0, 225.0], [570.0, 225.0], [570.0, 236.0], [532.0, 236.0]], ['Amount', 0.9999]], [[[43.0, 248.0], [198.0, 248.0], [198.0, 261.0], [43.0, 261.0]], ['Reviva Oatmeal Soap Bar 4.20 oz', 0.9934]], [[[370.0, 249.0], [379.0, 249.0], [379.0, 260.0], [370.0, 260.0]], ['3', 0.9993]], [[[461.0, 248.0], [494.0, 248.0], [494.0, 261.0], [461.0, 261.0]], ['$66.10', 0.9974]], [[[530.0, 248.0], [571.0, 248.0], [571.0, 261.0], [530.0, 261.0]], ['$198.30', 0.997]], [[[438.0, 314.0], [466.0, 314.0], [466.0, 327.0], [438.0, 327.0]], ['Total:', 0.9996]], [[[530.0, 313.0], [570.0, 313.0], [570.0, 327.0], [530.0, 327.0]], ['$198.30', 0.9974]]]}\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "with open(filename, 'rb') as f:\n",
140 | " result = pipeline_api(f)\n",
141 | "print(json.loads(result))"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": null,
147 | "id": "fcb6a317",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": []
151 | }
152 | ],
153 | "metadata": {
154 | "kernelspec": {
155 | "display_name": "python3",
156 | "language": "python",
157 | "name": "python3"
158 | }
159 | },
160 | "nbformat": 4,
161 | "nbformat_minor": 5
162 | }
163 |
--------------------------------------------------------------------------------
/prepline_paddleocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/prepline_paddleocr/__init__.py
--------------------------------------------------------------------------------
/prepline_paddleocr/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/prepline_paddleocr/api/__init__.py
--------------------------------------------------------------------------------
/prepline_paddleocr/api/app.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 |
7 | from fastapi import FastAPI, Request, status
8 |
9 | from slowapi import Limiter, _rate_limit_exceeded_handler
10 | from slowapi.errors import RateLimitExceeded
11 | from slowapi.util import get_remote_address
12 |
13 | from .paddleocr import router as paddleocr_router
14 |
15 |
16 | limiter = Limiter(key_func=get_remote_address)
17 | app = FastAPI()
18 | app.state.limiter = limiter
19 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
20 |
21 | app.include_router(paddleocr_router)
22 |
23 |
24 | @app.get("/healthcheck", status_code=status.HTTP_200_OK)
25 | async def healthcheck(request: Request):
26 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
27 |
--------------------------------------------------------------------------------
/prepline_paddleocr/api/paddleocr.py:
--------------------------------------------------------------------------------
1 | #####################################################################
2 | # THIS FILE IS AUTOMATICALLY GENERATED BY UNSTRUCTURED API TOOLS.
3 | # DO NOT MODIFY DIRECTLY
4 | #####################################################################
5 |
6 | import os
7 | from typing import List, Union
8 |
9 | from fastapi import status, FastAPI, File, Form, Request, UploadFile, APIRouter
10 | from slowapi.errors import RateLimitExceeded
11 | from slowapi import Limiter, _rate_limit_exceeded_handler
12 | from slowapi.util import get_remote_address
13 | from fastapi.responses import PlainTextResponse
14 |
15 | limiter = Limiter(key_func=get_remote_address)
16 | app = FastAPI()
17 | app.state.limiter = limiter
18 | app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
19 | router = APIRouter()
20 |
21 | RATE_LIMIT = os.environ.get("PIPELINE_API_RATE_LIMIT", "1/second")
22 |
23 |
24 | # pipeline-api
25 | from paddleocr import PaddleOCR
26 |
27 | import logging
28 |
29 | logging.disable()
30 | from PIL import Image
31 | import numpy as np
32 |
33 |
34 | def pipeline_api(
35 | file,
36 | file_content_type=None,
37 | m_some_parameters=[],
38 | ):
39 | ocr = PaddleOCR(lang="en", use_gpu=False, show_log=False)
40 | result = ocr.ocr(img=np.array(Image.open(file)))
41 |
42 | result = [
43 | (p1[0], tuple((p1[1][0], round(p1[1][1], 4)))) for p in result for p1 in p
44 | ]
45 |
46 | return json.dumps({"result": result})
47 |
48 |
49 | import json
50 | from fastapi.responses import StreamingResponse
51 | from starlette.types import Send
52 | from base64 import b64encode
53 | from typing import Optional, Mapping, Iterator, Tuple
54 | import secrets
55 |
56 |
57 | class MultipartMixedResponse(StreamingResponse):
58 | CRLF = b"\r\n"
59 |
60 | def __init__(self, *args, content_type: str = None, **kwargs):
61 | super().__init__(*args, **kwargs)
62 | self.content_type = content_type
63 |
64 | def init_headers(self, headers: Optional[Mapping[str, str]] = None) -> None:
65 | super().init_headers(headers)
66 | self.boundary_value = secrets.token_hex(16)
67 | content_type = f'multipart/mixed; boundary="{self.boundary_value}"'
68 | self.raw_headers.append((b"content-type", content_type.encode("latin-1")))
69 |
70 | @property
71 | def boundary(self):
72 | return b"--" + self.boundary_value.encode()
73 |
74 | def _build_part_headers(self, headers: dict) -> bytes:
75 | header_bytes = b""
76 | for header, value in headers.items():
77 | header_bytes += f"{header}: {value}".encode() + self.CRLF
78 | return header_bytes
79 |
80 | def build_part(self, chunk: bytes) -> bytes:
81 | part = self.boundary + self.CRLF
82 | part_headers = {
83 | "Content-Length": len(chunk),
84 | "Content-Transfer-Encoding": "base64",
85 | }
86 | if self.content_type is not None:
87 | part_headers["Content-Type"] = self.content_type
88 | part += self._build_part_headers(part_headers)
89 | part += self.CRLF + chunk + self.CRLF
90 | return part
91 |
92 | async def stream_response(self, send: Send) -> None:
93 | await send(
94 | {
95 | "type": "http.response.start",
96 | "status": self.status_code,
97 | "headers": self.raw_headers,
98 | }
99 | )
100 | async for chunk in self.body_iterator:
101 | if not isinstance(chunk, bytes):
102 | chunk = chunk.encode(self.charset)
103 | chunk = b64encode(chunk)
104 | await send(
105 | {
106 | "type": "http.response.body",
107 | "body": self.build_part(chunk),
108 | "more_body": True,
109 | }
110 | )
111 |
112 | await send({"type": "http.response.body", "body": b"", "more_body": False})
113 |
114 |
115 | @router.post("/paddleocr/v0.0.1/paddleocr")
116 | @limiter.limit(RATE_LIMIT)
117 | async def pipeline_1(
118 | request: Request,
119 | files: Union[List[UploadFile], None] = File(default=None),
120 | some_parameters: List[str] = Form(default=[]),
121 | ):
122 | content_type = request.headers.get("Accept")
123 |
124 | if isinstance(files, list) and len(files):
125 | if len(files) > 1:
126 | if content_type and content_type not in ["*/*", "multipart/mixed"]:
127 | return PlainTextResponse(
128 | content=(
129 | f"Conflict in media type {content_type}"
130 | ' with response type "multipart/mixed".\n'
131 | ),
132 | status_code=status.HTTP_406_NOT_ACCEPTABLE,
133 | )
134 |
135 | def response_generator():
136 | for file in files:
137 | _file = file.file
138 |
139 | response = pipeline_api(
140 | _file,
141 | m_some_parameters=some_parameters,
142 | file_content_type=file.content_type,
143 | )
144 | if type(response) not in [str, bytes]:
145 | response = json.dumps(response)
146 | yield response
147 |
148 | return MultipartMixedResponse(
149 | response_generator(),
150 | )
151 | else:
152 | file = files[0]
153 | _file = file.file
154 |
155 | response = pipeline_api(
156 | _file,
157 | m_some_parameters=some_parameters,
158 | file_content_type=file.content_type,
159 | )
160 |
161 | return response
162 |
163 | else:
164 | return PlainTextResponse(
165 | content='Request parameter "files" is required.\n',
166 | status_code=status.HTTP_400_BAD_REQUEST,
167 | )
168 |
169 |
170 | @app.get("/healthcheck", status_code=status.HTTP_200_OK)
171 | async def healthcheck(request: Request):
172 | return {"healthcheck": "HEALTHCHECK STATUS: EVERYTHING OK!"}
173 |
174 |
175 | app.include_router(router)
176 |
--------------------------------------------------------------------------------
/preprocessing-pipeline-family.yaml:
--------------------------------------------------------------------------------
1 | name: paddleocr
2 | version: 0.0.1
3 |
--------------------------------------------------------------------------------
/requirements/base.in:
--------------------------------------------------------------------------------
1 | unstructured>=0.2.4
2 | unstructured-api-tools>=0.4.4
3 |
4 | opencv-python==4.5.5.64
5 | pip-tools>=6.11.0
6 | ipython>=8.7.0
7 | ratelimit
8 |
9 | paddlepaddle
10 | paddleocr
11 | werkzeug>=2.2.3
12 | future>=0.18.3
13 | jupyter-core>=4.11.2
14 | nbdev>=2.3.12
15 | #protobuf>=3.20.2
16 | #starlette>=0.25.0
17 | IPython>=8.10
18 | wheel>=0.38.1
19 | pytest>=7.2.0
20 |
--------------------------------------------------------------------------------
/requirements/base.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.9
3 | # by the following command:
4 | #
5 | # pip-compile requirements/base.in
6 | #
7 | anyio==3.6.1
8 | # via
9 | # starlette
10 | # watchfiles
11 | astor==0.8.1
12 | # via paddlepaddle
13 | asttokens==2.2.1
14 | # via
15 | # nbdev
16 | # stack-data
17 | astunparse==1.6.3
18 | # via nbdev
19 | attrdict==2.0.1
20 | # via paddleocr
21 | attrs==22.1.0
22 | # via
23 | # jsonschema
24 | # pytest
25 | babel==2.11.0
26 | # via flask-babel
27 | backcall==0.2.0
28 | # via ipython
29 | bce-python-sdk==0.8.74
30 | # via visualdl
31 | beautifulsoup4==4.11.1
32 | # via
33 | # nbconvert
34 | # paddleocr
35 | bleach==5.0.1
36 | # via nbconvert
37 | build==0.9.0
38 | # via pip-tools
39 | cachetools==5.2.0
40 | # via premailer
41 | certifi==2022.12.7
42 | # via requests
43 | charset-normalizer==2.1.1
44 | # via requests
45 | click==8.1.3
46 | # via
47 | # flask
48 | # nltk
49 | # pip-tools
50 | # unstructured-api-tools
51 | # uvicorn
52 | contourpy==1.0.6
53 | # via matplotlib
54 | cssselect==1.2.0
55 | # via premailer
56 | cssutils==2.6.0
57 | # via premailer
58 | cycler==0.11.0
59 | # via matplotlib
60 | cython==0.29.32
61 | # via paddleocr
62 | decorator==5.1.1
63 | # via
64 | # ipython
65 | # paddlepaddle
66 | defusedxml==0.7.1
67 | # via nbconvert
68 | dill==0.3.6
69 | # via multiprocess
70 | entrypoints==0.4
71 | # via jupyter-client
72 | et-xmlfile==1.1.0
73 | # via openpyxl
74 | exceptiongroup==1.1.0
75 | # via pytest
76 | execnb==0.1.5
77 | # via nbdev
78 | executing==1.2.0
79 | # via stack-data
80 | fastapi==0.85.0
81 | # via unstructured-api-tools
82 | fastcore==1.5.28
83 | # via
84 | # execnb
85 | # ghapi
86 | # nbdev
87 | fastjsonschema==2.16.2
88 | # via nbformat
89 | fire==0.4.0
90 | # via
91 | # paddleocr
92 | # pdf2docx
93 | flask==2.2.2
94 | # via
95 | # flask-babel
96 | # visualdl
97 | flask-babel==2.0.0
98 | # via visualdl
99 | fonttools==4.38.0
100 | # via
101 | # matplotlib
102 | # paddleocr
103 | # pdf2docx
104 | future==0.18.3
105 | # via
106 | # -r requirements/base.in
107 | # bce-python-sdk
108 | ghapi==1.0.3
109 | # via nbdev
110 | h11==0.13.0
111 | # via uvicorn
112 | httptools==0.5.0
113 | # via uvicorn
114 | idna==3.4
115 | # via
116 | # anyio
117 | # requests
118 | imageio==2.22.4
119 | # via
120 | # imgaug
121 | # scikit-image
122 | imgaug==0.4.0
123 | # via paddleocr
124 | importlib-metadata==5.0.0
125 | # via
126 | # flask
127 | # nbconvert
128 | iniconfig==2.0.0
129 | # via pytest
130 | ipython==8.10.0
131 | # via
132 | # -r requirements/base.in
133 | # execnb
134 | itsdangerous==2.1.2
135 | # via flask
136 | jedi==0.18.2
137 | # via ipython
138 | jinja2==3.1.2
139 | # via
140 | # flask
141 | # flask-babel
142 | # nbconvert
143 | # unstructured-api-tools
144 | joblib==1.2.0
145 | # via nltk
146 | jsonschema==4.16.0
147 | # via nbformat
148 | jupyter-client==7.3.5
149 | # via nbclient
150 | jupyter-core==5.2.0
151 | # via
152 | # -r requirements/base.in
153 | # jupyter-client
154 | # nbconvert
155 | # nbformat
156 | jupyterlab-pygments==0.2.2
157 | # via nbconvert
158 | kiwisolver==1.4.4
159 | # via matplotlib
160 | limits==1.6
161 | # via slowapi
162 | lmdb==1.4.0
163 | # via paddleocr
164 | lxml==4.9.1
165 | # via
166 | # nbconvert
167 | # paddleocr
168 | # premailer
169 | # python-docx
170 | # unstructured
171 | markupsafe==2.1.1
172 | # via
173 | # jinja2
174 | # nbconvert
175 | # werkzeug
176 | matplotlib==3.6.2
177 | # via
178 | # imgaug
179 | # visualdl
180 | matplotlib-inline==0.1.6
181 | # via ipython
182 | mistune==2.0.4
183 | # via nbconvert
184 | multiprocess==0.70.14
185 | # via visualdl
186 | mypy==0.991
187 | # via unstructured-api-tools
188 | mypy-extensions==0.4.3
189 | # via mypy
190 | nbclient==0.6.8
191 | # via nbconvert
192 | nbconvert==7.0.0
193 | # via unstructured-api-tools
194 | nbdev==2.3.12
195 | # via -r requirements/base.in
196 | nbformat==5.6.0
197 | # via
198 | # nbclient
199 | # nbconvert
200 | nest-asyncio==1.5.5
201 | # via
202 | # jupyter-client
203 | # nbclient
204 | networkx==2.8.8
205 | # via scikit-image
206 | nltk==3.7
207 | # via unstructured
208 | numpy==1.23.5
209 | # via
210 | # contourpy
211 | # imageio
212 | # imgaug
213 | # matplotlib
214 | # opencv-contrib-python
215 | # opencv-python
216 | # opt-einsum
217 | # paddleocr
218 | # paddlepaddle
219 | # pandas
220 | # pdf2docx
221 | # pywavelets
222 | # scikit-image
223 | # scipy
224 | # tifffile
225 | # visualdl
226 | opencv-contrib-python==4.6.0.66
227 | # via paddleocr
228 | opencv-python==4.5.5.64
229 | # via
230 | # -r requirements/base.in
231 | # imgaug
232 | # paddleocr
233 | # pdf2docx
234 | openpyxl==3.0.10
235 | # via paddleocr
236 | opt-einsum==3.3.0
237 | # via paddlepaddle
238 | packaging==21.3
239 | # via
240 | # build
241 | # fastcore
242 | # ghapi
243 | # matplotlib
244 | # nbconvert
245 | # pytest
246 | # scikit-image
247 | # visualdl
248 | paddle-bfloat==0.1.7
249 | # via paddlepaddle
250 | paddleocr==2.6.1.1
251 | # via -r requirements/base.in
252 | paddlepaddle==2.4.0
253 | # via -r requirements/base.in
254 | pandas==1.5.2
255 | # via visualdl
256 | pandocfilters==1.5.0
257 | # via nbconvert
258 | parso==0.8.3
259 | # via jedi
260 | pdf2docx==0.5.6
261 | # via paddleocr
262 | pep517==0.13.0
263 | # via build
264 | pexpect==4.8.0
265 | # via ipython
266 | pickleshare==0.7.5
267 | # via ipython
268 | pillow==9.3.0
269 | # via
270 | # imageio
271 | # imgaug
272 | # matplotlib
273 | # paddlepaddle
274 | # scikit-image
275 | # visualdl
276 | pip-tools==6.11.0
277 | # via -r requirements/base.in
278 | platformdirs==3.0.0
279 | # via jupyter-core
280 | pluggy==1.0.0
281 | # via pytest
282 | premailer==3.10.0
283 | # via paddleocr
284 | prompt-toolkit==3.0.36
285 | # via ipython
286 | protobuf==3.20.0
287 | # via
288 | # paddlepaddle
289 | # visualdl
290 | ptyprocess==0.7.0
291 | # via pexpect
292 | pure-eval==0.2.2
293 | # via stack-data
294 | pyclipper==1.3.0.post4
295 | # via paddleocr
296 | pycryptodome==3.16.0
297 | # via bce-python-sdk
298 | pydantic==1.10.2
299 | # via fastapi
300 | pygments==2.13.0
301 | # via
302 | # ipython
303 | # nbconvert
304 | pymupdf==1.20.2
305 | # via
306 | # paddleocr
307 | # pdf2docx
308 | pyparsing==3.0.9
309 | # via
310 | # matplotlib
311 | # packaging
312 | pyrsistent==0.18.1
313 | # via jsonschema
314 | pytest==7.2.1
315 | # via -r requirements/base.in
316 | python-dateutil==2.8.2
317 | # via
318 | # jupyter-client
319 | # matplotlib
320 | # pandas
321 | python-docx==0.8.11
322 | # via
323 | # paddleocr
324 | # pdf2docx
325 | python-dotenv==0.21.0
326 | # via uvicorn
327 | python-multipart==0.0.5
328 | # via unstructured-api-tools
329 | pytz==2022.6
330 | # via
331 | # babel
332 | # flask-babel
333 | # pandas
334 | pywavelets==1.4.1
335 | # via scikit-image
336 | pyyaml==6.0
337 | # via
338 | # nbdev
339 | # uvicorn
340 | pyzmq==24.0.1
341 | # via jupyter-client
342 | rapidfuzz==2.13.3
343 | # via paddleocr
344 | ratelimit==2.2.1
345 | # via -r requirements/base.in
346 | regex==2022.10.31
347 | # via nltk
348 | requests==2.28.1
349 | # via
350 | # paddlepaddle
351 | # premailer
352 | # visualdl
353 | scikit-image==0.19.3
354 | # via
355 | # imgaug
356 | # paddleocr
357 | scipy==1.9.3
358 | # via
359 | # imgaug
360 | # scikit-image
361 | shapely==1.8.5.post1
362 | # via
363 | # imgaug
364 | # paddleocr
365 | six==1.16.0
366 | # via
367 | # asttokens
368 | # astunparse
369 | # attrdict
370 | # bce-python-sdk
371 | # bleach
372 | # fire
373 | # imgaug
374 | # limits
375 | # paddlepaddle
376 | # python-dateutil
377 | # python-multipart
378 | # visualdl
379 | slowapi==0.1.6
380 | # via unstructured-api-tools
381 | sniffio==1.3.0
382 | # via anyio
383 | soupsieve==2.3.2.post1
384 | # via beautifulsoup4
385 | stack-data==0.6.2
386 | # via ipython
387 | starlette==0.20.4
388 | # via fastapi
389 | termcolor==2.1.1
390 | # via fire
391 | tifffile==2022.10.10
392 | # via scikit-image
393 | tinycss2==1.1.1
394 | # via nbconvert
395 | tomli==2.0.1
396 | # via
397 | # build
398 | # mypy
399 | # pep517
400 | # pytest
401 | tornado==6.2
402 | # via jupyter-client
403 | tqdm==4.64.1
404 | # via
405 | # nltk
406 | # paddleocr
407 | traitlets==5.4.0
408 | # via
409 | # ipython
410 | # jupyter-client
411 | # jupyter-core
412 | # matplotlib-inline
413 | # nbclient
414 | # nbconvert
415 | # nbformat
416 | types-requests==2.28.11
417 | # via unstructured-api-tools
418 | types-ujson==5.5.0
419 | # via unstructured-api-tools
420 | types-urllib3==1.26.24
421 | # via types-requests
422 | typing-extensions==4.3.0
423 | # via
424 | # mypy
425 | # pydantic
426 | # starlette
427 | unstructured==0.2.5
428 | # via -r requirements/base.in
429 | unstructured-api-tools==0.4.6
430 | # via -r requirements/base.in
431 | urllib3==1.26.13
432 | # via requests
433 | uvicorn[standard]==0.18.3
434 | # via unstructured-api-tools
435 | uvloop==0.17.0
436 | # via uvicorn
437 | visualdl==2.4.1
438 | # via paddleocr
439 | watchdog==2.2.1
440 | # via nbdev
441 | watchfiles==0.17.0
442 | # via uvicorn
443 | wcwidth==0.2.5
444 | # via prompt-toolkit
445 | webencodings==0.5.1
446 | # via
447 | # bleach
448 | # tinycss2
449 | websockets==10.3
450 | # via uvicorn
451 | werkzeug==2.2.3
452 | # via
453 | # -r requirements/base.in
454 | # flask
455 | wheel==0.38.4
456 | # via
457 | # -r requirements/base.in
458 | # astunparse
459 | # pip-tools
460 | zipp==3.10.0
461 | # via importlib-metadata
462 |
463 | # The following packages are considered to be unsafe in a requirements file:
464 | # pip
465 | # setuptools
466 |
--------------------------------------------------------------------------------
/requirements/dev.in:
--------------------------------------------------------------------------------
1 | black
2 | flake8
3 | jupyter
4 | mypy
5 | nbdev
6 | pip-tools
7 | # NOTE(crag): consistency with unstructured-api-tools. pinned for a reason, see there.
8 | ipython==8.7.0
9 |
--------------------------------------------------------------------------------
/requirements/dev.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.9
3 | # by the following command:
4 | #
5 | # pip-compile requirements/dev.in
6 | #
7 | argon2-cffi==21.3.0
8 | # via notebook
9 | argon2-cffi-bindings==21.2.0
10 | # via argon2-cffi
11 | asttokens==2.0.8
12 | # via
13 | # nbdev
14 | # stack-data
15 | astunparse==1.6.3
16 | # via nbdev
17 | attrs==22.1.0
18 | # via jsonschema
19 | backcall==0.2.0
20 | # via ipython
21 | beautifulsoup4==4.11.1
22 | # via nbconvert
23 | black==23.1.0
24 | # via -r requirements/dev.in
25 | bleach==5.0.1
26 | # via nbconvert
27 | build==0.8.0
28 | # via pip-tools
29 | cffi==1.15.1
30 | # via argon2-cffi-bindings
31 | click==8.1.3
32 | # via
33 | # black
34 | # pip-tools
35 | debugpy==1.6.3
36 | # via ipykernel
37 | decorator==5.1.1
38 | # via ipython
39 | defusedxml==0.7.1
40 | # via nbconvert
41 | entrypoints==0.4
42 | # via jupyter-client
43 | execnb==0.1.4
44 | # via nbdev
45 | executing==1.0.0
46 | # via stack-data
47 | fastcore==1.5.27
48 | # via
49 | # execnb
50 | # ghapi
51 | # nbdev
52 | fastjsonschema==2.16.2
53 | # via nbformat
54 | flake8==6.0.0
55 | # via -r requirements/dev.in
56 | ghapi==1.0.3
57 | # via nbdev
58 | importlib-metadata==6.0.0
59 | # via nbconvert
60 | ipykernel==6.15.3
61 | # via
62 | # ipywidgets
63 | # jupyter
64 | # jupyter-console
65 | # notebook
66 | # qtconsole
67 | ipython==8.7.0
68 | # via
69 | # -r requirements/dev.in
70 | # execnb
71 | # ipykernel
72 | # ipywidgets
73 | # jupyter-console
74 | ipython-genutils==0.2.0
75 | # via
76 | # notebook
77 | # qtconsole
78 | ipywidgets==8.0.2
79 | # via jupyter
80 | jedi==0.18.1
81 | # via ipython
82 | jinja2==3.1.2
83 | # via
84 | # nbconvert
85 | # notebook
86 | jsonschema==4.16.0
87 | # via nbformat
88 | jupyter==1.0.0
89 | # via -r requirements/dev.in
90 | jupyter-client==7.3.5
91 | # via
92 | # ipykernel
93 | # jupyter-console
94 | # nbclient
95 | # notebook
96 | # qtconsole
97 | jupyter-console==6.4.4
98 | # via jupyter
99 | jupyter-core==4.11.1
100 | # via
101 | # jupyter-client
102 | # nbconvert
103 | # nbformat
104 | # notebook
105 | # qtconsole
106 | jupyterlab-pygments==0.2.2
107 | # via nbconvert
108 | jupyterlab-widgets==3.0.3
109 | # via ipywidgets
110 | lxml==4.9.1
111 | # via nbconvert
112 | markupsafe==2.1.1
113 | # via
114 | # jinja2
115 | # nbconvert
116 | matplotlib-inline==0.1.6
117 | # via
118 | # ipykernel
119 | # ipython
120 | mccabe==0.7.0
121 | # via flake8
122 | mistune==2.0.4
123 | # via nbconvert
124 | mypy==0.991
125 | # via -r requirements/dev.in
126 | mypy-extensions==0.4.3
127 | # via
128 | # black
129 | # mypy
130 | nbclient==0.6.8
131 | # via nbconvert
132 | nbconvert==7.0.0
133 | # via
134 | # jupyter
135 | # notebook
136 | nbdev==2.3.11
137 | # via -r requirements/dev.in
138 | nbformat==5.6.0
139 | # via
140 | # nbclient
141 | # nbconvert
142 | # notebook
143 | nest-asyncio==1.5.5
144 | # via
145 | # ipykernel
146 | # jupyter-client
147 | # nbclient
148 | # notebook
149 | notebook==6.4.12
150 | # via jupyter
151 | packaging==23.0
152 | # via
153 | # black
154 | # build
155 | # fastcore
156 | # ghapi
157 | # ipykernel
158 | # nbconvert
159 | # qtpy
160 | pandocfilters==1.5.0
161 | # via nbconvert
162 | parso==0.8.3
163 | # via jedi
164 | pathspec==0.10.1
165 | # via black
166 | pep517==0.13.0
167 | # via build
168 | pexpect==4.8.0
169 | # via ipython
170 | pickleshare==0.7.5
171 | # via ipython
172 | pip-tools==6.11.0
173 | # via -r requirements/dev.in
174 | platformdirs==2.5.2
175 | # via black
176 | prometheus-client==0.14.1
177 | # via notebook
178 | prompt-toolkit==3.0.31
179 | # via
180 | # ipython
181 | # jupyter-console
182 | psutil==5.9.2
183 | # via ipykernel
184 | ptyprocess==0.7.0
185 | # via
186 | # pexpect
187 | # terminado
188 | pure-eval==0.2.2
189 | # via stack-data
190 | pycodestyle==2.10.0
191 | # via flake8
192 | pycparser==2.21
193 | # via cffi
194 | pyflakes==3.0.1
195 | # via flake8
196 | pygments==2.13.0
197 | # via
198 | # ipython
199 | # jupyter-console
200 | # nbconvert
201 | # qtconsole
202 | pyrsistent==0.18.1
203 | # via jsonschema
204 | python-dateutil==2.8.2
205 | # via jupyter-client
206 | pyyaml==6.0
207 | # via nbdev
208 | pyzmq==24.0.1
209 | # via
210 | # ipykernel
211 | # jupyter-client
212 | # notebook
213 | # qtconsole
214 | qtconsole==5.3.2
215 | # via jupyter
216 | qtpy==2.2.0
217 | # via qtconsole
218 | send2trash==1.8.0
219 | # via notebook
220 | six==1.16.0
221 | # via
222 | # asttokens
223 | # astunparse
224 | # bleach
225 | # python-dateutil
226 | soupsieve==2.3.2.post1
227 | # via beautifulsoup4
228 | stack-data==0.5.0
229 | # via ipython
230 | terminado==0.15.0
231 | # via notebook
232 | tinycss2==1.1.1
233 | # via nbconvert
234 | tomli==2.0.1
235 | # via
236 | # black
237 | # build
238 | # mypy
239 | # pep517
240 | tornado==6.2
241 | # via
242 | # ipykernel
243 | # jupyter-client
244 | # notebook
245 | # terminado
246 | traitlets==5.4.0
247 | # via
248 | # ipykernel
249 | # ipython
250 | # ipywidgets
251 | # jupyter-client
252 | # jupyter-core
253 | # matplotlib-inline
254 | # nbclient
255 | # nbconvert
256 | # nbformat
257 | # notebook
258 | # qtconsole
259 | typing-extensions==4.3.0
260 | # via
261 | # black
262 | # mypy
263 | watchdog==2.1.9
264 | # via nbdev
265 | wcwidth==0.2.5
266 | # via prompt-toolkit
267 | webencodings==0.5.1
268 | # via
269 | # bleach
270 | # tinycss2
271 | wheel==0.37.1
272 | # via
273 | # astunparse
274 | # pip-tools
275 | widgetsnbextension==4.0.3
276 | # via ipywidgets
277 | zipp==3.12.1
278 | # via importlib-metadata
279 |
280 | # The following packages are considered to be unsafe in a requirements file:
281 | # pip
282 | # setuptools
283 |
--------------------------------------------------------------------------------
/requirements/test.in:
--------------------------------------------------------------------------------
1 | black
2 | # NOTE(mrobinson) - Pinning click due to a unicode issue in black
3 | # can remove after black drops support for Python 3.6
4 | # ref: https://github.com/psf/black/issues/2964
5 | click==8.1.3
6 | flake8
7 | mypy
8 | pytest-cov
9 |
--------------------------------------------------------------------------------
/requirements/test.txt:
--------------------------------------------------------------------------------
1 | #
2 | # This file is autogenerated by pip-compile with Python 3.9
3 | # by the following command:
4 | #
5 | # pip-compile requirements/test.in
6 | #
7 | attrs==22.1.0
8 | # via pytest
9 | black==23.1.0
10 | # via -r requirements/test.in
11 | click==8.1.3
12 | # via
13 | # -r requirements/test.in
14 | # black
15 | coverage[toml]==6.4.4
16 | # via pytest-cov
17 | flake8==6.0.0
18 | # via -r requirements/test.in
19 | iniconfig==1.1.1
20 | # via pytest
21 | mccabe==0.7.0
22 | # via flake8
23 | mypy==0.991
24 | # via -r requirements/test.in
25 | mypy-extensions==0.4.3
26 | # via
27 | # black
28 | # mypy
29 | packaging==23.0
30 | # via
31 | # black
32 | # pytest
33 | pathspec==0.10.1
34 | # via black
35 | platformdirs==2.5.2
36 | # via black
37 | pluggy==1.0.0
38 | # via pytest
39 | py==1.11.0
40 | # via pytest
41 | pycodestyle==2.10.0
42 | # via flake8
43 | pyflakes==3.0.1
44 | # via flake8
45 | pytest==7.1.3
46 | # via pytest-cov
47 | pytest-cov==4.0.0
48 | # via -r requirements/test.in
49 | tomli==2.0.1
50 | # via
51 | # black
52 | # coverage
53 | # mypy
54 | # pytest
55 | typing-extensions==4.3.0
56 | # via
57 | # black
58 | # mypy
59 |
--------------------------------------------------------------------------------
/sample-docs/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/sample-docs/.gitkeep
--------------------------------------------------------------------------------
/sample-docs/sample-receipt.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/sample-docs/sample-receipt.jpg
--------------------------------------------------------------------------------
/scripts/check-and-format-notebooks.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | import argparse
4 | from copy import deepcopy
5 | import difflib
6 | import json
7 | from pathlib import Path
8 | import sys
9 | from typing import List, Tuple, Union
10 |
11 | from nbdev import clean
12 | from nbconvert.preprocessors import ExecutePreprocessor
13 | import nbformat
14 | from unstructured_api_tools.pipelines.convert import read_notebook
15 |
16 |
17 | def process_nb(nb: nbformat.NotebookNode, working_dir: Union[str, Path]) -> nbformat.NotebookNode:
18 | """Execute cells in nb using working_dir as the working directory for imports, modifying the
19 | notebook in place (in memory)."""
20 | # Clear existing outputs before executing the notebook
21 | for cell in nb.cells:
22 | if cell.cell_type == "code":
23 | cell.outputs = []
24 | ep = ExecutePreprocessor(timeout=600)
25 | ep.preprocess(nb, {"metadata": {"path": working_dir}})
26 | # Merge adjacent text outputs after executing the notebook
27 | for cell in nb.cells:
28 | merge_adjacent_text_outputs(cell)
29 | return nb
30 |
31 |
32 | def merge_adjacent_text_outputs(cell: nbformat.NotebookNode) -> nbformat.NotebookNode:
33 | """Merges adjacent text stream outputs to avoid non-deterministic splits in output."""
34 | if cell.cell_type != "code":
35 | return cell
36 |
37 | new_outputs = []
38 | current_output = None
39 |
40 | for output in cell.outputs:
41 | if output.output_type == "stream":
42 | if current_output is None:
43 | current_output = output
44 | elif current_output.name == output.name:
45 | current_output.text += output.text
46 | else:
47 | new_outputs.append(current_output)
48 | current_output = output
49 | else:
50 | if current_output is not None:
51 | new_outputs.append(current_output)
52 | current_output = None
53 | new_outputs.append(output)
54 |
55 | if current_output is not None:
56 | new_outputs.append(current_output)
57 |
58 | cell.outputs = new_outputs
59 | return cell
60 |
61 |
62 | def nb_paths(root_path: Union[str, Path]) -> List[Path]:
63 | """Fetches all .ipynb filenames that belong to subdirectories of root_path (1 level deep) with
64 | 'notebooks' in the name."""
65 | root_path = Path(root_path)
66 | return [
67 | fn
68 | for dir in root_path.iterdir()
69 | # NOTE(alan): Search only in paths with 'notebooks' in the title such as pipeline-notebooks
70 | # and exploration-notebooks
71 | if "notebooks" in dir.stem and dir.is_dir()
72 | for fn in dir.iterdir()
73 | if fn.suffix == ".ipynb"
74 | ]
75 |
76 |
77 | def to_results_str(fns: List[Path], nonmatching_nbs: List[Path]) -> Tuple[str, str]:
78 | """Given files that were checked and list of files that would be changed, produces a summary of
79 | changes as well as a list of files to be changed"""
80 | unchanged = len(fns) - len(nonmatching_nbs)
81 | results = []
82 | if nonmatching_nbs:
83 | results.append(
84 | f"{len(nonmatching_nbs)} "
85 | f"{'file' if len(nonmatching_nbs) == 1 else 'files'} "
86 | f"{'would be ' if check else ''}changed"
87 | )
88 | if unchanged:
89 | results.append(
90 | f"{unchanged} "
91 | f"{'file' if unchanged == 1 else 'files'} "
92 | f"{'would be ' if check else ''}left unchanged"
93 | )
94 | summary_str = ", ".join(results) + ".\n"
95 | if nonmatching_nbs:
96 | details_str = (
97 | f"The following notebooks {'would have been' if check else 'were'} "
98 | "changed when executed and cleaned:\n* " + "\n* ".join(nonmatching_nbs) + "\n"
99 | )
100 | else:
101 | details_str = ""
102 |
103 | return summary_str, details_str
104 |
105 |
106 | if __name__ == "__main__":
107 | parser = argparse.ArgumentParser()
108 | parser.add_argument(
109 | "--check",
110 | default=False,
111 | action="store_true",
112 | help="Check notebook format without making changes. Return code 0 means formatting would "
113 | "produce no changes. Return code 1 means some files would be changed.",
114 | )
115 | parser.add_argument(
116 | "notebooks",
117 | metavar="notebook",
118 | nargs="*",
119 | help="Path(s) to notebook(s) to format (or check). If you don't pass any paths, "
120 | "notebooks in any subfolders with 'notebooks' in the name will be processed.",
121 | default=[],
122 | )
123 | args = parser.parse_args()
124 | check = args.check
125 | notebooks = args.notebooks
126 |
127 | root_path = Path(__file__).parent.parent
128 | nonmatching_nbs = []
129 | fns = notebooks if notebooks else nb_paths(root_path)
130 | for fn in fns:
131 | print(f"{'checking' if check else 'processing'} {fn}")
132 | nb = read_notebook(fn)
133 | modified_nb = deepcopy(nb)
134 | process_nb(modified_nb, root_path)
135 | clean.clean_nb(modified_nb, allowed_cell_metadata_keys=["tags"])
136 | if nb != modified_nb:
137 | nonmatching_nbs.append(str(fn))
138 | nb_json = json.dumps(nb.dict(), indent=2, sort_keys=True)
139 | modified_nb_json = json.dumps(modified_nb.dict(), indent=2, sort_keys=True)
140 | sys.stderr.write(f"The following diff shows the modifications made to {fn}\n")
141 | sys.stderr.writelines(
142 | (
143 | difflib.unified_diff(
144 | nb_json.splitlines(keepends=True),
145 | modified_nb_json.splitlines(keepends=True),
146 | )
147 | )
148 | )
149 | if not check:
150 | nbformat.write(modified_nb, fn)
151 |
152 | summary_str, details_str = to_results_str(fns, nonmatching_nbs)
153 | print(summary_str)
154 | if check:
155 | sys.stderr.write(details_str)
156 | if nonmatching_nbs:
157 | sys.exit(1)
158 | else:
159 | print(details_str)
160 |
--------------------------------------------------------------------------------
/scripts/docker-build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -euo pipefail
4 |
5 | DOCKER_BUILDKIT=1 docker buildx build --load --platform=linux/amd64 -f Dockerfile \
6 | --build-arg PIP_VERSION="$PIP_VERSION" \
7 | --build-arg PIPELINE_PACKAGE="$PIPELINE_PACKAGE" \
8 | --progress plain \
9 | -t pipeline-family-"$PIPELINE_FAMILY"-dev:latest .
10 |
--------------------------------------------------------------------------------
/scripts/shellcheck.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | find scripts -name "*.sh" -exec shellcheck {} +
4 |
5 |
--------------------------------------------------------------------------------
/scripts/test-doc-pipeline-apis-consistent.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | set -eu -o pipefail
4 |
5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
6 | cd "$SCRIPT_DIR"/..
7 |
8 | PIPELINE_OUTPUT_DIR=tmp-api-check-output-$RANDOM
9 | FILE_INDICTATING_FAILURE="$PIPELINE_OUTPUT_DIR"-has-failures
10 | mkdir -p $PIPELINE_OUTPUT_DIR
11 | touch $PIPELINE_OUTPUT_DIR/__init__.py
12 |
13 | function tmp_pipeline_comp_cleanup () {
14 | cd "$SCRIPT_DIR"/..
15 | rm -f "$FILE_INDICTATING_FAILURE"
16 | if [[ "$1" -eq 0 ]]; then
17 | rm -rf $PIPELINE_OUTPUT_DIR
18 | fi
19 | exit "$1"
20 | }
21 |
22 | unstructured_api_tools convert-pipeline-notebooks \
23 | --input-directory ./pipeline-notebooks \
24 | --output-directory "$PIPELINE_OUTPUT_DIR"
25 |
26 | NUM_PIPELINE_API_FILES_GENERATED=$(find "$PIPELINE_OUTPUT_DIR" -name "*.py" | wc -l)
27 |
28 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -eq 0 ]]; then
29 | echo "No pipelines where created by unstructured_api_tools convert-pipeline-notebooks"
30 | tmp_pipeline_comp_cleanup 1
31 | fi
32 |
33 | NUM_EXISTING_PIPELINE_API_FILES=$(find "$PACKAGE_NAME"/api -name "*.py" | wc -l)
34 |
35 | if [[ "$NUM_PIPELINE_API_FILES_GENERATED" -gt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then
36 | echo "More pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
37 | tmp_pipeline_comp_cleanup 1
38 | elif [[ "$NUM_PIPELINE_API_FILES_GENERATED" -lt "$NUM_EXISTING_PIPELINE_API_FILES" ]]; then
39 | echo "Fewer pipeline api files were autogenerated than appear in the ${PACKAGE_NAME}/api"
40 | tmp_pipeline_comp_cleanup 1
41 | fi
42 |
43 | cd "$PACKAGE_NAME"/api
44 | find . -name "*.py" -print0 | while IFS= read -r -d '' pipeline_file; do
45 | set +o pipefail
46 | if ! diff -u "$pipeline_file" ../../"$PIPELINE_OUTPUT_DIR/$pipeline_file"; then
47 | touch "../../$FILE_INDICTATING_FAILURE"
48 | fi
49 | set -o pipefail
50 | done
51 | cd -
52 |
53 | if [ -r "$FILE_INDICTATING_FAILURE" ]; then
54 | echo
55 | echo "Autogenerated pipeline api file(s) do not match existing versions, see above for diff's"
56 | echo " or run: diff -ru ${PACKAGE_NAME}/api/ ${PIPELINE_OUTPUT_DIR}/"
57 | tmp_pipeline_comp_cleanup 1
58 | fi
59 | tmp_pipeline_comp_cleanup 0
60 |
--------------------------------------------------------------------------------
/scripts/version-sync.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | function usage {
3 | echo "Usage: $(basename "$0") [-c] -f FILE_TO_CHANGE REPLACEMENT_FORMAT [-f FILE_TO_CHANGE REPLACEMENT_FORMAT ...]" 2>&1
4 | echo 'Synchronize files to latest version in source file'
5 | echo ' -s Specifies source file for version (default is CHANGELOG.md)'
6 | echo ' -f Specifies a file to change and the format for searching and replacing versions'
7 | echo ' FILE_TO_CHANGE is the file to be updated/checked for updates'
8 | echo ' REPLACEMENT_FORMAT is one of (semver, release, api-release)'
9 | echo ' semver indicates to look for a full semver version and replace with the latest full version'
10 | echo ' release indicates to look for a release semver version (x.x.x) and replace with the latest release version'
11 | echo ' api-release indicates to look for a release semver version in the context of an api route and replace with the latest release version'
12 | echo ' -c Compare versions and output proposed changes without changing anything.'
13 | }
14 |
15 | function getopts-extra () {
16 | declare i=1
17 | # if the next argument is not an option, then append it to array OPTARG
18 | while [[ ${OPTIND} -le $# && ${!OPTIND:0:1} != '-' ]]; do
19 | OPTARG[i]=${!OPTIND}
20 | i+=1
21 | OPTIND+=1
22 | done
23 | }
24 |
25 | # Parse input options
26 | declare CHECK=0
27 | declare SOURCE_FILE="CHANGELOG.md"
28 | declare -a FILES_TO_CHECK=()
29 | declare -a REPLACEMENT_FORMATS=()
30 | declare args
31 | declare OPTIND OPTARG opt
32 | while getopts ":hcs:f:" opt; do
33 | case $opt in
34 | h)
35 | usage
36 | exit 0
37 | ;;
38 | c)
39 | CHECK=1
40 | ;;
41 | s)
42 | SOURCE_FILE="$OPTARG"
43 | ;;
44 | f)
45 | getopts-extra "$@"
46 | args=( "${OPTARG[@]}" )
47 | # validate length of args, should be 2
48 | if [ ${#args[@]} -eq 2 ]; then
49 | FILES_TO_CHECK+=( "${args[0]}" )
50 | REPLACEMENT_FORMATS+=( "${args[1]}" )
51 | else
52 | echo "Exactly 2 arguments must follow -f option." >&2
53 | exit 1
54 | fi
55 | ;;
56 | \?)
57 | echo "Invalid option: -$OPTARG." >&2
58 | usage
59 | exit 1
60 | ;;
61 | esac
62 | done
63 |
64 | # Parse REPLACEMENT_FORMATS
65 | RE_SEMVER_FULL="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)(-((0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*)(\.(0|[1-9][0-9]*|[0-9]*[a-zA-Z-][0-9a-zA-Z-]*))*))?(\+([0-9a-zA-Z-]+(\.[0-9a-zA-Z-]+)*))?"
66 | RE_RELEASE="(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
67 | RE_API_RELEASE="v(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)\.(0|[1-9][0-9]*)"
68 | # Pull out semver appearing earliest in SOURCE_FILE.
69 | LAST_VERSION=$(grep -o -m 1 -E "${RE_SEMVER_FULL}" "$SOURCE_FILE")
70 | LAST_RELEASE=$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")
71 | LAST_API_RELEASE="v$(grep -o -m 1 -E "${RE_RELEASE}($|[^-+])$" "$SOURCE_FILE" | grep -o -m 1 -E "${RE_RELEASE}")"
72 | declare -a RE_SEMVERS=()
73 | declare -a UPDATED_VERSIONS=()
74 | for i in "${!REPLACEMENT_FORMATS[@]}"; do
75 | REPLACEMENT_FORMAT=${REPLACEMENT_FORMATS[$i]}
76 | case $REPLACEMENT_FORMAT in
77 | semver)
78 | RE_SEMVERS+=( "$RE_SEMVER_FULL" )
79 | UPDATED_VERSIONS+=( "$LAST_VERSION" )
80 | ;;
81 | release)
82 | RE_SEMVERS+=( "$RE_RELEASE" )
83 | UPDATED_VERSIONS+=( "$LAST_RELEASE" )
84 | ;;
85 | api-release)
86 | RE_SEMVERS+=( "$RE_API_RELEASE" )
87 | UPDATED_VERSIONS+=( "$LAST_API_RELEASE" )
88 | ;;
89 | *)
90 | echo "Invalid replacement format: \"${REPLACEMENT_FORMAT}\". Use semver, release, or api-release" >&2
91 | exit 1
92 | ;;
93 | esac
94 | done
95 |
96 | if [ -z "$LAST_VERSION" ];
97 | then
98 | # No match to semver regex in SOURCE_FILE, so no version to go from.
99 | printf "Error: Unable to find latest version from %s.\n" "$SOURCE_FILE"
100 | exit 1
101 | fi
102 |
103 | # Search files in FILES_TO_CHECK and change (or get diffs)
104 | declare FAILED_CHECK=0
105 |
106 | for i in "${!FILES_TO_CHECK[@]}"; do
107 | FILE_TO_CHANGE=${FILES_TO_CHECK[$i]}
108 | RE_SEMVER=${RE_SEMVERS[$i]}
109 | UPDATED_VERSION=${UPDATED_VERSIONS[$i]}
110 | FILE_VERSION=$(grep -o -m 1 -E "${RE_SEMVER}" "$FILE_TO_CHANGE")
111 | if [ -z "$FILE_VERSION" ];
112 | then
113 | # No match to semver regex in VERSIONFILE, so nothing to replace
114 | printf "Error: No semver version found in file %s.\n" "$FILE_TO_CHANGE"
115 | exit 1
116 | else
117 | # Replace semver in VERSIONFILE with semver obtained from SOURCE_FILE
118 | TMPFILE=$(mktemp /tmp/new_version.XXXXXX)
119 | # Check sed version, exit if version < 4.3
120 | if ! sed --version > /dev/null 2>&1; then
121 | CURRENT_VERSION=1.archaic
122 | else
123 | CURRENT_VERSION=$(sed --version | head -n1 | cut -d" " -f4)
124 | fi
125 | REQUIRED_VERSION="4.3"
126 | if [ "$(printf '%s\n' "$REQUIRED_VERSION" "$CURRENT_VERSION" | sort -V | head -n1)" != "$REQUIRED_VERSION" ]; then
127 | echo "sed version must be >= ${REQUIRED_VERSION}" && exit 1
128 | fi
129 | sed -E -r "s/$RE_SEMVER/$UPDATED_VERSION/" "$FILE_TO_CHANGE" > "$TMPFILE"
130 | if [ $CHECK == 1 ];
131 | then
132 | DIFF=$(diff "$FILE_TO_CHANGE" "$TMPFILE" )
133 | if [ -z "$DIFF" ];
134 | then
135 | printf "version sync would make no changes to %s.\n" "$FILE_TO_CHANGE"
136 | rm "$TMPFILE"
137 | else
138 | FAILED_CHECK=1
139 | printf "version sync would make the following changes to %s:\n%s\n" "$FILE_TO_CHANGE" "$DIFF"
140 | rm "$TMPFILE"
141 | fi
142 | else
143 | cp "$TMPFILE" "$FILE_TO_CHANGE"
144 | rm "$TMPFILE"
145 | fi
146 | fi
147 | done
148 |
149 | # Exit with code determined by whether changes were needed in a check.
150 | if [ ${FAILED_CHECK} -ne 0 ]; then
151 | exit 1
152 | else
153 | exit 0
154 | fi
155 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | exclude =
4 | prepline_*/api
5 |
--------------------------------------------------------------------------------
/test_paddleocr/api/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Unstructured-IO/pipeline-paddleocr/7142d2bd80b1687424fef0437f18de6a16124b1b/test_paddleocr/api/.gitkeep
--------------------------------------------------------------------------------
/test_paddleocr/api/test_paddleocr.py:
--------------------------------------------------------------------------------
1 | from fastapi.testclient import TestClient
2 |
3 |
4 | from prepline_paddleocr.api.app import app
5 |
6 |
7 | def test_api_health_check():
8 | client = TestClient(app)
9 | response = client.get("/healthcheck")
10 |
11 | assert response.status_code == 200
12 |
13 |
14 | def test_api_call():
15 | client = TestClient(app)
16 | with open("img/0.png", "rb") as f:
17 | response = client.post(
18 | "/paddleocr/v0.0.1/paddleocr", files={"files": ("filename", f, "image/jpeg")}
19 | )
20 |
21 | assert response.status_code == 200
22 |
23 |
24 | def test_api_call_files():
25 | client = TestClient(app)
26 |
27 | files = [("files", open("img/0.png", "rb")), ("files", open("img/0.png", "rb"))]
28 |
29 | response = client.post("/paddleocr/v0.0.1/paddleocr", files=files)
30 |
31 | assert response.status_code == 200
32 |
--------------------------------------------------------------------------------