├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   └── config.yml
    └── workflows
    │   └── gen_whl_to_pypi.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── cliff.toml
├── demo.py
├── docs
    └── docs.md
├── rapidocr_pdf
    ├── __init__.py
    ├── main.py
    └── utils
    │   ├── __init__.py
    │   ├── logger.py
    │   └── utils.py
├── requirements.txt
├── setup.py
└── tests
    ├── test_files
        ├── direct_and_image.pdf
        ├── direct_extract.pdf
        └── image.pdf
    └── test_main.py


/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🐞 Bug
3 | about: Bug
4 | title: 'Bug'
5 | labels: 'Bug'
6 | assignees: ''
7 | 
8 | ---
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: ❓ Questions
4 |     url: https://github.com/RapidAI/RapidOCRPDF/discussions/categories/q-a
5 |     about: Please use the community forum for help and questions regarding LabelConvert Docs.
6 |   - name: 💡 Ideas
7 |     url: https://github.com/RapidAI/RapidOCRPDF/discussions/categories/ideas
8 |     about: Please vote for and post new feature ideas in the community forum.


--------------------------------------------------------------------------------
/.github/workflows/gen_whl_to_pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Push rapidocr_pdf to pypi
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - v*
 7 | 
 8 | jobs:
 9 |   UnitTesting:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Pull latest code
13 |         uses: actions/checkout@v3
14 | 
15 |       - name: Set up Python 3.10
16 |         uses: actions/setup-python@v4
17 |         with:
18 |           python-version: '3.10'
19 |           architecture: 'x64'
20 | 
21 |       - name: Display Python version
22 |         run: python -c "import sys; print(sys.version)"
23 | 
24 |       - name: Unit testings with rapidocr
25 |         run: |
26 |           pip install -r requirements.txt
27 |           pip install pytest
28 |           pytest tests/test*.py
29 | 
30 |   GenerateWHL_PushPyPi:
31 |     needs: UnitTesting
32 |     runs-on: ubuntu-latest
33 | 
34 |     steps:
35 |       - uses: actions/checkout@v3
36 | 
37 |       - name: Set up Python 3.10
38 |         uses: actions/setup-python@v4
39 |         with:
40 |           python-version: '3.10'
41 |           architecture: 'x64'
42 | 
43 |       - name: Run setup.py
44 |         run: |
45 |           pip install -r requirements.txt
46 |           python -m pip install --upgrade pip
47 |           pip install wheel get_pypi_latest_version
48 |           python setup.py bdist_wheel ${{ github.ref_name }}
49 | 
50 |       - name: Publish distribution 📦 to PyPI
51 |         uses: pypa/gh-action-pypi-publish@v1.5.0
52 |         with:
53 |           password: ${{ secrets.PYPI_API_TOKEN }}
54 |           packages_dir: dist/
55 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.pth
  2 | 
  3 | # Created by .ignore support plugin (hsz.mobi)
  4 | ### Python template
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | .pytest_cache
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | # *.manifest
 39 | # *.spec
 40 | *.res
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | 
 60 | # Translations
 61 | *.mo
 62 | *.pot
 63 | 
 64 | # Django stuff:
 65 | *.log
 66 | local_settings.py
 67 | db.sqlite3
 68 | db.sqlite3-journal
 69 | 
 70 | # Flask stuff:
 71 | instance/
 72 | .webassets-cache
 73 | 
 74 | # Scrapy stuff:
 75 | .scrapy
 76 | 
 77 | # Sphinx documentation
 78 | docs/_build/
 79 | 
 80 | # PyBuilder
 81 | target/
 82 | 
 83 | # Jupyter Notebook
 84 | .ipynb_checkpoints
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | .env
112 | .venv
113 | env/
114 | venv/
115 | ENV/
116 | env.bak/
117 | venv.bak/
118 | 
119 | # Spyder project settings
120 | .spyderproject
121 | .spyproject
122 | 
123 | # Rope project settings
124 | .ropeproject
125 | 
126 | # mkdocs documentation
127 | /site
128 | 
129 | # mypy
130 | .mypy_cache/
131 | .dmypy.json
132 | dmypy.json
133 | 
134 | # Pyre type checker
135 | .pyre/
136 | 
137 | #idea
138 | .vs
139 | .vscode
140 | .idea
141 | /models
142 | 
143 | #models
144 | 
145 | *.ttf
146 | *.ttc
147 | 
148 | 
149 | *.bin
150 | *.mapping
151 | *.xml
152 | 
153 | *.pdiparams
154 | *.pdiparams.info
155 | *.pdmodel
156 | 
157 | .DS_Store


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://gitee.com/SWHL/autoflake
 3 |   rev: v2.1.1
 4 |   hooks:
 5 |     - id: autoflake
 6 |       args:
 7 |         [
 8 |           "--recursive",
 9 |           "--in-place",
10 |           "--remove-all-unused-imports",
11 |           "--remove-unused-variable",
12 |           "--ignore-init-module-imports",
13 |         ]
14 |       files: \.py$
15 | - repo: https://gitee.com/SWHL/black
16 |   rev: 23.1.0
17 |   hooks:
18 |     - id: black
19 |       files: \.py$


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |     <div align="center">
 3 |     <h1><b><i>RapidOCR 📄 PDF</i></b></h1>
 4 |     </div>
 5 | 
 6 | <a href="https://huggingface.co/spaces/RapidAI/RapidOCRPDF" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97-Hugging Face Demo-blue"></a>
 7 | <a href="https://www.modelscope.cn/studios/RapidAI/RapidOCRPDF/summary" target="_blank"><img src="https://img.shields.io/badge/魔搭-Demo-blue"></a>
 8 | <a href=""><img src="https://img.shields.io/badge/Python->=3.6-aff.svg"></a>
 9 | <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
10 | <a href="https://pypi.org/project/rapidocr-pdf/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rapidocr-pdf"></a>
11 | <a href="https://pepy.tech/project/rapidocr-pdf"><img src="https://static.pepy.tech/personalized-badge/rapidocr-pdf?period=total&units=abbreviation&left_color=grey&right_color=blue&left_text=Downloads"></a>
12 | <a href="https://semver.org/"><img alt="SemVer2.0" src="https://img.shields.io/badge/SemVer-2.0-brightgreen"></a>
13 | <a href="https://github.com/psf/black"><img src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
14 | <a href="https://choosealicense.com/licenses/apache-2.0/"><img alt="GitHub" src="https://img.shields.io/github/license/RapidAI/RapidOCRPDF"></a>
15 | 
16 | </div>
17 | 
18 | ### 简介
19 | 
20 | 本仓库依托于[RapidOCR](https://github.com/RapidAI/RapidOCR)仓库，快速提取PDF中文字，包括扫描版PDF、加密版PDF、可直接复制文字版PDF。
21 | 
22 | ### 整体流程
23 | 
24 | ```mermaid
25 | flowchart LR
26 | 
27 | A(PDF) --> B{是否可以直接提取内容} --是--> C(PyMuPDF)
28 | B --否--> D(RapidOCR)
29 | 
30 | C & D --> E(结果)
31 | ```
32 | 
33 | ### 安装
34 | 
35 | ```bash
36 | pip install rapidocr_pdf
37 | ```
38 | 
39 | ### 使用
40 | 
41 | #### 脚本使用
42 | 
43 | ⚠️注意：在`rapidocr_pdf>=0.4.0`中，支持`page_num_list`参数为负数，假设总页数为2，范围为`[-2, 1]`。
44 | 
45 | ⚠️注意：在`rapidocr_pdf>=0.3.0`中，支持了`page_num_list`参数，默认为None，全部提取。**如果指定，页码从0开始**。
46 | 
47 | ⚠️注意：在`rapidocr_pdf>=0.2.0`中，已经适配`rapidocr>=2.0.0`版本，可以通过参数来使用不同OCR推理引擎来提速。
48 | 下面的`ocr_params`为示例参数，详细请参见RapidOCR官方文档：[docs](https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#_4) 。
49 | 
50 | ```python
51 | from rapidocr_pdf import RapidOCRPDF
52 | 
53 | pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True})
54 | 
55 | pdf_path = "tests/test_files/direct_and_image.pdf"
56 | 
57 | # page_num_list=[1]: 仅提取第2页
58 | texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[1])
59 | print(texts)
60 | ```
61 | 
62 | #### 命令行使用
63 | 
64 | ```bash
65 | $ rapidocr_pdf -h
66 | usage: rapidocr_pdf [-h] [--dpi DPI] [-f] [--page_num_list [PAGE_NUM_LIST ...]] pdf_path
67 | 
68 | positional arguments:
69 |   pdf_path
70 | 
71 | options:
72 |   -h, --help            show this help message and exit
73 |   --dpi DPI
74 |   -f, --force_ocr       Whether to use ocr for all pages.
75 |   --page_num_list [PAGE_NUM_LIST ...]
76 |                         Which pages will be extracted. e.g. 0 1 2.
77 | 
78 | $ rapidocr_pdf tests/test_files/direct_and_image.pdf --page_num_list 0 1
79 | ```
80 | 
81 | ### 输入输出说明
82 | 
83 | **输入**：`Union[str, Path, bytes]`
84 | 
85 | **输出**：`List` \[**页码**, **文本内容**, **置信度**\]， 具体参见下例：
86 | 
87 | ```python
88 | [
89 |     [0, '人之初，性本善。性相近，习相远。', 0.8969868],
90 |     [1, 'Men at their birth, are naturally good.', 0.8969868],
91 | ]
92 | ```
93 | 


--------------------------------------------------------------------------------
/cliff.toml:
--------------------------------------------------------------------------------
  1 | # git-cliff ~ configuration file
  2 | # https://git-cliff.org/docs/configuration
  3 | 
  4 | [changelog]
  5 | # A Tera template to be rendered as the changelog's footer.
  6 | # See https://keats.github.io/tera/docs/#introduction
  7 | # header = """
  8 | # # Changelog\n
  9 | # All notable changes to this project will be documented in this file. See [conventional commits](https://www.conventionalcommits.org/) for commit guidelines.\n
 10 | # """
 11 | # A Tera template to be rendered for each release in the changelog.
 12 | # See https://keats.github.io/tera/docs/#introduction
 13 | body = """
 14 | {% for group, commits in commits | group_by(attribute="group") %}
 15 |     ### {{ group | striptags | trim | upper_first }}
 16 |     {% for commit in commits
 17 |     | filter(attribute="scope")
 18 |     | sort(attribute="scope") %}
 19 |         - **({{commit.scope}})**{% if commit.breaking %} [**breaking**]{% endif %} \
 20 |             {{ commit.message }} by [@{{ commit.author.name }}](https://github.com/{{ commit.author.name }}) in [{{ commit.id | truncate(length=7, end="") }}]($REPO/commit/{{ commit.id }})
 21 |     {%- endfor -%}
 22 |     {% raw %}\n{% endraw %}\
 23 |     {%- for commit in commits %}
 24 |         {%- if commit.scope -%}
 25 |         {% else -%}
 26 |             - {% if commit.breaking %} [**breaking**]{% endif %}\
 27 |                 {{ commit.message }} by [@{{ commit.author.name }}](https://github.com/{{ commit.author.name }}) in [{{ commit.id | truncate(length=7, end="") }}]($REPO/commit/{{ commit.id }})
 28 |         {% endif -%}
 29 |     {% endfor -%}
 30 | {% endfor %}
 31 | 
 32 | 
 33 | {% if github.contributors | length > 0 %}
 34 | ### 🎉 Contributors
 35 | 
 36 | {% for contributor in github.contributors %}
 37 |   - [@{{ contributor.username }}](https://github.com/{{ contributor.username }})
 38 | {%- endfor -%}
 39 | {% endif %}
 40 | 
 41 | 
 42 | {% if version %}
 43 |     {% if previous.version %}\
 44 |         **Full Changelog**:  [{{ version | trim_start_matches(pat="v") }}]($REPO/compare/{{ previous.version }}..{{ version }})
 45 |     {% else %}\
 46 |         **Full Changelog**:  [{{ version | trim_start_matches(pat="v") }}]
 47 |     {% endif %}\
 48 | {% else %}\
 49 |     ## [unreleased]
 50 | {% endif %}
 51 | """
 52 | # A Tera template to be rendered as the changelog's footer.
 53 | # See https://keats.github.io/tera/docs/#introduction
 54 | 
 55 | footer = """
 56 | 
 57 | """
 58 | 
 59 | # Remove leading and trailing whitespaces from the changelog's body.
 60 | trim = true
 61 | # postprocessors
 62 | postprocessors = [
 63 |     # Replace the placeholder `<REPO>` with a URL.
 64 |     { pattern = '\$REPO', replace = "https://github.com/RapidAI/RapidOCRPDF" }, # replace repository URL
 65 | ]
 66 | 
 67 | [git]
 68 | # Parse commits according to the conventional commits specification.
 69 | # See https://www.conventionalcommits.org
 70 | conventional_commits = true
 71 | # Exclude commits that do not match the conventional commits specification.
 72 | filter_unconventional = true
 73 | # Split commits on newlines, treating each line as an individual commit.
 74 | split_commits = false
 75 | # An array of regex based parsers to modify commit messages prior to further processing.
 76 | commit_preprocessors = [
 77 |     # Replace issue numbers with link templates to be updated in `changelog.postprocessors`.
 78 |     #{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "([#${2}](https://github.com/orhun/git-cliff/issues/${2}))"},
 79 | ]
 80 | # An array of regex based parsers for extracting data from the commit message.
 81 | # Assigns commits to groups.
 82 | # Optionally sets the commit's scope and can decide to exclude commits from further processing.
 83 | commit_parsers = [
 84 |   { message = "^feat", group = "<!-- 0 -->🚀 Features" },
 85 |   { message = "^fix", group = "<!-- 1 -->🐛 Bug Fixes" },
 86 |   { message = "^doc", group = "<!-- 3 -->📚 Documentation" },
 87 |   { message = "^perf", group = "<!-- 4 -->⚡ Performance" },
 88 |   { message = "^refactor", group = "<!-- 2 -->🚜 Refactor" },
 89 |   { message = "^style", group = "<!-- 5 -->🎨 Styling" },
 90 |   { message = "^test", group = "<!-- 6 -->🧪 Testing" },
 91 |   { message = "^chore\\(release\\): prepare for", skip = true },
 92 |   { message = "^chore\\(deps.*\\)", skip = true },
 93 |   { message = "^chore\\(pr\\)", skip = true },
 94 |   { message = "^chore\\(pull\\)", skip = true },
 95 |   { message = "^chore|^ci", group = "<!-- 7 -->⚙️ Miscellaneous Tasks" },
 96 |   { body = ".*security", group = "<!-- 8 -->🛡️ Security" },
 97 |   { message = "^revert", group = "<!-- 9 -->◀️ Revert" },
 98 |   { message = ".*", group = "<!-- 10 -->💼 Other" },
 99 | ]
100 | # Exclude commits that are not matched by any commit parser.
101 | filter_commits = false
102 | # Order releases topologically instead of chronologically.
103 | topo_order = false
104 | # Order of commits in each group/release within the changelog.
105 | # Allowed values: newest, oldest
106 | sort_commits = "newest"


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # @Author: SWHL
 3 | # @Contact: liekkaskono@163.com
 4 | from rapidocr_pdf import RapidOCRPDF
 5 | 
 6 | pdf_extracter = RapidOCRPDF()
 7 | 
 8 | pdf_path = "tests/test_files/direct_and_image.pdf"
 9 | texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[-1])
10 | print(texts)
11 | 


--------------------------------------------------------------------------------
/docs/docs.md:
--------------------------------------------------------------------------------
1 | See [link](https://github.com/RapidAI/RapidOCRPDF) for details.
2 | 


--------------------------------------------------------------------------------
/rapidocr_pdf/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | # @Author: SWHL
3 | # @Contact: liekkaskono@163.com
4 | from .main import RapidOCRPDF, RapidOCRPDFError
5 | 


--------------------------------------------------------------------------------
/rapidocr_pdf/main.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8 -*-
  2 | # @Author: SWHL
  3 | # @Contact: liekkaskono@163.com
  4 | import argparse
  5 | from pathlib import Path
  6 | from typing import Dict, List, Optional, Tuple, Union
  7 | 
  8 | import cv2
  9 | import fitz
 10 | import numpy as np
 11 | from rapidocr import RapidOCR
 12 | 
 13 | from .utils.logger import Logger
 14 | from .utils.utils import error_log, which_type
 15 | 
 16 | logger = Logger(logger_name=__name__).get_log()
 17 | 
 18 | 
 19 | class RapidOCRPDF:
 20 |     def __init__(self, dpi=200, ocr_params: Optional[Dict] = None):
 21 |         self.dpi = dpi
 22 |         self.ocr_engine = RapidOCR(params=ocr_params)
 23 |         self.empty_list = []
 24 | 
 25 |     def __call__(
 26 |         self,
 27 |         content: Union[str, Path, bytes],
 28 |         force_ocr: bool = False,
 29 |         page_num_list: Optional[List[int]] = None,
 30 |     ) -> List[List[Union[str, str, str]]]:
 31 |         try:
 32 |             file_type = which_type(content)
 33 |         except (FileExistsError, TypeError) as e:
 34 |             raise RapidOCRPDFError("The input content is empty.") from e
 35 | 
 36 |         if file_type != "pdf":
 37 |             raise RapidOCRPDFError("The file type is not PDF format.")
 38 | 
 39 |         try:
 40 |             pdf_data = self.load_pdf(content)
 41 |         except RapidOCRPDFError as e:
 42 |             logger.error("%s\n%s", e, error_log())
 43 |             return self.empty_list
 44 | 
 45 |         txts_dict, need_ocr_idxs = self.extract_texts(
 46 |             pdf_data, force_ocr, page_num_list
 47 |         )
 48 | 
 49 |         ocr_res_dict = self.get_ocr_res_streaming(pdf_data, need_ocr_idxs)
 50 | 
 51 |         final_result = self.merge_direct_ocr(txts_dict, ocr_res_dict)
 52 |         return final_result
 53 | 
 54 |     @staticmethod
 55 |     def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes:
 56 |         if isinstance(pdf_content, (str, Path)):
 57 |             if not Path(pdf_content).exists():
 58 |                 raise RapidOCRPDFError(f"{pdf_content} does not exist.")
 59 | 
 60 |             with open(pdf_content, "rb") as f:
 61 |                 data = f.read()
 62 |             return data
 63 | 
 64 |         if isinstance(pdf_content, bytes):
 65 |             return pdf_content
 66 | 
 67 |         raise RapidOCRPDFError(f"{type(pdf_content)} is not in [str, Path, bytes].")
 68 | 
 69 |     def extract_texts(
 70 |         self, pdf_data: bytes, force_ocr: bool, page_num_list: Optional[List[int]]
 71 |     ) -> Tuple[Dict, List]:
 72 |         texts, need_ocr_idxs = {}, []
 73 |         with fitz.open(stream=pdf_data) as doc:
 74 |             page_num_list = self.get_page_num_range(page_num_list, doc.page_count)
 75 |             for i, page in enumerate(doc):
 76 |                 if page_num_list is not None and i not in page_num_list:
 77 |                     continue
 78 | 
 79 |                 if force_ocr:
 80 |                     need_ocr_idxs.append(i)
 81 |                     continue
 82 | 
 83 |                 text = page.get_text("text", sort=True)
 84 |                 if text:
 85 |                     texts[i] = text
 86 |                 else:
 87 |                     need_ocr_idxs.append(i)
 88 |         return texts, need_ocr_idxs
 89 | 
 90 |     @staticmethod
 91 |     def get_page_num_range(
 92 |         page_num_list: Optional[List[int]], page_count: int
 93 |     ) -> Optional[List[int]]:
 94 |         if page_num_list is None:
 95 |             return None
 96 | 
 97 |         if max(page_num_list) >= page_count:
 98 |             raise RapidOCRPDFError(
 99 |                 f"The max value of {page_num_list} is greater than total page nums: {page_count}"
100 |             )
101 | 
102 |         # support negative number
103 |         new_page_num = []
104 |         for page_num in page_num_list:
105 |             if page_num >= 0:
106 |                 new_page_num.append(page_num)
107 | 
108 |             if abs(page_num) > page_count:
109 |                 raise RapidOCRPDFError(
110 |                     f"{page_num} is out of range [{-page_count}, {page_count - 1})"
111 |                 )
112 | 
113 |             positive_num = page_count + page_num
114 |             new_page_num.append(positive_num)
115 | 
116 |         return new_page_num
117 | 
118 |     def get_ocr_res_streaming(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict:
119 |         def convert_img(page):
120 |             pix = page.get_pixmap(dpi=self.dpi)
121 |             img = np.frombuffer(pix.samples, dtype=np.uint8)
122 |             img = img.reshape([pix.h, pix.w, pix.n])
123 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
124 |             return img
125 | 
126 |         ocr_res = {}
127 |         with fitz.open(stream=pdf_data) as doc:
128 |             for i in need_ocr_idxs:
129 |                 img = convert_img(doc[i])
130 | 
131 |                 preds = self.ocr_engine(img)
132 |                 if preds.txts is None:
133 |                     continue
134 | 
135 |                 avg_score = (
136 |                     sum(preds.scores) / len(preds.scores) if preds.scores else 0.0
137 |                 )
138 | 
139 |                 ocr_res[i] = {
140 |                     "text": "\n".join(preds.txts),
141 |                     "avg_confidence": avg_score,
142 |                 }
143 |         return ocr_res
144 | 
145 |     def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str]]:
146 |         final_result = {}
147 |         for page_idx, text in txts_dict.items():
148 |             final_result[page_idx] = {"text": text, "avg_confidence": "N/A"}
149 | 
150 |         for page_idx, ocr_data in ocr_res_dict.items():
151 |             final_result[page_idx] = {
152 |                 "text": ocr_data["text"],
153 |                 "avg_confidence": ocr_data["avg_confidence"],
154 |             }
155 | 
156 |         final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0])))
157 |         return [[k, v["text"], v["avg_confidence"]] for k, v in final_result.items()]
158 | 
159 | 
160 | class RapidOCRPDFError(Exception):
161 |     pass
162 | 
163 | 
164 | def parse_args(arg_list: Optional[List[str]] = None):
165 |     parser = argparse.ArgumentParser()
166 |     parser.add_argument("pdf_path", type=str)
167 |     parser.add_argument("--dpi", type=int, default=200)
168 |     parser.add_argument(
169 |         "-f",
170 |         "--force_ocr",
171 |         action="store_true",
172 |         default=False,
173 |         help="Whether to use ocr for all pages.",
174 |     )
175 |     parser.add_argument(
176 |         "--page_num_list",
177 |         type=int,
178 |         nargs="*",
179 |         default=None,
180 |         help="Which pages will be extracted. e.g. 0 1 2. Note: the index of page num starts from 0.",
181 |     )
182 |     args = parser.parse_args(arg_list)
183 |     return args
184 | 
185 | 
186 | def main(arg_list: Optional[List[str]] = None):
187 |     args = parse_args(arg_list)
188 |     pdf_extracter = RapidOCRPDF(args.dpi)
189 |     try:
190 |         result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list)
191 |         print(result)
192 |     except Exception as e:
193 |         logger.error("%s\n%s", e, error_log())
194 | 
195 | 
196 | if __name__ == "__main__":
197 |     main()
198 | 


--------------------------------------------------------------------------------
/rapidocr_pdf/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | # @Author: SWHL
3 | # @Contact: liekkaskono@163.com
4 | 


--------------------------------------------------------------------------------
/rapidocr_pdf/utils/logger.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # @Author: SWHL
 3 | # @Contact: liekkaskono@163.com
 4 | import logging
 5 | 
 6 | import colorlog
 7 | 
 8 | 
 9 | class Logger:
10 |     def __init__(self, log_level=logging.DEBUG, logger_name=None):
11 |         self.logger = logging.getLogger(logger_name)
12 |         self.logger.setLevel(log_level)
13 |         self.logger.propagate = False
14 | 
15 |         formatter = colorlog.ColoredFormatter(
16 |             "%(log_color)s[%(levelname)s] %(asctime)s [RapidOCR] %(filename)s:%(lineno)d: %(message)s",
17 |             log_colors={
18 |                 "DEBUG": "cyan",
19 |                 "INFO": "green",
20 |                 "WARNING": "yellow",
21 |                 "ERROR": "red",
22 |                 "CRITICAL": "red,bg_white",
23 |             },
24 |         )
25 | 
26 |         if not self.logger.handlers:
27 |             console_handler = logging.StreamHandler()
28 |             console_handler.setFormatter(formatter)
29 | 
30 |             for handler in self.logger.handlers:
31 |                 self.logger.removeHandler(handler)
32 | 
33 |             console_handler.setLevel(log_level)
34 |             self.logger.addHandler(console_handler)
35 | 
36 |     def get_log(self):
37 |         return self.logger
38 | 


--------------------------------------------------------------------------------
/rapidocr_pdf/utils/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # @Author: SWHL
 3 | # @Contact: liekkaskono@163.com
 4 | import importlib
 5 | import traceback
 6 | from pathlib import Path
 7 | from typing import Union
 8 | 
 9 | import filetype
10 | 
11 | 
12 | def error_log():
13 |     return traceback.format_exc()
14 | 
15 | 
16 | def import_package(name, package=None):
17 |     try:
18 |         module = importlib.import_module(name, package=package)
19 |         return module
20 |     except ModuleNotFoundError:
21 |         return None
22 | 
23 | 
24 | def which_type(content: Union[bytes, str, Path]) -> str:
25 |     if isinstance(content, (str, Path)) and not Path(content).exists():
26 |         raise FileExistsError(f"{content} does not exist.")
27 | 
28 |     kind = filetype.guess(content)
29 |     if kind is None:
30 |         raise TypeError(f"The type of {content} does not support.")
31 | 
32 |     return kind.extension
33 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | filetype>=1.2.0
2 | pymupdf
3 | rapidocr>=2.0.7
4 | colorlog
5 | onnxruntime


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # @Author: SWHL
 3 | # @Contact: liekkaskono@163.com
 4 | import sys
 5 | import warnings
 6 | from typing import List
 7 | 
 8 | import setuptools
 9 | from get_pypi_latest_version import GetPyPiLatestVersion
10 | 
11 | 
12 | def read_txt(txt_path: str) -> List:
13 |     if not isinstance(txt_path, str):
14 |         txt_path = str(txt_path)
15 | 
16 |     with open(txt_path, "r", encoding="utf-8") as f:
17 |         data = list(map(lambda x: x.rstrip("\n"), f))
18 |     return data
19 | 
20 | 
21 | def get_readme():
22 |     readme_path = "./docs/docs.md"
23 |     with open(readme_path, "r", encoding="utf-8") as f:
24 |         readme = f.read()
25 |     return readme
26 | 
27 | 
28 | MODULE_NAME = "rapidocr_pdf"
29 | VERSION_NUM = "0.0.1"
30 | 
31 | obtainer = GetPyPiLatestVersion()
32 | try:
33 |     latest_version = obtainer(MODULE_NAME)
34 |     if latest_version:
35 |         VERSION_NUM = obtainer.version_add_one(latest_version)
36 | 
37 |     if len(sys.argv) > 2:
38 |         match_str = " ".join(sys.argv[2:])
39 |         matched_versions = obtainer.extract_version(match_str)
40 |         if matched_versions:
41 |             VERSION_NUM = matched_versions
42 | except ValueError:
43 |     warnings.warn(
44 |         f"The package {MODULE_NAME} seems to be submitting for the first time."
45 |     )
46 | sys.argv = sys.argv[:2]
47 | 
48 | setuptools.setup(
49 |     name=MODULE_NAME,
50 |     version=VERSION_NUM,
51 |     platforms="Any",
52 |     description="Tools of extracting PDF content based on RapidOCR",
53 |     long_description=get_readme(),
54 |     long_description_content_type="text/markdown",
55 |     author="SWHL",
56 |     author_email="liekkaskono@163.com",
57 |     url="https://github.com/RapidAI/RapidOCRPDF",
58 |     license="Apache-2.0",
59 |     packages=setuptools.find_packages(),
60 |     install_requires=read_txt("requirements.txt"),
61 |     keywords=["rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino"],
62 |     classifiers=[
63 |         "Programming Language :: Python :: 3.6",
64 |         "Programming Language :: Python :: 3.7",
65 |         "Programming Language :: Python :: 3.8",
66 |         "Programming Language :: Python :: 3.9",
67 |         "Programming Language :: Python :: 3.10",
68 |         "Programming Language :: Python :: 3.11",
69 |         "Programming Language :: Python :: 3.12",
70 |         "Programming Language :: Python :: 3.13",
71 |     ],
72 |     python_requires=">=3.6",
73 |     entry_points={
74 |         "console_scripts": [f"{MODULE_NAME}={MODULE_NAME}.main:main"],
75 |     },
76 | )
77 | 


--------------------------------------------------------------------------------
/tests/test_files/direct_and_image.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RapidAI/RapidOCRPDF/bb99e81e397c53fc9b6bba71d49408470c02f09e/tests/test_files/direct_and_image.pdf


--------------------------------------------------------------------------------
/tests/test_files/direct_extract.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RapidAI/RapidOCRPDF/bb99e81e397c53fc9b6bba71d49408470c02f09e/tests/test_files/direct_extract.pdf


--------------------------------------------------------------------------------
/tests/test_files/image.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RapidAI/RapidOCRPDF/bb99e81e397c53fc9b6bba71d49408470c02f09e/tests/test_files/image.pdf


--------------------------------------------------------------------------------
/tests/test_main.py:
--------------------------------------------------------------------------------
  1 | # -*- encoding: utf-8 -*-
  2 | # @Author: SWHL
  3 | # @Contact: liekkaskono@163.com
  4 | import ast
  5 | import shlex
  6 | import sys
  7 | from pathlib import Path
  8 | 
  9 | cur_dir = Path(__file__).resolve().parent
 10 | root_dir = cur_dir.parent
 11 | sys.path.append(str(root_dir))
 12 | 
 13 | import pytest
 14 | 
 15 | from rapidocr_pdf import RapidOCRPDF, RapidOCRPDFError
 16 | from rapidocr_pdf.main import main
 17 | 
 18 | test_dir = cur_dir / "test_files"
 19 | 
 20 | pdf_path = test_dir / "direct_and_image.pdf"
 21 | 
 22 | extracter = RapidOCRPDF()
 23 | 
 24 | 
 25 | @pytest.mark.parametrize(
 26 |     "command, expected_output",
 27 |     [
 28 |         (
 29 |             f"{pdf_path} --page_num_list 0",
 30 |             "ABCNet: Real-time Scene Text Spotting with Adaptive Bezier-Curve Network∗",
 31 |         )
 32 |     ],
 33 | )
 34 | def test_cli(capsys, command, expected_output):
 35 |     main(shlex.split(command))
 36 |     output = capsys.readouterr().out.rstrip()
 37 |     output = ast.literal_eval(output)
 38 |     assert output[0][1].split("\n")[0].strip() == expected_output
 39 | 
 40 | 
 41 | def test_negative_page_num():
 42 |     pdf_path = test_dir / "direct_and_image.pdf"
 43 |     result = extracter(pdf_path, page_num_list=[-1])
 44 | 
 45 |     assert result[0][1].split("\n")[0].strip() == "Microsoft"
 46 | 
 47 | 
 48 | def test_error_negative_page_num():
 49 |     pdf_path = test_dir / "direct_and_image.pdf"
 50 |     with pytest.raises(RapidOCRPDFError) as exc_info:
 51 |         result = extracter(pdf_path, page_num_list=[-3])
 52 |     assert exc_info.type is RapidOCRPDFError
 53 | 
 54 | 
 55 | def test_page_num():
 56 |     pdf_path = test_dir / "direct_extract.pdf"
 57 |     result = extracter(pdf_path, page_num_list=[0])
 58 | 
 59 |     assert (
 60 |         result[0][1].split("\n")[0].strip()
 61 |         == "Defending Ukraine: Early Lessons from the Cyber War"
 62 |     )
 63 | 
 64 | 
 65 | def test_error_page_num():
 66 |     pdf_path = test_dir / "direct_extract.pdf"
 67 |     with pytest.raises(RapidOCRPDFError) as exc_info:
 68 |         result = extracter(pdf_path, page_num_list=[1])
 69 |     assert exc_info.type is RapidOCRPDFError
 70 | 
 71 | 
 72 | @pytest.mark.parametrize(
 73 |     "pdf_content, result1, result2",
 74 |     [
 75 |         (test_dir / "direct_extract.pdf", 4858, "      "),
 76 |         (test_dir / "image.pdf", 3478, "Kurbas"),
 77 |         (test_dir / "direct_and_image.pdf", 4848, "      "),
 78 |     ],
 79 | )
 80 | def test_different_pdf(pdf_content, result1, result2):
 81 |     result = extracter(pdf_content)
 82 |     assert len(result[0][1]) >= result1
 83 |     assert result[0][1][:6] == result2
 84 | 
 85 | 
 86 | def test_input_bytes():
 87 |     pdf_content = test_dir / "image.pdf"
 88 |     with open(pdf_content, "rb") as f:
 89 |         data = f.read()
 90 | 
 91 |     result = extracter(data)
 92 | 
 93 |     assert len(result[0][1]) > 0
 94 |     assert result[0][1][:6] == "Kurbas"
 95 | 
 96 | 
 97 | def test_force_ocr():
 98 |     pdf_content = test_dir / "image.pdf"
 99 |     with open(pdf_content, "rb") as f:
100 |         data = f.read()
101 | 
102 |     result = extracter(data, force_ocr=True)
103 |     assert len(result[0][1]) > 3400
104 |     assert result[0][1][:6] == "Kurbas"
105 | 
106 | 
107 | @pytest.mark.parametrize("content", [None, ""])
108 | def test_corner_case(content):
109 |     with pytest.raises(RapidOCRPDFError) as exc_info:
110 |         extracter(content)
111 |     assert exc_info.type is RapidOCRPDFError
112 | 


--------------------------------------------------------------------------------