├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ └── config.yml └── workflows │ └── gen_whl_to_pypi.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── cliff.toml ├── demo.py ├── docs └── docs.md ├── rapidocr_pdf ├── __init__.py ├── main.py └── utils │ ├── __init__.py │ ├── logger.py │ └── utils.py ├── requirements.txt ├── setup.py └── tests ├── test_files ├── direct_and_image.pdf ├── direct_extract.pdf └── image.pdf └── test_main.py /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🐞 Bug 3 | about: Bug 4 | title: 'Bug' 5 | labels: 'Bug' 6 | assignees: '' 7 | 8 | --- 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: ❓ Questions 4 | url: https://github.com/RapidAI/RapidOCRPDF/discussions/categories/q-a 5 | about: Please use the community forum for help and questions regarding LabelConvert Docs. 6 | - name: 💡 Ideas 7 | url: https://github.com/RapidAI/RapidOCRPDF/discussions/categories/ideas 8 | about: Please vote for and post new feature ideas in the community forum. -------------------------------------------------------------------------------- /.github/workflows/gen_whl_to_pypi.yml: -------------------------------------------------------------------------------- 1 | name: Push rapidocr_pdf to pypi 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | jobs: 9 | UnitTesting: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Pull latest code 13 | uses: actions/checkout@v3 14 | 15 | - name: Set up Python 3.10 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: '3.10' 19 | architecture: 'x64' 20 | 21 | - name: Display Python version 22 | run: python -c "import sys; print(sys.version)" 23 | 24 | - name: Unit testings with rapidocr 25 | run: | 26 | pip install -r requirements.txt 27 | pip install pytest 28 | pytest tests/test*.py 29 | 30 | GenerateWHL_PushPyPi: 31 | needs: UnitTesting 32 | runs-on: ubuntu-latest 33 | 34 | steps: 35 | - uses: actions/checkout@v3 36 | 37 | - name: Set up Python 3.10 38 | uses: actions/setup-python@v4 39 | with: 40 | python-version: '3.10' 41 | architecture: 'x64' 42 | 43 | - name: Run setup.py 44 | run: | 45 | pip install -r requirements.txt 46 | python -m pip install --upgrade pip 47 | pip install wheel get_pypi_latest_version 48 | python setup.py bdist_wheel ${{ github.ref_name }} 49 | 50 | - name: Publish distribution 📦 to PyPI 51 | uses: pypa/gh-action-pypi-publish@v1.5.0 52 | with: 53 | password: ${{ secrets.PYPI_API_TOKEN }} 54 | packages_dir: dist/ 55 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pth 2 | 3 | # Created by .ignore support plugin (hsz.mobi) 4 | ### Python template 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | .pytest_cache 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | # *.manifest 39 | # *.spec 40 | *.res 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | .env 112 | .venv 113 | env/ 114 | venv/ 115 | ENV/ 116 | env.bak/ 117 | venv.bak/ 118 | 119 | # Spyder project settings 120 | .spyderproject 121 | .spyproject 122 | 123 | # Rope project settings 124 | .ropeproject 125 | 126 | # mkdocs documentation 127 | /site 128 | 129 | # mypy 130 | .mypy_cache/ 131 | .dmypy.json 132 | dmypy.json 133 | 134 | # Pyre type checker 135 | .pyre/ 136 | 137 | #idea 138 | .vs 139 | .vscode 140 | .idea 141 | /models 142 | 143 | #models 144 | 145 | *.ttf 146 | *.ttc 147 | 148 | 149 | *.bin 150 | *.mapping 151 | *.xml 152 | 153 | *.pdiparams 154 | *.pdiparams.info 155 | *.pdmodel 156 | 157 | .DS_Store -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://gitee.com/SWHL/autoflake 3 | rev: v2.1.1 4 | hooks: 5 | - id: autoflake 6 | args: 7 | [ 8 | "--recursive", 9 | "--in-place", 10 | "--remove-all-unused-imports", 11 | "--remove-unused-variable", 12 | "--ignore-init-module-imports", 13 | ] 14 | files: \.py$ 15 | - repo: https://gitee.com/SWHL/black 16 | rev: 23.1.0 17 | hooks: 18 | - id: black 19 | files: \.py$ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |
3 |

RapidOCR 📄 PDF

4 |
5 | 6 | 7 | 8 | 9 | 10 | PyPI 11 | 12 | SemVer2.0 13 | 14 | GitHub 15 | 16 |
17 | 18 | ### 简介 19 | 20 | 本仓库依托于[RapidOCR](https://github.com/RapidAI/RapidOCR)仓库,快速提取PDF中文字,包括扫描版PDF、加密版PDF、可直接复制文字版PDF。 21 | 22 | ### 整体流程 23 | 24 | ```mermaid 25 | flowchart LR 26 | 27 | A(PDF) --> B{是否可以直接提取内容} --是--> C(PyMuPDF) 28 | B --否--> D(RapidOCR) 29 | 30 | C & D --> E(结果) 31 | ``` 32 | 33 | ### 安装 34 | 35 | ```bash 36 | pip install rapidocr_pdf 37 | ``` 38 | 39 | ### 使用 40 | 41 | #### 脚本使用 42 | 43 | ⚠️注意:在`rapidocr_pdf>=0.4.0`中,支持`page_num_list`参数为负数,假设总页数为2,范围为`[-2, 1]`。 44 | 45 | ⚠️注意:在`rapidocr_pdf>=0.3.0`中,支持了`page_num_list`参数,默认为None,全部提取。**如果指定,页码从0开始**。 46 | 47 | ⚠️注意:在`rapidocr_pdf>=0.2.0`中,已经适配`rapidocr>=2.0.0`版本,可以通过参数来使用不同OCR推理引擎来提速。 48 | 下面的`ocr_params`为示例参数,详细请参见RapidOCR官方文档:[docs](https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#_4) 。 49 | 50 | ```python 51 | from rapidocr_pdf import RapidOCRPDF 52 | 53 | pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True}) 54 | 55 | pdf_path = "tests/test_files/direct_and_image.pdf" 56 | 57 | # page_num_list=[1]: 仅提取第2页 58 | texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[1]) 59 | print(texts) 60 | ``` 61 | 62 | #### 命令行使用 63 | 64 | ```bash 65 | $ rapidocr_pdf -h 66 | usage: rapidocr_pdf [-h] [--dpi DPI] [-f] [--page_num_list [PAGE_NUM_LIST ...]] pdf_path 67 | 68 | positional arguments: 69 | pdf_path 70 | 71 | options: 72 | -h, --help show this help message and exit 73 | --dpi DPI 74 | -f, --force_ocr Whether to use ocr for all pages. 75 | --page_num_list [PAGE_NUM_LIST ...] 76 | Which pages will be extracted. e.g. 0 1 2. 77 | 78 | $ rapidocr_pdf tests/test_files/direct_and_image.pdf --page_num_list 0 1 79 | ``` 80 | 81 | ### 输入输出说明 82 | 83 | **输入**:`Union[str, Path, bytes]` 84 | 85 | **输出**:`List` \[**页码**, **文本内容**, **置信度**\], 具体参见下例: 86 | 87 | ```python 88 | [ 89 | [0, '人之初,性本善。性相近,习相远。', 0.8969868], 90 | [1, 'Men at their birth, are naturally good.', 0.8969868], 91 | ] 92 | ``` 93 | -------------------------------------------------------------------------------- /cliff.toml: -------------------------------------------------------------------------------- 1 | # git-cliff ~ configuration file 2 | # https://git-cliff.org/docs/configuration 3 | 4 | [changelog] 5 | # A Tera template to be rendered as the changelog's footer. 6 | # See https://keats.github.io/tera/docs/#introduction 7 | # header = """ 8 | # # Changelog\n 9 | # All notable changes to this project will be documented in this file. See [conventional commits](https://www.conventionalcommits.org/) for commit guidelines.\n 10 | # """ 11 | # A Tera template to be rendered for each release in the changelog. 12 | # See https://keats.github.io/tera/docs/#introduction 13 | body = """ 14 | {% for group, commits in commits | group_by(attribute="group") %} 15 | ### {{ group | striptags | trim | upper_first }} 16 | {% for commit in commits 17 | | filter(attribute="scope") 18 | | sort(attribute="scope") %} 19 | - **({{commit.scope}})**{% if commit.breaking %} [**breaking**]{% endif %} \ 20 | {{ commit.message }} by [@{{ commit.author.name }}](https://github.com/{{ commit.author.name }}) in [{{ commit.id | truncate(length=7, end="") }}]($REPO/commit/{{ commit.id }}) 21 | {%- endfor -%} 22 | {% raw %}\n{% endraw %}\ 23 | {%- for commit in commits %} 24 | {%- if commit.scope -%} 25 | {% else -%} 26 | - {% if commit.breaking %} [**breaking**]{% endif %}\ 27 | {{ commit.message }} by [@{{ commit.author.name }}](https://github.com/{{ commit.author.name }}) in [{{ commit.id | truncate(length=7, end="") }}]($REPO/commit/{{ commit.id }}) 28 | {% endif -%} 29 | {% endfor -%} 30 | {% endfor %} 31 | 32 | 33 | {% if github.contributors | length > 0 %} 34 | ### 🎉 Contributors 35 | 36 | {% for contributor in github.contributors %} 37 | - [@{{ contributor.username }}](https://github.com/{{ contributor.username }}) 38 | {%- endfor -%} 39 | {% endif %} 40 | 41 | 42 | {% if version %} 43 | {% if previous.version %}\ 44 | **Full Changelog**: [{{ version | trim_start_matches(pat="v") }}]($REPO/compare/{{ previous.version }}..{{ version }}) 45 | {% else %}\ 46 | **Full Changelog**: [{{ version | trim_start_matches(pat="v") }}] 47 | {% endif %}\ 48 | {% else %}\ 49 | ## [unreleased] 50 | {% endif %} 51 | """ 52 | # A Tera template to be rendered as the changelog's footer. 53 | # See https://keats.github.io/tera/docs/#introduction 54 | 55 | footer = """ 56 | 57 | """ 58 | 59 | # Remove leading and trailing whitespaces from the changelog's body. 60 | trim = true 61 | # postprocessors 62 | postprocessors = [ 63 | # Replace the placeholder `` with a URL. 64 | { pattern = '\$REPO', replace = "https://github.com/RapidAI/RapidOCRPDF" }, # replace repository URL 65 | ] 66 | 67 | [git] 68 | # Parse commits according to the conventional commits specification. 69 | # See https://www.conventionalcommits.org 70 | conventional_commits = true 71 | # Exclude commits that do not match the conventional commits specification. 72 | filter_unconventional = true 73 | # Split commits on newlines, treating each line as an individual commit. 74 | split_commits = false 75 | # An array of regex based parsers to modify commit messages prior to further processing. 76 | commit_preprocessors = [ 77 | # Replace issue numbers with link templates to be updated in `changelog.postprocessors`. 78 | #{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "([#${2}](https://github.com/orhun/git-cliff/issues/${2}))"}, 79 | ] 80 | # An array of regex based parsers for extracting data from the commit message. 81 | # Assigns commits to groups. 82 | # Optionally sets the commit's scope and can decide to exclude commits from further processing. 83 | commit_parsers = [ 84 | { message = "^feat", group = "🚀 Features" }, 85 | { message = "^fix", group = "🐛 Bug Fixes" }, 86 | { message = "^doc", group = "📚 Documentation" }, 87 | { message = "^perf", group = "⚡ Performance" }, 88 | { message = "^refactor", group = "🚜 Refactor" }, 89 | { message = "^style", group = "🎨 Styling" }, 90 | { message = "^test", group = "🧪 Testing" }, 91 | { message = "^chore\\(release\\): prepare for", skip = true }, 92 | { message = "^chore\\(deps.*\\)", skip = true }, 93 | { message = "^chore\\(pr\\)", skip = true }, 94 | { message = "^chore\\(pull\\)", skip = true }, 95 | { message = "^chore|^ci", group = "⚙️ Miscellaneous Tasks" }, 96 | { body = ".*security", group = "🛡️ Security" }, 97 | { message = "^revert", group = "◀️ Revert" }, 98 | { message = ".*", group = "💼 Other" }, 99 | ] 100 | # Exclude commits that are not matched by any commit parser. 101 | filter_commits = false 102 | # Order releases topologically instead of chronologically. 103 | topo_order = false 104 | # Order of commits in each group/release within the changelog. 105 | # Allowed values: newest, oldest 106 | sort_commits = "newest" -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | from rapidocr_pdf import RapidOCRPDF 5 | 6 | pdf_extracter = RapidOCRPDF() 7 | 8 | pdf_path = "tests/test_files/direct_and_image.pdf" 9 | texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[-1]) 10 | print(texts) 11 | -------------------------------------------------------------------------------- /docs/docs.md: -------------------------------------------------------------------------------- 1 | See [link](https://github.com/RapidAI/RapidOCRPDF) for details. 2 | -------------------------------------------------------------------------------- /rapidocr_pdf/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | from .main import RapidOCRPDF, RapidOCRPDFError 5 | -------------------------------------------------------------------------------- /rapidocr_pdf/main.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | import argparse 5 | from pathlib import Path 6 | from typing import Dict, List, Optional, Tuple, Union 7 | 8 | import cv2 9 | import fitz 10 | import numpy as np 11 | from rapidocr import RapidOCR 12 | 13 | from .utils.logger import Logger 14 | from .utils.utils import error_log, which_type 15 | 16 | logger = Logger(logger_name=__name__).get_log() 17 | 18 | 19 | class RapidOCRPDF: 20 | def __init__(self, dpi=200, ocr_params: Optional[Dict] = None): 21 | self.dpi = dpi 22 | self.ocr_engine = RapidOCR(params=ocr_params) 23 | self.empty_list = [] 24 | 25 | def __call__( 26 | self, 27 | content: Union[str, Path, bytes], 28 | force_ocr: bool = False, 29 | page_num_list: Optional[List[int]] = None, 30 | ) -> List[List[Union[str, str, str]]]: 31 | try: 32 | file_type = which_type(content) 33 | except (FileExistsError, TypeError) as e: 34 | raise RapidOCRPDFError("The input content is empty.") from e 35 | 36 | if file_type != "pdf": 37 | raise RapidOCRPDFError("The file type is not PDF format.") 38 | 39 | try: 40 | pdf_data = self.load_pdf(content) 41 | except RapidOCRPDFError as e: 42 | logger.error("%s\n%s", e, error_log()) 43 | return self.empty_list 44 | 45 | txts_dict, need_ocr_idxs = self.extract_texts( 46 | pdf_data, force_ocr, page_num_list 47 | ) 48 | 49 | ocr_res_dict = self.get_ocr_res_streaming(pdf_data, need_ocr_idxs) 50 | 51 | final_result = self.merge_direct_ocr(txts_dict, ocr_res_dict) 52 | return final_result 53 | 54 | @staticmethod 55 | def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes: 56 | if isinstance(pdf_content, (str, Path)): 57 | if not Path(pdf_content).exists(): 58 | raise RapidOCRPDFError(f"{pdf_content} does not exist.") 59 | 60 | with open(pdf_content, "rb") as f: 61 | data = f.read() 62 | return data 63 | 64 | if isinstance(pdf_content, bytes): 65 | return pdf_content 66 | 67 | raise RapidOCRPDFError(f"{type(pdf_content)} is not in [str, Path, bytes].") 68 | 69 | def extract_texts( 70 | self, pdf_data: bytes, force_ocr: bool, page_num_list: Optional[List[int]] 71 | ) -> Tuple[Dict, List]: 72 | texts, need_ocr_idxs = {}, [] 73 | with fitz.open(stream=pdf_data) as doc: 74 | page_num_list = self.get_page_num_range(page_num_list, doc.page_count) 75 | for i, page in enumerate(doc): 76 | if page_num_list is not None and i not in page_num_list: 77 | continue 78 | 79 | if force_ocr: 80 | need_ocr_idxs.append(i) 81 | continue 82 | 83 | text = page.get_text("text", sort=True) 84 | if text: 85 | texts[i] = text 86 | else: 87 | need_ocr_idxs.append(i) 88 | return texts, need_ocr_idxs 89 | 90 | @staticmethod 91 | def get_page_num_range( 92 | page_num_list: Optional[List[int]], page_count: int 93 | ) -> Optional[List[int]]: 94 | if page_num_list is None: 95 | return None 96 | 97 | if max(page_num_list) >= page_count: 98 | raise RapidOCRPDFError( 99 | f"The max value of {page_num_list} is greater than total page nums: {page_count}" 100 | ) 101 | 102 | # support negative number 103 | new_page_num = [] 104 | for page_num in page_num_list: 105 | if page_num >= 0: 106 | new_page_num.append(page_num) 107 | 108 | if abs(page_num) > page_count: 109 | raise RapidOCRPDFError( 110 | f"{page_num} is out of range [{-page_count}, {page_count - 1})" 111 | ) 112 | 113 | positive_num = page_count + page_num 114 | new_page_num.append(positive_num) 115 | 116 | return new_page_num 117 | 118 | def get_ocr_res_streaming(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict: 119 | def convert_img(page): 120 | pix = page.get_pixmap(dpi=self.dpi) 121 | img = np.frombuffer(pix.samples, dtype=np.uint8) 122 | img = img.reshape([pix.h, pix.w, pix.n]) 123 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 124 | return img 125 | 126 | ocr_res = {} 127 | with fitz.open(stream=pdf_data) as doc: 128 | for i in need_ocr_idxs: 129 | img = convert_img(doc[i]) 130 | 131 | preds = self.ocr_engine(img) 132 | if preds.txts is None: 133 | continue 134 | 135 | avg_score = ( 136 | sum(preds.scores) / len(preds.scores) if preds.scores else 0.0 137 | ) 138 | 139 | ocr_res[i] = { 140 | "text": "\n".join(preds.txts), 141 | "avg_confidence": avg_score, 142 | } 143 | return ocr_res 144 | 145 | def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str]]: 146 | final_result = {} 147 | for page_idx, text in txts_dict.items(): 148 | final_result[page_idx] = {"text": text, "avg_confidence": "N/A"} 149 | 150 | for page_idx, ocr_data in ocr_res_dict.items(): 151 | final_result[page_idx] = { 152 | "text": ocr_data["text"], 153 | "avg_confidence": ocr_data["avg_confidence"], 154 | } 155 | 156 | final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0]))) 157 | return [[k, v["text"], v["avg_confidence"]] for k, v in final_result.items()] 158 | 159 | 160 | class RapidOCRPDFError(Exception): 161 | pass 162 | 163 | 164 | def parse_args(arg_list: Optional[List[str]] = None): 165 | parser = argparse.ArgumentParser() 166 | parser.add_argument("pdf_path", type=str) 167 | parser.add_argument("--dpi", type=int, default=200) 168 | parser.add_argument( 169 | "-f", 170 | "--force_ocr", 171 | action="store_true", 172 | default=False, 173 | help="Whether to use ocr for all pages.", 174 | ) 175 | parser.add_argument( 176 | "--page_num_list", 177 | type=int, 178 | nargs="*", 179 | default=None, 180 | help="Which pages will be extracted. e.g. 0 1 2. Note: the index of page num starts from 0.", 181 | ) 182 | args = parser.parse_args(arg_list) 183 | return args 184 | 185 | 186 | def main(arg_list: Optional[List[str]] = None): 187 | args = parse_args(arg_list) 188 | pdf_extracter = RapidOCRPDF(args.dpi) 189 | try: 190 | result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list) 191 | print(result) 192 | except Exception as e: 193 | logger.error("%s\n%s", e, error_log()) 194 | 195 | 196 | if __name__ == "__main__": 197 | main() 198 | -------------------------------------------------------------------------------- /rapidocr_pdf/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | -------------------------------------------------------------------------------- /rapidocr_pdf/utils/logger.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | import logging 5 | 6 | import colorlog 7 | 8 | 9 | class Logger: 10 | def __init__(self, log_level=logging.DEBUG, logger_name=None): 11 | self.logger = logging.getLogger(logger_name) 12 | self.logger.setLevel(log_level) 13 | self.logger.propagate = False 14 | 15 | formatter = colorlog.ColoredFormatter( 16 | "%(log_color)s[%(levelname)s] %(asctime)s [RapidOCR] %(filename)s:%(lineno)d: %(message)s", 17 | log_colors={ 18 | "DEBUG": "cyan", 19 | "INFO": "green", 20 | "WARNING": "yellow", 21 | "ERROR": "red", 22 | "CRITICAL": "red,bg_white", 23 | }, 24 | ) 25 | 26 | if not self.logger.handlers: 27 | console_handler = logging.StreamHandler() 28 | console_handler.setFormatter(formatter) 29 | 30 | for handler in self.logger.handlers: 31 | self.logger.removeHandler(handler) 32 | 33 | console_handler.setLevel(log_level) 34 | self.logger.addHandler(console_handler) 35 | 36 | def get_log(self): 37 | return self.logger 38 | -------------------------------------------------------------------------------- /rapidocr_pdf/utils/utils.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | import importlib 5 | import traceback 6 | from pathlib import Path 7 | from typing import Union 8 | 9 | import filetype 10 | 11 | 12 | def error_log(): 13 | return traceback.format_exc() 14 | 15 | 16 | def import_package(name, package=None): 17 | try: 18 | module = importlib.import_module(name, package=package) 19 | return module 20 | except ModuleNotFoundError: 21 | return None 22 | 23 | 24 | def which_type(content: Union[bytes, str, Path]) -> str: 25 | if isinstance(content, (str, Path)) and not Path(content).exists(): 26 | raise FileExistsError(f"{content} does not exist.") 27 | 28 | kind = filetype.guess(content) 29 | if kind is None: 30 | raise TypeError(f"The type of {content} does not support.") 31 | 32 | return kind.extension 33 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | filetype>=1.2.0 2 | pymupdf 3 | rapidocr>=2.0.7 4 | colorlog 5 | onnxruntime -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | import sys 5 | import warnings 6 | from typing import List 7 | 8 | import setuptools 9 | from get_pypi_latest_version import GetPyPiLatestVersion 10 | 11 | 12 | def read_txt(txt_path: str) -> List: 13 | if not isinstance(txt_path, str): 14 | txt_path = str(txt_path) 15 | 16 | with open(txt_path, "r", encoding="utf-8") as f: 17 | data = list(map(lambda x: x.rstrip("\n"), f)) 18 | return data 19 | 20 | 21 | def get_readme(): 22 | readme_path = "./docs/docs.md" 23 | with open(readme_path, "r", encoding="utf-8") as f: 24 | readme = f.read() 25 | return readme 26 | 27 | 28 | MODULE_NAME = "rapidocr_pdf" 29 | VERSION_NUM = "0.0.1" 30 | 31 | obtainer = GetPyPiLatestVersion() 32 | try: 33 | latest_version = obtainer(MODULE_NAME) 34 | if latest_version: 35 | VERSION_NUM = obtainer.version_add_one(latest_version) 36 | 37 | if len(sys.argv) > 2: 38 | match_str = " ".join(sys.argv[2:]) 39 | matched_versions = obtainer.extract_version(match_str) 40 | if matched_versions: 41 | VERSION_NUM = matched_versions 42 | except ValueError: 43 | warnings.warn( 44 | f"The package {MODULE_NAME} seems to be submitting for the first time." 45 | ) 46 | sys.argv = sys.argv[:2] 47 | 48 | setuptools.setup( 49 | name=MODULE_NAME, 50 | version=VERSION_NUM, 51 | platforms="Any", 52 | description="Tools of extracting PDF content based on RapidOCR", 53 | long_description=get_readme(), 54 | long_description_content_type="text/markdown", 55 | author="SWHL", 56 | author_email="liekkaskono@163.com", 57 | url="https://github.com/RapidAI/RapidOCRPDF", 58 | license="Apache-2.0", 59 | packages=setuptools.find_packages(), 60 | install_requires=read_txt("requirements.txt"), 61 | keywords=["rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino"], 62 | classifiers=[ 63 | "Programming Language :: Python :: 3.6", 64 | "Programming Language :: Python :: 3.7", 65 | "Programming Language :: Python :: 3.8", 66 | "Programming Language :: Python :: 3.9", 67 | "Programming Language :: Python :: 3.10", 68 | "Programming Language :: Python :: 3.11", 69 | "Programming Language :: Python :: 3.12", 70 | "Programming Language :: Python :: 3.13", 71 | ], 72 | python_requires=">=3.6", 73 | entry_points={ 74 | "console_scripts": [f"{MODULE_NAME}={MODULE_NAME}.main:main"], 75 | }, 76 | ) 77 | -------------------------------------------------------------------------------- /tests/test_files/direct_and_image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidOCRPDF/bb99e81e397c53fc9b6bba71d49408470c02f09e/tests/test_files/direct_and_image.pdf -------------------------------------------------------------------------------- /tests/test_files/direct_extract.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidOCRPDF/bb99e81e397c53fc9b6bba71d49408470c02f09e/tests/test_files/direct_extract.pdf -------------------------------------------------------------------------------- /tests/test_files/image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RapidAI/RapidOCRPDF/bb99e81e397c53fc9b6bba71d49408470c02f09e/tests/test_files/image.pdf -------------------------------------------------------------------------------- /tests/test_main.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # @Author: SWHL 3 | # @Contact: liekkaskono@163.com 4 | import ast 5 | import shlex 6 | import sys 7 | from pathlib import Path 8 | 9 | cur_dir = Path(__file__).resolve().parent 10 | root_dir = cur_dir.parent 11 | sys.path.append(str(root_dir)) 12 | 13 | import pytest 14 | 15 | from rapidocr_pdf import RapidOCRPDF, RapidOCRPDFError 16 | from rapidocr_pdf.main import main 17 | 18 | test_dir = cur_dir / "test_files" 19 | 20 | pdf_path = test_dir / "direct_and_image.pdf" 21 | 22 | extracter = RapidOCRPDF() 23 | 24 | 25 | @pytest.mark.parametrize( 26 | "command, expected_output", 27 | [ 28 | ( 29 | f"{pdf_path} --page_num_list 0", 30 | "ABCNet: Real-time Scene Text Spotting with Adaptive Bezier-Curve Network∗", 31 | ) 32 | ], 33 | ) 34 | def test_cli(capsys, command, expected_output): 35 | main(shlex.split(command)) 36 | output = capsys.readouterr().out.rstrip() 37 | output = ast.literal_eval(output) 38 | assert output[0][1].split("\n")[0].strip() == expected_output 39 | 40 | 41 | def test_negative_page_num(): 42 | pdf_path = test_dir / "direct_and_image.pdf" 43 | result = extracter(pdf_path, page_num_list=[-1]) 44 | 45 | assert result[0][1].split("\n")[0].strip() == "Microsoft" 46 | 47 | 48 | def test_error_negative_page_num(): 49 | pdf_path = test_dir / "direct_and_image.pdf" 50 | with pytest.raises(RapidOCRPDFError) as exc_info: 51 | result = extracter(pdf_path, page_num_list=[-3]) 52 | assert exc_info.type is RapidOCRPDFError 53 | 54 | 55 | def test_page_num(): 56 | pdf_path = test_dir / "direct_extract.pdf" 57 | result = extracter(pdf_path, page_num_list=[0]) 58 | 59 | assert ( 60 | result[0][1].split("\n")[0].strip() 61 | == "Defending Ukraine: Early Lessons from the Cyber War" 62 | ) 63 | 64 | 65 | def test_error_page_num(): 66 | pdf_path = test_dir / "direct_extract.pdf" 67 | with pytest.raises(RapidOCRPDFError) as exc_info: 68 | result = extracter(pdf_path, page_num_list=[1]) 69 | assert exc_info.type is RapidOCRPDFError 70 | 71 | 72 | @pytest.mark.parametrize( 73 | "pdf_content, result1, result2", 74 | [ 75 | (test_dir / "direct_extract.pdf", 4858, " "), 76 | (test_dir / "image.pdf", 3478, "Kurbas"), 77 | (test_dir / "direct_and_image.pdf", 4848, " "), 78 | ], 79 | ) 80 | def test_different_pdf(pdf_content, result1, result2): 81 | result = extracter(pdf_content) 82 | assert len(result[0][1]) >= result1 83 | assert result[0][1][:6] == result2 84 | 85 | 86 | def test_input_bytes(): 87 | pdf_content = test_dir / "image.pdf" 88 | with open(pdf_content, "rb") as f: 89 | data = f.read() 90 | 91 | result = extracter(data) 92 | 93 | assert len(result[0][1]) > 0 94 | assert result[0][1][:6] == "Kurbas" 95 | 96 | 97 | def test_force_ocr(): 98 | pdf_content = test_dir / "image.pdf" 99 | with open(pdf_content, "rb") as f: 100 | data = f.read() 101 | 102 | result = extracter(data, force_ocr=True) 103 | assert len(result[0][1]) > 3400 104 | assert result[0][1][:6] == "Kurbas" 105 | 106 | 107 | @pytest.mark.parametrize("content", [None, ""]) 108 | def test_corner_case(content): 109 | with pytest.raises(RapidOCRPDFError) as exc_info: 110 | extracter(content) 111 | assert exc_info.type is RapidOCRPDFError 112 | --------------------------------------------------------------------------------