├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── feature_request.md │ └── question.md ├── dependabot.yml ├── pull_request_template.md ├── scripts │ ├── docs.sh │ └── test.sh └── workflows │ ├── codeql-analysis.yml │ ├── continuous-integration.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yml ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── docker-compose.yml ├── docs ├── Makefile ├── make.bat └── source │ ├── CHANGELOG.md │ ├── conf.py │ ├── example_files │ ├── columns.pdf │ ├── figure.pdf │ ├── grid.pdf │ ├── order_summary.pdf │ ├── simple_memo.pdf │ └── tables.pdf │ ├── examples │ ├── element_ordering.rst │ ├── extracting_text_from_figures.rst │ ├── index.rst │ ├── more_tables.rst │ ├── order_summary.rst │ └── simple_memo.rst │ ├── index.rst │ ├── overview.rst │ ├── reference │ ├── common.rst │ ├── components.rst │ ├── filtering.rst │ ├── index.rst │ ├── loaders.rst │ ├── sectioning.rst │ ├── tables.rst │ └── visualise.rst │ └── screenshots │ ├── order_summary_example │ ├── initial.png │ ├── sections.png │ ├── showing_font_1.png │ ├── showing_font_2.png │ └── zoomed.png │ └── simple_memo_example │ ├── top.png │ └── visualise.png ├── imagemagick_policy.xml ├── mypy.ini ├── py_pdf_parser ├── __init__.py ├── common.py ├── components.py ├── exceptions.py ├── filtering.py ├── loaders.py ├── sectioning.py ├── tables.py └── visualise │ ├── __init__.py │ ├── background.py │ ├── info_figure.py │ ├── main.py │ └── sections.py ├── pycodestyle.cfg ├── pyproject.toml ├── pytype.cfg ├── setup.py └── tests ├── __init__.py ├── base.py ├── data ├── images │ ├── tables1.png │ └── tables2.png └── pdfs │ ├── image.pdf │ ├── test.pdf │ └── test_protected.pdf ├── test_common.py ├── test_components.py ├── test_doc_examples ├── __init__.py ├── test_element_ordering.py ├── test_extracting_text_from_figures.py ├── test_order_summary.py ├── test_simple_memo.py └── test_tables.py ├── test_filtering.py ├── test_loaders.py ├── test_sectioning.py ├── test_tables.py ├── test_visualise.py └── utils.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | extend-ignore = E203 4 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Bug Report** 11 | 12 | Before submitting an issue, please ensure you have read our `CONTRIBUTING.md`, and follow the Code of Conduct. 13 | 14 | Thanks for taking the time to report a bug. To help us fix it quickly, please include the following information: 15 | 16 | * A good description of the bug, including expected behavior and actual behavior. 17 | * A (small as possible) reproducible example of the bug. Please include code, and any files required to reproduce the issue. 18 | * Any required context. 19 | * If you'd be interested on working on a fix for your issue, please let us know! 20 | 21 | Please also check that your bug is not actually caused by [pdfminer.six](https://github.com/pdfminer/pdfminer.six), and is really an issue with this project. 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Feature Request** 11 | 12 | Before submitting an issue, please ensure you have read our `CONTRIBUTING.md`, and 13 | follow the Code of Conduct. 14 | 15 | Thanks for suggesting a new feature. To enable a useful discussion, please include as 16 | much of the following as you can: 17 | * A good description of the feature. 18 | * Why do you want this feature? What is the use-case and context? 19 | * An example of what you'd like to achieve. 20 | * Any ideas about implementation. 21 | * Please also indicate if you'd be interested on working on the feature yourself. 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Question 3 | about: Ask for help using this tool 4 | title: '' 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Question** 11 | 12 | Before submitting an issue, please ensure you have read our `CONTRIBUTING.md`, and follow the Code of Conduct. 13 | 14 | Thanks for taking the time to submit an issue. To help us understand your question and answer it quickly, please include the following information where possible: 15 | 16 | * A good description of the question, including what you are trying to achieve and what the problems are. 17 | * A (small as possible) example highlighting your question. Please include code that you have tried, and any files required to run it. 18 | * Any required context. 19 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # Maintain dependencies from pip 4 | - package-ecosystem: "pip" 5 | directory: "/" 6 | schedule: 7 | interval: "daily" 8 | # Maintain dependencies from GitHub Actions 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | interval: "daily" 13 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | **Description** 2 | 3 | Please include a description of the change, and why it was needed. 4 | 5 | **Linked issues** 6 | 7 | Please link any issues this pull request related to. Using the word "closes" before the 8 | link will mean the issue is automatically closed by this Pull Request. 9 | 10 | **Testing** 11 | 12 | Please describe how your changes have been tested. 13 | 14 | **Checklist** 15 | 16 | - [ ] I have provided a good description of the change above 17 | - [ ] I have added any necessary tests 18 | - [ ] I have added all necessary type hints 19 | - [ ] I have checked my linting (`docker-compose run --rm lint`) 20 | - [ ] I have added/updated all necessary documentation 21 | - [ ] I have updated `CHANGELOG.md`, following the format from 22 | [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). 23 | -------------------------------------------------------------------------------- /.github/scripts/docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo '#######################################################################' 4 | echo '# Building docs #' 5 | echo '#######################################################################' 6 | 7 | export SPHINXOPTS="-W" # Treat warnings as errors 8 | 9 | xvfb-run make --directory $PROJECT_DIR/docs html 10 | 11 | DOCS_STATUS=$? 12 | if [[ ("$DOCS_STATUS" == 0) ]]; then 13 | echo '#######################################################################' 14 | echo '# Build succeded #' 15 | echo '#######################################################################' 16 | exit 0 17 | else 18 | echo '' 19 | echo '#######################################################################' 20 | echo '# Build failed ! #' 21 | echo '#######################################################################' 22 | exit 1 23 | fi 24 | -------------------------------------------------------------------------------- /.github/scripts/test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | clean_pyc () { echo 'cleaning .pyc files'; find . -name "*.pyc" -exec rm -f {} \; ; } 4 | trap clean_pyc EXIT 5 | 6 | echo '' 7 | echo '#######################################################################' 8 | echo '# Running nosetests #' 9 | echo '#######################################################################' 10 | xvfb-run nosetests $PROJECT_DIR 11 | 12 | TEST_STATUS=$? 13 | if [[ ("$TEST_STATUS" == 0) ]]; then 14 | echo '#######################################################################' 15 | echo '# nosetests succeded #' 16 | echo '#######################################################################' 17 | exit 0 18 | else 19 | echo '' 20 | echo '#######################################################################' 21 | echo '# nosetests failed ! #' 22 | echo '#######################################################################' 23 | exit 1 24 | fi 25 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [master, ] 6 | pull_request: 7 | # The branches below must be a subset of the branches above 8 | branches: [master] 9 | schedule: 10 | - cron: '0 7 * * 2' 11 | 12 | jobs: 13 | analyse: 14 | name: Analyse 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v3 20 | with: 21 | # We must fetch at least the immediate parents so that if this is 22 | # a pull request then we can checkout the head. 23 | fetch-depth: 2 24 | 25 | # If this run was triggered by a pull request event, then checkout 26 | # the head of the pull request instead of the merge commit. 27 | - run: git checkout HEAD^2 28 | if: ${{ github.event_name == 'pull_request' }} 29 | 30 | # Initializes the CodeQL tools for scanning. 31 | - name: Initialize CodeQL 32 | uses: github/codeql-action/init@v2 33 | # Override language selection by uncommenting this and choosing your languages 34 | # with: 35 | # languages: go, javascript, csharp, python, cpp, java 36 | 37 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 38 | # If this step fails, then you should remove it and run the build manually (see below) 39 | - name: Autobuild 40 | uses: github/codeql-action/autobuild@v2 41 | 42 | # ℹ️ Command-line programs to run using the OS shell. 43 | # 📚 https://git.io/JvXDl 44 | 45 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 46 | # and modify them (or add more) to build your code if your project 47 | # uses a compiled language 48 | 49 | #- run: | 50 | # make bootstrap 51 | # make release 52 | 53 | - name: Perform CodeQL Analysis 54 | uses: github/codeql-action/analyze@v2 55 | -------------------------------------------------------------------------------- /.github/workflows/continuous-integration.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | on: 3 | push: 4 | branches: 5 | - master 6 | pull_request: 7 | branches: 8 | - master 9 | jobs: 10 | continuous-integration: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Docker Buildx 15 | uses: docker/setup-buildx-action@v2 16 | - name: Cache Docker layers 17 | uses: actions/cache@v3 18 | with: 19 | path: /tmp/.buildx-cache 20 | key: ${{ runner.os }}-buildx-${{ github.sha }} 21 | restore-keys: | 22 | ${{ runner.os }}-buildx- 23 | - name: Build the tests docker container 24 | uses: docker/build-push-action@v3.1.1 25 | with: 26 | tags: jstockwin/py-pdf-parser-test:test 27 | cache-from: type=local,src=/tmp/.buildx-cache 28 | cache-to: type=local,dest=/tmp/.buildx-cache-new 29 | load: true 30 | # This ugly bit is necessary if you don't want your cache to grow forever 31 | # till it hits GitHub's limit of 5GB. 32 | # Temp fix 33 | # https://github.com/docker/build-push-action/issues/252 34 | # https://github.com/moby/buildkit/issues/1896 35 | - name: Move cache 36 | run: | 37 | rm -rf /tmp/.buildx-cache 38 | mv /tmp/.buildx-cache-new /tmp/.buildx-cache 39 | - name: Run test 40 | run: | # Note we need '-uroot' so user has permissions to github.workspace 41 | docker run --rm -uroot --volume ${{ github.workspace }}:/py-pdf-parser \ 42 | jstockwin/py-pdf-parser-test:test .github/scripts/test.sh 43 | - name: Check docs build correctly 44 | run: docker run --rm jstockwin/py-pdf-parser-test:test .github/scripts/docs.sh 45 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Publish new version 2 | on: 3 | release: 4 | types: [published] 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v3 10 | - uses: actions/setup-python@master 11 | - name: Install build packages 12 | run: pip3 install twine==3.1.1 wheel==0.34.2 13 | - name: Build package 14 | run: python3 setup.py sdist bdist_wheel 15 | - name: Check built package 16 | run: twine check dist/* 17 | - uses: actions/upload-artifact@v3 18 | with: 19 | path: ./dist 20 | 21 | pypi-publish: 22 | needs: ["build"] 23 | environment: "pypi" 24 | 25 | name: upload release to PyPI 26 | runs-on: ubuntu-latest 27 | permissions: 28 | # IMPORTANT: this permission is mandatory for trusted publishing 29 | id-token: write 30 | steps: 31 | - uses: actions/download-artifact@v3 32 | - name: Publish package distributions to PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1 34 | with: 35 | packages_dir: artifact/ 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # pytype 107 | .pytype/ 108 | 109 | .vscode/ 110 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | default_language_version: 4 | python: python3.8 # pinned until pytype issue resolved 5 | repos: 6 | - repo: https://github.com/pre-commit/pre-commit-hooks 7 | rev: v4.6.0 8 | hooks: 9 | - id: check-added-large-files 10 | - id: check-merge-conflict 11 | - id: debug-statements 12 | - id: detect-private-key 13 | - id: end-of-file-fixer 14 | - id: mixed-line-ending 15 | - id: trailing-whitespace 16 | - repo: https://github.com/pycqa/isort 17 | rev: 5.13.2 18 | hooks: 19 | - id: isort 20 | args: [--profile, black] 21 | - repo: https://github.com/psf/black 22 | rev: 24.4.2 23 | hooks: 24 | - id: black 25 | - repo: https://github.com/pycqa/flake8 26 | rev: 7.1.0 27 | hooks: 28 | - id: flake8 29 | - repo: https://github.com/mattseymour/pre-commit-pytype 30 | rev: '2023.5.8' 31 | hooks: 32 | - id: pytype 33 | args: ['--disable=pyi-error,import-error', '--exclude=tests'] 34 | - repo: https://github.com/pre-commit/mirrors-mypy 35 | rev: 'v1.10.1' 36 | hooks: 37 | - id: mypy 38 | additional_dependencies: [types-mock] 39 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Build documentation in the docs/ directory with Sphinx 9 | sphinx: 10 | configuration: docs/source/conf.py 11 | 12 | # Build documentation with MkDocs 13 | #mkdocs: 14 | # configuration: mkdocs.yml 15 | 16 | # Optionally build your docs in additional formats such as PDF and ePub 17 | formats: all 18 | 19 | # Optionally set the version of Python and requirements required to build your docs 20 | python: 21 | version: 3.8 22 | install: 23 | - method: pip 24 | path: . 25 | extra_requirements: 26 | - dev 27 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [Unreleased] 8 | 9 | ## [0.13.0] - 2024-07-23 10 | 11 | ### Added 12 | - Added extra filtering methods for ElementList 13 | - Make sure tests and docs are not included in binary distribution wheels (PyPi) and source distribution (sdist). 14 | 15 | ## [0.12.0] - 2023-11-10 16 | 17 | ### Added 18 | - Added support for opening password protected files ([#350](https://github.com/jstockwin/py-pdf-parser/pull/350)) 19 | 20 | ## [0.11.0] - 2023-08-07 21 | 22 | ### Changed 23 | - Various dependency updates 24 | - PyPI releases now use Trusted Publishers 25 | 26 | ### Fixed 27 | - Fixed typo in docs ([#361](https://github.com/jstockwin/py-pdf-parser/pull/361)) 28 | 29 | ## [0.10.2] - 2022-11-07 30 | 31 | ### Changed 32 | - Various dependency updates 33 | - Removed unused PyYAML dependency ([#262](https://github.com/jstockwin/py-pdf-parser/pull/262)) 34 | 35 | ## [0.10.1] - 2021-10-12 36 | ### Fixed 37 | - The `visualise` function properly uses the _elements_ parameter in order to filter visualised elements. ([#256](https://github.com/jstockwin/py-pdf-parser/pull/256)) 38 | 39 | ### Changed 40 | - Various dependency updates 41 | 42 | ## [0.10.0] - 2021-07-01 43 | - [BREAKING] Changes from using pyqt5 to using tkinter for the visualise tool. This 44 | means we don't need the python3-dev as a requirement, and seems to solve endless 45 | issues with pyqt5 not finding the correct qt bindings. This is a potential breaking 46 | change, although the visualise tool is only in the development version. No code 47 | changes are needed, but you will need tkinter installed for visualise to still work. 48 | - Changed python version from 3.6 to 3.8 in `.readthedocs.yml`. 49 | 50 | ## [0.9.0] - 2021-06-09 51 | ### Changed 52 | - Various dependency updates (matplotlib, pyqt5) 53 | - Removed all but the tests dockerfile for simplicity. Use Docker BuildKit. We will no longer be pushing images to DockerHub on release. ([#203](https://github.com/jstockwin/py-pdf-parser/pull/203)) 54 | 55 | ## [0.8.0] - 2021-05-12 56 | ### Changed 57 | - Various dependency updates 58 | - Updated CI to avoid login issue ([#182](https://github.com/jstockwin/py-pdf-parser/pull/182)) 59 | 60 | ## [0.7.0] - 2021-01-15 61 | ### Changed 62 | - Ensure we only accept LTTextBoxes at the top level (not LTTextLines) ([#155](https://github.com/jstockwin/py-pdf-parser/pull/155)) 63 | ## [0.6.0] - 2020-12-11 64 | ### Added 65 | - Enabled dependabot which should help to keep packages up to date ([#124](https://github.com/jstockwin/py-pdf-parser/pull/124)) 66 | 67 | ### Changed 68 | - Various dependency updates 69 | 70 | ### Fixed 71 | - Fixed a typo in simple memo example in the documentation. ([#121](https://github.com/jstockwin/py-pdf-parser/pull/121)) 72 | 73 | ## [0.5.0] - 2020-07-05 74 | ### Added 75 | - New functions on `ElementList`, `move_forwards_from` and `move_backwards_from`, to allow moving forwards and backwards from a certain element in the list easily. ([#113](https://github.com/jstockwin/py-pdf-parser/pull/113)) 76 | 77 | ### Changed 78 | - When the layout parameter all_texts is True, the text inside figures is now also returned as elements in the document. ([#99](https://github.com/jstockwin/py-pdf-parser/pull/99)) 79 | 80 | ### Fixed 81 | - Passing a tolerance less than the width/height of an element no longer causes an error. The tolerance is now capped at half the width/height of the element. ([#103](https://github.com/jstockwin/py-pdf-parser/pull/103)) 82 | 83 | ## [0.4.0] - 2020-06-22 84 | ### Added 85 | - Added `__len__` and `__repr__` functions to the Section class. ([#90](https://github.com/jstockwin/py-pdf-parser/pull/90)) 86 | - Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89)) 87 | - You can now specify `element_ordering` when instantiating a PDFDocument. This defaults to the old behaviour or left to right, top to bottom. ([#95](https://github.com/jstockwin/py-pdf-parser/pull/95)) 88 | 89 | ### Changed 90 | - Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88)) 91 | 92 | ## [0.3.0] - 2020-05-14 93 | ### Added 94 | - Published to PyPI as py-pdf-parser. 95 | - Documentation is now hosted [here](https://py-pdf-parser.readthedocs.io/en/latest/). ([#71](https://github.com/jstockwin/py-pdf-parser/pull/71)) 96 | - Added new examples to the documentation. ([#74](https://github.com/jstockwin/py-pdf-parser/pull/74)) 97 | - Font filtering now caches the elements by font. ([#73](https://github.com/jstockwin/py-pdf-parser/pull/73)) (updated in [#78](https://github.com/jstockwin/py-pdf-parser/pull/78)) 98 | - Font filtering now caches the elements by font. ([#73](https://github.com/jstockwin/py-pdf-parser/pull/73)) 99 | - The visualise tool now draws an outline around each section on the page. ([#69](https://github.com/jstockwin/py-pdf-parser/pull/69)) (updated in [#80](https://github.com/jstockwin/py-pdf-parser/pull/80)) 100 | 101 | 102 | ### Changed 103 | - This product is now complete enough for the needs of Optimor Ltd, however `jstockwin` is going to continue development as a personal project. The repository has been moved from `optimor/py-pdf-parser` to `jstockwin/py-pdf-parser`. 104 | 105 | ## [0.2.0] - 2020-04-17 106 | ### Added 107 | - It is now possible to specify `font_size_precision` when instantiating a PDFDocument. This is the number of decimal places the font size will be rounded to. ([#60](https://github.com/jstockwin/py-pdf-parser/pull/60)) 108 | - `extract_simple_table` now allows extracting tables with gaps, provided there is at least one full row and one full column. This is only the case if you pass `allow_gaps=True`, otherwise the original logic of raising an exception if there a gap remains. You can optionally pass a `reference_element` which must be in both a full row and a full column, this defaults to the first (top-left) element. ([#57](https://github.com/jstockwin/py-pdf-parser/pull/57)) 109 | 110 | ### Changed 111 | - Font sizes are now `float` not `int`. The `font_size_precision` in the additions defaults to 1, and as such all fonts will change to have a single decimal place. To keep the old behaviour, you can pass `font_size_precision=0` when instantiating your PDFDocument. 112 | 113 | ### Fixed 114 | - Improved performance of `extract_simple_table`, which is now much faster. ([#65](https://github.com/jstockwin/py-pdf-parser/pull/65)) 115 | 116 | ## [0.1.0] - 2020-04-08 117 | ### Added 118 | - Initial version of the product. Note: The version is less than 1, so this product should not yet be considered stable. API changes and other breaking changes are possible, if not likely. 119 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at jstockwin@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Contributions to this project are very welcome in whatever form these may be. We highly 4 | appreciate bug reports, pull requests, and documentation improvements. 5 | 6 | Any interaction with this project should adhere to our Code of Conduct, which you can 7 | find below. 8 | 9 | It should be noted that this project heavily relies on 10 | [pdfminer.six](https://github.com/pdfminer/pdfminer.six) and many issues about loading 11 | PDFs may be due to this package. We ask that you try to avoid filing bugs that are 12 | likely to be being cases by pdfminer.six against this repository, but rather you should 13 | report these bugs directly at 14 | [pdfminer.six/issues](https://github.com/pdfminer/pdfminer.six/issues). 15 | 16 | ## Issues 17 | 18 | Issues are very valuable to this project. 19 | 20 | * Ideas are a valuable source of contributions others can make. 21 | * Problems show where this project is lacking. 22 | * With a question you show where contributors can improve the user experience. 23 | 24 | Thank you for creating them. If you are submitting an issue and would be interested in 25 | helping to work on the fix, please indicate this in the issue. 26 | 27 | ## Pull Requests 28 | 29 | Pull requests are also very valuable. Before submitting a pull request, it is probably 30 | a good idea to first submit an issue to discuss the matter. This helps to avoid wasting 31 | your time working on something that may not be accepted. 32 | 33 | When submitting a Pull Request, you will need to do the following things. There is a 34 | checklist in the template to help make sure you don't forget. 35 | 36 | We run type checks using both pytpe and mypy. We also enforce code style using 37 | pycodestyle and black. You can run `docker-compose run --rm lint` to check this. 38 | 39 | * Provide a good description of the change, and the reason for it. 40 | * Ensure the tests, type checks, and linting passes (this is done by continuous 41 | integration). 42 | * Add any additional tests, as required. 43 | * Ensure all of your changes are well documented. 44 | * Update the CHANGELOG.md with a description of your changes, following the format from 45 | [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). 46 | 47 | ## Code of Conduct 48 | 49 | Before contributing, please read our [Code of Conduct](CODE_OF_CONDUCT.md). 50 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # syntax = docker/dockerfile:1.2 2 | FROM phusion/baseimage:focal-1.0.0 3 | 4 | RUN adduser --disabled-password --gecos "" app_user 5 | 6 | RUN apt-get update && \ 7 | apt-get -y install software-properties-common \ 8 | python3-pip \ 9 | python3-virtualenv \ 10 | python3-tk \ 11 | libmagickwand-dev \ 12 | xvfb && \ 13 | apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 14 | 15 | ENV VIRTUAL_ENV_DIR /.venv 16 | RUN python3 -m virtualenv --python=python3.8 $VIRTUAL_ENV_DIR 17 | # Set the virtual environment as the main Python directory 18 | ENV PATH $VIRTUAL_ENV_DIR/bin:$PATH 19 | 20 | RUN --mount=type=cache,target=/root/.cache/pip pip3 install --upgrade pip 21 | 22 | # Create src dir 23 | ENV PROJECT_DIR /py-pdf-parser 24 | WORKDIR $PROJECT_DIR 25 | 26 | # Add imagemagick policy 27 | ADD ./imagemagick_policy.xml /etc/ImageMagick-6/policy.xml 28 | 29 | # Install requirements 30 | ADD ./setup.py $PROJECT_DIR/setup.py 31 | ADD ./README.md $PROJECT_DIR/README.md 32 | RUN --mount=type=cache,target=/root/.cache/pip pip3 install -e $PROJECT_DIR[dev] 33 | RUN --mount=type=cache,target=/root/.cache/pip pip3 install -e $PROJECT_DIR[test] 34 | RUN chown -R app_user:app_user $VIRTUAL_ENV_DIR 35 | 36 | # Copy code, chown and switch user 37 | ADD ./ $PROJECT_DIR 38 | RUN chown -R app_user:app_user $PROJECT_DIR 39 | USER app_user 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Jake Stockwin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | prune tests 4 | prune tests/* 5 | prune docs 6 | prune docs/* 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # py-pdf-parser 2 | 3 | [![PyPI version](https://badge.fury.io/py/py-pdf-parser.svg)](https://badge.fury.io/py/py-pdf-parser) 4 | ![Continuous Integration](https://github.com/jstockwin/py-pdf-parser/workflows/Continuous%20Integration/badge.svg) 5 | [![Documentation Status](https://readthedocs.org/projects/py-pdf-parser/badge/?version=latest)](https://py-pdf-parser.readthedocs.io/en/latest/?badge=latest) 6 | 7 | Py PDF Parser is a tool to help extracting information from structured PDFs. 8 | 9 | Full details and installation instructions can be found at: 10 | https://py-pdf-parser.readthedocs.io/en/latest/ 11 | 12 | This project is based on an original design and protoype by Sam Whitehall (github.com/samwhitehall). 13 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | Please do not use public GitHub issues to report a security vulnerability. 6 | 7 | Instead, please send an email directly to jstockwin@gmail.com. Do not include any sensitive information in your email. Do try to include as much information as possible to help us understand the issue. 8 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | shell: 2 | extends: 3 | service: base 4 | volumes: 5 | - .:/py-pdf-parser 6 | - /tmp/.X11-unix:/tmp/.X11-unix:rw 7 | - ./imagemagick_policy.xml:/etc/ImageMagick-6/policy.xml 8 | environment: 9 | - DISPLAY 10 | command: bash 11 | 12 | tests: 13 | extends: 14 | service: base 15 | command: .github/scripts/test.sh 16 | 17 | # Run docs to re-build the docs once. 18 | docs: 19 | extends: 20 | service: base 21 | command: make --directory docs html 22 | environment: 23 | - SPHINXOPTS="-W" 24 | 25 | # Use "up" to host the docs on port 8000, watching for changes. 26 | docs-autobuild: 27 | extends: 28 | service: base 29 | ports: 30 | - "8000:8000" 31 | command: make --directory docs livehtml 32 | 33 | base: 34 | build: . 35 | volumes: 36 | - .:/py-pdf-parser 37 | working_dir: /py-pdf-parser 38 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | livehtml: 18 | sphinx-autobuild --host 0.0.0.0 -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 19 | 20 | # Catch-all target: route all unknown targets to Sphinx using the new 21 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 22 | %: Makefile 23 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 24 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/source/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ../../CHANGELOG.md -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | from typing import List 14 | 15 | import os 16 | import sys 17 | 18 | sys.path.insert(0, os.path.abspath("../../")) 19 | 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = "PDF Parser" 24 | copyright = "2019, Jake Stockwin" 25 | author = "Jake Stockwin" 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | "sphinx.ext.autodoc", 35 | "sphinx.ext.napoleon", 36 | "sphinx_rtd_theme", 37 | "recommonmark", 38 | ] 39 | 40 | # Add any paths that contain templates here, relative to this directory. 41 | templates_path = ["_templates"] 42 | 43 | # List of patterns, relative to source directory, that match files and 44 | # directories to ignore when looking for source files. 45 | # This pattern also affects html_static_path and html_extra_path. 46 | exclude_patterns: List[str] = [] 47 | 48 | master_doc = "index" 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | # The theme to use for HTML and HTML Help pages. See the documentation for 54 | # a list of builtin themes. 55 | # 56 | html_theme = "sphinx_rtd_theme" 57 | 58 | # Add any paths that contain custom static files (such as style sheets) here, 59 | # relative to this directory. They are copied after the builtin static files, 60 | # so a file named "default.css" will overwrite the builtin "default.css". 61 | html_static_path: List[str] = [] 62 | 63 | 64 | # -- Extension configuration ------------------------------------------------- 65 | -------------------------------------------------------------------------------- /docs/source/example_files/columns.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/columns.pdf -------------------------------------------------------------------------------- /docs/source/example_files/figure.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/figure.pdf -------------------------------------------------------------------------------- /docs/source/example_files/grid.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/grid.pdf -------------------------------------------------------------------------------- /docs/source/example_files/order_summary.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/order_summary.pdf -------------------------------------------------------------------------------- /docs/source/example_files/simple_memo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/simple_memo.pdf -------------------------------------------------------------------------------- /docs/source/example_files/tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/tables.pdf -------------------------------------------------------------------------------- /docs/source/examples/element_ordering.rst: -------------------------------------------------------------------------------- 1 | .. _element-ordering: 2 | 3 | Element Ordering 4 | ---------------- 5 | 6 | In this example, we see how to specify a custom ordering for the elements. 7 | 8 | For this we will use a simple pdf, which has a single element in each corner of the 9 | page. You can :download:`download the example here `. 10 | 11 | 12 | Default 13 | ....... 14 | 15 | The default element ordering is left to right, top to bottom. 16 | 17 | .. code-block:: python 18 | 19 | from py_pdf_parser.loaders import load_file 20 | 21 | file_path = "grid.pdf" 22 | 23 | # Default - left to right, top to bottom 24 | document = load_file(file_path) 25 | print([element.text() for element in document.elements]) 26 | 27 | This results in 28 | :: 29 | 30 | ['Top Left', 'Top Right', 'Bottom Left', 'Bottom Right'] 31 | 32 | Presets 33 | ....... 34 | 35 | There are also preset orderings for ``right to left, top to bottom``, 36 | ``top to bottom, left to right``, and ``top to bottom, right to left``. You can use 37 | these by importing the :class:`~py_pdf_parser.components.ElementOrdering` class from 38 | :py:mod:`py_pdf_parser.components` and passing these as the ``element_ordering`` 39 | argument to :class:`~py_pdf_parser.components.PDFDocument`. Note that keyword arguments 40 | to :meth:`~py_pdf_parser.loaders.load` and :meth:`~py_pdf_parser.loaders.load_file` get 41 | passed through to the :class:`~py_pdf_parser.components.PDFDocument`. 42 | 43 | .. code-block:: python 44 | 45 | from py_pdf_parser.loaders import load_file 46 | from py_pdf_parser.components import ElementOrdering 47 | 48 | # Preset - right to left, top to bottom 49 | document = load_file( 50 | file_path, element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM 51 | ) 52 | print([element.text() for element in document.elements]) 53 | 54 | # Preset - top to bottom, left to right 55 | document = load_file( 56 | file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT 57 | ) 58 | print([element.text() for element in document.elements]) 59 | 60 | # Preset - top to bottom, right to left 61 | document = load_file( 62 | file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT 63 | ) 64 | print([element.text() for element in document.elements]) 65 | 66 | which results in 67 | 68 | :: 69 | 70 | ['Top Right', 'Top Left', 'Bottom Right', 'Bottom Left'] 71 | ['Bottom Left', 'Top Left', 'Bottom Right', 'Top Right'] 72 | ['Top Right', 'Bottom Right', 'Top Left', 'Bottom Left'] 73 | 74 | Custom Ordering 75 | ............... 76 | 77 | If none of the presets give an ordering you are looking for, you can also pass a 78 | callable as the ``element_ordering`` argument of 79 | :class:`~py_pdf_parser.components.PDFDocument`. This callable will be given a list of 80 | elements for each page, and should return a list of the same elements, in the desired 81 | order. 82 | 83 | .. important:: 84 | 85 | The elements which get passed to your function will be PDFMiner.six elements, and NOT 86 | class :class:`~py_pdf_parser.componenets.PDFElement`. You can access the ``x0``, 87 | ``x1``, ``y0``, ``y1`` directly, and extract the text using `get_text()`. Other 88 | options are available: please familiarise yourself with the PDFMiner.six 89 | documentation. 90 | 91 | .. note:: 92 | 93 | Your function will be called multiple times, once for each page of the document. 94 | Elements will always be considered in order of increasing page number, your function 95 | only controls the ordering within each page. 96 | 97 | For example, if we wanted to implement an ordering which is bottom to top, left to right 98 | then we can do this as follows: 99 | 100 | .. code-block:: python 101 | 102 | from py_pdf_parser.loaders import load_file 103 | 104 | # Custom - bottom to top, left to right 105 | def ordering_function(elements): 106 | """ 107 | Note: Elements will be PDFMiner.six elements. The x axis is positive as you go left 108 | to right, and the y axis is positive as you go bottom to top, and hence we can 109 | simply sort according to this. 110 | """ 111 | return sorted(elements, key=lambda elem: (elem.x0, elem.y0)) 112 | 113 | 114 | document = load_file(file_path, element_ordering=ordering_function) 115 | print([element.text() for element in document.elements]) 116 | 117 | which results in 118 | 119 | :: 120 | 121 | ['Bottom Left', 'Top Left', 'Bottom Right', 'Top Right'] 122 | 123 | Multiple Columns 124 | ................ 125 | 126 | Finally, suppose our PDF has multiple columns, like 127 | :download:`this example `. 128 | 129 | If we don't specify an ``element_ordering``, the elements will be extracted in the 130 | following order: 131 | 132 | :: 133 | 134 | ['Column 1 Title', 'Column 2 Title', 'Here is some column 1 text.', 'Here is some column 2 text.', 'Col 1 left', 'Col 1 right', 'Col 2 left', 'Col 2 right'] 135 | 136 | If we visualise this document 137 | (see the :ref:`simple-memo` example if you don't know how to do this), then we can see 138 | that the column divider is at an ``x`` value of about 300. Using this information, we 139 | can specify a custom ordering function which will order the elements left to right, 140 | top to bottom, but in each column individually. 141 | 142 | .. code-block:: python 143 | 144 | from py_pdf_parser.loaders import load_file 145 | 146 | document = load_file("columns.pdf") 147 | 148 | def column_ordering_function(elements): 149 | """ 150 | The first entry in the key is False for colum 1, and Tru for column 2. The second 151 | and third keys just give left to right, top to bottom. 152 | """ 153 | return sorted(elements, key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0)) 154 | 155 | 156 | document = load_file(file_path, element_ordering=column_ordering_function) 157 | print([element.text() for element in document.elements]) 158 | 159 | which returns the elements in the correct order: 160 | 161 | :: 162 | 163 | ['Column 1 Title', 'Here is some column 1 text.', 'Col 1 left', 'Col 1 right', 'Column 2 Title', 'Here is some column 2 text.', 'Col 2 left', 'Col 2 right'] 164 | -------------------------------------------------------------------------------- /docs/source/examples/extracting_text_from_figures.rst: -------------------------------------------------------------------------------- 1 | .. _extracting-text-from-figures: 2 | 3 | Extracting Text From Figures 4 | ---------------------------- 5 | PDFs are structured documents, and can contain Figures. By default, PDFMiner.six and 6 | hence py-pdf-parser does not extract text from figures. 7 | 8 | You can :download:`download an example here `. In the 9 | example, there is figure which contains a red square, and some text. Below the figure 10 | there is some more text. 11 | 12 | By default, the text in the figure will not be included: 13 | 14 | .. code-block:: python 15 | 16 | from py_pdf_parser.loaders import load_file 17 | document = load_file("figure.pdf") 18 | print([element.text() for element in document.elements]) 19 | 20 | which results in: 21 | 22 | :: 23 | 24 | ["Here is some text outside of an image"] 25 | 26 | To include the text inside the figure, we must pass the ``all_texts`` layout parameter. 27 | This is documented in the PDFMiner.six documentation, `here 28 | `_. 29 | 30 | The layout parameters can be passed to both :meth:`~py_pdf_parser.loaders.load` and 31 | :meth:`~py-pdf-parser.loaders.load_file` as a dictionary to the ``la_params`` argument. 32 | 33 | In our case: 34 | 35 | .. code-block:: python 36 | 37 | from py_pdf_parser.loaders import load_file 38 | document = load_file("figure.pdf", la_params={"all_texts": True}) 39 | print([element.text() for element in document.elements]) 40 | 41 | which results in: 42 | 43 | :: 44 | 45 | ["This is some text in an image", "Here is some text outside of an image"] 46 | -------------------------------------------------------------------------------- /docs/source/examples/index.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | ======== 3 | 4 | Below you can find links to the following examples: 5 | 6 | - The :ref:`simple-memo` example shows the very basics of using py-pdf-parser. You will see how to load a pdf document, start filtering the elements, and extract text from certain elements in the document. 7 | - The :ref:`order-summary` example explains how to use font mappings, sections, and how to extract simple tables. 8 | - The :ref:`more-tables` example explains tables in more detail, showing how to extract more complex tables. 9 | - The :ref:`element-ordering` example shows how to specify different orderings for the elements on a page. 10 | - The :ref:`extracting-text-from-figures` example shows how to extract text from figures. 11 | 12 | .. toctree:: 13 | 14 | simple_memo 15 | order_summary 16 | more_tables 17 | element_ordering 18 | extracting_text_from_figures 19 | -------------------------------------------------------------------------------- /docs/source/examples/more_tables.rst: -------------------------------------------------------------------------------- 1 | .. _more-tables: 2 | 3 | More Tables 4 | ----------- 5 | 6 | In this example, we will learn how to extract different types of table, and the difference between a simple table and more complicated ones. 7 | 8 | You can :download:`download the example here `. 9 | 10 | Please read the :ref:`order-summary` example first, as this covers some other functionality of the table extraction methods. 11 | 12 | Load the file 13 | ............. 14 | 15 | The following code (click "show code" below to see it) loads the file, and assigns the elements for each table to a variable. If this does not make sense, you should go back and look at some of the previous examples. 16 | 17 | .. raw:: html 18 | 19 |
20 | Show code 21 | 22 | .. code-block:: python 23 | 24 | from py_pdf_parser.loaders import load_file 25 | 26 | FONT_MAPPING = { 27 | "BAAAAA+LiberationSerif-Bold,12.0": "header", 28 | "CAAAAA+LiberationSerif,12.0": "table_element", 29 | } 30 | document = load_file("tables.pdf", font_mapping=FONT_MAPPING) 31 | 32 | headers = document.elements.filter_by_font("header") 33 | 34 | # Extract reference elements 35 | simple_table_header = headers.filter_by_text_equal( 36 | "Simple Table" 37 | ).extract_single_element() 38 | 39 | simple_table_with_gaps_header = headers.filter_by_text_equal( 40 | "Simple Table with gaps" 41 | ).extract_single_element() 42 | 43 | simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal( 44 | "Simple Table with gaps in first row/col" 45 | ).extract_single_element() 46 | 47 | non_simple_table_header = headers.filter_by_text_equal( 48 | "Non Simple Table" 49 | ).extract_single_element() 50 | 51 | non_simple_table_with_merged_cols_header = headers.filter_by_text_equal( 52 | "Non Simple Table with Merged Columns" 53 | ).extract_single_element() 54 | 55 | non_simple_table_with_merged_rows_header = headers.filter_by_text_equal( 56 | "Non Simple Table with Merged Rows and Columns" 57 | ).extract_single_element() 58 | 59 | over_the_page_header = headers.filter_by_text_equal( 60 | "Over the page" 61 | ).extract_single_element() 62 | 63 | # Extract table elements 64 | simple_table_elements = document.elements.between( 65 | simple_table_header, simple_table_with_gaps_header 66 | ) 67 | simple_table_with_gaps_elements = document.elements.between( 68 | simple_table_with_gaps_header, simple_table_with_gaps_in_first_row_col_header 69 | ) 70 | 71 | simple_table_with_gaps_in_first_row_col_elements = document.elements.between( 72 | simple_table_with_gaps_in_first_row_col_header, non_simple_table_header 73 | ) 74 | 75 | non_simple_table_elements = document.elements.between( 76 | non_simple_table_header, non_simple_table_with_merged_cols_header 77 | ) 78 | 79 | non_simple_table_with_merged_cols_elements = document.elements.between( 80 | non_simple_table_with_merged_cols_header, non_simple_table_with_merged_rows_header 81 | ) 82 | 83 | non_simple_table_with_merged_rows_and_cols_elements = document.elements.between( 84 | non_simple_table_with_merged_rows_header, over_the_page_header 85 | ) 86 | 87 | over_the_page_elements = document.elements.after(over_the_page_header) 88 | 89 | .. raw:: html 90 | 91 |
92 | 93 | Overview 94 | ........ 95 | 96 | The tables in the example pdf are split into "Simple Tables" and "Non Simple Tables". For the simple tables, we will be able to use :meth:`~py_pdf_parser.tables.extract_simple_table`, otherwise we must use :meth:`~py_pdf_parser.tables.extract_table`. The former is much more efficient, and should be used when possible. 97 | 98 | In general, tables can become more complicated by having missing cells, or merged cells which go across multiple columns or multiple rows. In both cases, you will have to pass additional parameters to stop exceptions being raised when this is the case. This is to make the extraction more robust, and protect against unexpected outcomes. 99 | 100 | To use :meth:`~py_pdf_parser.tables.extract_simple_table` we must have at least one column and one row which have no missing cells, and we must have no merged cells at all. We will need to know which row/column has no missing cells, as these must be passed as the reference row and column. 101 | 102 | To understand why: for each column element in the reference row and each row element in the reference column, :meth:`~py_pdf_parser.tables.extract_simple_table` will scan across from the row element (to get the row) and up/down from the column element (to get the column), and see if there is an element there. If there is, it is added to the table. Therefore, if there are gaps in the reference row/column, other elements may get missed. There is a check for this, so an exception will be raised if this is the case. 103 | 104 | This means :meth:`~py_pdf_parser.tables.extract_simple_table` takes time proportional to ``len(cols) + len(rows)``. Conversely, :meth:`~py_pdf_parser.tables.extract_table` is at least ``len(cols) * len(rows)``, and if there are merged cells it will be even worse. (Note in reality the complexity is not quite this simple, but it should give you an idea of the difference.) 105 | 106 | Below, we will work through increasingly complex examples to explain the functionality, and the steps involved. 107 | 108 | Simple Table 109 | ............ 110 | 111 | This table is as simple as they come - there are no blank or merged cells. This means we can simply use :meth:`~py_pdf_parser.tables.extract_simple_table` as we have seen previously. 112 | 113 | .. code-block:: python 114 | 115 | from py_pdf_parser import tables 116 | table = tables.extract_simple_table(simple_table_elements, as_text=True) 117 | 118 | :: 119 | 120 | >>> table 121 | [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['B', '2', 'B', '2'], ['C', '3', 'C', '3']] 122 | 123 | Simple Table with gaps 124 | ...................... 125 | 126 | This table has gaps, however there are no gaps in the first row or column. These are the default reference row and column, and so :meth:`~py_pdf_parser.tables.extract_simple_table` will still work as expected. Blank cells will be empty strings if ``as_text=True``, and otherwise they will be ``None``. However, if we try the same code as above: 127 | 128 | .. code-block:: python 129 | 130 | table = tables.extract_simple_table( 131 | simple_table_with_gaps_elements, as_text=True 132 | ) 133 | 134 | this will raise an exception: 135 | 136 | :: 137 | 138 | py_pdf_parser.exceptions.TableExtractionError: Element not found, there appears to be a gap in the table. If this is expected, pass allow_gaps=True. 139 | 140 | This is to allow py-pdf-parser to be more robust in the case that you're expecting your table to have no empty cells. As the error message says, since this is expected behaviour we can simply pass ``allow_gaps=True``. 141 | 142 | .. code-block:: python 143 | 144 | table = tables.extract_simple_table( 145 | simple_table_with_gaps_elements, as_text=True, allow_gaps=True 146 | ) 147 | 148 | :: 149 | 150 | >>> table 151 | [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', '', '1'], ['B', '', '', ''], ['C', '', 'C', '3']] 152 | 153 | Simple Table with gaps in first row/col 154 | ....................................... 155 | 156 | This table is similar to the above example, but now we have gaps in the first row and the first column (if either of these were true then the above wouldn't work). If we try the above code, a useful exception is raised: 157 | 158 | .. code-block:: python 159 | 160 | table = tables.extract_simple_table( 161 | simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True 162 | ) 163 | 164 | :: 165 | 166 | py_pdf_parser.exceptions.TableExtractionError: Number of elements in table (9) does not match number of elements passed (12). Perhaps try extract_table instead of extract_simple_table, or change you reference element. 167 | 168 | The error message suggests either passing another reference element, or using the more complicated :meth:`~py_pdf_parser.tables.extract_table` method. In this case, as we still have a row and a column which have no missing cells, we can just pass a new reference element. 169 | 170 | As such, we can use the second column and the last row as our references, as neither of these have missing cells. The reference row and column are specified by simply passing the unique element in both the reference row and the reference column (called the reference element). In this case, it's the first number "3" in the table. Here we will be lazy and simply use the fact that this is the 10th element in the table, but you should probably do something smarter. 171 | 172 | .. code-block:: python 173 | 174 | reference_element = simple_table_with_gaps_in_first_row_col_elements[9] 175 | table = tables.extract_simple_table( 176 | simple_table_with_gaps_in_first_row_col_elements, 177 | as_text=True, 178 | allow_gaps=True, 179 | reference_element=reference_element, 180 | ) 181 | 182 | :: 183 | 184 | >>> table 185 | [['Heading 1', 'Heading 2', '', 'Heading 4'], ['', '1', 'A', ''], ['B', '2', '', '2'], ['C', '3', 'C', '3']] 186 | 187 | Non Simple Table 188 | ................ 189 | 190 | The next table does not have any row with no empty cells, and as such we must use :meth:`~py_pdf_parser.tables.extract_table`. There is no ``allow_gaps`` parameter for this method, since if you don't want to allow gaps you should be using :meth:`~py_pdf_parser.tables.extract_simple_table` instead. 191 | 192 | Whilst the below may seem easier than working out the reference element in the above example, please note that it will be computationally slower. 193 | 194 | .. code-block:: python 195 | 196 | table = tables.extract_table(non_simple_table_elements, as_text=True) 197 | 198 | :: 199 | 200 | >>> table 201 | [['', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', '', '1'], ['B', '', 'B', '2'], ['C', '3', 'C', '']] 202 | 203 | 204 | Non Simple Table with Merged Columns 205 | .................................... 206 | 207 | This table has text which goes across multiple columns. If we naively run this as above: 208 | 209 | .. code-block:: python 210 | 211 | table = tables.extract_table(non_simple_table_with_merged_cols_elements, as_text=True) 212 | 213 | then we get an exception: 214 | 215 | :: 216 | 217 | py_pdf_parser.exceptions.TableExtractionError: An element is in multiple columns. If this is expected, you can try passing fix_element_in_multiple_cols=True 218 | 219 | Just like ``allow_gaps``, this is so we can be more robust in the case that this is not expected. The error helpfully suggests to try passing ``fix_element_in_multiple_cols=True``. 220 | 221 | .. code-block:: python 222 | 223 | table = tables.extract_table( 224 | non_simple_table_with_merged_cols_elements, 225 | as_text=True, 226 | fix_element_in_multiple_cols=True, 227 | ) 228 | 229 | :: 230 | 231 | >>> table 232 | [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['This text spans across multiple columns', '', 'B', '2'], ['C', '3', 'C', '3']] 233 | 234 | Note that the merged cell has been pushed into the left-most column. Likewise, if we had a cell that was merged across multiple rows, we could pass ``fix_element_in_multiple_rows=True``, and it would be pushed into the top row. 235 | 236 | Non Simple Table with Merged Rows and Columns 237 | ............................................. 238 | 239 | In this case we have both merged rows and merged columns. We can pass both ``fix_element_in_multiple_rows=True`` and ``fix_element_in_multiple_cols=True``. The merged cell will be pushed into the left-most column and the top row. 240 | 241 | .. code-block:: python 242 | 243 | table = tables.extract_table( 244 | non_simple_table_with_merged_rows_and_cols_elements, 245 | as_text=True, 246 | fix_element_in_multiple_rows=True, 247 | fix_element_in_multiple_cols=True, 248 | ) 249 | 250 | :: 251 | 252 | >>> table 253 | [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['This text spans across multiple rows and \nmultiple columns.', '', 'A', '1'], ['', '', 'B', '2'], ['C', '3', 'C', '3']] 254 | 255 | 256 | Over the page 257 | ............. 258 | 259 | The final table goes over the page break. This is not a problem, simply pass the elements within the table and the result should be correct. 260 | 261 | If you had e.g. a footer that broke the table in two, simply ensure these elements are not included in the element list you pass to :meth:`~py_pdf_parser.tables.extract_table`, and again it should still work. 262 | 263 | .. code-block:: python 264 | 265 | table = tables.extract_simple_table(over_the_page_elements, as_text=True) 266 | 267 | :: 268 | 269 | >>> table 270 | [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['B', '2', 'B', '2'], ['C', '3', 'C', '3']] 271 | -------------------------------------------------------------------------------- /docs/source/examples/order_summary.rst: -------------------------------------------------------------------------------- 1 | .. _order-summary: 2 | 3 | Order Summary 4 | ------------- 5 | 6 | In this example we will extract some tabular data from an order summary pdf. 7 | 8 | You can :download:`download the example here `. 9 | 10 | This is a fairly simple PDF, and as such it would be fairly easy to identify the tables and extract the data from them, however we will use this example to introduce font mappings and sections, which will come in useful for larger PDFs. 11 | 12 | Step 1 - Load the file 13 | ...................... 14 | 15 | We can :func:`load ` the file as follows, and take a quick look using the :func:`visualise tool ` to check it looks good. 16 | 17 | .. code-block:: python 18 | 19 | from py_pdf_parser.loaders import load_file 20 | from py_pdf_parser.visualise import visualise 21 | 22 | document = load_file("order_summary.pdf") 23 | visualise(document) 24 | 25 | This should show the following. We should check that py-pdf-parser has detected each element correctly, which in this case it has. 26 | 27 | .. image:: /screenshots/order_summary_example/initial.png 28 | :height: 300px 29 | 30 | Step 2 - Use a font mapping 31 | ........................... 32 | 33 | Each :class:`~py_pdf_parser.components.PDFElement` has a :attr:`~py_pdf_parser.components.PDFElement.font` property, which is the name of the font in the PDF document (including the font size). You can use fonts to help filter elements. 34 | 35 | Fonts often have long, not very useful names. However, additional keyword arguments passed to :func:`~py_pdf_parser.loaders.load_file` will be used to initialise the :class:`~py_pdf_parser.components.PDFDocument`. One of these is the font mapping, which allows you to map the fonts in your PDF to more useful names. 36 | 37 | The visualise tool allows you to inspect fonts. If you hover over an element, a summary will be shown in text at the bottom of the window. For example, in the image below we hover over the first cell in the table, and can see that the font is ``EAAAA+FreeMono,12.0``. 38 | 39 | .. image:: /screenshots/order_summary_example/showing_font_1.png 40 | :height: 300px 41 | 42 | We can easily ask to see all of the available fonts by running 43 | 44 | :: 45 | 46 | >>> set(element.font for element in document.elements) 47 | {'EAAAAA+FreeMono,12.0', 'BAAAAA+LiberationSerif-Bold,16.0', 'CAAAAA+LiberationSerif,12.0', 'DAAAAA+FreeMonoBold,12.0', 'BAAAAA+LiberationSerif-Bold,12.0'} 48 | 49 | Using this and the visualise tool, we can now choose better names for each of the fonts, and then load the document again, but this time providing a font mapping. 50 | 51 | .. code-block:: python 52 | 53 | FONT_MAPPING = { 54 | "BAAAAA+LiberationSerif-Bold,16.0": "title", 55 | "BAAAAA+LiberationSerif-Bold,12.0": "sub_title", 56 | "CAAAAA+LiberationSerif,12.0": "text", 57 | "DAAAAA+FreeMonoBold,12.0": "table_header", 58 | "EAAAAA+FreeMono,12.0": "table_text", 59 | } 60 | document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING) 61 | 62 | Using the visualise tool again, we can now see that our element's font has changed to ``table_text``, which is a much more useful name for us. 63 | 64 | .. image:: /screenshots/order_summary_example/showing_font_2.png 65 | :height: 300px 66 | 67 | Step 3 - Use regex for font mapping 68 | ................................... 69 | In certain use cases (especially when handling many PDF files) you may encounter the problem that the same fonts have different prefixes. 70 | 71 | For example: 72 | 73 | File 1: 74 | :: 75 | 76 | >>> set(element.font for element in document.elements) 77 | {'EAAAAA+FreeMono,12.0', 'BAAAAA+LiberationSerif-Bold,16.0', 'CAAAAA+LiberationSerif,12.0', 'DAAAAA+FreeMonoBold,12.0', 'BAAAAA+LiberationSerif-Bold,12.0'} 78 | 79 | File 2: 80 | :: 81 | 82 | >>> set(element.font for element in document.elements) 83 | {'CIPKDS+FreeMono,12.0', 'FDHZTR+LiberationSerif-Bold,16.0', 'KJVFSL+LiberationSerif,12.0', 'BXNKHF+FreeMonoBold,12.0', 'OKSDFT+LiberationSerif-Bold,12.0'} 84 | 85 | In this case mapping fonts with regex patterns makes more sense. Create the your font mapping like before but fill it with regex patterns that don't specify the prefix precisely. Also specify that the font mapping contains regex patterns when loading the document. 86 | 87 | .. code-block:: python 88 | 89 | FONT_MAPPING = { 90 | r"\w{6}\+LiberationSerif-Bold,16.0": "title", 91 | r"\w{6}\+LiberationSerif-Bold,12.0": "sub_title", 92 | r"\w{6}\+LiberationSerif,12.0": "text", 93 | r"\w{6}\+FreeMonoBold,12.0": "table_header", 94 | r"\w{6}\+FreeMono,12.0": "table_text", 95 | } 96 | document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING, font_mapping_is_regex=True) 97 | 98 | Step 4 - Add sections 99 | ..................... 100 | 101 | Another thing we can do to make our job easier is to add :class:`Sections` to our document. A :class:`Sections` class is made available on :attr:`document.sectioning`, which in particular allows us to call :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`. 102 | 103 | A section has a name, and contains all elements between the start element and the end element. You can add multiple sections with the same name, but each section will have both a ``name`` and a ``unique_name`` (which is just the name with an additional ``_n`` on the end, where ``n`` is the number of sections with that name). 104 | 105 | As with the :class:`~py_pdf_parser.components.PDFDocument`, a :class:`~py_pdf_parser.sectioning.Section` has an :attr:`~py_pdf_parser.sectioning.Section.elements` property which returns an :class:`~py_pdf_parser.filtering.ElementList`, allowing you to filter the elements. 106 | 107 | .. important:: Never instantiate a :class:`Sections` yourself. You should always use :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`. 108 | 109 | Calling :meth:`~py_pdf_parser.sectioning.Sectioning.create_section` will return the :class:`~py_pdf_parser.sectioning.Section`, but the :class:`~py_pdf_parser.sectioning.Sectioning` class also has :meth:`~py_pdf_parser.sectioning.Sectioning.get_section` and :meth:`~py_pdf_parser.sectioning.Sectioning.get_sections_with_name` methods. 110 | 111 | Going back to our example, we will create sections for the order summary table, and for the totals table. Our order summary table will start with the "Order Summary:" sub title and end at the "Totals:" sub title. Note that there are two elements on the page with text equal to "Order Summary:", however they have different font and so we can still extract exactly the one we want. 112 | 113 | 114 | .. image:: /screenshots/order_summary_example/zoomed.png 115 | :height: 300px 116 | 117 | By default, :meth:`~py_pdf_parser.sectioning.Sectioning.create_section` will include the last element in the section, but this can be disabled by passing ``include_last_element=False``. 118 | 119 | The totals section will run from the "Totals:" sub title, until the end of the document. An :class:`~py_pdf_parser.filtering.ElementList` (e.g. ``document.elements``) acts like a set of elements, but it does also define an order, and as such we can access the last element in the :class:`~py_pdf_parser.filtering.ElementList` by simply doing ``document.elements[-1]``. 120 | 121 | .. code-block:: python 122 | 123 | order_summary_sub_title_element = ( 124 | document.elements.filter_by_font("sub_title") 125 | .filter_by_text_equal("Order Summary:") 126 | .extract_single_element() 127 | ) 128 | 129 | totals_sub_title_element = ( 130 | document.elements.filter_by_font("sub_title") 131 | .filter_by_text_equal("Totals:") 132 | .extract_single_element() 133 | ) 134 | 135 | final_element = document.elements[-1] 136 | 137 | order_summary_section = document.sectioning.create_section( 138 | name="order_summary", 139 | start_element=order_summary_sub_title_element, 140 | end_element=totals_sub_title_element, 141 | include_last_element=False, 142 | ) 143 | 144 | Again, the visualise tool is helpful to check everything worked as expected, as it will draw a border around all of our sections: 145 | 146 | .. image:: /screenshots/order_summary_example/sections.png 147 | :height: 300px 148 | 149 | Step 5 - Extract tables 150 | ....................... 151 | 152 | Now we have mapped our fonts and added some sections, we'd like to extract the table. In this case, we are able to use :meth:`~py_pdf_parser.tables.extract_simple_table`. We need to pass this the elements which form our table, however currently our sections also include the sub titles, "Order Summary:" and "Totals:". We need to exclude these from the elements we pass to :meth:`~py_pdf_parser.tables.extract_simple_table`. We have a reference to the sub title elements, so we could simply use :meth:`~py_pdf_parser.filtering.ElementList.remove_element`. However, since the tables seem to have their own fonts, it may be more robust to use :meth:`~py_pdf_parser.filtering.ElementList.filter_by_fonts`. 153 | 154 | We will also pass ``as_text=True``, since we are interested in the text, not the :class:`PDFElements` themselves. 155 | 156 | .. code-block:: python 157 | 158 | order_summary_table = tables.extract_simple_table( 159 | order_summary_section.elements.filter_by_fonts("table_header", "table_text"), 160 | as_text=True, 161 | ) 162 | 163 | totals_table = tables.extract_simple_table( 164 | totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True 165 | ) 166 | 167 | This gives: 168 | 169 | :: 170 | 171 | >>> order_summary_table 172 | [['Item', 'Unit Cost', 'Quantity', 'Cost'], ['Challenger 100g\nWhole Hops', '£3.29', '1', '£3.29'], ['Maris Otter \nPale Ale Malt \n(Crushed)', '£1.50/1000g', '4000g', '£6.00'], ['WLP037 \nYorkshire Ale \nYeast', '£7.08', '1', '£7.08'], ['Bottle Caps', '£1 per 100', '500', '£5']] 173 | 174 | >>> totals_table 175 | [['Subtotal:', '£26.28'], ['Shipping', '£6'], ['VAT 20%', '£6.45'], ['Total:', '£38.73']] 176 | 177 | As one final step, since the order summary table has a header row, we can make use of :meth:`~py_pdf_parser.tables.add_header_to_table`, which will change the list of lists to a list of dicts, mapping the header to the values in each row: 178 | 179 | .. code-block:: python 180 | 181 | order_summary_with_header = tables.add_header_to_table(order_summary_table) 182 | 183 | :: 184 | 185 | >>> order_summary_with_header 186 | [{'Item': 'Challenger 100g\nWhole Hops', 'Unit Cost': '£3.29', 'Quantity': '1', 'Cost': '£3.29'}, {'Item': 'Maris Otter \nPale Ale Malt \n(Crushed)', 'Unit Cost': '£1.50/1000g', 'Quantity': '4000g', 'Cost': '£6.00'}, {'Item': 'WLP037 \nYorkshire Ale \nYeast', 'Unit Cost': '£7.08', 'Quantity': '1', 'Cost': '£7.08'}, {'Item': 'Bottle Caps', 'Unit Cost': '£1 per 100', 'Quantity': '500', 'Cost': '£5'}] 187 | 188 | 189 | Full Code 190 | ......... 191 | 192 | .. code-block:: python 193 | 194 | from py_pdf_parser.loaders import load_file 195 | from py_pdf_parser import tables 196 | 197 | # from py_pdf_parser.visualise import visualise 198 | 199 | 200 | # Step 1 - Load the file 201 | document = load_file("order_summary.pdf") 202 | 203 | # visualise(document) 204 | 205 | # Step 2 - Use a font mapping 206 | 207 | # Show all fonts: 208 | # set(element.font for element in document.elements) 209 | 210 | FONT_MAPPING = { 211 | "BAAAAA+LiberationSerif-Bold,16.0": "title", 212 | "BAAAAA+LiberationSerif-Bold,12.0": "sub_title", 213 | "CAAAAA+LiberationSerif,12.0": "text", 214 | "DAAAAA+FreeMonoBold,12.0": "table_header", 215 | "EAAAAA+FreeMono,12.0": "table_text", 216 | } 217 | document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING) 218 | 219 | # OR 220 | 221 | # use regex patterns 222 | 223 | FONT_MAPPING = { 224 | r"\w{6}\+LiberationSerif-Bold,16.0": "title", 225 | r"\w{6}\+LiberationSerif-Bold,12.0": "sub_title", 226 | r"\w{6}\+LiberationSerif,12.0": "text", 227 | r"\w{6}\+FreeMonoBold,12.0": "table_header", 228 | r"\w{6}\+FreeMono,12.0": "table_text", 229 | } 230 | document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING, font_mapping_is_regex=True) 231 | 232 | # visualise(document) 233 | 234 | # Step 3 - Add sections 235 | order_summary_sub_title_element = ( 236 | document.elements.filter_by_font("sub_title") 237 | .filter_by_text_equal("Order Summary:") 238 | .extract_single_element() 239 | ) 240 | 241 | totals_sub_title_element = ( 242 | document.elements.filter_by_font("sub_title") 243 | .filter_by_text_equal("Totals:") 244 | .extract_single_element() 245 | ) 246 | 247 | final_element = document.elements[-1] 248 | 249 | order_summary_section = document.sectioning.create_section( 250 | name="order_summary", 251 | start_element=order_summary_sub_title_element, 252 | end_element=totals_sub_title_element, 253 | include_last_element=False, 254 | ) 255 | 256 | totals_section = document.sectioning.create_section( 257 | name="totals", start_element=totals_sub_title_element, end_element=final_element 258 | ) 259 | 260 | # visualise(document) 261 | 262 | # Step 4 - Extract tables 263 | 264 | order_summary_table = tables.extract_simple_table( 265 | order_summary_section.elements.filter_by_fonts("table_header", "table_text"), 266 | as_text=True, 267 | ) 268 | 269 | totals_table = tables.extract_simple_table( 270 | totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True 271 | ) 272 | 273 | order_summary_with_header = tables.add_header_to_table(order_summary_table) 274 | -------------------------------------------------------------------------------- /docs/source/examples/simple_memo.rst: -------------------------------------------------------------------------------- 1 | .. _simple-memo: 2 | 3 | Simple Memo 4 | ----------- 5 | 6 | Our first example will be extracting information from a simple memo. 7 | 8 | You can :download:`download the example memo here `. 9 | 10 | We will assume that your company issues these memos always in a consistent format, i.e. with the "TO", "FROM", "DATE", and "SUBJECT" fields, the main content of the memo. We would like to write some code such that we can extract the information from each memo. 11 | 12 | Step 1 - Load the file 13 | ...................... 14 | 15 | First, we should load the file into a :class:`~py_pdf_parser.components.PDFDocument`, using :func:`~py_pdf_parser.loaders.load_file`: 16 | 17 | .. code-block:: python 18 | 19 | from py_pdf_parser.loaders import load_file 20 | 21 | document = load_file("simple_memo.pdf") 22 | 23 | To check the PDF loaded as expected, we can use the :func:`~py_pdf_parser.visualise.main.visualise` tool by running 24 | 25 | .. code-block:: python 26 | 27 | from py_pdf_parser.visualise import visualise 28 | 29 | visualise(document) 30 | 31 | This will open a matplotlib window which should look something like the following image: 32 | 33 | .. image:: /screenshots/simple_memo_example/visualise.png 34 | :height: 300px 35 | 36 | Py-pdf-parser has extracted each element from the PDF as a :class:`~py_pdf_parser.components.PDFElement`, and is showing a blue box around each element. This is what we are looking for. Always check the visualise tool, since sometimes you will need to adjust the layout parameters so that the tool correctly identifies your elements. We will get on to this in later examples. 37 | 38 | Step 2 - Extract reference elements 39 | ................................... 40 | 41 | Certain elements should be present in every memo. We will use these as reference elements to identify the elements which contain the information we are interested in. We already have our ``document``, which is a :class:`~py_pdf_parser.components.PDFDocument`. We can do :meth:`document.elements ` to get a list (an :class:`~py_pdf_parser.filtering.ElementList`) of all the :class:`~py_pdf_parser.components.PDFElement` in the document, and also to allow us to filter the elements. 42 | 43 | The simplest way to extract the elements we are interested in is by text. There are many other options available to us, and a full list can be found on the :ref:`filtering reference page`. 44 | 45 | We will extract the "TO:", "FROM:", "DATE:" and "SUBJECT:" elements as reference elements, i.e. the elements on the left of the below image. We will then search to the right of each of them in turn, to extract the values for each field. 46 | 47 | .. image:: /screenshots/simple_memo_example/top.png 48 | :height: 200px 49 | 50 | To extract the element which says "TO:", we can simply run :meth:`document.elements.filter_by_text_equal("TO:") `. This returns a new :class:`~py_pdf_parser.filtering.ElementList` which contains all the elements in the document with text equal to "TO:". In this case, there should only be one element in the list. We could just use ``[0]`` on the element list to access the element in question, however, there is a convenience function, :func:`~py_pdf_parser.filtering.ElementList.extract_single_element` on the :class:`~py_pdf_parser.filtering.ElementList` class to handle this case. This essentially checks if the list has a single element and returns the element for you, otherwise it raises an exception. Use of this is encouraged to make your code more robust and to make any errors more explicit. 51 | 52 | .. code-block:: python 53 | 54 | to_element = document.elements.filter_by_text_equal("TO:").extract_single_element() 55 | from_element = document.elements.filter_by_text_equal("FROM:").extract_single_element() 56 | date_element = document.elements.filter_by_text_equal("DATE:").extract_single_element() 57 | subject_element = document.elements.filter_by_text_equal( 58 | "SUBJECT:" 59 | ).extract_single_element() 60 | 61 | Each of the above elements will be a :class:`~py_pdf_parser.components.PDFElement`. 62 | 63 | Step 3 - Extract the data 64 | ......................... 65 | 66 | In the above section we have extracted our reference elements. We can now use these to do some more filtering to extract the data we want. In particular, we can use :func:`~py_pdf_parser.filtering.ElementList.to_the_right_of`, which will extract elements directly to the right of a given element. It effectively draws a dotted line from the top and bottom of your element out to the right hand side of the page, and any elements which are partially within the box created by the dotted line will be returned. To extract the text from a :class:`~py_pdf_parser.components.PDFElement`, we must also call :func:`.text() `. 67 | 68 | .. code-block:: python 69 | 70 | to_text = document.elements.to_the_right_of(to_element).extract_single_element().text() 71 | from_text = ( 72 | document.elements.to_the_right_of(from_element).extract_single_element().text() 73 | ) 74 | date_text = ( 75 | document.elements.to_the_right_of(date_element).extract_single_element().text() 76 | ) 77 | subject_text_element = document.elements.to_the_right_of( 78 | subject_element 79 | ).extract_single_element() 80 | subject_text = subject_text_element.text() 81 | 82 | Note we keep a reference to the subject text element. This is because we will use it later. 83 | 84 | We have now extracted the data from the top of the memo, for example ``to_text`` will be ``"All Developers"``. The code does not rely on who the memo is to, and so it should still work for a memo with different values. 85 | 86 | The last thing we need to do is extract the content of the memo. In our example there is only one paragraph, and so only one element, but if there were multiple paragraphs there could be multiple elements. There are a few ways to do this. It is probably the case that all the content elements are below the "SUBJECT:" element, however if the text started too far to the right this may not be the case. Instead, we can just use :func:`~py_pdf_parser.filtering.ElementList.after` to filter for elements strictly after the ``subject_text_element``: 87 | 88 | .. code-block:: python 89 | 90 | content_elements = document.elements.after(subject_element) 91 | content_text = "\n".join(element.text() for element in content_elements) 92 | 93 | That is now everything extracted from the memo. We can wrap our output into any data structure we fancy, for example json: 94 | 95 | .. code-block:: python 96 | 97 | output = { 98 | "to": to_text, 99 | "from": from_text, 100 | "date": date_text, 101 | "subject": subject_text, 102 | "content": content_text, 103 | } 104 | 105 | Full Code 106 | ......... 107 | 108 | Here is the full script constructed above: 109 | 110 | .. code-block:: python 111 | 112 | from py_pdf_parser.loaders import load_file 113 | 114 | # Step 1 - Load the document 115 | document = load_file("simple_memo.pdf") 116 | 117 | # We could visualise it here to check it looks correct: 118 | # from py_pdf_parser.visualise import visualise 119 | # visualise(document) 120 | 121 | # Step 2 - Extract reference elements: 122 | to_element = document.elements.filter_by_text_equal("TO:").extract_single_element() 123 | from_element = document.elements.filter_by_text_equal("FROM:").extract_single_element() 124 | date_element = document.elements.filter_by_text_equal("DATE:").extract_single_element() 125 | subject_element = document.elements.filter_by_text_equal( 126 | "SUBJECT:" 127 | ).extract_single_element() 128 | 129 | # Step 3 - Extract the data 130 | to_text = document.elements.to_the_right_of(to_element).extract_single_element().text() 131 | from_text = ( 132 | document.elements.to_the_right_of(from_element).extract_single_element().text() 133 | ) 134 | date_text = ( 135 | document.elements.to_the_right_of(date_element).extract_single_element().text() 136 | ) 137 | subject_text_element = document.elements.to_the_right_of( 138 | subject_element 139 | ).extract_single_element() 140 | subject_text = subject_text_element.text() 141 | 142 | content_elements = document.elements.after(subject_element) 143 | content_text = "\n".join(element.text() for element in content_elements) 144 | 145 | output = { 146 | "to": to_text, 147 | "from": from_text, 148 | "date": date_text, 149 | "subject": subject_text, 150 | "content": content_text, 151 | } 152 | 153 | This gives: 154 | :: 155 | 156 | >>> from pprint import pprint 157 | >>> pprint(output) 158 | 159 | {'content': 'A new PDF Parsing tool\n' 160 | 'There is a new PDF parsing tool available, called py-pdf-parser - ' 161 | 'you should all check it out!\n' 162 | 'I think it could really help you extract that data we need from ' 163 | 'those PDFs.', 164 | 'date': '1st January 2020', 165 | 'from': 'John Smith', 166 | 'subject': 'A new PDF Parsing tool', 167 | 'to': 'All Developers'} 168 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to PDF Parser's documentation! 2 | ====================================== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | overview 9 | examples/index 10 | reference/index 11 | CHANGELOG.md 12 | -------------------------------------------------------------------------------- /docs/source/overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | Introduction 5 | ------------ 6 | 7 | This PDF Parser is a tool built on top of PDF Miner to help extracting information from PDFs in Python. The main idea was to create a tool that could be driven by code to interact with the elements on the PDF and slowly classify them by creating sections and adding tags to them. It also comes with a helpful visualisation tool which enables you to examine the current status of your elements. 8 | 9 | This page gives a brief overview of the PDF Parser, but there is also a full :doc:`reference/index` of all the functionality. You may get a more in-depth overview by looking at the :doc:`examples/index`. 10 | 11 | Setup 12 | ----- 13 | 14 | You will need to have Python 3.6 or greater installed, and if you're installing the development requirements to use the visualise tool you will also need tkinter installed on your system. For information on how to do this, see https://tkdocs.com/tutorial/install.html. 15 | 16 | We recommend you install the development requirements with ``pip3 install py-pdf-parser[dev]``, which enables the visualise tool. If you don't need the visualise tool (for example in a production app once you've written your parsing scripts) you can simply run ``pip3 install py-pdf-parser``. 17 | 18 | When Should I Use Py PDF Parser? 19 | -------------------------------- 20 | 21 | Py PDF Parser is best suited to locating and extracting specific data in a structured way from a PDF. You can locate contents however you want (by text, location, font, etc), and since it is code-driven you have the flexibility to implement custom logic without having to deal with the PDF itself. Py pdf parser helps to abstract away things like page breaks (unless you want to use them), which helps to write robust code which will extract data from multiple PDFs of the same type, even if there are differences between each individual document. 22 | 23 | Py PDF Parser is good at extracting tables in PDFs, and allows you to write code to programmatically locate the tables to extract. Page breaks (and even headers or footers) half way through your table can be ignored easily. If you're trying to extract all tables from a PDF, other tools (e.g. https://camelot-py.readthedocs.io/en/master/) are available and may be more appropriate. 24 | 25 | If you're simply trying to extract all of the text from a PDF, other tools (e.g. https://textract.readthedocs.io/en/stable/python_package.html) may be more appropriate. Whilst you can still do this with Py PDF Parser, it is not designed to be a tool where you simply plug in a PDF and it spits it out in text format. Py PDF Parser is not a plug-and-play solution, but rather a tool to help you write code that extracts certain pieces of data from a structured PDF. 26 | 27 | Loading A PDF 28 | ------------- 29 | 30 | To load a PDF, use the :func:`~py_pdf_parser.loaders.load_file`: function from the :doc:`reference/loaders`. You will need to use :func:`~py_pdf_parser.loaders.load_file`: with a file path to be able to use the visualisation tool with your PDF as the background. If you don't have this, you can instead use the :func:`~py_pdf_parser.loaders.load`: function, but when you use the visualisation tool there will be no background. 31 | 32 | We order the elements in a pdf, left-to-right, top-to-bottom. At the moment, this is not configurable. Each :class:`~py_pdf_parser.components.PDFElement` within the :class:`~py_pdf_parser.components.PDFDocument` are aware of their position, both on the page and within the document, and also have properties allowing you to access their font and text. For more information about :class:`~py_pdf_parser.components.PDFDocument` and :class:`~py_pdf_parser.components.PDFElement`, see :doc:`reference/components`. 33 | 34 | Pay particular attention to the ``la_params`` argument. These will need to be fine-tuned for your PDF. We suggest immediately visualising your PDF using the visualisation tool to see how the elements have been grouped. If multiple elements have been counted as one, or vice versa, you should be able to fix this by tweaking the ``la_params``. 35 | 36 | Filtering 37 | --------- 38 | 39 | Once you have loaded your PDF, say into a variable :class:`document`, you can start interacting with the elements. You can access all the elements by calling :class:`document.elements`. You may now want to filter your elements, for example you could do :meth:`document.elements.filter_by_text_equal("foo")` to filter for all elements which say "foo". To view all available filters, have a look at the :doc:`reference/filtering` reference. 40 | 41 | The :class:`document.elements` object, and any filtered subset thereof, will be an :class:`~py_pdf_parser.filtering.ElementList`. These act like sets of elements, and so you can union (:meth:`|`), intersect (:meth:`&`), difference (:meth:`-`) and symmetric difference (:meth:`^`) different filtered sets of elements. 42 | 43 | You can also chain filters, which will do the same as intersecting multiple filters, for example ``document.elements.filter_by_text_equal("foo").filter_by_tag("bar")`` is the same as ``document.elements.filter_by_text_equal("foo") & document.elements.filter_by_tag("bar")``. 44 | 45 | If you believe you have filtered down to a single element, and would like to examine that element, you can call :meth:`~py_pdf_parser.filtering.ElementList.extract_single_element`. This will return said element, or raise an exception if there is not a single element in your list. 46 | 47 | You can see an example of filtering in the :ref:`simple-memo` example. 48 | 49 | Classifying Elements 50 | -------------------- 51 | 52 | There are three ways to classify elements: 53 | 54 | - add tags 55 | - create sections 56 | - mark certain elements as ignored 57 | 58 | To add a tag, you can simply call :meth:`~py_pdf_parser.components.PDFElement.add_tag` on an :class:`~py_pdf_parser.components.PDFElement`, or :meth:`~py_pdf_parser.filtering.ElementList.add_tag_to_elements` on an :class:`~py_pdf_parser.filtering.ElementList`. You can filter by tags. 59 | 60 | To create a section, you can call :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`. See :doc:`reference/sectioning` for more information and the :ref:`order-summary` example for an example. When you create a section you simply specify a name for the section, and the start and end element for the section. Any elements between the start and end element will be included in your section. You can add multiple sections with the same name, and internally they will be given unique names. You can filter by either the non-unique ``section_name``, or by the unique sections. Elements can be in multiple sections. 61 | 62 | To mark an element as ignored, simply set the ``ignore`` property to ``True``. Ignored elements will not be included in any :class:`~py_pdf_parser.filtering.ElementList`, however existing lists which you have assigned to variables will not be re-calculated and so may still include the ignored elements. 63 | 64 | To process a whole pdf, we suggest that you mark any elements you're not interested in as ignored, group any elements which are together into sections, and then add tags to important elements. You can then loop through filtered sets of elements to extract the information you would like. 65 | 66 | Visualisation Tool 67 | ------------------ 68 | 69 | The PDF Parser comes with a visualisation tool. See the :doc:`reference/visualise` documentation. When you visualise your :class:`~py_pdf_parser.components.PDFDocument`, you'll be able to see each page of the document in turn, with every :class:`~py_pdf_parser.components.PDFElement` highlighted. You can hover over the elements to see their sections, tags and whether they are ignored or not. This is very helpful for debugging any problems. 70 | 71 | You can use the arrow key icons to change page, and can press home to return to page 1. You can also use the scroll wheel on your mouse to zoom in and out. 72 | 73 | You can see an example of the visualisation in the :ref:`simple-memo` and :ref:`order-summary` examples. 74 | 75 | Font Mappings 76 | ------------- 77 | 78 | You can filter elements by font. The font will be taken from the PDF itself, however often they have long and confusing names. You can specify a ``font_mapping`` when you load the document to map these to more memorable names. This ``font_mapping`` can either be a regex pattern or an exact string mapping. See the :doc:`reference/components` reference for the :class:`~py_pdf_parser.components.PDFDocument` arguments for more information. 79 | 80 | You can see an example of font mapping in the :ref:`order-summary` example. 81 | 82 | Tables 83 | ------ 84 | 85 | We have many functions to help extract tables. All of these use the positioning of the elements on the page to do this. See the :doc:`reference/tables` reference, and the :ref:`order-summary` and :ref:`more-tables` examples. 86 | -------------------------------------------------------------------------------- /docs/source/reference/common.rst: -------------------------------------------------------------------------------- 1 | Common 2 | ------ 3 | 4 | .. automodule:: py_pdf_parser.common 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/reference/components.rst: -------------------------------------------------------------------------------- 1 | Components 2 | ---------- 3 | 4 | .. automodule:: py_pdf_parser.components 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/reference/filtering.rst: -------------------------------------------------------------------------------- 1 | .. _filtering-reference: 2 | 3 | Filtering 4 | --------- 5 | 6 | .. autoclass:: py_pdf_parser.filtering.ElementList 7 | :members: 8 | :special-members: 9 | -------------------------------------------------------------------------------- /docs/source/reference/index.rst: -------------------------------------------------------------------------------- 1 | Reference 2 | ========= 3 | 4 | .. toctree:: 5 | 6 | common 7 | components 8 | filtering 9 | loaders 10 | sectioning 11 | tables 12 | visualise 13 | -------------------------------------------------------------------------------- /docs/source/reference/loaders.rst: -------------------------------------------------------------------------------- 1 | Loaders 2 | ------- 3 | 4 | .. automodule:: py_pdf_parser.loaders 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/reference/sectioning.rst: -------------------------------------------------------------------------------- 1 | Sectioning 2 | ---------- 3 | 4 | .. automodule:: py_pdf_parser.sectioning 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/reference/tables.rst: -------------------------------------------------------------------------------- 1 | Tables 2 | ------ 3 | 4 | .. automodule:: py_pdf_parser.tables 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/reference/visualise.rst: -------------------------------------------------------------------------------- 1 | Visualise 2 | --------- 3 | 4 | .. autofunction:: py_pdf_parser.visualise.main.visualise 5 | -------------------------------------------------------------------------------- /docs/source/screenshots/order_summary_example/initial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/initial.png -------------------------------------------------------------------------------- /docs/source/screenshots/order_summary_example/sections.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/sections.png -------------------------------------------------------------------------------- /docs/source/screenshots/order_summary_example/showing_font_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/showing_font_1.png -------------------------------------------------------------------------------- /docs/source/screenshots/order_summary_example/showing_font_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/showing_font_2.png -------------------------------------------------------------------------------- /docs/source/screenshots/order_summary_example/zoomed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/zoomed.png -------------------------------------------------------------------------------- /docs/source/screenshots/simple_memo_example/top.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/simple_memo_example/top.png -------------------------------------------------------------------------------- /docs/source/screenshots/simple_memo_example/visualise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/simple_memo_example/visualise.png -------------------------------------------------------------------------------- /imagemagick_policy.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ]> 11 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | -------------------------------------------------------------------------------- /py_pdf_parser/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/py_pdf_parser/__init__.py -------------------------------------------------------------------------------- /py_pdf_parser/common.py: -------------------------------------------------------------------------------- 1 | from py_pdf_parser.exceptions import InvalidCoordinatesError 2 | 3 | 4 | class BoundingBox: 5 | """ 6 | A rectangle, stored using the coordinates (x0, y0) of the bottom left corner, and 7 | the coordinates (x1, y1) of the top right corner. 8 | 9 | Args: 10 | x0 (int): The x coordinate of the bottom left corner. 11 | x1 (int): The x coordinate of the top right corner. 12 | y0 (int): The y coordinate of the bottom left corner. 13 | y1 (int): The y coordinate of the top right corner. 14 | 15 | Raises: 16 | InvalidCoordinatesError: if x1 is smaller than x0 or y1 is smaller than y0. 17 | 18 | Attributes: 19 | x0 (int): The x coordinate of the bottom left corner. 20 | x1 (int): The x coordinate of the top right corner. 21 | y0 (int): The y coordinate of the bottom left corner. 22 | y1 (int): The y coordinate of the top right corner. 23 | width (int): The width of the box, equal to x1 - x0. 24 | height (int): The height of the box, equal to y1 - y0. 25 | """ 26 | 27 | def __init__(self, x0: float, x1: float, y0: float, y1: float): 28 | if x1 < x0: 29 | raise InvalidCoordinatesError( 30 | f"Invalid coordinates, x1 is smaller than x0 ({x1}<{x0})" 31 | ) 32 | if y1 < y0: 33 | raise InvalidCoordinatesError( 34 | f"Invalid coordinates, y1 is smaller than y0 ({y1}<{y0})" 35 | ) 36 | self.x0 = x0 37 | self.x1 = x1 38 | self.y0 = y0 39 | self.y1 = y1 40 | self.width = x1 - x0 41 | self.height = y1 - y0 42 | 43 | def __eq__(self, other: object) -> bool: 44 | if not isinstance(other, BoundingBox): 45 | raise NotImplementedError(f"Can't compare BoundingBox with {type(other)}") 46 | 47 | return all( 48 | [ 49 | self.x0 == other.x0, 50 | self.x1 == other.x1, 51 | self.y0 == other.y0, 52 | self.y1 == other.y1, 53 | ] 54 | ) 55 | 56 | def __repr__(self) -> str: 57 | return f"" 58 | -------------------------------------------------------------------------------- /py_pdf_parser/exceptions.py: -------------------------------------------------------------------------------- 1 | class PDFParserError(Exception): 2 | pass 3 | 4 | 5 | # Components 6 | class PageNotFoundError(PDFParserError): 7 | pass 8 | 9 | 10 | class NoElementsOnPageError(PDFParserError): 11 | pass 12 | 13 | 14 | # Filtering 15 | class NoElementFoundError(PDFParserError): 16 | pass 17 | 18 | 19 | class MultipleElementsFoundError(PDFParserError): 20 | pass 21 | 22 | 23 | class ElementOutOfRangeError(PDFParserError): 24 | pass 25 | 26 | 27 | # Sectioning 28 | class InvalidSectionError(PDFParserError): 29 | pass 30 | 31 | 32 | class SectionNotFoundError(PDFParserError): 33 | pass 34 | 35 | 36 | # Tables 37 | class TableExtractionError(PDFParserError): 38 | pass 39 | 40 | 41 | class InvalidTableError(PDFParserError): 42 | pass 43 | 44 | 45 | class InvalidTableHeaderError(PDFParserError): 46 | pass 47 | 48 | 49 | class InvalidCoordinatesError(PDFParserError): 50 | pass 51 | -------------------------------------------------------------------------------- /py_pdf_parser/loaders.py: -------------------------------------------------------------------------------- 1 | from typing import IO, Any, Dict, List, NamedTuple, Optional 2 | 3 | import logging 4 | 5 | from pdfminer.high_level import extract_pages 6 | from pdfminer.layout import LAParams, LTFigure, LTTextBox 7 | 8 | from .components import PDFDocument 9 | 10 | logger = logging.getLogger("PDFParser") 11 | DEFAULT_LA_PARAMS: Dict = {"boxes_flow": None} 12 | 13 | 14 | class Page(NamedTuple): 15 | """ 16 | This is used to pass PDF Miner elements of a page when instantiating PDFDocument. 17 | 18 | Args: 19 | width (int): The width of the page. 20 | height (int): The height of the page. 21 | elements (list): A list of PDF Miner elements (LTTextBox) on the page. 22 | """ 23 | 24 | width: int 25 | height: int 26 | elements: List[LTTextBox] 27 | 28 | 29 | def load_file( 30 | path_to_file: str, la_params: Optional[Dict] = None, **kwargs: Any 31 | ) -> PDFDocument: 32 | """ 33 | Loads a file according to the specified file path. 34 | 35 | All other arguments are passed to `load`, see the documentation for `load`. 36 | 37 | Returns: 38 | PDFDocument: A PDFDocument with the specified file loaded. 39 | """ 40 | with open(path_to_file, "rb") as in_file: 41 | return load(in_file, pdf_file_path=path_to_file, la_params=la_params, **kwargs) 42 | 43 | 44 | def load( 45 | pdf_file: IO, 46 | pdf_file_path: Optional[str] = None, 47 | password: Optional[str] = None, 48 | la_params: Optional[Dict] = None, 49 | **kwargs: Any, 50 | ) -> PDFDocument: 51 | """ 52 | Loads the pdf file into a PDFDocument. 53 | 54 | Args: 55 | pdf_file (io): The PDF file. 56 | pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation 57 | for `PDFDocument`. 58 | password (str, optional): Password for the encrypted PDF. Required if the 59 | PDF is encrypted. 60 | la_params (dict): The layout parameters passed to PDF Miner for analysis. See 61 | the PDFMiner documentation here: 62 | https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams. 63 | Note that py_pdf_parser will re-order the elements it receives from PDFMiner 64 | so options relating to element ordering will have no effect. 65 | kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`. 66 | 67 | Returns: 68 | PDFDocument: A PDFDocument with the file loaded. 69 | """ 70 | if la_params is None: 71 | la_params = {} 72 | la_params = {**DEFAULT_LA_PARAMS, **la_params} 73 | 74 | pages: Dict[int, Page] = {} 75 | for page in extract_pages( 76 | pdf_file, laparams=LAParams(**la_params), password=password 77 | ): 78 | elements = [element for element in page if isinstance(element, LTTextBox)] 79 | 80 | # If all_texts=True then we may get some text from inside figures 81 | if la_params.get("all_texts"): 82 | figures = (element for element in page if isinstance(element, LTFigure)) 83 | for figure in figures: 84 | elements += [ 85 | element for element in figure if isinstance(element, LTTextBox) 86 | ] 87 | 88 | if not elements: 89 | logger.warning( 90 | f"No elements detected on page {page.pageid}, skipping this page." 91 | ) 92 | continue 93 | 94 | pages[page.pageid] = Page( 95 | width=page.width, height=page.height, elements=elements 96 | ) 97 | 98 | # Disable pytype check due to false positive. See the following issue for details: 99 | # https://github.com/google/pytype/issues/1028 100 | # pytype: disable=wrong-arg-types 101 | return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs) 102 | # pytype: enable=wrong-arg-types 103 | -------------------------------------------------------------------------------- /py_pdf_parser/sectioning.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Dict, Generator, ValuesView 2 | 3 | from collections import defaultdict 4 | 5 | from .exceptions import InvalidSectionError, SectionNotFoundError 6 | from .filtering import ElementList 7 | 8 | if TYPE_CHECKING: 9 | from .components import PDFDocument, PDFElement 10 | 11 | 12 | class Section: 13 | """ 14 | A continuous group of elements within a document. 15 | 16 | A section is intended to label a group of elements. Said elements must be continuous 17 | in the document. 18 | 19 | Warning: 20 | You should not instantiate a Section class yourself, but should call 21 | `create_section` from the `Sectioning` class below. 22 | 23 | Args: 24 | document (PDFDocument): A reference to the document. 25 | name (str): The name of the section. 26 | unique_name (str): Multiple sections can have the same name, but a unique name 27 | will be generated by the Sectioning class. 28 | start_element (PDFElement): The first element in the section. 29 | end_element (PDFElement): The last element in the section. 30 | """ 31 | 32 | document: "PDFDocument" 33 | name: str 34 | unique_name: str 35 | start_element: "PDFElement" 36 | end_element: "PDFElement" 37 | 38 | def __init__( 39 | self, 40 | document: "PDFDocument", 41 | name: str, 42 | unique_name: str, 43 | start_element: "PDFElement", 44 | end_element: "PDFElement", 45 | ): 46 | if start_element._index > end_element._index: 47 | raise InvalidSectionError("end_element must come after start_element") 48 | self.document = document 49 | self.name = name 50 | self.unique_name = unique_name 51 | self.start_element = start_element 52 | self.end_element = end_element 53 | 54 | def __contains__(self, element: "PDFElement") -> bool: 55 | return element in self.elements 56 | 57 | @property 58 | def elements(self) -> "ElementList": 59 | """ 60 | All the elements in the section. 61 | 62 | Returns: 63 | ElementList: All the elements in the section. 64 | """ 65 | return self.document.elements.between( 66 | self.start_element, self.end_element, inclusive=True 67 | ) 68 | 69 | def __eq__(self, other: object) -> bool: 70 | """ 71 | Returns True if the two sections have the same unique name and are from the 72 | same document 73 | """ 74 | if not isinstance(other, Section): 75 | raise NotImplementedError(f"Can't compare Section with {type(other)}") 76 | return all( 77 | [ 78 | self.document == other.document, 79 | self.unique_name == other.unique_name, 80 | self.start_element == other.start_element, 81 | self.end_element == other.end_element, 82 | self.__class__ == other.__class__, 83 | ] 84 | ) 85 | 86 | def __len__(self) -> int: 87 | """ 88 | Returns the number of elements in the section. 89 | """ 90 | return len(self.elements) 91 | 92 | def __repr__(self) -> str: 93 | return ( 94 | f"
" 96 | ) 97 | 98 | 99 | class Sectioning: 100 | """ 101 | A sectioning utilities class, made available on all PDFDocuments as ``.sectioning``. 102 | """ 103 | 104 | document: "PDFDocument" 105 | name_counts: Dict[str, int] 106 | sections_dict: Dict[str, Section] 107 | 108 | def __init__(self, document: "PDFDocument"): 109 | self.sections_dict = {} 110 | self.name_counts = defaultdict(int) 111 | self.document = document 112 | 113 | def create_section( 114 | self, 115 | name: str, 116 | start_element: "PDFElement", 117 | end_element: "PDFElement", 118 | include_last_element: bool = True, 119 | ) -> "Section": 120 | """ 121 | Creates a new section with the specified name. 122 | 123 | Creates a new section with the specified name, starting at `start_element` and 124 | ending at `end_element` (inclusive). The unique name will be set to name_ 125 | where is the number of existing sections with that name. 126 | 127 | Args: 128 | name (str): The name of the new section. 129 | start_element (PDFElement): The first element in the section. 130 | end_element (PDFElement): The last element in the section. 131 | include_last_element (bool): Whether the end_element should be included in 132 | the section, or only the elements which are strictly before the end 133 | element. Default: True (i.e. include end_element). 134 | 135 | Returns: 136 | Section: The created section. 137 | 138 | Raises: 139 | InvalidSectionError: If a the created section would be invalid. This is 140 | usually because the end_element comes after the start element. 141 | """ 142 | current_count = self.name_counts[name] 143 | unique_name = f"{name}_{current_count}" 144 | self.name_counts[name] += 1 145 | 146 | if not include_last_element: 147 | if end_element._index == 0: 148 | raise InvalidSectionError( 149 | "Section would contain no elements as end_element is the first " 150 | "element in the document and include_last_element is False" 151 | ) 152 | # We simply drop the index by one to get the element before 153 | end_element = self.document._element_list[end_element._index - 1] 154 | section = Section(self.document, name, unique_name, start_element, end_element) 155 | self.sections_dict[unique_name] = section 156 | return section 157 | 158 | def get_sections_with_name(self, name: str) -> Generator[Section, None, None]: 159 | """ 160 | Returns a list of all sections with the given name. 161 | """ 162 | return ( 163 | self.sections_dict[f"{name}_{idx}"] 164 | for idx in range(0, self.name_counts[name]) 165 | ) 166 | 167 | def get_section(self, unique_name: str) -> Section: 168 | """ 169 | Returns the section with the given unique name. 170 | 171 | Raises: 172 | SectionNotFoundError: If there is no section with the given unique_name. 173 | """ 174 | try: 175 | return self.sections_dict[unique_name] 176 | except KeyError as err: 177 | raise SectionNotFoundError( 178 | f"Could not find section with name {unique_name}" 179 | ) from err 180 | 181 | @property 182 | def sections(self) -> ValuesView[Section]: 183 | """ 184 | Returns the list of all created Sections. 185 | """ 186 | return self.sections_dict.values() 187 | -------------------------------------------------------------------------------- /py_pdf_parser/visualise/__init__.py: -------------------------------------------------------------------------------- 1 | from .main import visualise 2 | 3 | __all__ = [ 4 | "visualise", 5 | ] 6 | -------------------------------------------------------------------------------- /py_pdf_parser/visualise/background.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | import numpy 4 | import wand.color 5 | import wand.image 6 | from PIL import Image 7 | 8 | 9 | def get_pdf_background(pdf_file_path: str, page_number: int) -> Image.Image: 10 | """ 11 | Create a screenshot of this PDF page using Ghostscript, to use as the 12 | background for the matplotlib chart. 13 | """ 14 | # Appending e.g. [0] to the filename means it only loads the first page 15 | path_with_page = f"{pdf_file_path}[{page_number - 1}]" 16 | pdf_pages = wand.image.Image(filename=path_with_page, resolution=150) 17 | page = pdf_pages.sequence[0] 18 | 19 | with wand.image.Image(page) as image: 20 | # We need to composite this with a white image as a background, 21 | # because disabling the alpha channel doesn't work. 22 | bg_params = { 23 | "width": image.width, 24 | "height": image.height, 25 | "background": wand.color.Color("white"), 26 | } 27 | with wand.image.Image(**bg_params) as background: 28 | background.composite(image, 0, 0) 29 | img_buffer = numpy.asarray( 30 | bytearray(background.make_blob(format="png")), dtype="uint8" 31 | ) 32 | img_stream = io.BytesIO(img_buffer.tobytes()) 33 | 34 | return Image.open(img_stream).transpose(Image.FLIP_TOP_BOTTOM).convert("RGBA") 35 | -------------------------------------------------------------------------------- /py_pdf_parser/visualise/info_figure.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Dict, List, Optional 2 | 3 | from matplotlib.backend_bases import MouseButton 4 | 5 | if TYPE_CHECKING: 6 | from py_pdf_parser.components import PDFElement 7 | 8 | 9 | def get_clicked_element_info(clicked_elements: Dict[MouseButton, "PDFElement"]) -> str: 10 | left_element = clicked_elements.get(MouseButton.LEFT) 11 | right_element = clicked_elements.get(MouseButton.RIGHT) 12 | 13 | output = [] 14 | 15 | output.append("Left clicked element:") 16 | output.append("---------------------") 17 | output += _get_element_info(left_element) 18 | output.append("") 19 | 20 | output.append("Right clicked element:") 21 | output.append("---------------------") 22 | output += _get_element_info(right_element) 23 | output.append("") 24 | 25 | output.append("Element comparison:") 26 | output.append("-------------------") 27 | output += _get_element_comparison_info(left_element, right_element) 28 | return "\n".join(output) 29 | 30 | 31 | def _get_element_info(element: Optional["PDFElement"]) -> List[str]: 32 | if not element: 33 | return ["Click an element to see details"] 34 | return [ 35 | f"Text: {element.text(stripped=False)}", 36 | f"Font: {element.font}", 37 | f"Tags: {element.tags}", 38 | f"Bounding box: {element.bounding_box}", 39 | f"Width: {element.bounding_box.width}", 40 | f"Height: {element.bounding_box.height}", 41 | ] 42 | 43 | 44 | def _get_element_comparison_info( 45 | element1: Optional["PDFElement"], element2: Optional["PDFElement"] 46 | ) -> List[str]: 47 | if element1 is None or element2 is None: 48 | return ["Left click one element and right click another to see comparison"] 49 | 50 | bbox1 = element1.bounding_box 51 | bbox2 = element2.bounding_box 52 | 53 | # Height 54 | height_diff = abs(bbox1.height - bbox2.height) 55 | relative_height_diff = height_diff / bbox1.height 56 | 57 | # Line margin (i.e. vertical gap) 58 | line_margin = max(bbox1.y0 - bbox2.y1, bbox2.y0 - bbox1.y1) 59 | relative_line_margin = line_margin / bbox1.height 60 | 61 | # Alignment 62 | alignments = { 63 | "left": abs(bbox1.x0 - bbox2.x0), 64 | "right": abs(bbox1.x1 - bbox2.x1), 65 | "center": abs((bbox1.x0 + bbox1.x1) / 2 - (bbox2.x0 + bbox2.x1) / 2), 66 | } 67 | sorted_alignments = sorted(alignments.items(), key=lambda x: x[1]) 68 | alignment_name, alignment_value = sorted_alignments[0] 69 | relative_alignment_value = alignment_value / bbox1.height 70 | 71 | return [ 72 | "Note 'relative' is relative to the left clicked element", 73 | f"Height diff: {height_diff}", 74 | f"Relative height diff {relative_height_diff}", 75 | f"Line margin: {line_margin}", 76 | f"Relative line margin: {relative_line_margin}", 77 | f"Closest alignment: {alignment_value} ({alignment_name})", 78 | f"Relative alignment: {relative_alignment_value}", 79 | ] 80 | -------------------------------------------------------------------------------- /py_pdf_parser/visualise/main.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple 2 | 3 | import logging 4 | import tkinter as tk 5 | 6 | import matplotlib 7 | from matplotlib.backend_bases import MouseButton 8 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk 9 | from matplotlib.figure import Figure 10 | 11 | from py_pdf_parser.components import PDFDocument 12 | 13 | from .background import get_pdf_background 14 | from .info_figure import get_clicked_element_info 15 | from .sections import SectionVisualiser 16 | 17 | if TYPE_CHECKING: 18 | from matplotlib.axes import Axes 19 | from matplotlib.backend_bases import MouseEvent 20 | from matplotlib.figure import Text 21 | 22 | from py_pdf_parser.components import PDFElement 23 | from py_pdf_parser.filtering import ElementList 24 | 25 | logger = logging.getLogger("PDFParser") 26 | 27 | 28 | STYLES = { 29 | "untagged": {"color": "#00a9f4", "linewidth": 1, "alpha": 0.5}, 30 | "tagged": {"color": "#007ac1", "linewidth": 1, "alpha": 0.5}, 31 | "ignored": {"color": "#67daff", "linewidth": 1, "alpha": 0.2, "linestyle": ":"}, 32 | } 33 | 34 | DPI = 100 35 | 36 | 37 | class CustomToolbar(NavigationToolbar2Tk): 38 | def __init__( 39 | self, 40 | canvas: tk.Canvas, 41 | window: tk.Tk, 42 | first_page_callback: Callable, 43 | previous_page_callback: Callable, 44 | next_page_callback: Callable, 45 | last_page_callback: Callable, 46 | *args: Any, 47 | **kwargs: Any, 48 | ): 49 | self.first_page_callback = first_page_callback 50 | self.previous_page_callback = previous_page_callback 51 | self.next_page_callback = next_page_callback 52 | self.last_page_callback = last_page_callback 53 | self.toolitems += ( 54 | (None, None, None, None), # Divider 55 | ("First page", "Go to fist page", "back", "first_page_callback"), 56 | ("Previous page", "Go to previous page", "back", "previous_page_callback"), 57 | ("Next page", "Go to next page", "forward", "next_page_callback"), 58 | ("Last page", "Go to last page", "forward", "last_page_callback"), 59 | ) 60 | super().__init__(canvas, window, *args, **kwargs) 61 | 62 | def reset(self, not_first_page: bool, not_last_page: bool) -> None: 63 | map = {True: tk.ACTIVE, False: tk.DISABLED} 64 | self._buttons["First page"]["state"] = map[not_first_page] 65 | self._buttons["Previous page"]["state"] = map[not_first_page] 66 | self._buttons["Next page"]["state"] = map[not_last_page] 67 | self._buttons["Last page"]["state"] = map[not_last_page] 68 | 69 | 70 | class PDFVisualiser: 71 | """ 72 | Class used to handle visualising the PDF. Do not instantiate this yourself, instead 73 | you should call the `visualise` function. 74 | 75 | We need a class as we have to keep track of the current page etc. 76 | """ 77 | 78 | document: PDFDocument 79 | current_page: int 80 | __ax: "Axes" 81 | __fig: "Figure" 82 | __info_fig: Optional["Figure"] = None 83 | __info_text: Optional["Text"] = None 84 | __section_visualiser: "SectionVisualiser" 85 | 86 | __clicked_elements: Dict[MouseButton, "PDFElement"] = {} 87 | 88 | def __init__( 89 | self, 90 | root: tk.Tk, 91 | document: PDFDocument, 92 | current_page: int = 1, 93 | elements: Optional["ElementList"] = None, 94 | show_info: bool = False, 95 | width: Optional[int] = None, 96 | height: Optional[int] = None, 97 | ): 98 | if not document._pdf_file_path: 99 | logger.warning( 100 | "PDFDocument does not initialised with pdf_file_path and so we cannot " 101 | "add the PDF background for visualisation. Please use load_file " 102 | "instead of load, or specify pdf_file_path manually" 103 | ) 104 | 105 | self.document = document 106 | self.current_page = current_page 107 | if elements is not None: 108 | self.elements = elements 109 | else: 110 | self.elements = document.elements 111 | self.show_info = show_info 112 | 113 | self.root = root 114 | if width is None: 115 | width = self.root.winfo_screenwidth() 116 | if height is None: 117 | height = self.root.winfo_screenheight() 118 | self.root.geometry(f"{width}x{height}") 119 | title = "py-pdf-parser" 120 | if self.document._pdf_file_path: 121 | title += f" - {self.document._pdf_file_path}" 122 | self.root.title(title) 123 | 124 | self.__fig = Figure(figsize=(5, 4), dpi=DPI) 125 | self.canvas = FigureCanvasTkAgg(self.__fig, master=self.root) 126 | self.canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1) 127 | self.toolbar = CustomToolbar( 128 | self.canvas, 129 | self.root, 130 | next_page_callback=self.__next_page, 131 | first_page_callback=self.__first_page, 132 | previous_page_callback=self.__previous_page, 133 | last_page_callback=self.__last_page, 134 | ) 135 | 136 | self.__ax = self.canvas.figure.add_subplot(111) 137 | 138 | self.__section_visualiser = SectionVisualiser(self.document, self.__ax) 139 | 140 | if self.show_info: 141 | self.__info_fig, self.__info_text = self.__initialise_info_fig() 142 | 143 | self.__plot_current_page() 144 | 145 | def __plot_current_page(self) -> None: 146 | if self.show_info: 147 | self.__clear_clicked_elements() 148 | 149 | self.__ax.cla() 150 | 151 | # draw PDF image as background 152 | page = self.document.get_page(self.current_page) 153 | if self.document._pdf_file_path is not None: 154 | background = get_pdf_background( 155 | self.document._pdf_file_path, self.current_page 156 | ) 157 | self.__ax.imshow( 158 | background, 159 | origin="lower", 160 | extent=[0, page.width, 0, page.height], 161 | interpolation="kaiser", 162 | ) 163 | else: 164 | self.__ax.set_aspect("equal") 165 | self.__ax.set_xlim([0, page.width]) 166 | self.__ax.set_ylim([0, page.height]) 167 | 168 | page = self.document.get_page(self.current_page) 169 | for element in page.elements & self.elements: 170 | style = STYLES["tagged"] if element.tags else STYLES["untagged"] 171 | self.__plot_element(element, style) 172 | 173 | # We'd like to draw greyed out rectangles around the ignored elements, but these 174 | # are excluded from ElementLists, so we need to do this manually. 175 | page_indexes = set( 176 | range(page.start_element._index, page.end_element._index + 1) 177 | ) 178 | ignored_indexes_on_page = page_indexes & self.document._ignored_indexes 179 | for index in ignored_indexes_on_page: 180 | element = self.document._element_list[index] 181 | self.__plot_element(element, STYLES["ignored"]) 182 | 183 | self.__section_visualiser.plot_sections_for_page(page) 184 | 185 | self.__ax.format_coord = self.__get_annotations 186 | self.__reset_toolbar() 187 | 188 | def __initialise_info_fig(self) -> Tuple["Figure", "Axes"]: 189 | window = tk.Toplevel(self.root) 190 | 191 | info_fig = Figure() 192 | canvas = FigureCanvasTkAgg(info_fig, window) 193 | canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=True) 194 | 195 | self.canvas.mpl_connect("button_press_event", self.__on_click) 196 | 197 | info_text = info_fig.text( 198 | 0.01, 199 | 0.5, 200 | "", 201 | horizontalalignment="left", 202 | verticalalignment="center", 203 | ) 204 | return info_fig, info_text 205 | 206 | def __on_click(self, event: "MouseEvent") -> None: 207 | if event.button == MouseButton.MIDDLE: 208 | self.__clear_clicked_elements() 209 | return 210 | if event.button not in [MouseButton.LEFT, MouseButton.RIGHT]: 211 | return 212 | for rect in self.__ax.patches: 213 | if not rect.contains(event)[0]: 214 | continue 215 | # rect is the rectangle we clicked on! 216 | self.__clicked_elements[event.button] = rect.element 217 | self.__update_text() 218 | 219 | return 220 | 221 | def __clear_clicked_elements(self) -> None: 222 | self.__clicked_elements = {} 223 | self.__update_text() 224 | 225 | def __update_text(self) -> None: 226 | if self.__info_text is None or self.__info_fig is None: 227 | return 228 | self.__info_text.set_text(get_clicked_element_info(self.__clicked_elements)) 229 | self.__info_fig.canvas.draw() 230 | 231 | def __plot_element(self, element: "PDFElement", style: Dict) -> None: 232 | rect = _ElementRectangle(element, **style) 233 | self.__ax.add_patch(rect) 234 | 235 | def __reset_toolbar(self) -> None: 236 | not_first_page = self.current_page != 1 237 | not_last_page = self.current_page != self.document.number_of_pages 238 | self.toolbar.reset(not_first_page, not_last_page) 239 | 240 | def __get_annotations(self, x: float, y: float) -> str: 241 | annotation = f"({x:.2f}, {y:.2f})" 242 | for element in self.elements.filter_by_page(self.current_page): 243 | bbox = element.bounding_box 244 | if bbox.x0 <= x <= bbox.x1 and bbox.y0 <= y <= bbox.y1: 245 | annotation += f" {element}" 246 | sections_dict = self.document.sectioning.sections_dict 247 | section_names = [ 248 | section_name 249 | for section_name, section in sections_dict.items() 250 | if element in section 251 | ] 252 | if section_names: 253 | sections_str = "', '".join(section_names) 254 | annotation += f", SECTIONS: '{sections_str}'" 255 | 256 | return annotation 257 | 258 | def __first_page(self) -> None: 259 | self.__set_page(min(self.document.page_numbers)) 260 | 261 | def __last_page(self) -> None: 262 | self.__set_page(max(self.document.page_numbers)) 263 | 264 | def __next_page(self) -> None: 265 | current_page_idx = self.document.page_numbers.index(self.current_page) 266 | next_page_idx = min(current_page_idx + 1, self.document.number_of_pages) 267 | next_page = self.document.page_numbers[next_page_idx] 268 | self.__set_page(next_page) 269 | 270 | def __previous_page(self) -> None: 271 | current_page_idx = self.document.page_numbers.index(self.current_page) 272 | previous_page_idx = max(current_page_idx - 1, 0) 273 | previous_page = self.document.page_numbers[previous_page_idx] 274 | self.__set_page(previous_page) 275 | 276 | def __set_page(self, page_number: int) -> None: 277 | if self.current_page != page_number: 278 | self.current_page = page_number 279 | self.__plot_current_page() 280 | self.__fig.canvas.draw() 281 | 282 | 283 | class _ElementRectangle(matplotlib.patches.Rectangle): 284 | """ 285 | This is essentially the same as a matplotlib.patches.Rectangle, except 286 | with an added `element` attribute. It also supplies the coordinates for 287 | the rectangle from the element's bounding box. 288 | """ 289 | 290 | def __init__(self, element: "PDFElement", **style: str): 291 | self.element = element 292 | bbox = element.bounding_box 293 | super().__init__((bbox.x0, bbox.y0), bbox.width, bbox.height, **style) 294 | 295 | 296 | def visualise( 297 | document: PDFDocument, 298 | page_number: int = 1, 299 | elements: Optional["ElementList"] = None, 300 | show_info: bool = False, 301 | width: Optional[int] = None, 302 | height: Optional[int] = None, 303 | ) -> None: 304 | """ 305 | Visualises a PDFDocument, allowing you to inspect all the elements. 306 | 307 | Will open a Matplotlib window showing the page_number. You can use the black 308 | buttons on the right of the toolbar to navigate through pages. 309 | 310 | Warning: 311 | In order to show you the actual PDF behind the elements, your document 312 | must be initialised with pdf_file_path, and your PDF must be at the given path. 313 | If this is not done, the background will be white. 314 | 315 | Args: 316 | document (PDFDocument): The pdf document to visualise. 317 | page_number (int): The page to visualise. Note you can change pages using 318 | the arrow keys in the visualisation window. 319 | elements (ElementList, optional): Which elements of the document to visualise. 320 | Defaults to all of the elements in the document. 321 | show_info (bool): Shows an additional window allowing you to click on 322 | PDFElements and see details about them. Default: False. 323 | width: (int, optional): The initial width of the visualisation window. 324 | Default: Screen width. 325 | height: (int, optional): The initial height of the visualisation window. 326 | Default: Screen height. 327 | """ 328 | root = tk.Tk() 329 | PDFVisualiser(root, document, page_number, elements, show_info, width, height) 330 | root.mainloop() 331 | -------------------------------------------------------------------------------- /py_pdf_parser/visualise/sections.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union 2 | 3 | import pyvoronoi 4 | from matplotlib import cm 5 | from shapely import geometry, ops 6 | 7 | if TYPE_CHECKING: 8 | from matplotlib.axes import Axes 9 | 10 | from py_pdf_parser.components import ElementList, PDFDocument, PDFElement, PDFPage 11 | from py_pdf_parser.sectioning import Section 12 | 13 | 14 | # The simple boundary margins are used when trying to draw simple rectangles around 15 | # sections - we see if each of them work in turn. A higher margin means more space 16 | # between the elements and the section boundary line. 17 | SIMPLE_BOUNDARY_MARGINS = [10, 5, 2, 0] 18 | 19 | 20 | class SectionVisualiser: 21 | """ 22 | Used internally to draw outlines of sections on the visualise plot. 23 | 24 | We first try to draw a simple rectangle around the section with a fixed margin, for 25 | increasingly small margins. If this doesn't work (because an element that is not 26 | in the section would be within the section outline rectangle) then we instead 27 | construct the boundary as follows: 28 | 29 | We create a Voronoi diagram around all of the elements on the page, and the page 30 | boundaries (actually we get a diagram around each side of the bounding box of each 31 | element). Then for each line in the diagram we check if it was generated between one 32 | box which is in the section and one which isn't, and if so we draw it. 33 | 34 | This produces some slightly interested outlines, and so we also run a simplification 35 | check. This takes three points on the outline, and if the triangle created by 36 | joining them together doesn't contain any of our elements, we can remove the middle 37 | point to make the whole shape a bit simpler. 38 | 39 | It can still produce some slightly interesting shapes, but does work fairly well. 40 | Importantly, every element in the section will be within the outline, and no boxes 41 | which are not in the section will be (which cannot always be achieved by simply 42 | drawing a rectangle around all the points in the section). 43 | 44 | It does add some time when changing page on the visualise tool, but the whole 45 | process is done in <0.5 sections which is acceptable for a development tool. 46 | """ 47 | 48 | all_elements: List["PDFElement"] 49 | document: "PDFDocument" 50 | page: "PDFPage" 51 | pv: Optional["pyvoronoi.Pyvoronoi"] 52 | pv_segments: Optional[List] 53 | 54 | __ax: "Axes" 55 | __sections_by_page_number: Dict[int, List["Section"]] 56 | 57 | def __init__(self, document: "PDFDocument", ax: "Axes"): 58 | self.document = document 59 | self.__ax = ax 60 | 61 | colour_map = cm.get_cmap("Dark2").colors 62 | self.__colour_mapping = { 63 | section.unique_name: colour_map[idx % len(colour_map)] 64 | for idx, section in enumerate(self.document.sectioning.sections) 65 | } 66 | 67 | self.__sections_by_page_number = {} 68 | 69 | def __get_sections_for_page(self, page: "PDFPage") -> List["Section"]: 70 | if page.page_number not in self.__sections_by_page_number: 71 | self.__sections_by_page_number[page.page_number] = [ 72 | section 73 | for section in self.document.sectioning.sections 74 | if section.elements & page.elements 75 | ] 76 | return self.__sections_by_page_number[page.page_number] 77 | 78 | def __get_segment_for_element(self, element: "PDFElement") -> List: 79 | bbox = element.bounding_box 80 | return [ 81 | ((bbox.x0, bbox.y0), (bbox.x0, bbox.y1)), 82 | ((bbox.x0, bbox.y1), (bbox.x1, bbox.y1)), 83 | ((bbox.x1, bbox.y1), (bbox.x1, bbox.y0)), 84 | ((bbox.x1, bbox.y0), (bbox.x0, bbox.y0)), 85 | ] 86 | 87 | def __get_segments_for_elements(self, elements: List["PDFElement"]) -> List: 88 | return [ 89 | (start, end) 90 | for element in elements 91 | for start, end in self.__get_segment_for_element(element) 92 | ] 93 | 94 | def __get_element_boxes( 95 | self, elements: Union[List["PDFElement"], "ElementList"] 96 | ) -> List: 97 | return [ 98 | geometry.box( 99 | element.bounding_box.x0, 100 | element.bounding_box.y0, 101 | element.bounding_box.x1, 102 | element.bounding_box.y1, 103 | ) 104 | for element in elements 105 | ] 106 | 107 | def __simplify_outlines( 108 | self, line: geometry.LineString 109 | ) -> Tuple[List[int], List[int]]: 110 | """ 111 | Simplified the outline by considering set of 3 consecutive vertices, and if 112 | there are no elements in this triangle, removes the middle vertex from the 113 | shape. This is done iteratively around the shape until no further changes are 114 | made. 115 | """ 116 | xs, ys = line.xy 117 | 118 | # The last point is the same as the first point, which makes things a bit more 119 | # complicated. We simply remove the last point and add it back at the end. 120 | xs.pop(-1) 121 | ys.pop(-1) 122 | boxes = self.__get_element_boxes(self.all_elements) 123 | idx = 0 124 | since_last_changed = 0 125 | while since_last_changed <= len(xs) + 1: 126 | idx1 = (idx + 1) % len(xs) 127 | idx2 = (idx + 2) % len(xs) 128 | 129 | x0 = xs[idx] 130 | x1 = xs[idx1] 131 | x2 = xs[idx2] 132 | 133 | y0 = ys[idx] 134 | y1 = ys[idx1] 135 | y2 = ys[idx2] 136 | 137 | triangle_points = ((x0, y0), (x1, y1), (x2, y2), (x0, y0)) 138 | triangle = geometry.Polygon(triangle_points) 139 | if triangle.area < 0.1 or not any( 140 | triangle.intersects(box) for box in boxes 141 | ): 142 | xs.pop(idx1) 143 | ys.pop(idx1) 144 | since_last_changed = 0 145 | else: 146 | since_last_changed += 1 147 | 148 | idx = (idx + 1) % len(xs) 149 | 150 | # Add the last point back 151 | xs.append(xs[0]) 152 | ys.append(ys[0]) 153 | return xs, ys 154 | 155 | def __plot_edges( 156 | self, to_plot: List, edges: List, vertices: List, label: str 157 | ) -> None: 158 | lines = [] 159 | for edge_idx in to_plot: 160 | edge = edges[edge_idx] 161 | start_vertex = vertices[edge.start] 162 | end_vertex = vertices[edge.end] 163 | # Note it could be that the edge is supposed to be parabola (edge.is_linear 164 | # will be false), but in our case we always have boxes with 90 degree 165 | # corners. If it's a parabola then the focus is one of these corners, and by 166 | # drawing a line instead of a parabola we at worse cut through this point, 167 | # which is fine. 168 | lines.append( 169 | geometry.LineString( 170 | [[start_vertex.X, start_vertex.Y], [end_vertex.X, end_vertex.Y]] 171 | ) 172 | ) 173 | merged_line = ops.linemerge(geometry.MultiLineString(lines)) 174 | kwargs = {"label": label, "alpha": 0.5, "color": self.__colour_mapping[label]} 175 | # Merged line is either a MultiLineString which means we need to draw multiple 176 | # lines, or it is a LineString which means we only need to draw one. 177 | if isinstance(merged_line, geometry.MultiLineString): 178 | for line in merged_line: 179 | xs, ys = self.__simplify_outlines(line) 180 | self.__ax.plot(xs, ys, **kwargs) 181 | kwargs.pop( 182 | "label", None 183 | ) # Only pass label once for single legend entry 184 | else: 185 | xs, ys = self.__simplify_outlines(merged_line) 186 | self.__ax.plot(xs, ys, **kwargs) 187 | 188 | def __plot_section(self, section: "Section") -> None: 189 | if self.pv is None or self.pv_segments is None: 190 | self.pv, self.pv_segments = self.__get_voronoi() 191 | edges = self.pv.GetEdges() 192 | vertices = self.pv.GetVertices() 193 | cells = self.pv.GetCells() 194 | 195 | # If an ignored element is within the section, we need to draw lines around it. 196 | # The following code gets the first and last non-ignored elements in the section 197 | # on the page, and then gets all elements between (inclusive) these elements, 198 | # even if they are ignored. 199 | section_elements_on_page = section.elements & self.page.elements 200 | section_elements = [ 201 | section.document._element_list[index] 202 | for index in range( 203 | section_elements_on_page[0]._index, 204 | section_elements_on_page[-1]._index + 1, 205 | ) 206 | ] 207 | section_segments = self.__get_segments_for_elements(section_elements) 208 | in_section = [point in section_segments for point in self.pv_segments] 209 | 210 | to_plot = [] 211 | for idx, edge in enumerate(edges): 212 | first_segment = cells[edge.cell].site 213 | second_segment = cells[edges[edge.twin].cell].site 214 | # We should plot if the first segment is in the section and the second isn't 215 | if in_section[first_segment] and not in_section[second_segment]: 216 | to_plot.append(idx) 217 | 218 | self.__plot_edges(to_plot, edges, vertices, label=section.unique_name) 219 | 220 | def __get_voronoi(self) -> Tuple[pyvoronoi.Pyvoronoi, List]: 221 | all_segments = self.__get_segments_for_elements(self.all_elements) 222 | # Add the page boundary as segments: 223 | all_segments += [ 224 | [(0, 0), (0, self.page.height)], 225 | [(0, 0), (self.page.width, 0)], 226 | [(0, self.page.height), (self.page.width, self.page.height)], 227 | [(self.page.width, 0), (self.page.width, self.page.height)], 228 | ] 229 | 230 | pv = pyvoronoi.Pyvoronoi(10) 231 | for segment in all_segments: 232 | pv.AddSegment(segment) 233 | 234 | pv.Construct() 235 | return pv, all_segments 236 | 237 | def __get_boundary_for_elements( 238 | self, elements: "ElementList", margin: int 239 | ) -> Tuple[float, float, float, float]: 240 | x0s = [element.bounding_box.x0 for element in elements] 241 | x1s = [element.bounding_box.x1 for element in elements] 242 | y0s = [element.bounding_box.y0 for element in elements] 243 | y1s = [element.bounding_box.y1 for element in elements] 244 | 245 | x0 = min(x0s) - margin 246 | x1 = max(x1s) + margin 247 | y0 = min(y0s) - margin 248 | y1 = max(y1s) + margin 249 | 250 | return x0, x1, y0, y1 251 | 252 | def __plot_section_simple(self, section: "Section") -> bool: 253 | section_elements_on_page = section.elements & self.page.elements 254 | non_section_elements = self.page.elements - section_elements_on_page 255 | boxes = self.__get_element_boxes(non_section_elements) 256 | 257 | for margin in SIMPLE_BOUNDARY_MARGINS: 258 | x0, x1, y0, y1 = self.__get_boundary_for_elements( 259 | section_elements_on_page, margin=margin 260 | ) 261 | 262 | boundary = geometry.box(x0, y0, x1, y1) 263 | 264 | if not any(box.intersects(boundary) for box in boxes): 265 | # No elements outside of the section are within this boundary, and as 266 | # such we can simply draw this boundary as the section outline. Break. 267 | break 268 | else: 269 | # None of the margins gave us a box which did not contain any non-section 270 | # elements. We cannot use the simple method. 271 | return False 272 | 273 | label = section.unique_name 274 | 275 | kwargs = {"label": label, "alpha": 0.5, "color": self.__colour_mapping[label]} 276 | self.__ax.plot([x0, x1, x1, x0, x0], [y0, y0, y1, y1, y0], **kwargs) 277 | 278 | return True 279 | 280 | def plot_sections_for_page(self, page: "PDFPage") -> None: 281 | self.pv = None 282 | self.pv_segments = None 283 | self.page = page 284 | 285 | sections = self.__get_sections_for_page(page) 286 | 287 | if not sections: 288 | # No sections on page, nothing to plot 289 | return 290 | 291 | # We want to include ignored elements for this bit. 292 | page_indexes = set( 293 | range(page.start_element._index, page.end_element._index + 1) 294 | ) 295 | ignored_indexes_on_page = page_indexes & self.document._ignored_indexes 296 | self.all_elements = list(page.elements) + [ 297 | self.document._element_list[index] for index in ignored_indexes_on_page 298 | ] 299 | 300 | for section in sections: 301 | plotted = self.__plot_section_simple(section) 302 | if not plotted: 303 | self.__plot_section(section) 304 | 305 | # Show the legend 306 | self.__ax.legend() 307 | -------------------------------------------------------------------------------- /pycodestyle.cfg: -------------------------------------------------------------------------------- 1 | [pycodestyle] 2 | # E231 is ignored because current released version of black can't handle it, we should 3 | # remove it from the ignore list when https://github.com/psf/black/issues/1010 is solved 4 | ignore = E722, E731, E241, E402, E203, W503, E231 5 | max-line-length = 88 6 | count = True 7 | exclude=.venv,.pytype 8 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile = "black" 3 | known_typing = "typing" 4 | sections = "FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER" 5 | -------------------------------------------------------------------------------- /pytype.cfg: -------------------------------------------------------------------------------- 1 | # NOTE: All relative paths are relative to the location of this file. 2 | 3 | [pytype] 4 | 5 | # Space-separated list of files or directories to exclude. 6 | exclude = 7 | **/*_test.py 8 | **/test_*.py 9 | 10 | # Space-separated list of files or directories to process. 11 | inputs = 12 | . 13 | 14 | # Keep going past errors to analyze as many files as possible. 15 | keep_going = False 16 | 17 | # Run N jobs in parallel. When 'auto' is used, this will be equivalent to the 18 | # number of CPUs on the host system. 19 | jobs = 4 20 | 21 | # All pytype output goes here. 22 | output = .pytype 23 | 24 | # Paths to source code directories, separated by ':'. 25 | pythonpath = 26 | . 27 | 28 | # Python version (major.minor) of the target code. 29 | python_version = 3.8 30 | 31 | # Use the enum overlay for more precise enum checking. This flag is temporary 32 | # and will be removed once this behavior is enabled by default. 33 | use_enum_overlay = Use the enum overlay for more precise enum checking. 34 | 35 | # Build dict literals from dict(k=v, ...) calls. This flag is temporary and will 36 | # be removed once this behavior is enabled by default. 37 | build_dict_literals_from_kwargs = Build dict literals from dict(k=v, ...) calls. 38 | 39 | # Enable stricter namedtuple checks, such as unpacking and 'typing.Tuple' 40 | # compatibility. This flag is temporary and will be removed once this behavior 41 | # is enabled by default. 42 | strict_namedtuple_checks = Enable stricter namedtuple checks, such as unpacking and 'typing.Tuple' compatibility. 43 | 44 | # Enable exhaustive checking of function parameter types. This flag is temporary 45 | # and will be removed once this behavior is enabled by default. 46 | strict_parameter_checks = Enable exhaustive checking of function parameter types. 47 | 48 | # Enable support for TypedDicts. This flag is temporary and will be removed once 49 | # this behavior is enabled by default. 50 | enable_typed_dicts = Enable support for TypedDicts. 51 | 52 | # Solve unknown types to label with structural types. This flag is temporary and 53 | # will be removed once this behavior is enabled by default. 54 | protocols = Solve unknown types to label with structural types. 55 | 56 | # Only load submodules that are explicitly imported. This flag is temporary and 57 | # will be removed once this behavior is enabled by default. 58 | strict_import = Only load submodules that are explicitly imported. 59 | 60 | # Infer precise return types even for invalid function calls. This flag is 61 | # temporary and will be removed once this behavior is enabled by default. 62 | precise_return = Infer precise return types even for invalid function calls. 63 | 64 | # Comma or space separated list of error names to ignore. 65 | disable = 66 | pyi-error 67 | 68 | # Don't report errors. 69 | report_errors = True 70 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from setuptools import find_packages, setup 5 | 6 | if sys.version_info < (3, 6): 7 | print(sys.stderr, "{}: need Python 3.6 or later.".format(sys.argv[0])) 8 | print(sys.stderr, "Your Python is {}".format(sys.version)) 9 | sys.exit(1) 10 | 11 | 12 | ROOT_DIR = os.path.dirname(__file__) 13 | 14 | 15 | setup( 16 | name="py-pdf-parser", 17 | packages=find_packages(exclude=["tests", "tests.*", "docs", "docs.*"]), 18 | version="0.13.0", 19 | url="https://github.com/jstockwin/py-pdf-parser", 20 | license="BSD", 21 | description="A tool to help extracting information from structured PDFs.", 22 | long_description=open(os.path.join(ROOT_DIR, "README.md")).read(), 23 | long_description_content_type="text/markdown", 24 | author="Jake Stockwin", 25 | author_email="jstockwin@gmail.com", 26 | include_package_data=True, 27 | install_requires=[ 28 | "pdfminer.six==20220524", 29 | "docopt==0.6.2", 30 | "wand==0.6.10", 31 | ], 32 | extras_require={ 33 | "dev": [ 34 | "matplotlib==3.5.1", 35 | "pillow==9.2.0", 36 | "pyvoronoi==1.0.7", 37 | "shapely==1.8.2", 38 | ], 39 | "test": [ 40 | "ddt==1.6.0", 41 | "matplotlib==3.5.1", 42 | "mock==4.0.3", 43 | "nose==1.3.7", 44 | "pillow==9.2.0", 45 | "recommonmark==0.7.1", 46 | "sphinx-autobuild==2021.3.14", 47 | "sphinx-rtd-theme==1.0.0", 48 | "Sphinx==5.2.3", 49 | ], 50 | }, 51 | ) 52 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/__init__.py -------------------------------------------------------------------------------- /tests/base.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING, List, Optional, Union 2 | 3 | import _tkinter 4 | import logging 5 | import os 6 | import tkinter as tk 7 | from unittest import TestCase 8 | 9 | from PIL import Image 10 | 11 | if TYPE_CHECKING: 12 | from pdfminer.layout import LTComponent 13 | 14 | from py_pdf_parser.components import PDFElement 15 | from py_pdf_parser.filtering import ElementList 16 | 17 | 18 | # Turn of debug spam from pdfminer, matplotlib, shapely 19 | logging.getLogger("pdfminer").setLevel(logging.WARNING) 20 | logging.getLogger("matplotlib").setLevel(logging.WARNING) 21 | logging.getLogger("shapely").setLevel(logging.WARNING) 22 | 23 | 24 | class BaseTestCase(TestCase): 25 | # Helper functions 26 | def assert_original_element_in( 27 | self, original_element: "LTComponent", element_list: "ElementList" 28 | ): 29 | pdf_element = self.extract_element_from_list(original_element, element_list) 30 | self.assertIn(pdf_element, element_list) 31 | 32 | def assert_original_element_list_list_equal( 33 | self, 34 | original_element_list_list: List[List[Optional["LTComponent"]]], 35 | element_list_list: List[List[Optional["PDFElement"]]], 36 | ): 37 | self.assertEqual(len(original_element_list_list), len(element_list_list)) 38 | for original_element_list, element_list in zip( 39 | original_element_list_list, element_list_list 40 | ): 41 | self.assert_original_element_list_equal(original_element_list, element_list) 42 | 43 | def assert_original_element_list_equal( 44 | self, 45 | original_element_list: List[Optional["LTComponent"]], 46 | element_list: Union[List[Optional["PDFElement"]], "ElementList"], 47 | ): 48 | self.assertEqual(len(original_element_list), len(element_list)) 49 | for original_element, element in zip(original_element_list, element_list): 50 | if original_element is None or element is None: 51 | self.assertIsNone(original_element) 52 | self.assertIsNone(element) 53 | else: 54 | self.assert_original_element_equal(original_element, element) 55 | 56 | def assert_original_element_equal( 57 | self, original_element: "LTComponent", element: "PDFElement" 58 | ): 59 | self.assertEqual(original_element, element.original_element) 60 | 61 | def extract_element_from_list( 62 | self, 63 | original_element: "LTComponent", 64 | element_list: Union[List[Optional["PDFElement"]], "ElementList"], 65 | ) -> "PDFElement": 66 | return [ 67 | elem 68 | for elem in element_list 69 | if elem is not None 70 | if elem.original_element == original_element 71 | ][0] 72 | 73 | 74 | class BaseVisualiseTestCase(BaseTestCase): 75 | """ 76 | See the answer from ivan_pozdeev at 77 | https://stackoverflow.com/questions/4083796/how-do-i-run-unittest-on-a-tkinter-app 78 | for the setUp, tearDown and pump_events methods. This basically allows us to 79 | run tk.mainloop() manually using pump_events, thus allowing us to use visualise 80 | without blocking the thread. 81 | 82 | There is also a custom check_images function to do comparison of the screenshots 83 | from visualise. You can set self.WRITE_NEW_TEST_IMAGES to True to write new images 84 | if they don't exist. This also allows you to delete images which are old, and then 85 | run the tests with WRITE_NEW_TEST_IMAGES=True to replace them. 86 | """ 87 | 88 | WRITE_NEW_TEST_IMAGES = False 89 | 90 | def setUp(self): 91 | self.root = tk.Tk() 92 | self.pump_events() 93 | 94 | def tearDown(self): 95 | if self.root: 96 | self.root.destroy() 97 | self.pump_events() 98 | 99 | def pump_events(self): 100 | while self.root.dooneevent(_tkinter.ALL_EVENTS | _tkinter.DONT_WAIT): 101 | pass 102 | 103 | def check_images(self, visualiser, image_name): 104 | self.pump_events() 105 | root_path = os.path.join(os.path.dirname(__file__), "data", "images") 106 | existing_file_path = os.path.join(root_path, f"{image_name}.png") 107 | new_file_path = os.path.join(root_path, f"{image_name}-new.png") 108 | 109 | # Check if file exists (write if not) 110 | if not os.path.isfile(existing_file_path): 111 | if not self.WRITE_NEW_TEST_IMAGES: 112 | self.fail(f"Could not find existing image for {image_name=}. Set ") 113 | 114 | visualiser._PDFVisualiser__fig.savefig(existing_file_path) 115 | 116 | # Check images are identical (fail if not) 117 | existing_image = Image.open(existing_file_path) 118 | 119 | visualiser._PDFVisualiser__fig.savefig(new_file_path) 120 | new_image = Image.open(new_file_path) 121 | 122 | if new_image.tobytes() != existing_image.tobytes(): 123 | self.fail(f"Images differ for {image_name=}.") 124 | 125 | os.remove(new_file_path) 126 | -------------------------------------------------------------------------------- /tests/data/images/tables1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/images/tables1.png -------------------------------------------------------------------------------- /tests/data/images/tables2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/images/tables2.png -------------------------------------------------------------------------------- /tests/data/pdfs/image.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/pdfs/image.pdf -------------------------------------------------------------------------------- /tests/data/pdfs/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/pdfs/test.pdf -------------------------------------------------------------------------------- /tests/data/pdfs/test_protected.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/pdfs/test_protected.pdf -------------------------------------------------------------------------------- /tests/test_common.py: -------------------------------------------------------------------------------- 1 | from py_pdf_parser.common import BoundingBox 2 | from py_pdf_parser.exceptions import InvalidCoordinatesError 3 | 4 | from .base import BaseTestCase 5 | 6 | 7 | class TestBoundingBox(BaseTestCase): 8 | def test_create_bounding_box(self): 9 | bbox = BoundingBox(0, 1, 0, 1) 10 | self.assertEqual(bbox.width, 1) 11 | self.assertEqual(bbox.height, 1) 12 | 13 | # Checks that it raises an exception if coordinates are not valid 14 | with self.assertRaises(InvalidCoordinatesError): 15 | BoundingBox(1, 0, 0, 1) 16 | 17 | with self.assertRaises(InvalidCoordinatesError): 18 | BoundingBox(0, 1, 1, 0) 19 | 20 | def test_eq(self): 21 | bbox_1 = BoundingBox(0, 1, 0, 1) 22 | bbox_2 = BoundingBox(0, 1, 0, 1) 23 | self.assertEqual(bbox_1, bbox_2) 24 | 25 | bbox_3 = BoundingBox(0, 1, 0, 3) 26 | self.assertNotEqual(bbox_1, bbox_3) 27 | 28 | def test_repr(self): 29 | bbox = BoundingBox(0, 1, 0, 1) 30 | self.assertEqual(repr(bbox), "") 31 | -------------------------------------------------------------------------------- /tests/test_components.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from ddt import data, ddt 4 | 5 | from py_pdf_parser.common import BoundingBox 6 | from py_pdf_parser.components import ElementOrdering, PDFDocument 7 | from py_pdf_parser.exceptions import NoElementsOnPageError, PageNotFoundError 8 | from py_pdf_parser.filtering import ElementList 9 | from py_pdf_parser.loaders import Page 10 | 11 | from .base import BaseTestCase 12 | from .utils import FakePDFMinerTextElement, create_pdf_document, create_pdf_element 13 | 14 | 15 | @ddt 16 | class TestPDFElement(BaseTestCase): 17 | element_bbox = BoundingBox(2, 5, 2, 5) 18 | 19 | def test_page_number(self): 20 | element = create_pdf_element() 21 | self.assertEqual(element.page_number, 1) 22 | 23 | with self.assertRaises(AttributeError): 24 | element.page_number = 2 25 | 26 | def test_font_name(self): 27 | element = create_pdf_element(font_name="test_font") 28 | self.assertEqual(element.font_name, "test_font") 29 | 30 | def test_font_size(self): 31 | element = create_pdf_element(font_size=2) 32 | self.assertEqual(element.font_size, 2) 33 | 34 | def test_font_size_precision(self): 35 | element = create_pdf_element(font_size=1.234) 36 | self.assertEqual(element.font_size, 1.2) 37 | 38 | element = create_pdf_element(font_size=1.234, font_size_precision=0) 39 | self.assertEqual(element.font_size, 1) 40 | 41 | element = create_pdf_element(font_size=1.234, font_size_precision=3) 42 | self.assertEqual(element.font_size, 1.234) 43 | 44 | def test_font(self): 45 | element = create_pdf_element(font_name="test_font", font_size=2) 46 | self.assertEqual(element.font, "test_font,2") 47 | 48 | element = create_pdf_element( 49 | font_name="test_font", 50 | font_size=3, 51 | font_mapping={"test_font,3": "test_named_font"}, 52 | ) 53 | self.assertEqual(element.font, "test_named_font") 54 | 55 | element = create_pdf_element( 56 | font_name="test_font", 57 | font_size=2, 58 | font_mapping={"test_font,3": "test_named_font"}, 59 | ) 60 | self.assertEqual(element.font, "test_font,2") 61 | 62 | # Test when font_mapping argument is passed to PDFDocument 63 | font_mapping = {} 64 | element = create_pdf_element( 65 | font_name="fake_font_1", font_size=10, font_mapping=font_mapping 66 | ) 67 | self.assertEqual(element.font, "fake_font_1,10") 68 | 69 | font_mapping = {"fake_font_1,10": "large_text"} 70 | element = create_pdf_element( 71 | font_name="fake_font_1", font_size=10, font_mapping=font_mapping 72 | ) 73 | self.assertEqual(element.font, "large_text") 74 | 75 | font_mapping = {r"^fake_font_\d,10$": "large_text"} 76 | element = create_pdf_element( 77 | font_name="fake_font_1", 78 | font_size=10, 79 | font_mapping=font_mapping, 80 | font_mapping_is_regex=True, 81 | ) 82 | self.assertEqual(element.font, "large_text") 83 | 84 | font_mapping = {r"^fake_font_\d,10$": "large_text"} 85 | element = create_pdf_element( 86 | font_name="FAKE_FONT_1", 87 | font_size=10, 88 | font_mapping=font_mapping, 89 | font_mapping_is_regex=True, 90 | ) 91 | self.assertEqual(element.font, "FAKE_FONT_1,10") 92 | 93 | font_mapping = {r"^fake_font_\d,10$": "large_text"} 94 | element = create_pdf_element( 95 | font_name="FAKE_FONT_1", 96 | font_size=10, 97 | font_mapping=font_mapping, 98 | font_mapping_is_regex=True, 99 | regex_flags=re.IGNORECASE, 100 | ) 101 | self.assertEqual(element.font, "large_text") 102 | 103 | def test_text(self): 104 | element = create_pdf_element(text=" test ") 105 | self.assertEqual(element.text(), "test") 106 | self.assertEqual(element.text(stripped=False), " test ") 107 | 108 | def test_add_tag(self): 109 | element = create_pdf_element() 110 | self.assertEqual(element.tags, set()) 111 | 112 | element.add_tag("foo") 113 | self.assertEqual(element.tags, set(["foo"])) 114 | 115 | element.add_tag("foo") 116 | self.assertEqual(element.tags, set(["foo"])) 117 | 118 | element.add_tag("bar") 119 | self.assertEqual(element.tags, set(["foo", "bar"])) 120 | 121 | def test_repr(self): 122 | element = create_pdf_element(font_name="test_font", font_size=2) 123 | self.assertEqual(repr(element), "") 124 | 125 | element.add_tag("foo") 126 | self.assertEqual( 127 | repr(element), "" 128 | ) 129 | 130 | element.ignore() 131 | self.assertEqual( 132 | repr(element), "" 133 | ) 134 | 135 | @data( 136 | BoundingBox(1, 6, 1, 6), # This box fully encloses the element 137 | BoundingBox(1, 6, 0, 3), # This box intersects the bottom of the element 138 | BoundingBox(1, 6, 0, 2), # This box touches the bottom of the element 139 | BoundingBox(1, 6, 4, 6), # This box intersects the top of the element 140 | BoundingBox(1, 6, 5, 6), # This box touches the top of the element 141 | BoundingBox(1, 6, 3, 4), # This box goes through center horizontally 142 | BoundingBox(1, 3, 1, 6), # This box intersects the left of the element 143 | BoundingBox(1, 2, 1, 6), # This box touches the left of the element 144 | BoundingBox(4, 6, 1, 6), # This box intersects the left of the element 145 | BoundingBox(5, 6, 1, 6), # This box touches the left of the element 146 | BoundingBox(3, 4, 1, 6), # This box goes through the center vertically 147 | BoundingBox(3, 4, 3, 4), # This box is enclosed inside the element 148 | ) 149 | def test_partially_within_true(self, bounding_box): 150 | element = create_pdf_element(self.element_bbox) 151 | self.assertTrue(element.partially_within(bounding_box)) 152 | 153 | @data( 154 | BoundingBox(1, 6, 0, 1), # This box is underneath the element 155 | BoundingBox(1, 6, 6, 7), # This box is above the element 156 | BoundingBox(0, 1, 1, 6), # This box is to the left of the element 157 | BoundingBox(6, 7, 1, 6), # This box is to the lerightft of the element 158 | ) 159 | def test_partially_within_false(self, bounding_box): 160 | element = create_pdf_element(self.element_bbox) 161 | self.assertFalse(element.partially_within(bounding_box)) 162 | 163 | @data(BoundingBox(1, 6, 1, 6)) # This box fully encloses the element 164 | def test_entirely_within_true(self, bounding_box): 165 | element = create_pdf_element(self.element_bbox) 166 | self.assertTrue(element.entirely_within(bounding_box)) 167 | 168 | @data( 169 | BoundingBox(1, 6, 0, 3), # This box intersects the bottom of the element 170 | BoundingBox(1, 6, 0, 2), # This box touches the bottom of the element 171 | BoundingBox(1, 6, 4, 6), # This box intersects the top of the element 172 | BoundingBox(1, 6, 5, 6), # This box touches the top of the element 173 | BoundingBox(1, 6, 3, 4), # This box goes through center horizontally 174 | BoundingBox(1, 3, 1, 6), # This box intersects the left of the element 175 | BoundingBox(1, 2, 1, 6), # This box touches the left of the element 176 | BoundingBox(4, 6, 1, 6), # This box intersects the left of the element 177 | BoundingBox(5, 6, 1, 6), # This box touches the left of the element 178 | BoundingBox(3, 4, 1, 6), # This box goes through the center vertically 179 | BoundingBox(1, 6, 0, 1), # This box is underneath the element 180 | BoundingBox(1, 6, 6, 7), # This box is above the element 181 | BoundingBox(0, 1, 1, 6), # This box is to the left of the element 182 | BoundingBox(6, 7, 1, 6), # This box is to the right of the element 183 | BoundingBox(3, 4, 3, 4), # This box is enclosed inside the element 184 | ) 185 | def test_entirely_within_false(self, bounding_box): 186 | element = create_pdf_element(self.element_bbox) 187 | self.assertFalse(element.entirely_within(bounding_box)) 188 | 189 | 190 | class TestPDFDocument(BaseTestCase): 191 | def test_document(self): 192 | el_page_1_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3)) 193 | el_page_1_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3)) 194 | el_page_1_bottom_left = FakePDFMinerTextElement(BoundingBox(0, 1, 0, 1)) 195 | el_page_1_bottom_right = FakePDFMinerTextElement(BoundingBox(2, 3, 0, 1)) 196 | page_1 = Page( 197 | elements=[ 198 | el_page_1_top_left, 199 | el_page_1_top_right, 200 | el_page_1_bottom_left, 201 | el_page_1_bottom_right, 202 | ], 203 | width=100, 204 | height=100, 205 | ) 206 | 207 | el_page_2_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3)) 208 | el_page_2_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3)) 209 | el_page_2_bottom_left = FakePDFMinerTextElement(BoundingBox(0, 1, 0, 1)) 210 | el_page_2_bottom_right = FakePDFMinerTextElement(BoundingBox(2, 3, 0, 1)) 211 | page_2 = Page( 212 | elements=[ 213 | el_page_2_bottom_right, 214 | el_page_2_bottom_left, 215 | el_page_2_top_right, 216 | el_page_2_top_left, 217 | ], 218 | width=100, 219 | height=100, 220 | ) 221 | 222 | document = PDFDocument(pages={1: page_1, 2: page_2}) 223 | 224 | # Checks elements were reordered 225 | expected_ordered_list = [ 226 | el_page_1_top_left, 227 | el_page_1_top_right, 228 | el_page_1_bottom_left, 229 | el_page_1_bottom_right, 230 | el_page_2_top_left, 231 | el_page_2_top_right, 232 | el_page_2_bottom_left, 233 | el_page_2_bottom_right, 234 | ] 235 | self.assertEqual( 236 | [elem.original_element for elem in document._element_list], 237 | expected_ordered_list, 238 | ) 239 | 240 | # Checks indexes were assigned properly 241 | self.assertEqual( 242 | [elem._index for elem in document._element_list], [0, 1, 2, 3, 4, 5, 6, 7] 243 | ) 244 | 245 | # Checks page numbers is correct 246 | self.assertEqual(document.page_numbers, [1, 2]) 247 | 248 | # Checks number of pages is correct 249 | self.assertEqual(document.number_of_pages, 2) 250 | 251 | # Checks pages were assigned properly 252 | self.assertEqual( 253 | [elem.page_number for elem in document._element_list], 254 | [1, 1, 1, 1, 2, 2, 2, 2], 255 | ) 256 | 257 | # Checks pages were instantiated correctly 258 | pdf_page_1 = document.get_page(1) 259 | self.assertEqual(page_1.width, pdf_page_1.width) 260 | self.assertEqual(page_1.height, pdf_page_1.height) 261 | self.assertEqual(el_page_1_top_left, pdf_page_1.start_element.original_element) 262 | self.assertEqual( 263 | el_page_1_bottom_right, pdf_page_1.end_element.original_element 264 | ) 265 | self.assertEqual(pdf_page_1.page_number, 1) 266 | self.assertEqual(pdf_page_1.elements, ElementList(document, set([0, 1, 2, 3]))) 267 | 268 | pdf_page_2 = document.get_page(2) 269 | self.assertEqual(page_2.width, pdf_page_2.width) 270 | self.assertEqual(page_2.height, pdf_page_2.height) 271 | self.assertEqual(el_page_2_top_left, pdf_page_2.start_element.original_element) 272 | self.assertEqual( 273 | el_page_2_bottom_right, pdf_page_2.end_element.original_element 274 | ) 275 | self.assertEqual(pdf_page_2.page_number, 2) 276 | self.assertEqual(pdf_page_2.elements, ElementList(document, set([4, 5, 6, 7]))) 277 | 278 | self.assertEqual(document.pages, [pdf_page_1, pdf_page_2]) 279 | 280 | self.assertEqual( 281 | document.elements, ElementList(document, set([0, 1, 2, 3, 4, 5, 6, 7])) 282 | ) 283 | with self.assertRaises(PageNotFoundError): 284 | document.get_page(3) 285 | 286 | def test_document_with_blank_page(self): 287 | with self.assertRaises(NoElementsOnPageError): 288 | PDFDocument(pages={1: Page(elements=[], width=100, height=100)}) 289 | 290 | def test_element_ordering(self): 291 | # elem_1 elem_2 292 | # elem_3 elem_4 293 | elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10)) 294 | elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10)) 295 | elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5)) 296 | elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5)) 297 | 298 | # Check default: left to right, top to bottom 299 | document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4]) 300 | self.assert_original_element_list_equal( 301 | [elem_1, elem_2, elem_3, elem_4], document.elements 302 | ) 303 | 304 | # Check other presets 305 | document = create_pdf_document( 306 | elements=[elem_1, elem_2, elem_3, elem_4], 307 | element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM, 308 | ) 309 | self.assert_original_element_list_equal( 310 | [elem_2, elem_1, elem_4, elem_3], document.elements 311 | ) 312 | 313 | document = create_pdf_document( 314 | elements=[elem_1, elem_2, elem_3, elem_4], 315 | element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT, 316 | ) 317 | self.assert_original_element_list_equal( 318 | [elem_1, elem_3, elem_2, elem_4], document.elements 319 | ) 320 | 321 | document = create_pdf_document( 322 | elements=[elem_1, elem_2, elem_3, elem_4], 323 | element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT, 324 | ) 325 | self.assert_original_element_list_equal( 326 | [elem_2, elem_4, elem_1, elem_3], document.elements 327 | ) 328 | 329 | # Check custom function 330 | document = create_pdf_document( 331 | elements=[elem_1, elem_2, elem_3, elem_4], 332 | element_ordering=lambda elements: [ 333 | elements[0], 334 | elements[3], 335 | elements[1], 336 | elements[2], 337 | ], 338 | ) 339 | self.assert_original_element_list_equal( 340 | [elem_1, elem_4, elem_2, elem_3], document.elements 341 | ) 342 | -------------------------------------------------------------------------------- /tests/test_doc_examples/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/test_doc_examples/__init__.py -------------------------------------------------------------------------------- /tests/test_doc_examples/test_element_ordering.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from py_pdf_parser.components import ElementOrdering 4 | from py_pdf_parser.loaders import load_file 5 | from tests.base import BaseTestCase 6 | 7 | 8 | class TestSimpleMemo(BaseTestCase): 9 | def test_output_is_correct(self): 10 | file_path = os.path.join( 11 | os.path.dirname(__file__), "../../docs/source/example_files/grid.pdf" 12 | ) 13 | 14 | # Default - left to right, top to bottom 15 | document = load_file(file_path) 16 | self.assertListEqual( 17 | [element.text() for element in document.elements], 18 | ["Top Left", "Top Right", "Bottom Left", "Bottom Right"], 19 | ) 20 | 21 | # Preset - right to left, top to bottom 22 | document = load_file( 23 | file_path, element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM 24 | ) 25 | self.assertListEqual( 26 | [element.text() for element in document.elements], 27 | ["Top Right", "Top Left", "Bottom Right", "Bottom Left"], 28 | ) 29 | 30 | # Preset - top to bottom, left to right 31 | document = load_file( 32 | file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT 33 | ) 34 | self.assertListEqual( 35 | [element.text() for element in document.elements], 36 | ["Bottom Left", "Top Left", "Bottom Right", "Top Right"], 37 | ) 38 | 39 | # Preset - top to bottom, right to left 40 | document = load_file( 41 | file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT 42 | ) 43 | self.assertListEqual( 44 | [element.text() for element in document.elements], 45 | ["Top Right", "Bottom Right", "Top Left", "Bottom Left"], 46 | ) 47 | 48 | # Custom - bottom to top, left to right 49 | def ordering_function(elements): 50 | return sorted(elements, key=lambda elem: (elem.x0, elem.y0)) 51 | 52 | document = load_file(file_path, element_ordering=ordering_function) 53 | self.assertListEqual( 54 | [element.text() for element in document.elements], 55 | ["Bottom Left", "Top Left", "Bottom Right", "Top Right"], 56 | ) 57 | 58 | # Custom - This PDF has columns! 59 | # TODO: CHANGE PATH! 60 | file_path = os.path.join( 61 | os.path.dirname(__file__), "../../docs/source/example_files/columns.pdf" 62 | ) 63 | 64 | # Default - left to right, top to bottom 65 | document = load_file(file_path) 66 | self.assertListEqual( 67 | [element.text() for element in document.elements], 68 | [ 69 | "Column 1 Title", 70 | "Column 2 Title", 71 | "Here is some column 1 text.", 72 | "Here is some column 2 text.", 73 | "Col 1 left", 74 | "Col 1 right", 75 | "Col 2 left", 76 | "Col 2 right", 77 | ], 78 | ) 79 | 80 | # Visualise, and we can see that the middle is at around x = 300. 81 | # visualise(document) 82 | 83 | def column_ordering_function(elements): 84 | return sorted(elements, key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0)) 85 | 86 | document = load_file(file_path, element_ordering=column_ordering_function) 87 | self.assertListEqual( 88 | [element.text() for element in document.elements], 89 | [ 90 | "Column 1 Title", 91 | "Here is some column 1 text.", 92 | "Col 1 left", 93 | "Col 1 right", 94 | "Column 2 Title", 95 | "Here is some column 2 text.", 96 | "Col 2 left", 97 | "Col 2 right", 98 | ], 99 | ) 100 | -------------------------------------------------------------------------------- /tests/test_doc_examples/test_extracting_text_from_figures.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from py_pdf_parser.loaders import load_file 4 | from tests.base import BaseTestCase 5 | 6 | 7 | class TestExtractingTextFromFigures(BaseTestCase): 8 | def test_output_is_correct(self): 9 | file_path = os.path.join( 10 | os.path.dirname(__file__), "../../docs/source/example_files/figure.pdf" 11 | ) 12 | 13 | # Without all_texts 14 | document = load_file(file_path) 15 | self.assertListEqual( 16 | [element.text() for element in document.elements], 17 | ["Here is some text outside of an image"], 18 | ) 19 | 20 | document = load_file(file_path, la_params={"all_texts": True}) 21 | self.assertListEqual( 22 | [element.text() for element in document.elements], 23 | ["This is some text in an image", "Here is some text outside of an image"], 24 | ) 25 | -------------------------------------------------------------------------------- /tests/test_doc_examples/test_order_summary.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from py_pdf_parser import tables 4 | from py_pdf_parser.loaders import load_file 5 | from tests.base import BaseTestCase 6 | 7 | 8 | class TestSimpleMemo(BaseTestCase): 9 | def test_output_is_correct(self): 10 | # The code below should match that in the documentation example "order_summary" 11 | # Step 1 - Load the document 12 | file_path = os.path.join( 13 | os.path.dirname(__file__), 14 | "../../docs/source/example_files/order_summary.pdf", 15 | ) 16 | FONT_MAPPING = { 17 | "BAAAAA+LiberationSerif-Bold,16.0": "title", 18 | "BAAAAA+LiberationSerif-Bold,12.0": "sub_title", 19 | "CAAAAA+LiberationSerif,12.0": "text", 20 | "DAAAAA+FreeMonoBold,12.0": "table_header", 21 | "EAAAAA+FreeMono,12.0": "table_text", 22 | } 23 | document = load_file(file_path, font_mapping=FONT_MAPPING) 24 | 25 | # visualise(document) 26 | 27 | # Step 3 - Add sections 28 | order_summary_sub_title_element = ( 29 | document.elements.filter_by_font("sub_title") 30 | .filter_by_text_equal("Order Summary:") 31 | .extract_single_element() 32 | ) 33 | 34 | totals_sub_title_element = ( 35 | document.elements.filter_by_font("sub_title") 36 | .filter_by_text_equal("Totals:") 37 | .extract_single_element() 38 | ) 39 | 40 | final_element = document.elements[-1] 41 | 42 | order_summary_section = document.sectioning.create_section( 43 | name="order_summary", 44 | start_element=order_summary_sub_title_element, 45 | end_element=totals_sub_title_element, 46 | include_last_element=False, 47 | ) 48 | 49 | totals_section = document.sectioning.create_section( 50 | name="totals", 51 | start_element=totals_sub_title_element, 52 | end_element=final_element, 53 | ) 54 | 55 | # visualise(document) 56 | 57 | # Step 4 - Extract tables 58 | 59 | order_summary_table = tables.extract_simple_table( 60 | order_summary_section.elements.filter_by_fonts( 61 | "table_header", "table_text" 62 | ), 63 | as_text=True, 64 | ) 65 | 66 | totals_table = tables.extract_simple_table( 67 | totals_section.elements.filter_by_fonts("table_header", "table_text"), 68 | as_text=True, 69 | ) 70 | 71 | order_summary_with_header = tables.add_header_to_table(order_summary_table) 72 | 73 | self.assertListEqual( 74 | order_summary_table, 75 | [ 76 | ["Item", "Unit Cost", "Quantity", "Cost"], 77 | ["Challenger 100g\nWhole Hops", "£3.29", "1", "£3.29"], 78 | [ 79 | "Maris Otter \nPale Ale Malt \n(Crushed)", 80 | "£1.50/1000g", 81 | "4000g", 82 | "£6.00", 83 | ], 84 | ["WLP037 \nYorkshire Ale \nYeast", "£7.08", "1", "£7.08"], 85 | ["Bottle Caps", "£1 per 100", "500", "£5"], 86 | ], 87 | ) 88 | 89 | self.assertListEqual( 90 | totals_table, 91 | [ 92 | ["Subtotal:", "£26.28"], 93 | ["Shipping", "£6"], 94 | ["VAT 20%", "£6.45"], 95 | ["Total:", "£38.73"], 96 | ], 97 | ) 98 | 99 | self.assertListEqual( 100 | order_summary_with_header, 101 | [ 102 | { 103 | "Item": "Challenger 100g\nWhole Hops", 104 | "Unit Cost": "£3.29", 105 | "Quantity": "1", 106 | "Cost": "£3.29", 107 | }, 108 | { 109 | "Item": "Maris Otter \nPale Ale Malt \n(Crushed)", 110 | "Unit Cost": "£1.50/1000g", 111 | "Quantity": "4000g", 112 | "Cost": "£6.00", 113 | }, 114 | { 115 | "Item": "WLP037 \nYorkshire Ale \nYeast", 116 | "Unit Cost": "£7.08", 117 | "Quantity": "1", 118 | "Cost": "£7.08", 119 | }, 120 | { 121 | "Item": "Bottle Caps", 122 | "Unit Cost": "£1 per 100", 123 | "Quantity": "500", 124 | "Cost": "£5", 125 | }, 126 | ], 127 | ) 128 | -------------------------------------------------------------------------------- /tests/test_doc_examples/test_simple_memo.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from py_pdf_parser.loaders import load_file 4 | from tests.base import BaseTestCase 5 | 6 | 7 | class TestSimpleMemo(BaseTestCase): 8 | def test_output_is_correct(self): 9 | # The code below should match that in the documentation example "simple_memo" 10 | # Step 1 - Load the document 11 | file_path = os.path.join( 12 | os.path.dirname(__file__), 13 | "../../docs/source/example_files/simple_memo.pdf", 14 | ) 15 | document = load_file(file_path) 16 | 17 | # We could visualise it here to check it looks correct: 18 | # from py_pdf_parser.visualise import visualise 19 | # visualise(document) 20 | 21 | # Step 2 - Extract reference elements: 22 | to_element = document.elements.filter_by_text_equal( 23 | "TO:" 24 | ).extract_single_element() 25 | from_element = document.elements.filter_by_text_equal( 26 | "FROM:" 27 | ).extract_single_element() 28 | date_element = document.elements.filter_by_text_equal( 29 | "DATE:" 30 | ).extract_single_element() 31 | subject_element = document.elements.filter_by_text_equal( 32 | "SUBJECT:" 33 | ).extract_single_element() 34 | 35 | # Step 3 - Extract the data 36 | to_text = ( 37 | document.elements.to_the_right_of(to_element) 38 | .extract_single_element() 39 | .text() 40 | ) 41 | from_text = ( 42 | document.elements.to_the_right_of(from_element) 43 | .extract_single_element() 44 | .text() 45 | ) 46 | date_text = ( 47 | document.elements.to_the_right_of(date_element) 48 | .extract_single_element() 49 | .text() 50 | ) 51 | subject_text_element = document.elements.to_the_right_of( 52 | subject_element 53 | ).extract_single_element() 54 | subject_text = subject_text_element.text() 55 | 56 | content_elements = document.elements.after(subject_element) 57 | content_text = "\n".join(element.text() for element in content_elements) 58 | 59 | output = { 60 | "to": to_text, 61 | "from": from_text, 62 | "date": date_text, 63 | "subject": subject_text, 64 | "content": content_text, 65 | } 66 | 67 | self.assertDictEqual( 68 | output, 69 | { 70 | "content": ( 71 | "A new PDF Parsing tool\n" 72 | "There is a new PDF parsing tool available, called py-pdf-parser - " 73 | "you should all check it out!\n" 74 | "I think it could really help you extract that data we need from " 75 | "those PDFs." 76 | ), 77 | "date": "1st January 2020", 78 | "from": "John Smith", 79 | "subject": "A new PDF Parsing tool", 80 | "to": "All Developers", 81 | }, 82 | ) 83 | -------------------------------------------------------------------------------- /tests/test_doc_examples/test_tables.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from py_pdf_parser import tables 4 | from py_pdf_parser.exceptions import TableExtractionError 5 | from py_pdf_parser.loaders import load_file 6 | from tests.base import BaseTestCase 7 | 8 | 9 | class TestSimpleMemo(BaseTestCase): 10 | def test_output_is_correct(self): 11 | file_path = os.path.join( 12 | os.path.dirname(__file__), "../../docs/source/example_files/tables.pdf" 13 | ) 14 | 15 | # Step 1 - Load the file 16 | FONT_MAPPING = { 17 | "BAAAAA+LiberationSerif-Bold,12.0": "header", 18 | "CAAAAA+LiberationSerif,12.0": "table_element", 19 | } 20 | document = load_file(file_path, font_mapping=FONT_MAPPING) 21 | 22 | headers = document.elements.filter_by_font("header") 23 | 24 | # Extract reference elements 25 | simple_table_header = headers.filter_by_text_equal( 26 | "Simple Table" 27 | ).extract_single_element() 28 | 29 | simple_table_with_gaps_header = headers.filter_by_text_equal( 30 | "Simple Table with gaps" 31 | ).extract_single_element() 32 | 33 | simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal( 34 | "Simple Table with gaps in first row/col" 35 | ).extract_single_element() 36 | 37 | non_simple_table_header = headers.filter_by_text_equal( 38 | "Non Simple Table" 39 | ).extract_single_element() 40 | 41 | non_simple_table_with_merged_cols_header = headers.filter_by_text_equal( 42 | "Non Simple Table with Merged Columns" 43 | ).extract_single_element() 44 | 45 | non_simple_table_with_merged_rows_header = headers.filter_by_text_equal( 46 | "Non Simple Table with Merged Rows and Columns" 47 | ).extract_single_element() 48 | 49 | over_the_page_header = headers.filter_by_text_equal( 50 | "Over the page" 51 | ).extract_single_element() 52 | 53 | # Extract table elements 54 | simple_table_elements = document.elements.between( 55 | simple_table_header, simple_table_with_gaps_header 56 | ) 57 | simple_table_with_gaps_elements = document.elements.between( 58 | simple_table_with_gaps_header, 59 | simple_table_with_gaps_in_first_row_col_header, 60 | ) 61 | 62 | simple_table_with_gaps_in_first_row_col_elements = document.elements.between( 63 | simple_table_with_gaps_in_first_row_col_header, non_simple_table_header 64 | ) 65 | 66 | non_simple_table_elements = document.elements.between( 67 | non_simple_table_header, non_simple_table_with_merged_cols_header 68 | ) 69 | 70 | non_simple_table_with_merged_cols_elements = document.elements.between( 71 | non_simple_table_with_merged_cols_header, 72 | non_simple_table_with_merged_rows_header, 73 | ) 74 | 75 | non_simple_table_with_merged_rows_and_cols_elements = document.elements.between( 76 | non_simple_table_with_merged_rows_header, over_the_page_header 77 | ) 78 | 79 | over_the_page_elements = document.elements.after(over_the_page_header) 80 | 81 | # Simple Table 82 | table = tables.extract_simple_table(simple_table_elements, as_text=True) 83 | self.assertListEqual( 84 | table, 85 | [ 86 | ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], 87 | ["A", "1", "A", "1"], 88 | ["B", "2", "B", "2"], 89 | ["C", "3", "C", "3"], 90 | ], 91 | ) 92 | 93 | # Simple Table with gaps 94 | 95 | with self.assertRaises(TableExtractionError): 96 | tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True) 97 | 98 | table = tables.extract_simple_table( 99 | simple_table_with_gaps_elements, as_text=True, allow_gaps=True 100 | ) 101 | self.assertListEqual( 102 | table, 103 | [ 104 | ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], 105 | ["A", "1", "", "1"], 106 | ["B", "", "", ""], 107 | ["C", "", "C", "3"], 108 | ], 109 | ) 110 | 111 | # Simple Table with gaps in first row/col 112 | with self.assertRaises(TableExtractionError): 113 | tables.extract_simple_table( 114 | simple_table_with_gaps_in_first_row_col_elements, 115 | as_text=True, 116 | allow_gaps=True, 117 | ) 118 | 119 | reference_element = simple_table_with_gaps_in_first_row_col_elements[9] 120 | table = tables.extract_simple_table( 121 | simple_table_with_gaps_in_first_row_col_elements, 122 | as_text=True, 123 | allow_gaps=True, 124 | reference_element=reference_element, 125 | ) 126 | self.assertListEqual( 127 | table, 128 | [ 129 | ["Heading 1", "Heading 2", "", "Heading 4"], 130 | ["", "1", "A", ""], 131 | ["B", "2", "", "2"], 132 | ["C", "3", "C", "3"], 133 | ], 134 | ) 135 | 136 | # Non Simple Table 137 | table = tables.extract_table(non_simple_table_elements, as_text=True) 138 | self.assertListEqual( 139 | table, 140 | [ 141 | ["", "Heading 2", "Heading 3", "Heading 4"], 142 | ["A", "1", "", "1"], 143 | ["B", "", "B", "2"], 144 | ["C", "3", "C", ""], 145 | ], 146 | ) 147 | 148 | # Non Simple Table with Merged Columns 149 | with self.assertRaises(TableExtractionError): 150 | tables.extract_table( 151 | non_simple_table_with_merged_cols_elements, as_text=True 152 | ) 153 | 154 | table = tables.extract_table( 155 | non_simple_table_with_merged_cols_elements, 156 | as_text=True, 157 | fix_element_in_multiple_cols=True, 158 | ) 159 | self.assertListEqual( 160 | table, 161 | [ 162 | ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], 163 | ["A", "1", "A", "1"], 164 | ["This text spans across multiple columns", "", "B", "2"], 165 | ["C", "3", "C", "3"], 166 | ], 167 | ) 168 | 169 | # Non Simple Table with Merged Rows and Columns 170 | table = tables.extract_table( 171 | non_simple_table_with_merged_rows_and_cols_elements, 172 | as_text=True, 173 | fix_element_in_multiple_rows=True, 174 | fix_element_in_multiple_cols=True, 175 | ) 176 | self.assertListEqual( 177 | table, 178 | [ 179 | ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], 180 | [ 181 | "This text spans across multiple rows and \nmultiple columns.", 182 | "", 183 | "A", 184 | "1", 185 | ], 186 | ["", "", "B", "2"], 187 | ["C", "3", "C", "3"], 188 | ], 189 | ) 190 | 191 | # Over the page 192 | table = tables.extract_simple_table(over_the_page_elements, as_text=True) 193 | self.assertListEqual( 194 | table, 195 | [ 196 | ["Heading 1", "Heading 2", "Heading 3", "Heading 4"], 197 | ["A", "1", "A", "1"], 198 | ["B", "2", "B", "2"], 199 | ["C", "3", "C", "3"], 200 | ], 201 | ) 202 | -------------------------------------------------------------------------------- /tests/test_loaders.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | from pdfminer.pdfdocument import PDFPasswordIncorrect 5 | 6 | from py_pdf_parser.components import PDFDocument 7 | from py_pdf_parser.loaders import load, load_file 8 | 9 | 10 | class LoadersTest(TestCase): 11 | def test_load_file(self): 12 | file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf") 13 | document = load_file(file_path) 14 | self.assertIsInstance(document, PDFDocument) 15 | 16 | def test_load_protected_file(self): 17 | file_path = os.path.join( 18 | os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf" 19 | ) 20 | document = load_file(file_path, password="p4ssword") 21 | self.assertIsInstance(document, PDFDocument) 22 | 23 | def test_load_protected_file_wrong_password(self): 24 | file_path = os.path.join( 25 | os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf" 26 | ) 27 | with self.assertRaises(PDFPasswordIncorrect): 28 | load_file(file_path, password="wrong_password") 29 | 30 | def test_load(self): 31 | file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf") 32 | with open(file_path, "rb") as in_file: 33 | document = load(in_file) 34 | self.assertIsInstance(document, PDFDocument) 35 | 36 | def test_load_with_text_in_image(self): 37 | file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "image.pdf") 38 | with open(file_path, "rb") as in_file: 39 | document = load(in_file) 40 | self.assertIsInstance(document, PDFDocument) 41 | self.assertEqual(len(document.elements), 1) 42 | 43 | with open(file_path, "rb") as in_file: 44 | document = load(in_file, la_params={"all_texts": True}) 45 | self.assertIsInstance(document, PDFDocument) 46 | self.assertEqual(len(document.elements), 2) 47 | 48 | def test_load_file_with_text_in_image(self): 49 | file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "image.pdf") 50 | document = load_file(file_path, la_params={"all_texts": True}) 51 | self.assertIsInstance(document, PDFDocument) 52 | self.assertEqual(len(document.elements), 2) 53 | -------------------------------------------------------------------------------- /tests/test_sectioning.py: -------------------------------------------------------------------------------- 1 | import types 2 | 3 | from py_pdf_parser.exceptions import InvalidSectionError, SectionNotFoundError 4 | from py_pdf_parser.sectioning import Sectioning 5 | 6 | from .base import BaseTestCase 7 | from .utils import FakePDFMinerTextElement, create_pdf_document, create_section 8 | 9 | 10 | class TestSection(BaseTestCase): 11 | def test_contains(self): 12 | elem_1 = FakePDFMinerTextElement() 13 | elem_2 = FakePDFMinerTextElement() 14 | elem_3 = FakePDFMinerTextElement() 15 | document = create_pdf_document([elem_1, elem_2, elem_3]) 16 | 17 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 18 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 19 | pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list) 20 | 21 | section = create_section( 22 | document, start_element=pdf_elem_1, end_element=pdf_elem_2 23 | ) 24 | 25 | self.assertIn(pdf_elem_1, section) 26 | self.assertIn(pdf_elem_2, section) 27 | self.assertNotIn(pdf_elem_3, section) 28 | 29 | def test_eq(self): 30 | elem_1 = FakePDFMinerTextElement() 31 | elem_2 = FakePDFMinerTextElement() 32 | elem_3 = FakePDFMinerTextElement() 33 | document = create_pdf_document([elem_1, elem_2, elem_3]) 34 | 35 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 36 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 37 | pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list) 38 | 39 | section_1 = create_section( 40 | document, start_element=pdf_elem_1, end_element=pdf_elem_2 41 | ) 42 | section_2 = create_section( 43 | document, start_element=pdf_elem_1, end_element=pdf_elem_2 44 | ) 45 | self.assertEqual(section_1, section_2) 46 | section_3 = create_section( 47 | document, start_element=pdf_elem_1, end_element=pdf_elem_3 48 | ) 49 | self.assertNotEqual(section_1, section_3) 50 | 51 | def test_exceptions(self): 52 | elem_1 = FakePDFMinerTextElement() 53 | elem_2 = FakePDFMinerTextElement() 54 | document = create_pdf_document([elem_1, elem_2]) 55 | 56 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 57 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 58 | with self.assertRaises(InvalidSectionError): 59 | create_section(document, start_element=pdf_elem_2, end_element=pdf_elem_1) 60 | 61 | def test_len(self): 62 | elem_1 = FakePDFMinerTextElement() 63 | elem_2 = FakePDFMinerTextElement() 64 | elem_3 = FakePDFMinerTextElement() 65 | document = create_pdf_document([elem_1, elem_2, elem_3]) 66 | 67 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 68 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 69 | pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list) 70 | 71 | section = create_section( 72 | document, 73 | name="fake_section", 74 | start_element=pdf_elem_1, 75 | end_element=pdf_elem_3, 76 | ) 77 | self.assertEqual(len(section), 3) 78 | 79 | # Ignoring an element should affect the length of the section. 80 | pdf_elem_2.ignore() 81 | self.assertEqual(len(section), 2) 82 | 83 | def test_repr(self): 84 | elem_1 = FakePDFMinerTextElement() 85 | elem_2 = FakePDFMinerTextElement() 86 | document = create_pdf_document([elem_1, elem_2]) 87 | 88 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 89 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 90 | 91 | section = create_section( 92 | document, 93 | name="fake_section", 94 | unique_name="fake_section_0", 95 | start_element=pdf_elem_1, 96 | end_element=pdf_elem_2, 97 | ) 98 | 99 | self.assertEqual( 100 | repr(section), 101 | ( 102 | "
" 104 | ), 105 | ) 106 | 107 | # Ignoring an element should affect the number of elements of the section. 108 | pdf_elem_2.ignore() 109 | self.assertEqual( 110 | repr(section), 111 | ( 112 | "
" 114 | ), 115 | ) 116 | 117 | 118 | class TestSectioning(BaseTestCase): 119 | def test_create_section(self): 120 | elem_1 = FakePDFMinerTextElement() 121 | elem_2 = FakePDFMinerTextElement() 122 | elem_3 = FakePDFMinerTextElement() 123 | document = create_pdf_document([elem_1, elem_2, elem_3]) 124 | 125 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 126 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 127 | pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list) 128 | 129 | sectioning = Sectioning(document) 130 | sectioning.create_section( 131 | "fake_section", start_element=pdf_elem_1, end_element=pdf_elem_2 132 | ) 133 | 134 | section_1 = create_section( 135 | document, 136 | unique_name="fake_section_0", 137 | start_element=pdf_elem_1, 138 | end_element=pdf_elem_2, 139 | ) 140 | self.assertEqual(len(sectioning.sections), 1) 141 | self.assertIn(section_1, sectioning.sections) 142 | 143 | # Checks that section with the same name would have different unique names when 144 | # added in Sectioning 145 | section_2 = create_section( 146 | document, 147 | unique_name="fake_section_1", 148 | start_element=pdf_elem_2, 149 | end_element=pdf_elem_3, 150 | ) 151 | sectioning.create_section( 152 | name="fake_section", start_element=pdf_elem_2, end_element=pdf_elem_3 153 | ) 154 | self.assertEqual(len(sectioning.sections), 2) 155 | self.assertIn(section_1, sectioning.sections) 156 | self.assertIn(section_2, sectioning.sections) 157 | 158 | # Test with include_end_element being False 159 | section_3 = sectioning.create_section( 160 | name="test", 161 | start_element=pdf_elem_1, 162 | end_element=pdf_elem_3, 163 | include_last_element=False, 164 | ) 165 | self.assertEqual(len(section_3.elements), 2) 166 | self.assertIn(pdf_elem_1, section_3.elements) 167 | self.assertIn(pdf_elem_2, section_3.elements) 168 | self.assertNotIn(pdf_elem_3, section_3.elements) 169 | 170 | with self.assertRaises(InvalidSectionError): 171 | sectioning.create_section( 172 | name="test", 173 | start_element=pdf_elem_1, 174 | end_element=pdf_elem_1, 175 | include_last_element=False, 176 | ) 177 | 178 | def test_get_sections_with_name(self): 179 | elem_1 = FakePDFMinerTextElement() 180 | elem_2 = FakePDFMinerTextElement() 181 | document = create_pdf_document([elem_1, elem_2]) 182 | 183 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 184 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 185 | 186 | self.assertTrue( 187 | isinstance( 188 | document.sectioning.get_sections_with_name("foo"), types.GeneratorType 189 | ) 190 | ) 191 | self.assertEqual(list(document.sectioning.get_sections_with_name("foo")), []) 192 | 193 | section_1 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2) 194 | section_2 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2) 195 | document.sectioning.create_section("bar", pdf_elem_1, pdf_elem_2) 196 | 197 | self.assertTrue( 198 | isinstance( 199 | document.sectioning.get_sections_with_name("foo"), types.GeneratorType 200 | ) 201 | ) 202 | self.assertEqual( 203 | list(document.sectioning.get_sections_with_name("foo")), 204 | [section_1, section_2], 205 | ) 206 | 207 | def test_get_section(self): 208 | elem_1 = FakePDFMinerTextElement() 209 | elem_2 = FakePDFMinerTextElement() 210 | document = create_pdf_document([elem_1, elem_2]) 211 | 212 | pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list) 213 | pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list) 214 | 215 | with self.assertRaises(SectionNotFoundError): 216 | document.sectioning.get_section("foo") 217 | 218 | self.assertTrue( 219 | isinstance( 220 | document.sectioning.get_sections_with_name("foo"), types.GeneratorType 221 | ) 222 | ) 223 | self.assertEqual(list(document.sectioning.get_sections_with_name("foo")), []) 224 | 225 | section_1 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2) 226 | section_2 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2) 227 | 228 | self.assertEqual(document.sectioning.get_section("foo_0"), section_1) 229 | self.assertEqual(document.sectioning.get_section("foo_1"), section_2) 230 | -------------------------------------------------------------------------------- /tests/test_visualise.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from py_pdf_parser.loaders import load_file 4 | from py_pdf_parser.visualise.main import PDFVisualiser 5 | 6 | from .base import BaseVisualiseTestCase 7 | 8 | 9 | class TestVisualise(BaseVisualiseTestCase): 10 | def test_visualise(self): 11 | file_path = os.path.join( 12 | os.path.dirname(__file__), "../docs/source/example_files/tables.pdf" 13 | ) 14 | 15 | FONT_MAPPING = { 16 | "BAAAAA+LiberationSerif-Bold,12.0": "header", 17 | "CAAAAA+LiberationSerif,12.0": "table_element", 18 | } 19 | document = load_file(file_path, font_mapping=FONT_MAPPING) 20 | 21 | visualiser = PDFVisualiser( 22 | self.root, document, show_info=True, width=1920, height=1080 23 | ) 24 | 25 | self.check_images(visualiser, "tables1") 26 | 27 | visualiser.toolbar._buttons["Next page"].invoke() 28 | self.check_images(visualiser, "tables2") 29 | -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict, List, NamedTuple, Optional, Union 2 | 3 | import re 4 | 5 | from pdfminer.layout import LTComponent 6 | 7 | from py_pdf_parser.common import BoundingBox 8 | from py_pdf_parser.components import ElementOrdering, PDFDocument, PDFElement 9 | from py_pdf_parser.loaders import Page 10 | from py_pdf_parser.sectioning import Section 11 | 12 | 13 | class FakePDFMinerCharacter(NamedTuple): 14 | fontname: str = "fake_fontname" 15 | height: float = 10 16 | 17 | 18 | class FakePDFMinerIterator: 19 | def __init__(self, font_name: str = "fake_font", font_size: float = 10): 20 | self.finished = False 21 | self.font_name = font_name 22 | self.font_size = font_size 23 | 24 | def __next__(self): 25 | if self.finished: 26 | raise StopIteration() 27 | 28 | self.finished = True 29 | return [FakePDFMinerCharacter(fontname=self.font_name, height=self.font_size)] 30 | 31 | 32 | class FakePDFMinerTextElement(LTComponent): 33 | """ 34 | This is a stub to help create something which looks like a PDFMiner text element 35 | for use in testing. 36 | 37 | The fontname and size are detected by getting the first character of the first row 38 | of the contained text. This is done by iterating, hence we define an iterator which 39 | simply returns one list of length one and then raises StopIteration. This is the 40 | minimum needed to pretend to allow extraction of the first character, for which 41 | we use the FakeCharacter namedtuple which has fontname and height attibutes set. 42 | """ 43 | 44 | def __init__( 45 | self, 46 | bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1), 47 | text: str = "fake_text", 48 | font_name: str = "fake_font", 49 | font_size: float = 10, 50 | ): 51 | super().__init__( 52 | bbox=[bounding_box.x0, bounding_box.y0, bounding_box.x1, bounding_box.y1] 53 | ) 54 | self.text = text 55 | self.font_name = font_name 56 | self.font_size = font_size 57 | 58 | def __iter__(self): 59 | return FakePDFMinerIterator(font_name=self.font_name, font_size=self.font_size) 60 | 61 | def get_text(self) -> str: 62 | if self.text is None: 63 | return "" 64 | return self.text 65 | 66 | 67 | def create_pdf_element( 68 | bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1), 69 | text: str = "fake_text", 70 | font_name: str = "fake_font", 71 | font_size: float = 10, 72 | font_mapping: Optional[Dict[str, str]] = None, 73 | font_mapping_is_regex: bool = False, 74 | regex_flags: Union[int, re.RegexFlag] = 0, 75 | font_size_precision: int = 1, 76 | ) -> "PDFElement": 77 | document = create_pdf_document( 78 | elements=[ 79 | FakePDFMinerTextElement( 80 | bounding_box, text=text, font_name=font_name, font_size=font_size 81 | ) 82 | ], 83 | font_mapping=font_mapping, 84 | font_mapping_is_regex=font_mapping_is_regex, 85 | regex_flags=regex_flags, 86 | font_size_precision=font_size_precision, 87 | ) 88 | return document.elements[0] 89 | 90 | 91 | def create_pdf_document( 92 | elements: Union[List[LTComponent], Dict[int, List[LTComponent]]], 93 | font_mapping: Optional[Dict[str, str]] = None, 94 | font_mapping_is_regex: bool = False, 95 | regex_flags: Union[int, re.RegexFlag] = 0, 96 | font_size_precision: int = 1, 97 | element_ordering: Union[ 98 | ElementOrdering, Callable[[List], List] 99 | ] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM, 100 | ) -> "PDFDocument": 101 | """ 102 | Creates a PDF document with the given elements. 103 | "elements" can be a list of elements (in which case a document with a single page 104 | will be created) or a dictionary mapping page number to its list of elements. 105 | """ 106 | if not isinstance(elements, dict): 107 | pages = {1: Page(elements=elements, width=100, height=100)} 108 | else: 109 | pages = { 110 | page_number: Page(elements=elements_list, width=100, height=100) 111 | for page_number, elements_list in elements.items() 112 | } 113 | 114 | return PDFDocument( 115 | pages=pages, 116 | font_mapping=font_mapping, 117 | font_mapping_is_regex=font_mapping_is_regex, 118 | regex_flags=regex_flags, 119 | font_size_precision=font_size_precision, 120 | element_ordering=element_ordering, 121 | ) 122 | 123 | 124 | def create_section( 125 | document: "PDFDocument", 126 | name: str = "fake_name", 127 | unique_name: str = "fake_name_1", 128 | start_element: Optional["PDFElement"] = None, 129 | end_element: Optional["PDFElement"] = None, 130 | ) -> "Section": 131 | """ 132 | Creates a simple section 133 | """ 134 | if start_element is None: 135 | start_element = document._element_list[0] 136 | if end_element is None: 137 | end_element = document._element_list[-1] 138 | 139 | return Section(document, name, unique_name, start_element, end_element) 140 | --------------------------------------------------------------------------------