├── .flake8
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── feature_request.md
    │   └── question.md
    ├── dependabot.yml
    ├── pull_request_template.md
    ├── scripts
    │   ├── docs.sh
    │   └── test.sh
    └── workflows
    │   ├── codeql-analysis.yml
    │   ├── continuous-integration.yml
    │   └── release.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .readthedocs.yml
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── docker-compose.yml
├── docs
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── CHANGELOG.md
    │   ├── conf.py
    │   ├── example_files
    │       ├── columns.pdf
    │       ├── figure.pdf
    │       ├── grid.pdf
    │       ├── order_summary.pdf
    │       ├── simple_memo.pdf
    │       └── tables.pdf
    │   ├── examples
    │       ├── element_ordering.rst
    │       ├── extracting_text_from_figures.rst
    │       ├── index.rst
    │       ├── more_tables.rst
    │       ├── order_summary.rst
    │       └── simple_memo.rst
    │   ├── index.rst
    │   ├── overview.rst
    │   ├── reference
    │       ├── common.rst
    │       ├── components.rst
    │       ├── filtering.rst
    │       ├── index.rst
    │       ├── loaders.rst
    │       ├── sectioning.rst
    │       ├── tables.rst
    │       └── visualise.rst
    │   └── screenshots
    │       ├── order_summary_example
    │           ├── initial.png
    │           ├── sections.png
    │           ├── showing_font_1.png
    │           ├── showing_font_2.png
    │           └── zoomed.png
    │       └── simple_memo_example
    │           ├── top.png
    │           └── visualise.png
├── imagemagick_policy.xml
├── mypy.ini
├── py_pdf_parser
    ├── __init__.py
    ├── common.py
    ├── components.py
    ├── exceptions.py
    ├── filtering.py
    ├── loaders.py
    ├── sectioning.py
    ├── tables.py
    └── visualise
    │   ├── __init__.py
    │   ├── background.py
    │   ├── info_figure.py
    │   ├── main.py
    │   └── sections.py
├── pycodestyle.cfg
├── pyproject.toml
├── pytype.cfg
├── setup.py
└── tests
    ├── __init__.py
    ├── base.py
    ├── data
        ├── images
        │   ├── tables1.png
        │   └── tables2.png
        └── pdfs
        │   ├── image.pdf
        │   ├── test.pdf
        │   └── test_protected.pdf
    ├── test_common.py
    ├── test_components.py
    ├── test_doc_examples
        ├── __init__.py
        ├── test_element_ordering.py
        ├── test_extracting_text_from_figures.py
        ├── test_order_summary.py
        ├── test_simple_memo.py
        └── test_tables.py
    ├── test_filtering.py
    ├── test_loaders.py
    ├── test_sectioning.py
    ├── test_tables.py
    ├── test_visualise.py
    └── utils.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | extend-ignore = E203
4 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Bug Report**
11 | 
12 | Before submitting an issue, please ensure you have read our `CONTRIBUTING.md`, and follow the Code of Conduct.
13 | 
14 | Thanks for taking the time to report a bug. To help us fix it quickly, please include the following information:
15 | 
16 | * A good description of the bug, including expected behavior and actual behavior.
17 | * A (small as possible) reproducible example of the bug. Please include code, and any files required to reproduce the issue.
18 | * Any required context.
19 | * If you'd be interested on working on a fix for your issue, please let us know!
20 | 
21 | Please also check that your bug is not actually caused by [pdfminer.six](https://github.com/pdfminer/pdfminer.six), and is really an issue with this project.
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: enhancement
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Feature Request**
11 | 
12 | Before submitting an issue, please ensure you have read our `CONTRIBUTING.md`, and
13 | follow the Code of Conduct.
14 | 
15 | Thanks for suggesting a new feature. To enable a useful discussion, please include as
16 | much of the following as you can:
17 | * A good description of the feature.
18 | * Why do you want this feature? What is the use-case and context?
19 | * An example of what you'd like to achieve.
20 | * Any ideas about implementation.
21 | * Please also indicate if you'd be interested on working on the feature yourself.
22 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Question
 3 | about: Ask for help using this tool
 4 | title: ''
 5 | labels: question
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Question**
11 | 
12 | Before submitting an issue, please ensure you have read our `CONTRIBUTING.md`, and follow the Code of Conduct.
13 | 
14 | Thanks for taking the time to submit an issue. To help us understand your question and answer it quickly, please include the following information where possible:
15 | 
16 | * A good description of the question, including what you are trying to achieve and what the problems are.
17 | * A (small as possible) example highlighting your question. Please include code that you have tried, and any files required to run it.
18 | * Any required context.
19 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   # Maintain dependencies from pip
 4 |   - package-ecosystem: "pip"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "daily"
 8 |   # Maintain dependencies from GitHub Actions
 9 |   - package-ecosystem: "github-actions"
10 |     directory: "/"
11 |     schedule:
12 |       interval: "daily"
13 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | **Description**
 2 | 
 3 | Please include a description of the change, and why it was needed.
 4 | 
 5 | **Linked issues**
 6 | 
 7 | Please link any issues this pull request related to. Using the word "closes" before the
 8 | link will mean the issue is automatically closed by this Pull Request.
 9 | 
10 | **Testing**
11 | 
12 | Please describe how your changes have been tested.
13 | 
14 | **Checklist**
15 | 
16 | - [ ] I have provided a good description of the change above
17 | - [ ] I have added any necessary tests
18 | - [ ] I have added all necessary type hints
19 | - [ ] I have checked my linting (`docker-compose run --rm lint`)
20 | - [ ] I have added/updated all necessary documentation
21 | - [ ] I have updated `CHANGELOG.md`, following the format from
22 |       [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
23 | 


--------------------------------------------------------------------------------
/.github/scripts/docs.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo '#######################################################################'
 4 | echo '#                             Building docs                           #'
 5 | echo '#######################################################################'
 6 | 
 7 | export SPHINXOPTS="-W"  # Treat warnings as errors
 8 | 
 9 | xvfb-run make --directory $PROJECT_DIR/docs html
10 | 
11 | DOCS_STATUS=$?
12 | if [[ ("$DOCS_STATUS" == 0) ]]; then
13 |   echo '#######################################################################'
14 |   echo '#                            Build succeded                           #'
15 |   echo '#######################################################################'
16 |   exit 0
17 | else
18 |   echo ''
19 |   echo '#######################################################################'
20 |   echo '#                            Build failed    !                        #'
21 |   echo '#######################################################################'
22 |   exit 1
23 | fi
24 | 


--------------------------------------------------------------------------------
/.github/scripts/test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | clean_pyc () { echo 'cleaning .pyc files'; find . -name "*.pyc" -exec rm -f {} \; ; }
 4 | trap clean_pyc EXIT
 5 | 
 6 | echo ''
 7 | echo '#######################################################################'
 8 | echo '#                          Running nosetests                          #'
 9 | echo '#######################################################################'
10 | xvfb-run nosetests $PROJECT_DIR
11 | 
12 | TEST_STATUS=$?
13 | if [[ ("$TEST_STATUS" == 0) ]]; then
14 |   echo '#######################################################################'
15 |   echo '#                          nosetests succeded                         #'
16 |   echo '#######################################################################'
17 |   exit 0
18 | else
19 |   echo ''
20 |   echo '#######################################################################'
21 |   echo '#                          nosetests failed    !                      #'
22 |   echo '#######################################################################'
23 |   exit 1
24 | fi
25 | 


--------------------------------------------------------------------------------
/.github/workflows/codeql-analysis.yml:
--------------------------------------------------------------------------------
 1 | name: "CodeQL"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master, ]
 6 |   pull_request:
 7 |     # The branches below must be a subset of the branches above
 8 |     branches: [master]
 9 |   schedule:
10 |     - cron: '0 7 * * 2'
11 | 
12 | jobs:
13 |   analyse:
14 |     name: Analyse
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - name: Checkout repository
19 |       uses: actions/checkout@v3
20 |       with:
21 |         # We must fetch at least the immediate parents so that if this is
22 |         # a pull request then we can checkout the head.
23 |         fetch-depth: 2
24 | 
25 |     # If this run was triggered by a pull request event, then checkout
26 |     # the head of the pull request instead of the merge commit.
27 |     - run: git checkout HEAD^2
28 |       if: ${{ github.event_name == 'pull_request' }}
29 | 
30 |     # Initializes the CodeQL tools for scanning.
31 |     - name: Initialize CodeQL
32 |       uses: github/codeql-action/init@v2
33 |       # Override language selection by uncommenting this and choosing your languages
34 |       # with:
35 |       #   languages: go, javascript, csharp, python, cpp, java
36 | 
37 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, or Java).
38 |     # If this step fails, then you should remove it and run the build manually (see below)
39 |     - name: Autobuild
40 |       uses: github/codeql-action/autobuild@v2
41 | 
42 |     # ℹ️ Command-line programs to run using the OS shell.
43 |     # 📚 https://git.io/JvXDl
44 | 
45 |     # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines
46 |     #    and modify them (or add more) to build your code if your project
47 |     #    uses a compiled language
48 | 
49 |     #- run: |
50 |     #   make bootstrap
51 |     #   make release
52 | 
53 |     - name: Perform CodeQL Analysis
54 |       uses: github/codeql-action/analyze@v2
55 | 


--------------------------------------------------------------------------------
/.github/workflows/continuous-integration.yml:
--------------------------------------------------------------------------------
 1 | name: Continuous Integration
 2 | on:
 3 |   push:
 4 |     branches:
 5 |     - master
 6 |   pull_request:
 7 |     branches:
 8 |     - master
 9 | jobs:
10 |   continuous-integration:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 |       - name: Set up Docker Buildx
15 |         uses: docker/setup-buildx-action@v2
16 |       - name: Cache Docker layers
17 |         uses: actions/cache@v3
18 |         with:
19 |           path: /tmp/.buildx-cache
20 |           key: ${{ runner.os }}-buildx-${{ github.sha }}
21 |           restore-keys: |
22 |             ${{ runner.os }}-buildx-
23 |       - name: Build the tests docker container
24 |         uses: docker/build-push-action@v3.1.1
25 |         with:
26 |           tags: jstockwin/py-pdf-parser-test:test
27 |           cache-from: type=local,src=/tmp/.buildx-cache
28 |           cache-to: type=local,dest=/tmp/.buildx-cache-new
29 |           load: true
30 |         # This ugly bit is necessary if you don't want your cache to grow forever
31 |         # till it hits GitHub's limit of 5GB.
32 |         # Temp fix
33 |         # https://github.com/docker/build-push-action/issues/252
34 |         # https://github.com/moby/buildkit/issues/1896
35 |       - name: Move cache
36 |         run: |
37 |           rm -rf /tmp/.buildx-cache
38 |           mv /tmp/.buildx-cache-new /tmp/.buildx-cache
39 |       - name: Run test
40 |         run: |  # Note we need '-uroot' so user has permissions to github.workspace
41 |           docker run --rm -uroot --volume ${{ github.workspace }}:/py-pdf-parser \
42 |           jstockwin/py-pdf-parser-test:test .github/scripts/test.sh
43 |       - name: Check docs build correctly
44 |         run: docker run --rm jstockwin/py-pdf-parser-test:test .github/scripts/docs.sh
45 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Publish new version
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - uses: actions/setup-python@master
11 |       - name: Install build packages
12 |         run: pip3 install twine==3.1.1 wheel==0.34.2
13 |       - name: Build package
14 |         run: python3 setup.py sdist bdist_wheel
15 |       - name: Check built package
16 |         run: twine check dist/*
17 |       - uses: actions/upload-artifact@v3
18 |         with:
19 |           path: ./dist
20 | 
21 |   pypi-publish:
22 |     needs: ["build"]
23 |     environment: "pypi"
24 | 
25 |     name: upload release to PyPI
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       # IMPORTANT: this permission is mandatory for trusted publishing
29 |       id-token: write
30 |     steps:
31 |       - uses: actions/download-artifact@v3
32 |       - name: Publish package distributions to PyPI
33 |         uses: pypa/gh-action-pypi-publish@release/v1
34 |         with:
35 |           packages_dir: artifact/
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # pytype
107 | .pytype/
108 | 
109 | .vscode/
110 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | default_language_version:
 4 |     python: python3.8  # pinned until pytype issue resolved
 5 | repos:
 6 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v4.6.0
 8 |     hooks:
 9 |     -   id: check-added-large-files
10 |     -   id: check-merge-conflict
11 |     -   id: debug-statements
12 |     -   id: detect-private-key
13 |     -   id: end-of-file-fixer
14 |     -   id: mixed-line-ending
15 |     -   id: trailing-whitespace
16 | -   repo: https://github.com/pycqa/isort
17 |     rev: 5.13.2
18 |     hooks:
19 |     -   id: isort
20 |         args: [--profile, black]
21 | -   repo: https://github.com/psf/black
22 |     rev: 24.4.2
23 |     hooks:
24 |     -   id: black
25 | -   repo: https://github.com/pycqa/flake8
26 |     rev: 7.1.0
27 |     hooks:
28 |     -   id: flake8
29 | -   repo: https://github.com/mattseymour/pre-commit-pytype
30 |     rev: '2023.5.8'
31 |     hooks:
32 |     -   id: pytype
33 |         args: ['--disable=pyi-error,import-error', '--exclude=tests']
34 | -   repo: https://github.com/pre-commit/mirrors-mypy
35 |     rev: 'v1.10.1'
36 |     hooks:
37 |     -   id: mypy
38 |         additional_dependencies: [types-mock]
39 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Build documentation in the docs/ directory with Sphinx
 9 | sphinx:
10 |   configuration: docs/source/conf.py
11 | 
12 | # Build documentation with MkDocs
13 | #mkdocs:
14 | #  configuration: mkdocs.yml
15 | 
16 | # Optionally build your docs in additional formats such as PDF and ePub
17 | formats: all
18 | 
19 | # Optionally set the version of Python and requirements required to build your docs
20 | python:
21 |   version: 3.8
22 |   install:
23 |     - method: pip
24 |       path: .
25 |       extra_requirements:
26 |         - dev
27 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
  5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
  6 | 
  7 | ## [Unreleased]
  8 | 
  9 | ## [0.13.0] - 2024-07-23
 10 | 
 11 | ### Added
 12 | - Added extra filtering methods for ElementList
 13 | - Make sure tests and docs are not included in binary distribution wheels (PyPi) and source distribution (sdist).
 14 | 
 15 | ## [0.12.0] - 2023-11-10
 16 | 
 17 | ### Added
 18 | - Added support for opening password protected files ([#350](https://github.com/jstockwin/py-pdf-parser/pull/350))
 19 | 
 20 | ## [0.11.0] - 2023-08-07
 21 | 
 22 | ### Changed
 23 | - Various dependency updates
 24 | - PyPI releases now use Trusted Publishers
 25 | 
 26 | ### Fixed
 27 | - Fixed typo in docs ([#361](https://github.com/jstockwin/py-pdf-parser/pull/361))
 28 | 
 29 | ## [0.10.2] - 2022-11-07
 30 | 
 31 | ### Changed
 32 | - Various dependency updates
 33 | - Removed unused PyYAML dependency ([#262](https://github.com/jstockwin/py-pdf-parser/pull/262))
 34 | 
 35 | ## [0.10.1] - 2021-10-12
 36 | ### Fixed
 37 | - The `visualise` function properly uses the _elements_ parameter in order to filter visualised elements. ([#256](https://github.com/jstockwin/py-pdf-parser/pull/256))
 38 | 
 39 | ### Changed
 40 | - Various dependency updates
 41 | 
 42 | ## [0.10.0] - 2021-07-01
 43 | - [BREAKING] Changes from using pyqt5 to using tkinter for the visualise tool. This
 44 |   means we don't need the python3-dev as a requirement, and seems to solve endless
 45 |   issues with pyqt5 not finding the correct qt bindings. This is a potential breaking
 46 |   change, although the visualise tool is only in the development version. No code
 47 |   changes are needed, but you will need tkinter installed for visualise to still work.
 48 | - Changed python version from 3.6 to 3.8 in `.readthedocs.yml`.
 49 | 
 50 | ## [0.9.0] - 2021-06-09
 51 | ### Changed
 52 | - Various dependency updates (matplotlib, pyqt5)
 53 | - Removed all but the tests dockerfile for simplicity. Use Docker BuildKit. We will no longer be pushing images to DockerHub on release. ([#203](https://github.com/jstockwin/py-pdf-parser/pull/203))
 54 | 
 55 | ## [0.8.0] - 2021-05-12
 56 | ### Changed
 57 | - Various dependency updates
 58 | - Updated CI to avoid login issue ([#182](https://github.com/jstockwin/py-pdf-parser/pull/182))
 59 | 
 60 | ## [0.7.0] - 2021-01-15
 61 | ### Changed
 62 | - Ensure we only accept LTTextBoxes at the top level (not LTTextLines) ([#155](https://github.com/jstockwin/py-pdf-parser/pull/155))
 63 | ## [0.6.0] - 2020-12-11
 64 | ### Added
 65 | - Enabled dependabot which should help to keep packages up to date ([#124](https://github.com/jstockwin/py-pdf-parser/pull/124))
 66 | 
 67 | ### Changed
 68 | - Various dependency updates
 69 | 
 70 | ### Fixed
 71 | - Fixed a typo in simple memo example in the documentation. ([#121](https://github.com/jstockwin/py-pdf-parser/pull/121))
 72 | 
 73 | ## [0.5.0] - 2020-07-05
 74 | ### Added
 75 | - New functions on `ElementList`, `move_forwards_from` and `move_backwards_from`, to allow moving forwards and backwards from a certain element in the list easily. ([#113](https://github.com/jstockwin/py-pdf-parser/pull/113))
 76 | 
 77 | ### Changed
 78 | - When the layout parameter all_texts is True, the text inside figures is now also returned as elements in the document. ([#99](https://github.com/jstockwin/py-pdf-parser/pull/99))
 79 | 
 80 | ### Fixed
 81 | - Passing a tolerance less than the width/height of an element no longer causes an error. The tolerance is now capped at half the width/height of the element. ([#103](https://github.com/jstockwin/py-pdf-parser/pull/103))
 82 | 
 83 | ## [0.4.0] - 2020-06-22
 84 | ### Added
 85 | - Added `__len__` and `__repr__` functions to the Section class. ([#90](https://github.com/jstockwin/py-pdf-parser/pull/90))
 86 | - Added flag to `extract_simple_table` and `extract_table` functions to remove duplicate header rows. ([#89](https://github.com/jstockwin/py-pdf-parser/pull/89))
 87 | - You can now specify `element_ordering` when instantiating a PDFDocument. This defaults to the old behaviour or left to right, top to bottom. ([#95](https://github.com/jstockwin/py-pdf-parser/pull/95))
 88 | 
 89 | ### Changed
 90 | - Advanced layout analysis is now disabled by default. ([#88](https://github.com/jstockwin/py-pdf-parser/pull/88))
 91 | 
 92 | ## [0.3.0] - 2020-05-14
 93 | ### Added
 94 | - Published to PyPI as py-pdf-parser.
 95 | - Documentation is now hosted [here](https://py-pdf-parser.readthedocs.io/en/latest/). ([#71](https://github.com/jstockwin/py-pdf-parser/pull/71))
 96 | - Added new examples to the documentation. ([#74](https://github.com/jstockwin/py-pdf-parser/pull/74))
 97 | - Font filtering now caches the elements by font. ([#73](https://github.com/jstockwin/py-pdf-parser/pull/73)) (updated in [#78](https://github.com/jstockwin/py-pdf-parser/pull/78))
 98 | - Font filtering now caches the elements by font. ([#73](https://github.com/jstockwin/py-pdf-parser/pull/73))
 99 | - The visualise tool now draws an outline around each section on the page. ([#69](https://github.com/jstockwin/py-pdf-parser/pull/69)) (updated in [#80](https://github.com/jstockwin/py-pdf-parser/pull/80))
100 | 
101 | 
102 | ### Changed
103 | - This product is now complete enough for the needs of Optimor Ltd, however `jstockwin` is going to continue development as a personal project. The repository has been moved from `optimor/py-pdf-parser` to `jstockwin/py-pdf-parser`.
104 | 
105 | ## [0.2.0] - 2020-04-17
106 | ### Added
107 | - It is now possible to specify `font_size_precision` when instantiating a PDFDocument. This is the number of decimal places the font size will be rounded to. ([#60](https://github.com/jstockwin/py-pdf-parser/pull/60))
108 | - `extract_simple_table` now allows extracting tables with gaps, provided there is at least one full row and one full column. This is only the case if you pass `allow_gaps=True`, otherwise the original logic of raising an exception if there a gap remains. You can optionally pass a `reference_element` which must be in both a full row and a full column, this defaults to the first (top-left) element. ([#57](https://github.com/jstockwin/py-pdf-parser/pull/57))
109 | 
110 | ### Changed
111 | - Font sizes are now `float` not `int`. The `font_size_precision` in the additions defaults to 1, and as such all fonts will change to have a single decimal place. To keep the old behaviour, you can pass `font_size_precision=0` when instantiating your PDFDocument.
112 | 
113 | ### Fixed
114 | - Improved performance of `extract_simple_table`, which is now much faster. ([#65](https://github.com/jstockwin/py-pdf-parser/pull/65))
115 | 
116 | ## [0.1.0] - 2020-04-08
117 | ### Added
118 | - Initial version of the product. Note: The version is less than 1, so this product should not yet be considered stable. API changes and other breaking changes are possible, if not likely.
119 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at jstockwin@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Contributions to this project are very welcome in whatever form these may be. We highly
 4 | appreciate bug reports, pull requests, and documentation improvements.
 5 | 
 6 | Any interaction with this project should adhere to our Code of Conduct, which you can
 7 | find below.
 8 | 
 9 | It should be noted that this project heavily relies on
10 | [pdfminer.six](https://github.com/pdfminer/pdfminer.six) and many issues about loading
11 | PDFs may be due to this package. We ask that you try to avoid filing bugs that are
12 | likely to be being cases by pdfminer.six against this repository, but rather you should
13 | report these bugs directly at
14 | [pdfminer.six/issues](https://github.com/pdfminer/pdfminer.six/issues).
15 | 
16 | ## Issues
17 | 
18 | Issues are very valuable to this project.
19 | 
20 | * Ideas are a valuable source of contributions others can make.
21 | * Problems show where this project is lacking.
22 | * With a question you show where contributors can improve the user experience.
23 | 
24 | Thank you for creating them. If you are submitting an issue and would be interested in
25 | helping to work on the fix, please indicate this in the issue.
26 | 
27 | ## Pull Requests
28 | 
29 | Pull requests are also very valuable. Before submitting a pull request, it is probably
30 | a good idea to first submit an issue to discuss the matter. This helps to avoid wasting
31 | your time working on something that may not be accepted.
32 | 
33 | When submitting a Pull Request, you will need to do the following things. There is a
34 | checklist in the template to help make sure you don't forget.
35 | 
36 | We run type checks using both pytpe and mypy. We also enforce code style using
37 | pycodestyle and black. You can run  `docker-compose run --rm lint` to check this.
38 | 
39 | * Provide a good description of the change, and the reason for it.
40 | * Ensure the tests, type checks, and linting passes (this is done by continuous
41 |   integration).
42 | * Add any additional tests, as required.
43 | * Ensure all of your changes are well documented.
44 | * Update the CHANGELOG.md with a description of your changes, following the format from
45 |   [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
46 | 
47 | ## Code of Conduct
48 | 
49 | Before contributing, please read our [Code of Conduct](CODE_OF_CONDUCT.md).
50 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # syntax = docker/dockerfile:1.2
 2 | FROM phusion/baseimage:focal-1.0.0
 3 | 
 4 | RUN adduser --disabled-password --gecos "" app_user
 5 | 
 6 | RUN apt-get update && \
 7 |     apt-get -y install software-properties-common \
 8 |                        python3-pip \
 9 |                        python3-virtualenv \
10 |                        python3-tk \
11 |                        libmagickwand-dev \
12 |                        xvfb && \
13 |     apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
14 | 
15 | ENV VIRTUAL_ENV_DIR /.venv
16 | RUN python3 -m virtualenv --python=python3.8 $VIRTUAL_ENV_DIR
17 | # Set the virtual environment as the main Python directory
18 | ENV PATH $VIRTUAL_ENV_DIR/bin:$PATH
19 | 
20 | RUN --mount=type=cache,target=/root/.cache/pip pip3 install --upgrade pip
21 | 
22 | # Create src dir
23 | ENV PROJECT_DIR /py-pdf-parser
24 | WORKDIR $PROJECT_DIR
25 | 
26 | # Add imagemagick policy
27 | ADD ./imagemagick_policy.xml /etc/ImageMagick-6/policy.xml
28 | 
29 | # Install requirements
30 | ADD ./setup.py $PROJECT_DIR/setup.py
31 | ADD ./README.md $PROJECT_DIR/README.md
32 | RUN --mount=type=cache,target=/root/.cache/pip pip3 install -e $PROJECT_DIR[dev]
33 | RUN --mount=type=cache,target=/root/.cache/pip pip3 install -e $PROJECT_DIR[test]
34 | RUN chown -R app_user:app_user $VIRTUAL_ENV_DIR
35 | 
36 | # Copy code, chown and switch user
37 | ADD ./ $PROJECT_DIR
38 | RUN chown -R app_user:app_user $PROJECT_DIR
39 | USER app_user
40 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Jake Stockwin
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md
2 | include LICENSE
3 | prune tests
4 | prune tests/*
5 | prune docs
6 | prune docs/*
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # py-pdf-parser
 2 | 
 3 | [![PyPI version](https://badge.fury.io/py/py-pdf-parser.svg)](https://badge.fury.io/py/py-pdf-parser)
 4 | ![Continuous Integration](https://github.com/jstockwin/py-pdf-parser/workflows/Continuous%20Integration/badge.svg)
 5 | [![Documentation Status](https://readthedocs.org/projects/py-pdf-parser/badge/?version=latest)](https://py-pdf-parser.readthedocs.io/en/latest/?badge=latest)
 6 | 
 7 | Py PDF Parser is a tool to help extracting information from structured PDFs.
 8 | 
 9 | Full details and installation instructions can be found at:
10 | https://py-pdf-parser.readthedocs.io/en/latest/
11 | 
12 | This project is based on an original design and protoype by Sam Whitehall (github.com/samwhitehall).
13 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | 
3 | ## Reporting a Vulnerability
4 | 
5 | Please do not use public GitHub issues to report a security vulnerability.
6 | 
7 | Instead, please send an email directly to jstockwin@gmail.com. Do not include any sensitive information in your email. Do try to include as much information as possible to help us understand the issue.
8 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | shell:
 2 |   extends:
 3 |     service: base
 4 |   volumes:
 5 |     - .:/py-pdf-parser
 6 |     - /tmp/.X11-unix:/tmp/.X11-unix:rw
 7 |     - ./imagemagick_policy.xml:/etc/ImageMagick-6/policy.xml
 8 |   environment:
 9 |     - DISPLAY
10 |   command: bash
11 | 
12 | tests:
13 |   extends:
14 |     service: base
15 |   command: .github/scripts/test.sh
16 | 
17 | # Run docs to re-build the docs once.
18 | docs:
19 |   extends:
20 |     service: base
21 |   command: make --directory docs html
22 |   environment:
23 |     - SPHINXOPTS="-W"
24 | 
25 | # Use "up" to host the docs on port 8000, watching for changes.
26 | docs-autobuild:
27 |   extends:
28 |     service: base
29 |   ports:
30 |     - "8000:8000"
31 |   command: make --directory docs livehtml
32 | 
33 | base:
34 |   build: .
35 |   volumes:
36 |     - .:/py-pdf-parser
37 |   working_dir: /py-pdf-parser
38 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | livehtml:
18 | 	sphinx-autobuild --host 0.0.0.0 -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
19 | 
20 | # Catch-all target: route all unknown targets to Sphinx using the new
21 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
22 | %: Makefile
23 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
24 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | ../../CHANGELOG.md


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | from typing import List
14 | 
15 | import os
16 | import sys
17 | 
18 | sys.path.insert(0, os.path.abspath("../../"))
19 | 
20 | 
21 | # -- Project information -----------------------------------------------------
22 | 
23 | project = "PDF Parser"
24 | copyright = "2019, Jake Stockwin"
25 | author = "Jake Stockwin"
26 | 
27 | 
28 | # -- General configuration ---------------------------------------------------
29 | 
30 | # Add any Sphinx extension module names here, as strings. They can be
31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
32 | # ones.
33 | extensions = [
34 |     "sphinx.ext.autodoc",
35 |     "sphinx.ext.napoleon",
36 |     "sphinx_rtd_theme",
37 |     "recommonmark",
38 | ]
39 | 
40 | # Add any paths that contain templates here, relative to this directory.
41 | templates_path = ["_templates"]
42 | 
43 | # List of patterns, relative to source directory, that match files and
44 | # directories to ignore when looking for source files.
45 | # This pattern also affects html_static_path and html_extra_path.
46 | exclude_patterns: List[str] = []
47 | 
48 | master_doc = "index"
49 | 
50 | 
51 | # -- Options for HTML output -------------------------------------------------
52 | 
53 | # The theme to use for HTML and HTML Help pages.  See the documentation for
54 | # a list of builtin themes.
55 | #
56 | html_theme = "sphinx_rtd_theme"
57 | 
58 | # Add any paths that contain custom static files (such as style sheets) here,
59 | # relative to this directory. They are copied after the builtin static files,
60 | # so a file named "default.css" will overwrite the builtin "default.css".
61 | html_static_path: List[str] = []
62 | 
63 | 
64 | # -- Extension configuration -------------------------------------------------
65 | 


--------------------------------------------------------------------------------
/docs/source/example_files/columns.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/columns.pdf


--------------------------------------------------------------------------------
/docs/source/example_files/figure.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/figure.pdf


--------------------------------------------------------------------------------
/docs/source/example_files/grid.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/grid.pdf


--------------------------------------------------------------------------------
/docs/source/example_files/order_summary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/order_summary.pdf


--------------------------------------------------------------------------------
/docs/source/example_files/simple_memo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/simple_memo.pdf


--------------------------------------------------------------------------------
/docs/source/example_files/tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/example_files/tables.pdf


--------------------------------------------------------------------------------
/docs/source/examples/element_ordering.rst:
--------------------------------------------------------------------------------
  1 | .. _element-ordering:
  2 | 
  3 | Element Ordering
  4 | ----------------
  5 | 
  6 | In this example, we see how to specify a custom ordering for the elements.
  7 | 
  8 | For this we will use a simple pdf, which has a single element in each corner of the
  9 | page. You can :download:`download the example here </example_files/grid.pdf>`.
 10 | 
 11 | 
 12 | Default
 13 | .......
 14 | 
 15 | The default element ordering is left to right, top to bottom.
 16 | 
 17 | .. code-block:: python
 18 | 
 19 |    from py_pdf_parser.loaders import load_file
 20 | 
 21 |    file_path = "grid.pdf"
 22 | 
 23 |    # Default - left to right, top to bottom
 24 |    document = load_file(file_path)
 25 |    print([element.text() for element in document.elements])
 26 | 
 27 | This results in
 28 | ::
 29 | 
 30 |    ['Top Left', 'Top Right', 'Bottom Left', 'Bottom Right']
 31 | 
 32 | Presets
 33 | .......
 34 | 
 35 | There are also preset orderings for ``right to left, top to bottom``,
 36 | ``top to bottom, left to right``, and ``top to bottom, right to left``. You can use
 37 | these by importing the :class:`~py_pdf_parser.components.ElementOrdering` class from
 38 | :py:mod:`py_pdf_parser.components` and passing these as the ``element_ordering``
 39 | argument to :class:`~py_pdf_parser.components.PDFDocument`. Note that keyword arguments
 40 | to :meth:`~py_pdf_parser.loaders.load` and :meth:`~py_pdf_parser.loaders.load_file` get
 41 | passed through to the :class:`~py_pdf_parser.components.PDFDocument`.
 42 | 
 43 | .. code-block:: python
 44 | 
 45 |    from py_pdf_parser.loaders import load_file
 46 |    from py_pdf_parser.components import ElementOrdering
 47 | 
 48 |    # Preset - right to left, top to bottom
 49 |    document = load_file(
 50 |        file_path, element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM
 51 |    )
 52 |    print([element.text() for element in document.elements])
 53 | 
 54 |    # Preset - top to bottom, left to right
 55 |    document = load_file(
 56 |        file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT
 57 |    )
 58 |    print([element.text() for element in document.elements])
 59 | 
 60 |    # Preset - top to bottom, right to left
 61 |    document = load_file(
 62 |        file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT
 63 |    )
 64 |    print([element.text() for element in document.elements])
 65 | 
 66 | which results in
 67 | 
 68 | ::
 69 | 
 70 |    ['Top Right', 'Top Left', 'Bottom Right', 'Bottom Left']
 71 |    ['Bottom Left', 'Top Left', 'Bottom Right', 'Top Right']
 72 |    ['Top Right', 'Bottom Right', 'Top Left', 'Bottom Left']
 73 | 
 74 | Custom Ordering
 75 | ...............
 76 | 
 77 | If none of the presets give an ordering you are looking for, you can also pass a
 78 | callable as the ``element_ordering`` argument of
 79 | :class:`~py_pdf_parser.components.PDFDocument`. This callable will be given a list of
 80 | elements for each page, and should return a list of the same elements, in the desired
 81 | order.
 82 | 
 83 | .. important::
 84 | 
 85 |    The elements which get passed to your function will be PDFMiner.six elements, and NOT
 86 |    class :class:`~py_pdf_parser.componenets.PDFElement`. You can access the ``x0``,
 87 |    ``x1``, ``y0``, ``y1`` directly, and extract the text using `get_text()`. Other
 88 |    options are available: please familiarise yourself with the PDFMiner.six
 89 |    documentation.
 90 | 
 91 | .. note::
 92 | 
 93 |    Your function will be called multiple times, once for each page of the document.
 94 |    Elements will always be considered in order of increasing page number, your function
 95 |    only controls the ordering within each page.
 96 | 
 97 | For example, if we wanted to implement an ordering which is bottom to top, left to right
 98 | then we can do this as follows:
 99 | 
100 | .. code-block:: python
101 | 
102 |    from py_pdf_parser.loaders import load_file
103 | 
104 |    # Custom - bottom to top, left to right
105 |    def ordering_function(elements):
106 |        """
107 |        Note: Elements will be PDFMiner.six elements. The x axis is positive as you go left
108 |        to right, and the y axis is positive as you go bottom to top, and hence we can
109 |        simply sort according to this.
110 |        """
111 |        return sorted(elements, key=lambda elem: (elem.x0, elem.y0))
112 | 
113 | 
114 |    document = load_file(file_path, element_ordering=ordering_function)
115 |    print([element.text() for element in document.elements])
116 | 
117 | which results in
118 | 
119 | ::
120 | 
121 |    ['Bottom Left', 'Top Left', 'Bottom Right', 'Top Right']
122 | 
123 | Multiple Columns
124 | ................
125 | 
126 | Finally, suppose our PDF has multiple columns, like
127 | :download:`this example </example_files/columns.pdf>`.
128 | 
129 | If we don't specify an ``element_ordering``, the elements will be extracted in the
130 | following order:
131 | 
132 | ::
133 | 
134 |    ['Column 1 Title', 'Column 2 Title', 'Here is some column 1 text.', 'Here is some column 2 text.', 'Col 1 left', 'Col 1 right', 'Col 2 left', 'Col 2 right']
135 | 
136 | If we visualise this document
137 | (see the :ref:`simple-memo` example if you don't know how to do this), then we can see
138 | that the column divider is at an ``x`` value of about 300. Using this information, we
139 | can specify a custom ordering function which will order the elements left to right,
140 | top to bottom, but in each column individually.
141 | 
142 | .. code-block:: python
143 | 
144 |    from py_pdf_parser.loaders import load_file
145 | 
146 |    document = load_file("columns.pdf")
147 | 
148 |    def column_ordering_function(elements):
149 |        """
150 |        The first entry in the key is False for colum 1, and Tru for column 2. The second
151 |        and third keys just give left to right, top to bottom.
152 |        """
153 |        return sorted(elements, key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0))
154 | 
155 | 
156 |    document = load_file(file_path, element_ordering=column_ordering_function)
157 |    print([element.text() for element in document.elements])
158 | 
159 | which returns the elements in the correct order:
160 | 
161 | ::
162 | 
163 |    ['Column 1 Title', 'Here is some column 1 text.', 'Col 1 left', 'Col 1 right', 'Column 2 Title', 'Here is some column 2 text.', 'Col 2 left', 'Col 2 right']
164 | 


--------------------------------------------------------------------------------
/docs/source/examples/extracting_text_from_figures.rst:
--------------------------------------------------------------------------------
 1 | .. _extracting-text-from-figures:
 2 | 
 3 | Extracting Text From Figures
 4 | ----------------------------
 5 | PDFs are structured documents, and can contain Figures. By default, PDFMiner.six and
 6 | hence py-pdf-parser does not extract text from figures.
 7 | 
 8 | You can :download:`download an example here </example_files/figure.pdf>`. In the
 9 | example, there is figure which contains a red square, and some text. Below the figure
10 | there is some more text.
11 | 
12 | By default, the text in the figure will not be included:
13 | 
14 | .. code-block:: python
15 | 
16 |    from py_pdf_parser.loaders import load_file
17 |    document = load_file("figure.pdf")
18 |    print([element.text() for element in document.elements])
19 | 
20 | which results in:
21 | 
22 | ::
23 | 
24 |    ["Here is some text outside of an image"]
25 | 
26 | To include the text inside the figure, we must pass the ``all_texts`` layout parameter.
27 | This is documented in the PDFMiner.six documentation, `here
28 | <https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams>`_.
29 | 
30 | The layout parameters can be passed to both :meth:`~py_pdf_parser.loaders.load` and
31 | :meth:`~py-pdf-parser.loaders.load_file` as a dictionary to the ``la_params`` argument.
32 | 
33 | In our case:
34 | 
35 | .. code-block:: python
36 | 
37 |    from py_pdf_parser.loaders import load_file
38 |    document = load_file("figure.pdf", la_params={"all_texts": True})
39 |    print([element.text() for element in document.elements])
40 | 
41 | which results in:
42 | 
43 | ::
44 | 
45 |    ["This is some text in an image", "Here is some text outside of an image"]
46 | 


--------------------------------------------------------------------------------
/docs/source/examples/index.rst:
--------------------------------------------------------------------------------
 1 | Examples
 2 | ========
 3 | 
 4 | Below you can find links to the following examples:
 5 | 
 6 | - The :ref:`simple-memo` example shows the very basics of using py-pdf-parser. You will see how to load a pdf document, start filtering the elements, and extract text from certain elements in the document.
 7 | - The :ref:`order-summary` example explains how to use font mappings, sections, and how to extract simple tables.
 8 | - The :ref:`more-tables` example explains tables in more detail, showing how to extract more complex tables.
 9 | - The :ref:`element-ordering` example shows how to specify different orderings for the elements on a page.
10 | - The :ref:`extracting-text-from-figures` example shows how to extract text from figures.
11 | 
12 | .. toctree::
13 | 
14 |    simple_memo
15 |    order_summary
16 |    more_tables
17 |    element_ordering
18 |    extracting_text_from_figures
19 | 


--------------------------------------------------------------------------------
/docs/source/examples/more_tables.rst:
--------------------------------------------------------------------------------
  1 | .. _more-tables:
  2 | 
  3 | More Tables
  4 | -----------
  5 | 
  6 | In this example, we will learn how to extract different types of table, and the difference between a simple table and more complicated ones.
  7 | 
  8 | You can :download:`download the example here </example_files/tables.pdf>`.
  9 | 
 10 | Please read the :ref:`order-summary` example first, as this covers some other functionality of the table extraction methods.
 11 | 
 12 | Load the file
 13 | .............
 14 | 
 15 | The following code (click "show code" below to see it) loads the file, and assigns the elements for each table to a variable. If this does not make sense, you should go back and look at some of the previous examples.
 16 | 
 17 | .. raw:: html
 18 | 
 19 |    <details>
 20 |    <summary><a>Show code</a></summary>
 21 | 
 22 | .. code-block:: python
 23 | 
 24 |    from py_pdf_parser.loaders import load_file
 25 | 
 26 |    FONT_MAPPING = {
 27 |        "BAAAAA+LiberationSerif-Bold,12.0": "header",
 28 |        "CAAAAA+LiberationSerif,12.0": "table_element",
 29 |    }
 30 |    document = load_file("tables.pdf", font_mapping=FONT_MAPPING)
 31 | 
 32 |    headers = document.elements.filter_by_font("header")
 33 | 
 34 |    # Extract reference elements
 35 |    simple_table_header = headers.filter_by_text_equal(
 36 |        "Simple Table"
 37 |    ).extract_single_element()
 38 | 
 39 |    simple_table_with_gaps_header = headers.filter_by_text_equal(
 40 |        "Simple Table with gaps"
 41 |    ).extract_single_element()
 42 | 
 43 |    simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal(
 44 |        "Simple Table with gaps in first row/col"
 45 |    ).extract_single_element()
 46 | 
 47 |    non_simple_table_header = headers.filter_by_text_equal(
 48 |        "Non Simple Table"
 49 |    ).extract_single_element()
 50 | 
 51 |    non_simple_table_with_merged_cols_header = headers.filter_by_text_equal(
 52 |        "Non Simple Table with Merged Columns"
 53 |    ).extract_single_element()
 54 | 
 55 |    non_simple_table_with_merged_rows_header = headers.filter_by_text_equal(
 56 |        "Non Simple Table with Merged Rows and Columns"
 57 |    ).extract_single_element()
 58 | 
 59 |    over_the_page_header = headers.filter_by_text_equal(
 60 |        "Over the page"
 61 |    ).extract_single_element()
 62 | 
 63 |    # Extract table elements
 64 |    simple_table_elements = document.elements.between(
 65 |        simple_table_header, simple_table_with_gaps_header
 66 |    )
 67 |    simple_table_with_gaps_elements = document.elements.between(
 68 |        simple_table_with_gaps_header, simple_table_with_gaps_in_first_row_col_header
 69 |    )
 70 | 
 71 |    simple_table_with_gaps_in_first_row_col_elements = document.elements.between(
 72 |        simple_table_with_gaps_in_first_row_col_header, non_simple_table_header
 73 |    )
 74 | 
 75 |    non_simple_table_elements = document.elements.between(
 76 |        non_simple_table_header, non_simple_table_with_merged_cols_header
 77 |    )
 78 | 
 79 |    non_simple_table_with_merged_cols_elements = document.elements.between(
 80 |        non_simple_table_with_merged_cols_header, non_simple_table_with_merged_rows_header
 81 |    )
 82 | 
 83 |    non_simple_table_with_merged_rows_and_cols_elements = document.elements.between(
 84 |        non_simple_table_with_merged_rows_header, over_the_page_header
 85 |    )
 86 | 
 87 |    over_the_page_elements = document.elements.after(over_the_page_header)
 88 | 
 89 | .. raw:: html
 90 | 
 91 |    </details>
 92 | 
 93 | Overview
 94 | ........
 95 | 
 96 | The tables in the example pdf are split into "Simple Tables" and "Non Simple Tables". For the simple tables, we will be able to use :meth:`~py_pdf_parser.tables.extract_simple_table`, otherwise we must use :meth:`~py_pdf_parser.tables.extract_table`. The former is much more efficient, and should be used when possible.
 97 | 
 98 | In general, tables can become more complicated by having missing cells, or merged cells which go across multiple columns or multiple rows. In both cases, you will have to pass additional parameters to stop exceptions being raised when this is the case. This is to make the extraction more robust, and protect against unexpected outcomes.
 99 | 
100 | To use :meth:`~py_pdf_parser.tables.extract_simple_table` we must have at least one column and one row which have no missing cells, and we must have no merged cells at all. We will need to know which row/column has no missing cells, as these must be passed as the reference row and column.
101 | 
102 | To understand why: for each column element in the reference row and each row element in the reference column, :meth:`~py_pdf_parser.tables.extract_simple_table` will scan across from the row element (to get the row) and up/down from the column element (to get the column), and see if there is an element there. If there is, it is added to the table. Therefore, if there are gaps in the reference row/column, other elements may get missed. There is a check for this, so an exception will be raised if this is the case.
103 | 
104 | This means :meth:`~py_pdf_parser.tables.extract_simple_table` takes time proportional to ``len(cols) + len(rows)``. Conversely,  :meth:`~py_pdf_parser.tables.extract_table` is at least ``len(cols) * len(rows)``, and if there are merged cells it will be even worse. (Note in reality the complexity is not quite this simple, but it should give you an idea of the difference.)
105 | 
106 | Below, we will work through increasingly complex examples to explain the functionality, and the steps involved.
107 | 
108 | Simple Table
109 | ............
110 | 
111 | This table is as simple as they come - there are no blank or merged cells. This means we can simply use :meth:`~py_pdf_parser.tables.extract_simple_table` as we have seen previously.
112 | 
113 | .. code-block:: python
114 | 
115 |    from py_pdf_parser import tables
116 |    table = tables.extract_simple_table(simple_table_elements, as_text=True)
117 | 
118 | ::
119 | 
120 |    >>> table
121 |    [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['B', '2', 'B', '2'], ['C', '3', 'C', '3']]
122 | 
123 | Simple Table with gaps
124 | ......................
125 | 
126 | This table has gaps, however there are no gaps in the first row or column. These are the default reference row and column, and so :meth:`~py_pdf_parser.tables.extract_simple_table` will still work as expected. Blank cells will be empty strings if ``as_text=True``, and otherwise they will be ``None``. However, if we try the same code as above:
127 | 
128 | .. code-block:: python
129 | 
130 |    table = tables.extract_simple_table(
131 |        simple_table_with_gaps_elements, as_text=True
132 |    )
133 | 
134 | this will raise an exception:
135 | 
136 | ::
137 | 
138 |    py_pdf_parser.exceptions.TableExtractionError: Element not found, there appears to be a gap in the table. If this is expected, pass allow_gaps=True.
139 | 
140 | This is to allow py-pdf-parser to be more robust in the case that you're expecting your table to have no empty cells. As the error message says, since this is expected behaviour we can simply pass ``allow_gaps=True``.
141 | 
142 | .. code-block:: python
143 | 
144 |    table = tables.extract_simple_table(
145 |        simple_table_with_gaps_elements, as_text=True, allow_gaps=True
146 |    )
147 | 
148 | ::
149 | 
150 |    >>> table
151 |    [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', '', '1'], ['B', '', '', ''], ['C', '', 'C', '3']]
152 | 
153 | Simple Table with gaps in first row/col
154 | .......................................
155 | 
156 | This table is similar to the above example, but now we have gaps in the first row and the first column (if either of these were true then the above wouldn't work). If we try the above code, a useful exception is raised:
157 | 
158 | .. code-block:: python
159 | 
160 |    table = tables.extract_simple_table(
161 |        simple_table_with_gaps_in_first_row_col_elements, as_text=True, allow_gaps=True
162 |    )
163 | 
164 | ::
165 | 
166 |    py_pdf_parser.exceptions.TableExtractionError: Number of elements in table (9) does not match number of elements passed (12). Perhaps try extract_table instead of extract_simple_table, or change you reference element.
167 | 
168 | The error message suggests either passing another reference element, or using the more complicated :meth:`~py_pdf_parser.tables.extract_table` method. In this case, as we still have a row and a column which have no missing cells, we can just pass a new reference element.
169 | 
170 | As such, we can use the second column and the last row as our references, as neither of these have missing cells. The reference row and column are specified by simply passing the unique element in both the reference row and the reference column (called the reference element). In this case, it's the first number "3" in the table. Here we will be lazy and simply use the fact that this is the 10th element in the table, but you should probably do something smarter.
171 | 
172 | .. code-block:: python
173 | 
174 |    reference_element = simple_table_with_gaps_in_first_row_col_elements[9]
175 |    table = tables.extract_simple_table(
176 |        simple_table_with_gaps_in_first_row_col_elements,
177 |        as_text=True,
178 |        allow_gaps=True,
179 |        reference_element=reference_element,
180 |    )
181 | 
182 | ::
183 | 
184 |     >>> table
185 |     [['Heading 1', 'Heading 2', '', 'Heading 4'], ['', '1', 'A', ''], ['B', '2', '', '2'], ['C', '3', 'C', '3']]
186 | 
187 | Non Simple Table
188 | ................
189 | 
190 | The next table does not have any row with no empty cells, and as such we must use :meth:`~py_pdf_parser.tables.extract_table`. There is no ``allow_gaps`` parameter for this method, since if you don't want to allow gaps you should be using :meth:`~py_pdf_parser.tables.extract_simple_table` instead.
191 | 
192 | Whilst the below may seem easier than working out the reference element in the above example, please note that it will be computationally slower.
193 | 
194 | .. code-block:: python
195 | 
196 |    table = tables.extract_table(non_simple_table_elements, as_text=True)
197 | 
198 | ::
199 | 
200 |    >>> table
201 |    [['', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', '', '1'], ['B', '', 'B', '2'], ['C', '3', 'C', '']]
202 | 
203 | 
204 | Non Simple Table with Merged Columns
205 | ....................................
206 | 
207 | This table has text which goes across multiple columns. If we naively run this as above:
208 | 
209 | .. code-block:: python
210 | 
211 |    table = tables.extract_table(non_simple_table_with_merged_cols_elements, as_text=True)
212 | 
213 | then we get an exception:
214 | 
215 | ::
216 | 
217 |    py_pdf_parser.exceptions.TableExtractionError: An element is in multiple columns. If this is expected, you can try passing fix_element_in_multiple_cols=True
218 | 
219 | Just like ``allow_gaps``, this is so we can be more robust in the case that this is not expected. The error helpfully suggests to try passing ``fix_element_in_multiple_cols=True``.
220 | 
221 | .. code-block:: python
222 | 
223 |    table = tables.extract_table(
224 |        non_simple_table_with_merged_cols_elements,
225 |        as_text=True,
226 |        fix_element_in_multiple_cols=True,
227 |    )
228 | 
229 | ::
230 | 
231 |    >>> table
232 |    [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['This text spans across multiple columns', '', 'B', '2'], ['C', '3', 'C', '3']]
233 | 
234 | Note that the merged cell has been pushed into the left-most column. Likewise, if we had a cell that was merged across multiple rows, we could pass ``fix_element_in_multiple_rows=True``, and it would be pushed into the top row.
235 | 
236 | Non Simple Table with Merged Rows and Columns
237 | .............................................
238 | 
239 | In this case we have both merged rows and merged columns. We can pass both ``fix_element_in_multiple_rows=True`` and ``fix_element_in_multiple_cols=True``. The merged cell will be pushed into the left-most column and the top row.
240 | 
241 | .. code-block:: python
242 | 
243 |    table = tables.extract_table(
244 |        non_simple_table_with_merged_rows_and_cols_elements,
245 |        as_text=True,
246 |        fix_element_in_multiple_rows=True,
247 |        fix_element_in_multiple_cols=True,
248 |    )
249 | 
250 | ::
251 | 
252 |    >>> table
253 |    [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['This text spans across multiple rows and \nmultiple columns.', '', 'A', '1'], ['', '', 'B', '2'], ['C', '3', 'C', '3']]
254 | 
255 | 
256 | Over the page
257 | .............
258 | 
259 | The final table goes over the page break. This is not a problem, simply pass the elements within the table and the result should be correct.
260 | 
261 | If you had e.g. a footer that broke the table in two, simply ensure these elements are not included in the element list you pass to :meth:`~py_pdf_parser.tables.extract_table`, and again it should still work.
262 | 
263 | .. code-block:: python
264 | 
265 |    table = tables.extract_simple_table(over_the_page_elements, as_text=True)
266 | 
267 | ::
268 | 
269 |    >>> table
270 |    [['Heading 1', 'Heading 2', 'Heading 3', 'Heading 4'], ['A', '1', 'A', '1'], ['B', '2', 'B', '2'], ['C', '3', 'C', '3']]
271 | 


--------------------------------------------------------------------------------
/docs/source/examples/order_summary.rst:
--------------------------------------------------------------------------------
  1 | .. _order-summary:
  2 | 
  3 | Order Summary
  4 | -------------
  5 | 
  6 | In this example we will extract some tabular data from an order summary pdf.
  7 | 
  8 | You can :download:`download the example here </example_files/order_summary.pdf>`.
  9 | 
 10 | This is a fairly simple PDF, and as such it would be fairly easy to identify the tables and extract the data from them, however we will use this example to introduce font mappings and sections, which will come in useful for larger PDFs.
 11 | 
 12 | Step 1 - Load the file
 13 | ......................
 14 | 
 15 | We can :func:`load <py_pdf_parser.loaders.load_file>` the file as follows, and take a quick look using the :func:`visualise tool <py_pdf_parser.visualise.main.visualise>` to check it looks good.
 16 | 
 17 | .. code-block:: python
 18 | 
 19 |    from py_pdf_parser.loaders import load_file
 20 |    from py_pdf_parser.visualise import visualise
 21 | 
 22 |    document = load_file("order_summary.pdf")
 23 |    visualise(document)
 24 | 
 25 | This should show the following. We should check that py-pdf-parser has detected each element correctly, which in this case it has.
 26 | 
 27 | .. image:: /screenshots/order_summary_example/initial.png
 28 |    :height: 300px
 29 | 
 30 | Step 2 - Use a font mapping
 31 | ...........................
 32 | 
 33 | Each :class:`~py_pdf_parser.components.PDFElement` has a :attr:`~py_pdf_parser.components.PDFElement.font` property, which is the name of the font in the PDF document (including the font size). You can use fonts to help filter elements.
 34 | 
 35 | Fonts often have long, not very useful names. However, additional keyword arguments passed to :func:`~py_pdf_parser.loaders.load_file` will be used to initialise the :class:`~py_pdf_parser.components.PDFDocument`. One of these is the font mapping, which allows you to map the fonts in your PDF to more useful names.
 36 | 
 37 | The visualise tool allows you to inspect fonts. If you hover over an element, a summary will be shown in text at the bottom of the window. For example, in the image below we hover over the first cell in the table, and can see that the font is ``EAAAA+FreeMono,12.0``.
 38 | 
 39 | .. image:: /screenshots/order_summary_example/showing_font_1.png
 40 |    :height: 300px
 41 | 
 42 | We can easily ask to see all of the available fonts by running
 43 | 
 44 | ::
 45 | 
 46 |     >>> set(element.font for element in document.elements)
 47 |     {'EAAAAA+FreeMono,12.0', 'BAAAAA+LiberationSerif-Bold,16.0', 'CAAAAA+LiberationSerif,12.0', 'DAAAAA+FreeMonoBold,12.0', 'BAAAAA+LiberationSerif-Bold,12.0'}
 48 | 
 49 | Using this and the visualise tool, we can now choose better names for each of the fonts, and then load the document again, but this time providing a font mapping.
 50 | 
 51 | .. code-block:: python
 52 | 
 53 |    FONT_MAPPING = {
 54 |        "BAAAAA+LiberationSerif-Bold,16.0": "title",
 55 |        "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
 56 |        "CAAAAA+LiberationSerif,12.0": "text",
 57 |        "DAAAAA+FreeMonoBold,12.0": "table_header",
 58 |        "EAAAAA+FreeMono,12.0": "table_text",
 59 |    }
 60 |    document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING)
 61 | 
 62 | Using the visualise tool again, we can now see that our element's font has changed to ``table_text``, which is a much more useful name for us.
 63 | 
 64 | .. image:: /screenshots/order_summary_example/showing_font_2.png
 65 |    :height: 300px
 66 | 
 67 | Step 3 - Use regex for font mapping
 68 | ...................................
 69 | In certain use cases (especially when handling many PDF files) you may encounter the problem that the same fonts have different prefixes.
 70 | 
 71 | For example:
 72 | 
 73 | File 1:
 74 | ::
 75 | 
 76 |     >>> set(element.font for element in document.elements)
 77 |     {'EAAAAA+FreeMono,12.0', 'BAAAAA+LiberationSerif-Bold,16.0', 'CAAAAA+LiberationSerif,12.0', 'DAAAAA+FreeMonoBold,12.0', 'BAAAAA+LiberationSerif-Bold,12.0'}
 78 | 
 79 | File 2:
 80 | ::
 81 | 
 82 |     >>> set(element.font for element in document.elements)
 83 |     {'CIPKDS+FreeMono,12.0', 'FDHZTR+LiberationSerif-Bold,16.0', 'KJVFSL+LiberationSerif,12.0', 'BXNKHF+FreeMonoBold,12.0', 'OKSDFT+LiberationSerif-Bold,12.0'}
 84 | 
 85 | In this case mapping fonts with regex patterns makes more sense. Create the your font mapping like before but fill it with regex patterns that don't specify the prefix precisely. Also specify that the font mapping contains regex patterns when loading the document.
 86 | 
 87 | .. code-block:: python
 88 | 
 89 |    FONT_MAPPING = {
 90 |        r"\w{6}\+LiberationSerif-Bold,16.0": "title",
 91 |        r"\w{6}\+LiberationSerif-Bold,12.0": "sub_title",
 92 |        r"\w{6}\+LiberationSerif,12.0": "text",
 93 |        r"\w{6}\+FreeMonoBold,12.0": "table_header",
 94 |        r"\w{6}\+FreeMono,12.0": "table_text",
 95 |    }
 96 |    document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING, font_mapping_is_regex=True)
 97 | 
 98 | Step 4 - Add sections
 99 | .....................
100 | 
101 | Another thing we can do to make our job easier is to add :class:`Sections<py_pdf_parser.sectioning.Section>` to our document. A :class:`Sections<py_pdf_parser.sectioning.Sectioning>` class is made available on :attr:`document.sectioning<py_pdf_parser.components.PDFDocument.sectioning>`, which in particular allows us to call :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`.
102 | 
103 | A section has a name, and contains all elements between the start element and the end element. You can add multiple sections with the same name, but each section will have both a ``name`` and a ``unique_name`` (which is just the name with an additional ``_n`` on the end, where ``n`` is the number of sections with that name).
104 | 
105 | As with the :class:`~py_pdf_parser.components.PDFDocument`, a :class:`~py_pdf_parser.sectioning.Section` has an :attr:`~py_pdf_parser.sectioning.Section.elements` property which returns an :class:`~py_pdf_parser.filtering.ElementList`, allowing you to filter the elements.
106 | 
107 | .. important:: Never instantiate a :class:`Sections<py_pdf_parser.sectioning.Section>` yourself. You should always use :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`.
108 | 
109 | Calling :meth:`~py_pdf_parser.sectioning.Sectioning.create_section` will return the :class:`~py_pdf_parser.sectioning.Section`, but the :class:`~py_pdf_parser.sectioning.Sectioning` class also has :meth:`~py_pdf_parser.sectioning.Sectioning.get_section` and :meth:`~py_pdf_parser.sectioning.Sectioning.get_sections_with_name` methods.
110 | 
111 | Going back to our example, we will create sections for the order summary table, and for the totals table. Our order summary table will start with the "Order Summary:" sub title and end at the "Totals:" sub title. Note that there are two elements on the page with text equal to "Order Summary:", however they have different font and so we can still extract exactly the one we want.
112 | 
113 | 
114 | .. image:: /screenshots/order_summary_example/zoomed.png
115 |    :height: 300px
116 | 
117 | By default, :meth:`~py_pdf_parser.sectioning.Sectioning.create_section` will include the last element in the section, but this can be disabled by passing ``include_last_element=False``.
118 | 
119 | The totals section will run from the "Totals:" sub title, until the end of the document. An :class:`~py_pdf_parser.filtering.ElementList` (e.g. ``document.elements``) acts like a set of elements, but it does also define an order, and as such we can access the last element in the :class:`~py_pdf_parser.filtering.ElementList` by simply doing ``document.elements[-1]``.
120 | 
121 | .. code-block:: python
122 | 
123 |    order_summary_sub_title_element = (
124 |        document.elements.filter_by_font("sub_title")
125 |        .filter_by_text_equal("Order Summary:")
126 |        .extract_single_element()
127 |    )
128 | 
129 |    totals_sub_title_element = (
130 |        document.elements.filter_by_font("sub_title")
131 |        .filter_by_text_equal("Totals:")
132 |        .extract_single_element()
133 |    )
134 | 
135 |    final_element = document.elements[-1]
136 | 
137 |    order_summary_section = document.sectioning.create_section(
138 |        name="order_summary",
139 |        start_element=order_summary_sub_title_element,
140 |        end_element=totals_sub_title_element,
141 |        include_last_element=False,
142 |    )
143 | 
144 | Again, the visualise tool is helpful to check everything worked as expected, as it will draw a border around all of our sections:
145 | 
146 | .. image:: /screenshots/order_summary_example/sections.png
147 |    :height: 300px
148 | 
149 | Step 5 - Extract tables
150 | .......................
151 | 
152 | Now we have mapped our fonts and added some sections, we'd like to extract the table. In this case, we are able to use :meth:`~py_pdf_parser.tables.extract_simple_table`. We need to pass this the elements which form our table, however currently our sections also include the sub titles, "Order Summary:" and "Totals:". We need to exclude these from the elements we pass to :meth:`~py_pdf_parser.tables.extract_simple_table`. We have a reference to the sub title elements, so we could simply use :meth:`~py_pdf_parser.filtering.ElementList.remove_element`. However, since the tables seem to have their own fonts, it may be more robust to use :meth:`~py_pdf_parser.filtering.ElementList.filter_by_fonts`.
153 | 
154 | We will also pass ``as_text=True``, since we are interested in the text, not the :class:`PDFElements<py_pdf_parser.components.PDFElement>` themselves.
155 | 
156 | .. code-block:: python
157 | 
158 |    order_summary_table = tables.extract_simple_table(
159 |        order_summary_section.elements.filter_by_fonts("table_header", "table_text"),
160 |        as_text=True,
161 |    )
162 | 
163 |    totals_table = tables.extract_simple_table(
164 |        totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True
165 |    )
166 | 
167 | This gives:
168 | 
169 | ::
170 | 
171 |    >>> order_summary_table
172 |    [['Item', 'Unit Cost', 'Quantity', 'Cost'], ['Challenger 100g\nWhole Hops', '£3.29', '1', '£3.29'], ['Maris Otter \nPale Ale Malt \n(Crushed)', '£1.50/1000g', '4000g', '£6.00'], ['WLP037 \nYorkshire Ale \nYeast', '£7.08', '1', '£7.08'], ['Bottle Caps', '£1 per 100', '500', '£5']]
173 | 
174 |    >>> totals_table
175 |    [['Subtotal:', '£26.28'], ['Shipping', '£6'], ['VAT 20%', '£6.45'], ['Total:', '£38.73']]
176 | 
177 | As one final step, since the order summary table has a header row, we can make use of :meth:`~py_pdf_parser.tables.add_header_to_table`, which will change the list of lists to a list of dicts, mapping the header to the values in each row:
178 | 
179 | .. code-block:: python
180 | 
181 |    order_summary_with_header = tables.add_header_to_table(order_summary_table)
182 | 
183 | ::
184 | 
185 |    >>> order_summary_with_header
186 |    [{'Item': 'Challenger 100g\nWhole Hops', 'Unit Cost': '£3.29', 'Quantity': '1', 'Cost': '£3.29'}, {'Item': 'Maris Otter \nPale Ale Malt \n(Crushed)', 'Unit Cost': '£1.50/1000g', 'Quantity': '4000g', 'Cost': '£6.00'}, {'Item': 'WLP037 \nYorkshire Ale \nYeast', 'Unit Cost': '£7.08', 'Quantity': '1', 'Cost': '£7.08'}, {'Item': 'Bottle Caps', 'Unit Cost': '£1 per 100', 'Quantity': '500', 'Cost': '£5'}]
187 | 
188 | 
189 | Full Code
190 | .........
191 | 
192 | .. code-block:: python
193 | 
194 |    from py_pdf_parser.loaders import load_file
195 |    from py_pdf_parser import tables
196 | 
197 |    # from py_pdf_parser.visualise import visualise
198 | 
199 | 
200 |    # Step 1 - Load the file
201 |    document = load_file("order_summary.pdf")
202 | 
203 |    # visualise(document)
204 | 
205 |    # Step 2 - Use a font mapping
206 | 
207 |    # Show all fonts:
208 |    # set(element.font for element in document.elements)
209 | 
210 |    FONT_MAPPING = {
211 |        "BAAAAA+LiberationSerif-Bold,16.0": "title",
212 |        "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
213 |        "CAAAAA+LiberationSerif,12.0": "text",
214 |        "DAAAAA+FreeMonoBold,12.0": "table_header",
215 |        "EAAAAA+FreeMono,12.0": "table_text",
216 |    }
217 |    document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING)
218 | 
219 |    # OR
220 | 
221 |    # use regex patterns
222 | 
223 |    FONT_MAPPING = {
224 |        r"\w{6}\+LiberationSerif-Bold,16.0": "title",
225 |        r"\w{6}\+LiberationSerif-Bold,12.0": "sub_title",
226 |        r"\w{6}\+LiberationSerif,12.0": "text",
227 |        r"\w{6}\+FreeMonoBold,12.0": "table_header",
228 |        r"\w{6}\+FreeMono,12.0": "table_text",
229 |    }
230 |    document = load_file("order_summary.pdf", font_mapping=FONT_MAPPING, font_mapping_is_regex=True)
231 | 
232 |    # visualise(document)
233 | 
234 |    # Step 3 - Add sections
235 |    order_summary_sub_title_element = (
236 |        document.elements.filter_by_font("sub_title")
237 |        .filter_by_text_equal("Order Summary:")
238 |        .extract_single_element()
239 |    )
240 | 
241 |    totals_sub_title_element = (
242 |        document.elements.filter_by_font("sub_title")
243 |        .filter_by_text_equal("Totals:")
244 |        .extract_single_element()
245 |    )
246 | 
247 |    final_element = document.elements[-1]
248 | 
249 |    order_summary_section = document.sectioning.create_section(
250 |        name="order_summary",
251 |        start_element=order_summary_sub_title_element,
252 |        end_element=totals_sub_title_element,
253 |        include_last_element=False,
254 |    )
255 | 
256 |    totals_section = document.sectioning.create_section(
257 |        name="totals", start_element=totals_sub_title_element, end_element=final_element
258 |    )
259 | 
260 |    # visualise(document)
261 | 
262 |    # Step 4 - Extract tables
263 | 
264 |    order_summary_table = tables.extract_simple_table(
265 |        order_summary_section.elements.filter_by_fonts("table_header", "table_text"),
266 |        as_text=True,
267 |    )
268 | 
269 |    totals_table = tables.extract_simple_table(
270 |        totals_section.elements.filter_by_fonts("table_header", "table_text"), as_text=True
271 |    )
272 | 
273 |    order_summary_with_header = tables.add_header_to_table(order_summary_table)
274 | 


--------------------------------------------------------------------------------
/docs/source/examples/simple_memo.rst:
--------------------------------------------------------------------------------
  1 | .. _simple-memo:
  2 | 
  3 | Simple Memo
  4 | -----------
  5 | 
  6 | Our first example will be extracting information from a simple memo.
  7 | 
  8 | You can :download:`download the example memo here </example_files/simple_memo.pdf>`.
  9 | 
 10 | We will assume that your company issues these memos always in a consistent format, i.e. with the "TO", "FROM", "DATE", and "SUBJECT" fields, the main content of the memo. We would like to write some code such that we can extract the information from each memo.
 11 | 
 12 | Step 1 - Load the file
 13 | ......................
 14 | 
 15 | First, we should load the file into a :class:`~py_pdf_parser.components.PDFDocument`, using :func:`~py_pdf_parser.loaders.load_file`:
 16 | 
 17 | .. code-block:: python
 18 | 
 19 |    from py_pdf_parser.loaders import load_file
 20 | 
 21 |    document = load_file("simple_memo.pdf")
 22 | 
 23 | To check the PDF loaded as expected, we can use the :func:`~py_pdf_parser.visualise.main.visualise` tool by running
 24 | 
 25 | .. code-block:: python
 26 | 
 27 |    from py_pdf_parser.visualise import visualise
 28 | 
 29 |    visualise(document)
 30 | 
 31 | This will open a matplotlib window which should look something like the following image:
 32 | 
 33 | .. image:: /screenshots/simple_memo_example/visualise.png
 34 |    :height: 300px
 35 | 
 36 | Py-pdf-parser has extracted each element from the PDF as a :class:`~py_pdf_parser.components.PDFElement`, and is showing a blue box around each element. This is what we are looking for. Always check the visualise tool, since sometimes you will need to adjust the layout parameters so that the tool correctly identifies your elements. We will get on to this in later examples.
 37 | 
 38 | Step 2 - Extract reference elements
 39 | ...................................
 40 | 
 41 | Certain elements should be present in every memo. We will use these as reference elements to identify the elements which contain the information we are interested in. We already have our ``document``, which is a :class:`~py_pdf_parser.components.PDFDocument`. We can do :meth:`document.elements <py_pdf_parser.components.PDFDocument.elements>` to get a list (an :class:`~py_pdf_parser.filtering.ElementList`) of all the :class:`~py_pdf_parser.components.PDFElement` in the document, and also to allow us to filter the elements.
 42 | 
 43 | The simplest way to extract the elements we are interested in is by text. There are many other options available to us, and a full list can be found on the :ref:`filtering reference page<filtering-reference>`.
 44 | 
 45 | We will extract the "TO:", "FROM:", "DATE:" and "SUBJECT:" elements as reference elements, i.e. the elements on the left of the below image. We will then search to the right of each of them in turn, to extract the values for each field.
 46 | 
 47 | .. image:: /screenshots/simple_memo_example/top.png
 48 |    :height: 200px
 49 | 
 50 | To extract the element which says "TO:", we can simply run :meth:`document.elements.filter_by_text_equal("TO:") <py_pdf_parser.filtering.ElementList.filter_by_text_equal>`. This returns a new :class:`~py_pdf_parser.filtering.ElementList` which contains all the elements in the document with text equal to "TO:". In this case, there should only be one element in the list. We could just use ``[0]`` on the element list to access the element in question, however, there is a convenience function, :func:`~py_pdf_parser.filtering.ElementList.extract_single_element` on the :class:`~py_pdf_parser.filtering.ElementList` class to handle this case. This essentially checks if the list has a single element and returns the element for you, otherwise it raises an exception. Use of this is encouraged to make your code more robust and to make any errors more explicit.
 51 | 
 52 | .. code-block:: python
 53 | 
 54 |    to_element = document.elements.filter_by_text_equal("TO:").extract_single_element()
 55 |    from_element = document.elements.filter_by_text_equal("FROM:").extract_single_element()
 56 |    date_element = document.elements.filter_by_text_equal("DATE:").extract_single_element()
 57 |    subject_element = document.elements.filter_by_text_equal(
 58 |        "SUBJECT:"
 59 |    ).extract_single_element()
 60 | 
 61 | Each of the above elements will be a :class:`~py_pdf_parser.components.PDFElement`.
 62 | 
 63 | Step 3 - Extract the data
 64 | .........................
 65 | 
 66 | In the above section we have extracted our reference elements. We can now use these to do some more filtering to extract the data we want. In particular, we can use :func:`~py_pdf_parser.filtering.ElementList.to_the_right_of`, which will extract elements directly to the right of a given element. It effectively draws a dotted line from the top and bottom of your element out to the right hand side of the page, and any elements which are partially within the box created by the dotted line will be returned. To extract the text from a :class:`~py_pdf_parser.components.PDFElement`, we must also call :func:`.text() <py_pdf_parser.components.PDFElement.text>`.
 67 | 
 68 | .. code-block:: python
 69 | 
 70 |    to_text = document.elements.to_the_right_of(to_element).extract_single_element().text()
 71 |    from_text = (
 72 |        document.elements.to_the_right_of(from_element).extract_single_element().text()
 73 |    )
 74 |    date_text = (
 75 |        document.elements.to_the_right_of(date_element).extract_single_element().text()
 76 |    )
 77 |    subject_text_element = document.elements.to_the_right_of(
 78 |        subject_element
 79 |    ).extract_single_element()
 80 |    subject_text = subject_text_element.text()
 81 | 
 82 | Note we keep a reference to the subject text element. This is because we will use it later.
 83 | 
 84 | We have now extracted the data from the top of the memo, for example ``to_text`` will be ``"All Developers"``. The code does not rely on who the memo is to, and so it should still work for a memo with different values.
 85 | 
 86 | The last thing we need to do is extract the content of the memo. In our example there is only one paragraph, and so only one element, but if there were multiple paragraphs there could be multiple elements. There are a few ways to do this. It is probably the case that all the content elements are below the "SUBJECT:" element, however if the text started too far to the right this may not be the case. Instead, we can just use :func:`~py_pdf_parser.filtering.ElementList.after` to filter for elements strictly after the ``subject_text_element``:
 87 | 
 88 | .. code-block:: python
 89 | 
 90 |    content_elements = document.elements.after(subject_element)
 91 |    content_text = "\n".join(element.text() for element in content_elements)
 92 | 
 93 | That is now everything extracted from the memo. We can wrap our output into any data structure we fancy, for example json:
 94 | 
 95 | .. code-block:: python
 96 | 
 97 |    output = {
 98 |        "to": to_text,
 99 |        "from": from_text,
100 |        "date": date_text,
101 |        "subject": subject_text,
102 |        "content": content_text,
103 |    }
104 | 
105 | Full Code
106 | .........
107 | 
108 | Here is the full script constructed above:
109 | 
110 | .. code-block:: python
111 | 
112 |    from py_pdf_parser.loaders import load_file
113 | 
114 |    # Step 1 - Load the document
115 |    document = load_file("simple_memo.pdf")
116 | 
117 |    # We could visualise it here to check it looks correct:
118 |    # from py_pdf_parser.visualise import visualise
119 |    # visualise(document)
120 | 
121 |    # Step 2 - Extract reference elements:
122 |    to_element = document.elements.filter_by_text_equal("TO:").extract_single_element()
123 |    from_element = document.elements.filter_by_text_equal("FROM:").extract_single_element()
124 |    date_element = document.elements.filter_by_text_equal("DATE:").extract_single_element()
125 |    subject_element = document.elements.filter_by_text_equal(
126 |        "SUBJECT:"
127 |    ).extract_single_element()
128 | 
129 |    # Step 3 - Extract the data
130 |    to_text = document.elements.to_the_right_of(to_element).extract_single_element().text()
131 |    from_text = (
132 |        document.elements.to_the_right_of(from_element).extract_single_element().text()
133 |    )
134 |    date_text = (
135 |        document.elements.to_the_right_of(date_element).extract_single_element().text()
136 |    )
137 |    subject_text_element = document.elements.to_the_right_of(
138 |        subject_element
139 |    ).extract_single_element()
140 |    subject_text = subject_text_element.text()
141 | 
142 |    content_elements = document.elements.after(subject_element)
143 |    content_text = "\n".join(element.text() for element in content_elements)
144 | 
145 |    output = {
146 |        "to": to_text,
147 |        "from": from_text,
148 |        "date": date_text,
149 |        "subject": subject_text,
150 |        "content": content_text,
151 |    }
152 | 
153 | This gives:
154 | ::
155 | 
156 |    >>> from pprint import pprint
157 |    >>> pprint(output)
158 | 
159 |    {'content': 'A new PDF Parsing tool\n'
160 |                'There is a new PDF parsing tool available, called py-pdf-parser - '
161 |                'you should all check it out!\n'
162 |                'I think it could really help you extract that data we need from '
163 |                'those PDFs.',
164 |     'date': '1st January 2020',
165 |     'from': 'John Smith',
166 |     'subject': 'A new PDF Parsing tool',
167 |     'to': 'All Developers'}
168 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to PDF Parser's documentation!
 2 | ======================================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: Contents:
 7 | 
 8 |    overview
 9 |    examples/index
10 |    reference/index
11 |    CHANGELOG.md
12 | 


--------------------------------------------------------------------------------
/docs/source/overview.rst:
--------------------------------------------------------------------------------
 1 | Overview
 2 | ========
 3 | 
 4 | Introduction
 5 | ------------
 6 | 
 7 | This PDF Parser is a tool built on top of PDF Miner to help extracting information from PDFs in Python. The main idea was to create a tool that could be driven by code to interact with the elements on the PDF and slowly classify them by creating sections and adding tags to them. It also comes with a helpful visualisation tool which enables you to examine the current status of your elements.
 8 | 
 9 | This page gives a brief overview of the PDF Parser, but there is also a full :doc:`reference/index` of all the functionality. You may get a more in-depth overview by looking at the :doc:`examples/index`.
10 | 
11 | Setup
12 | -----
13 | 
14 | You will need to have Python 3.6 or greater installed, and if you're installing the development requirements to use the visualise tool you will also need tkinter installed on your system. For information on how to do this, see https://tkdocs.com/tutorial/install.html.
15 | 
16 | We recommend you install the development requirements with ``pip3 install py-pdf-parser[dev]``, which enables the visualise tool. If you don't need the visualise tool (for example in a production app once you've written your parsing scripts) you can simply run ``pip3 install py-pdf-parser``.
17 | 
18 | When Should I Use Py PDF Parser?
19 | --------------------------------
20 | 
21 | Py PDF Parser is best suited to locating and extracting specific data in a structured way from a PDF. You can locate contents however you want (by text, location, font, etc), and since it is code-driven you have the flexibility to implement custom logic without having to deal with the PDF itself. Py pdf parser helps to abstract away things like page breaks (unless you want to use them), which helps to write robust code which will extract data from multiple PDFs of the same type, even if there are differences between each individual document.
22 | 
23 | Py PDF Parser is good at extracting tables in PDFs, and allows you to write code to programmatically locate the tables to extract. Page breaks (and even headers or footers) half way through your table can be ignored easily. If you're trying to extract all tables from a PDF, other tools (e.g. https://camelot-py.readthedocs.io/en/master/) are available and may be more appropriate.
24 | 
25 | If you're simply trying to extract all of the text from a PDF, other tools (e.g. https://textract.readthedocs.io/en/stable/python_package.html) may be more appropriate. Whilst you can still do this with Py PDF Parser, it is not designed to be a tool where you simply plug in a PDF and it spits it out in text format. Py PDF Parser is not a plug-and-play solution, but rather a tool to help you write code that extracts certain pieces of data from a structured PDF.
26 | 
27 | Loading A PDF
28 | -------------
29 | 
30 | To load a PDF, use the :func:`~py_pdf_parser.loaders.load_file`: function from the :doc:`reference/loaders`. You will need to use :func:`~py_pdf_parser.loaders.load_file`: with a file path to be able to use the visualisation tool with your PDF as the background. If you don't have this, you can instead use the :func:`~py_pdf_parser.loaders.load`: function, but when you use the visualisation tool there will be no background.
31 | 
32 | We order the elements in a pdf, left-to-right, top-to-bottom. At the moment, this is not configurable. Each :class:`~py_pdf_parser.components.PDFElement` within the :class:`~py_pdf_parser.components.PDFDocument` are aware of their position, both on the page and within the document, and also have properties allowing you to access their font and text. For more information about :class:`~py_pdf_parser.components.PDFDocument` and :class:`~py_pdf_parser.components.PDFElement`, see :doc:`reference/components`.
33 | 
34 | Pay particular attention to the ``la_params`` argument. These will need to be fine-tuned for your PDF. We suggest immediately visualising your PDF using the visualisation tool to see how the elements have been grouped. If multiple elements have been counted as one, or vice versa, you should be able to fix this by tweaking the ``la_params``.
35 | 
36 | Filtering
37 | ---------
38 | 
39 | Once you have loaded your PDF, say into a variable :class:`document<py_pdf_parser.components.PDFDocument>`, you can start interacting with the elements. You can access all the elements by calling :class:`document.elements<py_pdf_parser.filtering.ElementList>`. You may now want to filter your elements, for example you could do :meth:`document.elements.filter_by_text_equal("foo")<py_pdf_parser.filtering.ElementList.filter_by_text_equal>` to filter for all elements which say "foo". To view all available filters, have a look at the :doc:`reference/filtering` reference.
40 | 
41 | The :class:`document.elements<py_pdf_parser.filtering.ElementList>` object, and any filtered subset thereof, will be an :class:`~py_pdf_parser.filtering.ElementList`. These act like sets of elements, and so you can union (:meth:`|<py_pdf_parser.filtering.ElementList.__or__>`), intersect (:meth:`&<py_pdf_parser.filtering.ElementList.__and__>`), difference (:meth:`-<py_pdf_parser.filtering.ElementList.__sub__>`) and symmetric difference (:meth:`^<py_pdf_parser.filtering.ElementList.__xor__>`) different filtered sets of elements.
42 | 
43 | You can also chain filters, which will do the same as intersecting multiple filters, for example ``document.elements.filter_by_text_equal("foo").filter_by_tag("bar")`` is the same as ``document.elements.filter_by_text_equal("foo") & document.elements.filter_by_tag("bar")``.
44 | 
45 | If you believe you have filtered down to a single element, and would like to examine that element, you can call :meth:`~py_pdf_parser.filtering.ElementList.extract_single_element`. This will return said element, or raise an exception if there is not a single element in your list.
46 | 
47 | You can see an example of filtering in the :ref:`simple-memo` example.
48 | 
49 | Classifying Elements
50 | --------------------
51 | 
52 | There are three ways to classify elements:
53 | 
54 | - add tags
55 | - create sections
56 | - mark certain elements as ignored
57 | 
58 | To add a tag, you can simply call :meth:`~py_pdf_parser.components.PDFElement.add_tag` on an :class:`~py_pdf_parser.components.PDFElement`, or :meth:`~py_pdf_parser.filtering.ElementList.add_tag_to_elements` on an :class:`~py_pdf_parser.filtering.ElementList`. You can filter by tags.
59 | 
60 | To create a section, you can call :meth:`~py_pdf_parser.sectioning.Sectioning.create_section`. See :doc:`reference/sectioning` for more information and the :ref:`order-summary` example for an example. When you create a section you simply specify a name for the section, and the start and end element for the section. Any elements between the start and end element will be included in your section. You can add multiple sections with the same name, and internally they will be given unique names. You can filter by either the non-unique ``section_name``, or by the unique sections. Elements can be in multiple sections.
61 | 
62 | To mark an element as ignored, simply set the ``ignore`` property to ``True``. Ignored elements will not be included in any :class:`~py_pdf_parser.filtering.ElementList`, however existing lists which you have assigned to variables will not be re-calculated and so may still include the ignored elements.
63 | 
64 | To process a whole pdf, we suggest that you mark any elements you're not interested in as ignored, group any elements which are together into sections, and then add tags to important elements. You can then loop through filtered sets of elements to extract the information you would like.
65 | 
66 | Visualisation Tool
67 | ------------------
68 | 
69 | The PDF Parser comes with a visualisation tool. See the :doc:`reference/visualise` documentation. When you visualise your :class:`~py_pdf_parser.components.PDFDocument`, you'll be able to see each page of the document in turn, with every :class:`~py_pdf_parser.components.PDFElement` highlighted. You can hover over the elements to see their sections, tags and whether they are ignored or not. This is very helpful for debugging any problems.
70 | 
71 | You can use the arrow key icons to change page, and can press home to return to page 1. You can also use the scroll wheel on your mouse to zoom in and out.
72 | 
73 | You can see an example of the visualisation in the :ref:`simple-memo` and :ref:`order-summary` examples.
74 | 
75 | Font Mappings
76 | -------------
77 | 
78 | You can filter elements by font. The font will be taken from the PDF itself, however often they have long and confusing names. You can specify a ``font_mapping`` when you load the document to map these to more memorable names. This ``font_mapping`` can either be a regex pattern or an exact string mapping. See the :doc:`reference/components` reference for the :class:`~py_pdf_parser.components.PDFDocument` arguments for more information.
79 | 
80 | You can see an example of font mapping in the :ref:`order-summary` example.
81 | 
82 | Tables
83 | ------
84 | 
85 | We have many functions to help extract tables. All of these use the positioning of the elements on the page to do this. See the :doc:`reference/tables` reference, and the :ref:`order-summary` and :ref:`more-tables` examples.
86 | 


--------------------------------------------------------------------------------
/docs/source/reference/common.rst:
--------------------------------------------------------------------------------
1 | Common
2 | ------
3 | 
4 | .. automodule:: py_pdf_parser.common
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/reference/components.rst:
--------------------------------------------------------------------------------
1 | Components
2 | ----------
3 | 
4 | .. automodule:: py_pdf_parser.components
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/reference/filtering.rst:
--------------------------------------------------------------------------------
1 | .. _filtering-reference:
2 | 
3 | Filtering
4 | ---------
5 | 
6 | .. autoclass:: py_pdf_parser.filtering.ElementList
7 |     :members:
8 |     :special-members:
9 | 


--------------------------------------------------------------------------------
/docs/source/reference/index.rst:
--------------------------------------------------------------------------------
 1 | Reference
 2 | =========
 3 | 
 4 | .. toctree::
 5 | 
 6 |    common
 7 |    components
 8 |    filtering
 9 |    loaders
10 |    sectioning
11 |    tables
12 |    visualise
13 | 


--------------------------------------------------------------------------------
/docs/source/reference/loaders.rst:
--------------------------------------------------------------------------------
1 | Loaders
2 | -------
3 | 
4 | .. automodule:: py_pdf_parser.loaders
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/reference/sectioning.rst:
--------------------------------------------------------------------------------
1 | Sectioning
2 | ----------
3 | 
4 | .. automodule:: py_pdf_parser.sectioning
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/reference/tables.rst:
--------------------------------------------------------------------------------
1 | Tables
2 | ------
3 | 
4 | .. automodule:: py_pdf_parser.tables
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/reference/visualise.rst:
--------------------------------------------------------------------------------
1 | Visualise
2 | ---------
3 | 
4 | .. autofunction:: py_pdf_parser.visualise.main.visualise
5 | 


--------------------------------------------------------------------------------
/docs/source/screenshots/order_summary_example/initial.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/initial.png


--------------------------------------------------------------------------------
/docs/source/screenshots/order_summary_example/sections.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/sections.png


--------------------------------------------------------------------------------
/docs/source/screenshots/order_summary_example/showing_font_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/showing_font_1.png


--------------------------------------------------------------------------------
/docs/source/screenshots/order_summary_example/showing_font_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/showing_font_2.png


--------------------------------------------------------------------------------
/docs/source/screenshots/order_summary_example/zoomed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/order_summary_example/zoomed.png


--------------------------------------------------------------------------------
/docs/source/screenshots/simple_memo_example/top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/simple_memo_example/top.png


--------------------------------------------------------------------------------
/docs/source/screenshots/simple_memo_example/visualise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/docs/source/screenshots/simple_memo_example/visualise.png


--------------------------------------------------------------------------------
/imagemagick_policy.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE policymap [
 3 | <!ELEMENT policymap (policy)+>
 4 | <!ELEMENT policy (#PCDATA)>
 5 | <!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
 6 | <!ATTLIST policy name CDATA #IMPLIED>
 7 | <!ATTLIST policy rights CDATA #IMPLIED>
 8 | <!ATTLIST policy pattern CDATA #IMPLIED>
 9 | <!ATTLIST policy value CDATA #IMPLIED>
10 | ]>
11 | <!--
12 |   Configure ImageMagick policies.
13 | 
14 |   Domains include system, delegate, coder, filter, path, or resource.
15 | 
16 |   Rights include none, read, write, and execute.  Use | to combine them,
17 |   for example: "read | write" to permit read from, or write to, a path.
18 | 
19 |   Use a glob expression as a pattern.
20 | 
21 |   Suppose we do not want users to process MPEG video images:
22 | 
23 |     <policy domain="delegate" rights="none" pattern="mpeg:decode" />
24 | 
25 |   Here we do not want users reading images from HTTP:
26 | 
27 |     <policy domain="coder" rights="none" pattern="HTTP" />
28 | 
29 |   Lets prevent users from executing any image filters:
30 | 
31 |     <policy domain="filter" rights="none" pattern="*" />
32 | 
33 |   The /repository file system is restricted to read only.  We use a glob
34 |   expression to match all paths that start with /repository:
35 | 
36 |     <policy domain="path" rights="read" pattern="/repository/*" />
37 | 
38 |   Any large image is cached to disk rather than memory:
39 | 
40 |     <policy domain="resource" name="area" value="1GB"/>
41 | 
42 |   Define arguments for the memory, map, area, and disk resources with
43 |   SI prefixes (.e.g 100MB).  In addition, resource policies are maximums for
44 |   each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
45 |   exceeds policy maximum so memory limit is 1GB).
46 | -->
47 | <policymap>
48 |   <!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
49 |   <!-- <policy domain="resource" name="memory" value="2GiB"/> -->
50 |   <!-- <policy domain="resource" name="map" value="4GiB"/> -->
51 |   <!-- <policy domain="resource" name="area" value="1GB"/> -->
52 |   <!-- <policy domain="resource" name="disk" value="16EB"/> -->
53 |   <!-- <policy domain="resource" name="file" value="768"/> -->
54 |   <!-- <policy domain="resource" name="thread" value="4"/> -->
55 |   <!-- <policy domain="resource" name="throttle" value="0"/> -->
56 |   <!-- <policy domain="resource" name="time" value="3600"/> -->
57 |   <!-- <policy domain="system" name="precision" value="6"/> -->
58 |   <policy domain="cache" name="shared-secret" value="passphrase"/>
59 |   <policy domain="coder" rights="none" pattern="EPHEMERAL" />
60 |   <policy domain="coder" rights="none" pattern="URL" />
61 |   <policy domain="coder" rights="none" pattern="HTTPS" />
62 |   <policy domain="coder" rights="none" pattern="MVG" />
63 |   <policy domain="coder" rights="none" pattern="MSL" />
64 |   <policy domain="coder" rights="none" pattern="TEXT" />
65 |   <policy domain="coder" rights="none" pattern="SHOW" />
66 |   <policy domain="coder" rights="none" pattern="WIN" />
67 |   <policy domain="coder" rights="none" pattern="PLT" />
68 |   <policy domain="path" rights="none" pattern="@*" />
69 |   <!-- disable ghostscript format types -->
70 |   <policy domain="coder" rights="none" pattern="PS" />
71 |   <policy domain="coder" rights="none" pattern="EPS" />
72 |   <policy domain="coder" rights="read" pattern="PDF" />
73 |   <policy domain="coder" rights="none" pattern="XPS" />
74 | </policymap>
75 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | 


--------------------------------------------------------------------------------
/py_pdf_parser/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/py_pdf_parser/__init__.py


--------------------------------------------------------------------------------
/py_pdf_parser/common.py:
--------------------------------------------------------------------------------
 1 | from py_pdf_parser.exceptions import InvalidCoordinatesError
 2 | 
 3 | 
 4 | class BoundingBox:
 5 |     """
 6 |     A rectangle, stored using the coordinates (x0, y0) of the bottom left corner, and
 7 |     the coordinates (x1, y1) of the top right corner.
 8 | 
 9 |     Args:
10 |         x0 (int): The x coordinate of the bottom left corner.
11 |         x1 (int): The x coordinate of the top right corner.
12 |         y0 (int): The y coordinate of the bottom left corner.
13 |         y1 (int): The y coordinate of the top right corner.
14 | 
15 |     Raises:
16 |         InvalidCoordinatesError: if x1 is smaller than x0 or y1 is smaller than y0.
17 | 
18 |     Attributes:
19 |         x0 (int): The x coordinate of the bottom left corner.
20 |         x1 (int): The x coordinate of the top right corner.
21 |         y0 (int): The y coordinate of the bottom left corner.
22 |         y1 (int): The y coordinate of the top right corner.
23 |         width (int): The width of the box, equal to x1 - x0.
24 |         height (int): The height of the box, equal to y1 - y0.
25 |     """
26 | 
27 |     def __init__(self, x0: float, x1: float, y0: float, y1: float):
28 |         if x1 < x0:
29 |             raise InvalidCoordinatesError(
30 |                 f"Invalid coordinates, x1 is smaller than x0 ({x1}<{x0})"
31 |             )
32 |         if y1 < y0:
33 |             raise InvalidCoordinatesError(
34 |                 f"Invalid coordinates, y1 is smaller than y0 ({y1}<{y0})"
35 |             )
36 |         self.x0 = x0
37 |         self.x1 = x1
38 |         self.y0 = y0
39 |         self.y1 = y1
40 |         self.width = x1 - x0
41 |         self.height = y1 - y0
42 | 
43 |     def __eq__(self, other: object) -> bool:
44 |         if not isinstance(other, BoundingBox):
45 |             raise NotImplementedError(f"Can't compare BoundingBox with {type(other)}")
46 | 
47 |         return all(
48 |             [
49 |                 self.x0 == other.x0,
50 |                 self.x1 == other.x1,
51 |                 self.y0 == other.y0,
52 |                 self.y1 == other.y1,
53 |             ]
54 |         )
55 | 
56 |     def __repr__(self) -> str:
57 |         return f"<BoundingBox x0={self.x0}, x1={self.x1}, y0={self.y0}, y1={self.y1}>"
58 | 


--------------------------------------------------------------------------------
/py_pdf_parser/exceptions.py:
--------------------------------------------------------------------------------
 1 | class PDFParserError(Exception):
 2 |     pass
 3 | 
 4 | 
 5 | # Components
 6 | class PageNotFoundError(PDFParserError):
 7 |     pass
 8 | 
 9 | 
10 | class NoElementsOnPageError(PDFParserError):
11 |     pass
12 | 
13 | 
14 | # Filtering
15 | class NoElementFoundError(PDFParserError):
16 |     pass
17 | 
18 | 
19 | class MultipleElementsFoundError(PDFParserError):
20 |     pass
21 | 
22 | 
23 | class ElementOutOfRangeError(PDFParserError):
24 |     pass
25 | 
26 | 
27 | # Sectioning
28 | class InvalidSectionError(PDFParserError):
29 |     pass
30 | 
31 | 
32 | class SectionNotFoundError(PDFParserError):
33 |     pass
34 | 
35 | 
36 | # Tables
37 | class TableExtractionError(PDFParserError):
38 |     pass
39 | 
40 | 
41 | class InvalidTableError(PDFParserError):
42 |     pass
43 | 
44 | 
45 | class InvalidTableHeaderError(PDFParserError):
46 |     pass
47 | 
48 | 
49 | class InvalidCoordinatesError(PDFParserError):
50 |     pass
51 | 


--------------------------------------------------------------------------------
/py_pdf_parser/loaders.py:
--------------------------------------------------------------------------------
  1 | from typing import IO, Any, Dict, List, NamedTuple, Optional
  2 | 
  3 | import logging
  4 | 
  5 | from pdfminer.high_level import extract_pages
  6 | from pdfminer.layout import LAParams, LTFigure, LTTextBox
  7 | 
  8 | from .components import PDFDocument
  9 | 
 10 | logger = logging.getLogger("PDFParser")
 11 | DEFAULT_LA_PARAMS: Dict = {"boxes_flow": None}
 12 | 
 13 | 
 14 | class Page(NamedTuple):
 15 |     """
 16 |     This is used to pass PDF Miner elements of a page when instantiating PDFDocument.
 17 | 
 18 |     Args:
 19 |         width (int): The width of the page.
 20 |         height (int): The height of the page.
 21 |         elements (list): A list of PDF Miner elements (LTTextBox) on the page.
 22 |     """
 23 | 
 24 |     width: int
 25 |     height: int
 26 |     elements: List[LTTextBox]
 27 | 
 28 | 
 29 | def load_file(
 30 |     path_to_file: str, la_params: Optional[Dict] = None, **kwargs: Any
 31 | ) -> PDFDocument:
 32 |     """
 33 |     Loads a file according to the specified file path.
 34 | 
 35 |     All other arguments are passed to `load`, see the documentation for `load`.
 36 | 
 37 |     Returns:
 38 |         PDFDocument: A PDFDocument with the specified file loaded.
 39 |     """
 40 |     with open(path_to_file, "rb") as in_file:
 41 |         return load(in_file, pdf_file_path=path_to_file, la_params=la_params, **kwargs)
 42 | 
 43 | 
 44 | def load(
 45 |     pdf_file: IO,
 46 |     pdf_file_path: Optional[str] = None,
 47 |     password: Optional[str] = None,
 48 |     la_params: Optional[Dict] = None,
 49 |     **kwargs: Any,
 50 | ) -> PDFDocument:
 51 |     """
 52 |     Loads the pdf file into a PDFDocument.
 53 | 
 54 |     Args:
 55 |         pdf_file (io): The PDF file.
 56 |         pdf_file_path (str, optional): Passed to `PDFDocument`. See the documentation
 57 |             for `PDFDocument`.
 58 |         password (str, optional): Password for the encrypted PDF. Required if the
 59 |             PDF is encrypted.
 60 |         la_params (dict): The layout parameters passed to PDF Miner for analysis. See
 61 |             the PDFMiner documentation here:
 62 |             https://pdfminersix.readthedocs.io/en/latest/reference/composable.html#laparams.
 63 |             Note that py_pdf_parser will re-order the elements it receives from PDFMiner
 64 |             so options relating to element ordering will have no effect.
 65 |         kwargs: Passed to `PDFDocument`. See the documentation for `PDFDocument`.
 66 | 
 67 |     Returns:
 68 |         PDFDocument: A PDFDocument with the file loaded.
 69 |     """
 70 |     if la_params is None:
 71 |         la_params = {}
 72 |     la_params = {**DEFAULT_LA_PARAMS, **la_params}
 73 | 
 74 |     pages: Dict[int, Page] = {}
 75 |     for page in extract_pages(
 76 |         pdf_file, laparams=LAParams(**la_params), password=password
 77 |     ):
 78 |         elements = [element for element in page if isinstance(element, LTTextBox)]
 79 | 
 80 |         # If all_texts=True then we may get some text from inside figures
 81 |         if la_params.get("all_texts"):
 82 |             figures = (element for element in page if isinstance(element, LTFigure))
 83 |             for figure in figures:
 84 |                 elements += [
 85 |                     element for element in figure if isinstance(element, LTTextBox)
 86 |                 ]
 87 | 
 88 |         if not elements:
 89 |             logger.warning(
 90 |                 f"No elements detected on page {page.pageid}, skipping this page."
 91 |             )
 92 |             continue
 93 | 
 94 |         pages[page.pageid] = Page(
 95 |             width=page.width, height=page.height, elements=elements
 96 |         )
 97 | 
 98 |     # Disable pytype check due to false positive. See the following issue for details:
 99 |     # https://github.com/google/pytype/issues/1028
100 |     # pytype: disable=wrong-arg-types
101 |     return PDFDocument(pages=pages, pdf_file_path=pdf_file_path, **kwargs)
102 |     # pytype: enable=wrong-arg-types
103 | 


--------------------------------------------------------------------------------
/py_pdf_parser/sectioning.py:
--------------------------------------------------------------------------------
  1 | from typing import TYPE_CHECKING, Dict, Generator, ValuesView
  2 | 
  3 | from collections import defaultdict
  4 | 
  5 | from .exceptions import InvalidSectionError, SectionNotFoundError
  6 | from .filtering import ElementList
  7 | 
  8 | if TYPE_CHECKING:
  9 |     from .components import PDFDocument, PDFElement
 10 | 
 11 | 
 12 | class Section:
 13 |     """
 14 |     A continuous group of elements within a document.
 15 | 
 16 |     A section is intended to label a group of elements. Said elements must be continuous
 17 |     in the document.
 18 | 
 19 |     Warning:
 20 |         You should not instantiate a Section class yourself, but should call
 21 |         `create_section` from the `Sectioning` class below.
 22 | 
 23 |     Args:
 24 |         document (PDFDocument): A reference to the document.
 25 |         name (str): The name of the section.
 26 |         unique_name (str): Multiple sections can have the same name, but a unique name
 27 |             will be generated by the Sectioning class.
 28 |         start_element (PDFElement): The first element in the section.
 29 |         end_element (PDFElement): The last element in the section.
 30 |     """
 31 | 
 32 |     document: "PDFDocument"
 33 |     name: str
 34 |     unique_name: str
 35 |     start_element: "PDFElement"
 36 |     end_element: "PDFElement"
 37 | 
 38 |     def __init__(
 39 |         self,
 40 |         document: "PDFDocument",
 41 |         name: str,
 42 |         unique_name: str,
 43 |         start_element: "PDFElement",
 44 |         end_element: "PDFElement",
 45 |     ):
 46 |         if start_element._index > end_element._index:
 47 |             raise InvalidSectionError("end_element must come after start_element")
 48 |         self.document = document
 49 |         self.name = name
 50 |         self.unique_name = unique_name
 51 |         self.start_element = start_element
 52 |         self.end_element = end_element
 53 | 
 54 |     def __contains__(self, element: "PDFElement") -> bool:
 55 |         return element in self.elements
 56 | 
 57 |     @property
 58 |     def elements(self) -> "ElementList":
 59 |         """
 60 |         All the elements in the section.
 61 | 
 62 |         Returns:
 63 |             ElementList: All the elements in the section.
 64 |         """
 65 |         return self.document.elements.between(
 66 |             self.start_element, self.end_element, inclusive=True
 67 |         )
 68 | 
 69 |     def __eq__(self, other: object) -> bool:
 70 |         """
 71 |         Returns True if the two sections have the same unique name and are from the
 72 |         same document
 73 |         """
 74 |         if not isinstance(other, Section):
 75 |             raise NotImplementedError(f"Can't compare Section with {type(other)}")
 76 |         return all(
 77 |             [
 78 |                 self.document == other.document,
 79 |                 self.unique_name == other.unique_name,
 80 |                 self.start_element == other.start_element,
 81 |                 self.end_element == other.end_element,
 82 |                 self.__class__ == other.__class__,
 83 |             ]
 84 |         )
 85 | 
 86 |     def __len__(self) -> int:
 87 |         """
 88 |         Returns the number of elements in the section.
 89 |         """
 90 |         return len(self.elements)
 91 | 
 92 |     def __repr__(self) -> str:
 93 |         return (
 94 |             f"<Section name: '{self.name}', unique_name: '{self.unique_name}', "
 95 |             f"number of elements: {len(self)}>"
 96 |         )
 97 | 
 98 | 
 99 | class Sectioning:
100 |     """
101 |     A sectioning utilities class, made available on all PDFDocuments as ``.sectioning``.
102 |     """
103 | 
104 |     document: "PDFDocument"
105 |     name_counts: Dict[str, int]
106 |     sections_dict: Dict[str, Section]
107 | 
108 |     def __init__(self, document: "PDFDocument"):
109 |         self.sections_dict = {}
110 |         self.name_counts = defaultdict(int)
111 |         self.document = document
112 | 
113 |     def create_section(
114 |         self,
115 |         name: str,
116 |         start_element: "PDFElement",
117 |         end_element: "PDFElement",
118 |         include_last_element: bool = True,
119 |     ) -> "Section":
120 |         """
121 |         Creates a new section with the specified name.
122 | 
123 |         Creates a new section with the specified name, starting at `start_element` and
124 |         ending at `end_element` (inclusive). The unique name will be set to name_<idx>
125 |         where <idx> is the number of existing sections with that name.
126 | 
127 |         Args:
128 |             name (str): The name of the new section.
129 |             start_element (PDFElement): The first element in the section.
130 |             end_element (PDFElement): The last element in the section.
131 |             include_last_element (bool): Whether the end_element should be included in
132 |                 the section, or only the elements which are strictly before the end
133 |                 element. Default: True (i.e. include end_element).
134 | 
135 |         Returns:
136 |             Section: The created section.
137 | 
138 |         Raises:
139 |             InvalidSectionError: If a the created section would be invalid. This is
140 |                 usually because the end_element comes after the start element.
141 |         """
142 |         current_count = self.name_counts[name]
143 |         unique_name = f"{name}_{current_count}"
144 |         self.name_counts[name] += 1
145 | 
146 |         if not include_last_element:
147 |             if end_element._index == 0:
148 |                 raise InvalidSectionError(
149 |                     "Section would contain no elements as end_element is the first "
150 |                     "element in the document and include_last_element is False"
151 |                 )
152 |             # We simply drop the index by one to get the element before
153 |             end_element = self.document._element_list[end_element._index - 1]
154 |         section = Section(self.document, name, unique_name, start_element, end_element)
155 |         self.sections_dict[unique_name] = section
156 |         return section
157 | 
158 |     def get_sections_with_name(self, name: str) -> Generator[Section, None, None]:
159 |         """
160 |         Returns a list of all sections with the given name.
161 |         """
162 |         return (
163 |             self.sections_dict[f"{name}_{idx}"]
164 |             for idx in range(0, self.name_counts[name])
165 |         )
166 | 
167 |     def get_section(self, unique_name: str) -> Section:
168 |         """
169 |         Returns the section with the given unique name.
170 | 
171 |         Raises:
172 |             SectionNotFoundError: If there is no section with the given unique_name.
173 |         """
174 |         try:
175 |             return self.sections_dict[unique_name]
176 |         except KeyError as err:
177 |             raise SectionNotFoundError(
178 |                 f"Could not find section with name {unique_name}"
179 |             ) from err
180 | 
181 |     @property
182 |     def sections(self) -> ValuesView[Section]:
183 |         """
184 |         Returns the list of all created Sections.
185 |         """
186 |         return self.sections_dict.values()
187 | 


--------------------------------------------------------------------------------
/py_pdf_parser/visualise/__init__.py:
--------------------------------------------------------------------------------
1 | from .main import visualise
2 | 
3 | __all__ = [
4 |     "visualise",
5 | ]
6 | 


--------------------------------------------------------------------------------
/py_pdf_parser/visualise/background.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | 
 3 | import numpy
 4 | import wand.color
 5 | import wand.image
 6 | from PIL import Image
 7 | 
 8 | 
 9 | def get_pdf_background(pdf_file_path: str, page_number: int) -> Image.Image:
10 |     """
11 |     Create a screenshot of this PDF page using Ghostscript, to use as the
12 |     background for the matplotlib chart.
13 |     """
14 |     # Appending e.g. [0] to the filename means it only loads the first page
15 |     path_with_page = f"{pdf_file_path}[{page_number - 1}]"
16 |     pdf_pages = wand.image.Image(filename=path_with_page, resolution=150)
17 |     page = pdf_pages.sequence[0]
18 | 
19 |     with wand.image.Image(page) as image:
20 |         # We need to composite this with a white image as a background,
21 |         # because disabling the alpha channel doesn't work.
22 |         bg_params = {
23 |             "width": image.width,
24 |             "height": image.height,
25 |             "background": wand.color.Color("white"),
26 |         }
27 |         with wand.image.Image(**bg_params) as background:
28 |             background.composite(image, 0, 0)
29 |             img_buffer = numpy.asarray(
30 |                 bytearray(background.make_blob(format="png")), dtype="uint8"
31 |             )
32 |             img_stream = io.BytesIO(img_buffer.tobytes())
33 | 
34 |     return Image.open(img_stream).transpose(Image.FLIP_TOP_BOTTOM).convert("RGBA")
35 | 


--------------------------------------------------------------------------------
/py_pdf_parser/visualise/info_figure.py:
--------------------------------------------------------------------------------
 1 | from typing import TYPE_CHECKING, Dict, List, Optional
 2 | 
 3 | from matplotlib.backend_bases import MouseButton
 4 | 
 5 | if TYPE_CHECKING:
 6 |     from py_pdf_parser.components import PDFElement
 7 | 
 8 | 
 9 | def get_clicked_element_info(clicked_elements: Dict[MouseButton, "PDFElement"]) -> str:
10 |     left_element = clicked_elements.get(MouseButton.LEFT)
11 |     right_element = clicked_elements.get(MouseButton.RIGHT)
12 | 
13 |     output = []
14 | 
15 |     output.append("Left clicked element:")
16 |     output.append("---------------------")
17 |     output += _get_element_info(left_element)
18 |     output.append("")
19 | 
20 |     output.append("Right clicked element:")
21 |     output.append("---------------------")
22 |     output += _get_element_info(right_element)
23 |     output.append("")
24 | 
25 |     output.append("Element comparison:")
26 |     output.append("-------------------")
27 |     output += _get_element_comparison_info(left_element, right_element)
28 |     return "\n".join(output)
29 | 
30 | 
31 | def _get_element_info(element: Optional["PDFElement"]) -> List[str]:
32 |     if not element:
33 |         return ["Click an element to see details"]
34 |     return [
35 |         f"Text: {element.text(stripped=False)}",
36 |         f"Font: {element.font}",
37 |         f"Tags: {element.tags}",
38 |         f"Bounding box: {element.bounding_box}",
39 |         f"Width: {element.bounding_box.width}",
40 |         f"Height: {element.bounding_box.height}",
41 |     ]
42 | 
43 | 
44 | def _get_element_comparison_info(
45 |     element1: Optional["PDFElement"], element2: Optional["PDFElement"]
46 | ) -> List[str]:
47 |     if element1 is None or element2 is None:
48 |         return ["Left click one element and right click another to see comparison"]
49 | 
50 |     bbox1 = element1.bounding_box
51 |     bbox2 = element2.bounding_box
52 | 
53 |     # Height
54 |     height_diff = abs(bbox1.height - bbox2.height)
55 |     relative_height_diff = height_diff / bbox1.height
56 | 
57 |     # Line margin (i.e. vertical gap)
58 |     line_margin = max(bbox1.y0 - bbox2.y1, bbox2.y0 - bbox1.y1)
59 |     relative_line_margin = line_margin / bbox1.height
60 | 
61 |     # Alignment
62 |     alignments = {
63 |         "left": abs(bbox1.x0 - bbox2.x0),
64 |         "right": abs(bbox1.x1 - bbox2.x1),
65 |         "center": abs((bbox1.x0 + bbox1.x1) / 2 - (bbox2.x0 + bbox2.x1) / 2),
66 |     }
67 |     sorted_alignments = sorted(alignments.items(), key=lambda x: x[1])
68 |     alignment_name, alignment_value = sorted_alignments[0]
69 |     relative_alignment_value = alignment_value / bbox1.height
70 | 
71 |     return [
72 |         "Note 'relative' is relative to the left clicked element",
73 |         f"Height diff: {height_diff}",
74 |         f"Relative height diff {relative_height_diff}",
75 |         f"Line margin: {line_margin}",
76 |         f"Relative line margin: {relative_line_margin}",
77 |         f"Closest alignment: {alignment_value} ({alignment_name})",
78 |         f"Relative alignment: {relative_alignment_value}",
79 |     ]
80 | 


--------------------------------------------------------------------------------
/py_pdf_parser/visualise/main.py:
--------------------------------------------------------------------------------
  1 | from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple
  2 | 
  3 | import logging
  4 | import tkinter as tk
  5 | 
  6 | import matplotlib
  7 | from matplotlib.backend_bases import MouseButton
  8 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk
  9 | from matplotlib.figure import Figure
 10 | 
 11 | from py_pdf_parser.components import PDFDocument
 12 | 
 13 | from .background import get_pdf_background
 14 | from .info_figure import get_clicked_element_info
 15 | from .sections import SectionVisualiser
 16 | 
 17 | if TYPE_CHECKING:
 18 |     from matplotlib.axes import Axes
 19 |     from matplotlib.backend_bases import MouseEvent
 20 |     from matplotlib.figure import Text
 21 | 
 22 |     from py_pdf_parser.components import PDFElement
 23 |     from py_pdf_parser.filtering import ElementList
 24 | 
 25 | logger = logging.getLogger("PDFParser")
 26 | 
 27 | 
 28 | STYLES = {
 29 |     "untagged": {"color": "#00a9f4", "linewidth": 1, "alpha": 0.5},
 30 |     "tagged": {"color": "#007ac1", "linewidth": 1, "alpha": 0.5},
 31 |     "ignored": {"color": "#67daff", "linewidth": 1, "alpha": 0.2, "linestyle": ":"},
 32 | }
 33 | 
 34 | DPI = 100
 35 | 
 36 | 
 37 | class CustomToolbar(NavigationToolbar2Tk):
 38 |     def __init__(
 39 |         self,
 40 |         canvas: tk.Canvas,
 41 |         window: tk.Tk,
 42 |         first_page_callback: Callable,
 43 |         previous_page_callback: Callable,
 44 |         next_page_callback: Callable,
 45 |         last_page_callback: Callable,
 46 |         *args: Any,
 47 |         **kwargs: Any,
 48 |     ):
 49 |         self.first_page_callback = first_page_callback
 50 |         self.previous_page_callback = previous_page_callback
 51 |         self.next_page_callback = next_page_callback
 52 |         self.last_page_callback = last_page_callback
 53 |         self.toolitems += (
 54 |             (None, None, None, None),  # Divider
 55 |             ("First page", "Go to fist page", "back", "first_page_callback"),
 56 |             ("Previous page", "Go to previous page", "back", "previous_page_callback"),
 57 |             ("Next page", "Go to next page", "forward", "next_page_callback"),
 58 |             ("Last page", "Go to last page", "forward", "last_page_callback"),
 59 |         )
 60 |         super().__init__(canvas, window, *args, **kwargs)
 61 | 
 62 |     def reset(self, not_first_page: bool, not_last_page: bool) -> None:
 63 |         map = {True: tk.ACTIVE, False: tk.DISABLED}
 64 |         self._buttons["First page"]["state"] = map[not_first_page]
 65 |         self._buttons["Previous page"]["state"] = map[not_first_page]
 66 |         self._buttons["Next page"]["state"] = map[not_last_page]
 67 |         self._buttons["Last page"]["state"] = map[not_last_page]
 68 | 
 69 | 
 70 | class PDFVisualiser:
 71 |     """
 72 |     Class used to handle visualising the PDF. Do not instantiate this yourself, instead
 73 |     you should call the `visualise` function.
 74 | 
 75 |     We need a class as we have to keep track of the current page etc.
 76 |     """
 77 | 
 78 |     document: PDFDocument
 79 |     current_page: int
 80 |     __ax: "Axes"
 81 |     __fig: "Figure"
 82 |     __info_fig: Optional["Figure"] = None
 83 |     __info_text: Optional["Text"] = None
 84 |     __section_visualiser: "SectionVisualiser"
 85 | 
 86 |     __clicked_elements: Dict[MouseButton, "PDFElement"] = {}
 87 | 
 88 |     def __init__(
 89 |         self,
 90 |         root: tk.Tk,
 91 |         document: PDFDocument,
 92 |         current_page: int = 1,
 93 |         elements: Optional["ElementList"] = None,
 94 |         show_info: bool = False,
 95 |         width: Optional[int] = None,
 96 |         height: Optional[int] = None,
 97 |     ):
 98 |         if not document._pdf_file_path:
 99 |             logger.warning(
100 |                 "PDFDocument does not initialised with pdf_file_path and so we cannot "
101 |                 "add the PDF background for visualisation. Please use load_file "
102 |                 "instead of load, or specify pdf_file_path manually"
103 |             )
104 | 
105 |         self.document = document
106 |         self.current_page = current_page
107 |         if elements is not None:
108 |             self.elements = elements
109 |         else:
110 |             self.elements = document.elements
111 |         self.show_info = show_info
112 | 
113 |         self.root = root
114 |         if width is None:
115 |             width = self.root.winfo_screenwidth()
116 |         if height is None:
117 |             height = self.root.winfo_screenheight()
118 |         self.root.geometry(f"{width}x{height}")
119 |         title = "py-pdf-parser"
120 |         if self.document._pdf_file_path:
121 |             title += f" - {self.document._pdf_file_path}"
122 |         self.root.title(title)
123 | 
124 |         self.__fig = Figure(figsize=(5, 4), dpi=DPI)
125 |         self.canvas = FigureCanvasTkAgg(self.__fig, master=self.root)
126 |         self.canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)
127 |         self.toolbar = CustomToolbar(
128 |             self.canvas,
129 |             self.root,
130 |             next_page_callback=self.__next_page,
131 |             first_page_callback=self.__first_page,
132 |             previous_page_callback=self.__previous_page,
133 |             last_page_callback=self.__last_page,
134 |         )
135 | 
136 |         self.__ax = self.canvas.figure.add_subplot(111)
137 | 
138 |         self.__section_visualiser = SectionVisualiser(self.document, self.__ax)
139 | 
140 |         if self.show_info:
141 |             self.__info_fig, self.__info_text = self.__initialise_info_fig()
142 | 
143 |         self.__plot_current_page()
144 | 
145 |     def __plot_current_page(self) -> None:
146 |         if self.show_info:
147 |             self.__clear_clicked_elements()
148 | 
149 |         self.__ax.cla()
150 | 
151 |         # draw PDF image as background
152 |         page = self.document.get_page(self.current_page)
153 |         if self.document._pdf_file_path is not None:
154 |             background = get_pdf_background(
155 |                 self.document._pdf_file_path, self.current_page
156 |             )
157 |             self.__ax.imshow(
158 |                 background,
159 |                 origin="lower",
160 |                 extent=[0, page.width, 0, page.height],
161 |                 interpolation="kaiser",
162 |             )
163 |         else:
164 |             self.__ax.set_aspect("equal")
165 |             self.__ax.set_xlim([0, page.width])
166 |             self.__ax.set_ylim([0, page.height])
167 | 
168 |         page = self.document.get_page(self.current_page)
169 |         for element in page.elements & self.elements:
170 |             style = STYLES["tagged"] if element.tags else STYLES["untagged"]
171 |             self.__plot_element(element, style)
172 | 
173 |         # We'd like to draw greyed out rectangles around the ignored elements, but these
174 |         # are excluded from ElementLists, so we need to do this manually.
175 |         page_indexes = set(
176 |             range(page.start_element._index, page.end_element._index + 1)
177 |         )
178 |         ignored_indexes_on_page = page_indexes & self.document._ignored_indexes
179 |         for index in ignored_indexes_on_page:
180 |             element = self.document._element_list[index]
181 |             self.__plot_element(element, STYLES["ignored"])
182 | 
183 |         self.__section_visualiser.plot_sections_for_page(page)
184 | 
185 |         self.__ax.format_coord = self.__get_annotations
186 |         self.__reset_toolbar()
187 | 
188 |     def __initialise_info_fig(self) -> Tuple["Figure", "Axes"]:
189 |         window = tk.Toplevel(self.root)
190 | 
191 |         info_fig = Figure()
192 |         canvas = FigureCanvasTkAgg(info_fig, window)
193 |         canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=True)
194 | 
195 |         self.canvas.mpl_connect("button_press_event", self.__on_click)
196 | 
197 |         info_text = info_fig.text(
198 |             0.01,
199 |             0.5,
200 |             "",
201 |             horizontalalignment="left",
202 |             verticalalignment="center",
203 |         )
204 |         return info_fig, info_text
205 | 
206 |     def __on_click(self, event: "MouseEvent") -> None:
207 |         if event.button == MouseButton.MIDDLE:
208 |             self.__clear_clicked_elements()
209 |             return
210 |         if event.button not in [MouseButton.LEFT, MouseButton.RIGHT]:
211 |             return
212 |         for rect in self.__ax.patches:
213 |             if not rect.contains(event)[0]:
214 |                 continue
215 |             # rect is the rectangle we clicked on!
216 |             self.__clicked_elements[event.button] = rect.element
217 |             self.__update_text()
218 | 
219 |             return
220 | 
221 |     def __clear_clicked_elements(self) -> None:
222 |         self.__clicked_elements = {}
223 |         self.__update_text()
224 | 
225 |     def __update_text(self) -> None:
226 |         if self.__info_text is None or self.__info_fig is None:
227 |             return
228 |         self.__info_text.set_text(get_clicked_element_info(self.__clicked_elements))
229 |         self.__info_fig.canvas.draw()
230 | 
231 |     def __plot_element(self, element: "PDFElement", style: Dict) -> None:
232 |         rect = _ElementRectangle(element, **style)
233 |         self.__ax.add_patch(rect)
234 | 
235 |     def __reset_toolbar(self) -> None:
236 |         not_first_page = self.current_page != 1
237 |         not_last_page = self.current_page != self.document.number_of_pages
238 |         self.toolbar.reset(not_first_page, not_last_page)
239 | 
240 |     def __get_annotations(self, x: float, y: float) -> str:
241 |         annotation = f"({x:.2f}, {y:.2f})"
242 |         for element in self.elements.filter_by_page(self.current_page):
243 |             bbox = element.bounding_box
244 |             if bbox.x0 <= x <= bbox.x1 and bbox.y0 <= y <= bbox.y1:
245 |                 annotation += f" {element}"
246 |                 sections_dict = self.document.sectioning.sections_dict
247 |                 section_names = [
248 |                     section_name
249 |                     for section_name, section in sections_dict.items()
250 |                     if element in section
251 |                 ]
252 |                 if section_names:
253 |                     sections_str = "', '".join(section_names)
254 |                     annotation += f", SECTIONS: '{sections_str}'"
255 | 
256 |         return annotation
257 | 
258 |     def __first_page(self) -> None:
259 |         self.__set_page(min(self.document.page_numbers))
260 | 
261 |     def __last_page(self) -> None:
262 |         self.__set_page(max(self.document.page_numbers))
263 | 
264 |     def __next_page(self) -> None:
265 |         current_page_idx = self.document.page_numbers.index(self.current_page)
266 |         next_page_idx = min(current_page_idx + 1, self.document.number_of_pages)
267 |         next_page = self.document.page_numbers[next_page_idx]
268 |         self.__set_page(next_page)
269 | 
270 |     def __previous_page(self) -> None:
271 |         current_page_idx = self.document.page_numbers.index(self.current_page)
272 |         previous_page_idx = max(current_page_idx - 1, 0)
273 |         previous_page = self.document.page_numbers[previous_page_idx]
274 |         self.__set_page(previous_page)
275 | 
276 |     def __set_page(self, page_number: int) -> None:
277 |         if self.current_page != page_number:
278 |             self.current_page = page_number
279 |             self.__plot_current_page()
280 |             self.__fig.canvas.draw()
281 | 
282 | 
283 | class _ElementRectangle(matplotlib.patches.Rectangle):
284 |     """
285 |     This is essentially the same as a matplotlib.patches.Rectangle, except
286 |     with an added `element` attribute. It also supplies the coordinates for
287 |     the rectangle from the element's bounding box.
288 |     """
289 | 
290 |     def __init__(self, element: "PDFElement", **style: str):
291 |         self.element = element
292 |         bbox = element.bounding_box
293 |         super().__init__((bbox.x0, bbox.y0), bbox.width, bbox.height, **style)
294 | 
295 | 
296 | def visualise(
297 |     document: PDFDocument,
298 |     page_number: int = 1,
299 |     elements: Optional["ElementList"] = None,
300 |     show_info: bool = False,
301 |     width: Optional[int] = None,
302 |     height: Optional[int] = None,
303 | ) -> None:
304 |     """
305 |     Visualises a PDFDocument, allowing you to inspect all the elements.
306 | 
307 |     Will open a Matplotlib window showing the page_number. You can use the black
308 |     buttons on the right of the toolbar to navigate through pages.
309 | 
310 |     Warning:
311 |         In order to show you the actual PDF behind the elements, your document
312 |         must be initialised with pdf_file_path, and your PDF must be at the given path.
313 |         If this is not done, the background will be white.
314 | 
315 |     Args:
316 |         document (PDFDocument): The pdf document to visualise.
317 |         page_number (int): The page to visualise. Note you can change pages using
318 |             the arrow keys in the visualisation window.
319 |         elements (ElementList, optional): Which elements of the document to visualise.
320 |             Defaults to all of the elements in the document.
321 |         show_info (bool): Shows an additional window allowing you to click on
322 |             PDFElements and see details about them. Default: False.
323 |         width: (int, optional): The initial width of the visualisation window.
324 |             Default: Screen width.
325 |         height: (int, optional): The initial height of the visualisation window.
326 |             Default: Screen height.
327 |     """
328 |     root = tk.Tk()
329 |     PDFVisualiser(root, document, page_number, elements, show_info, width, height)
330 |     root.mainloop()
331 | 


--------------------------------------------------------------------------------
/py_pdf_parser/visualise/sections.py:
--------------------------------------------------------------------------------
  1 | from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
  2 | 
  3 | import pyvoronoi
  4 | from matplotlib import cm
  5 | from shapely import geometry, ops
  6 | 
  7 | if TYPE_CHECKING:
  8 |     from matplotlib.axes import Axes
  9 | 
 10 |     from py_pdf_parser.components import ElementList, PDFDocument, PDFElement, PDFPage
 11 |     from py_pdf_parser.sectioning import Section
 12 | 
 13 | 
 14 | # The simple boundary margins are used when trying to draw simple rectangles around
 15 | # sections - we see if each of them work in turn. A higher margin means more space
 16 | # between the elements and the section boundary line.
 17 | SIMPLE_BOUNDARY_MARGINS = [10, 5, 2, 0]
 18 | 
 19 | 
 20 | class SectionVisualiser:
 21 |     """
 22 |     Used internally to draw outlines of sections on the visualise plot.
 23 | 
 24 |     We first try to draw a simple rectangle around the section with a fixed margin, for
 25 |     increasingly small margins. If this doesn't work (because an element that is not
 26 |     in the section would be within the section outline rectangle) then we instead
 27 |     construct the boundary as follows:
 28 | 
 29 |     We create a Voronoi diagram around all of the elements on the page, and the page
 30 |     boundaries (actually we get a diagram around each side of the bounding box of each
 31 |     element). Then for each line in the diagram we check if it was generated between one
 32 |     box which is in the section and one which isn't, and if so we draw it.
 33 | 
 34 |     This produces some slightly interested outlines, and so we also run a simplification
 35 |     check. This takes three points on the outline, and if the triangle created by
 36 |     joining them together doesn't contain any of our elements, we can remove the middle
 37 |     point to make the whole shape a bit simpler.
 38 | 
 39 |     It can still produce some slightly interesting shapes, but does work fairly well.
 40 |     Importantly, every element in the section will be within the outline, and no boxes
 41 |     which are not in the section will be (which cannot always be achieved by simply
 42 |     drawing a rectangle around all the points in the section).
 43 | 
 44 |     It does add some time when changing page on the visualise tool, but the whole
 45 |     process is done in <0.5 sections which is acceptable for a development tool.
 46 |     """
 47 | 
 48 |     all_elements: List["PDFElement"]
 49 |     document: "PDFDocument"
 50 |     page: "PDFPage"
 51 |     pv: Optional["pyvoronoi.Pyvoronoi"]
 52 |     pv_segments: Optional[List]
 53 | 
 54 |     __ax: "Axes"
 55 |     __sections_by_page_number: Dict[int, List["Section"]]
 56 | 
 57 |     def __init__(self, document: "PDFDocument", ax: "Axes"):
 58 |         self.document = document
 59 |         self.__ax = ax
 60 | 
 61 |         colour_map = cm.get_cmap("Dark2").colors
 62 |         self.__colour_mapping = {
 63 |             section.unique_name: colour_map[idx % len(colour_map)]
 64 |             for idx, section in enumerate(self.document.sectioning.sections)
 65 |         }
 66 | 
 67 |         self.__sections_by_page_number = {}
 68 | 
 69 |     def __get_sections_for_page(self, page: "PDFPage") -> List["Section"]:
 70 |         if page.page_number not in self.__sections_by_page_number:
 71 |             self.__sections_by_page_number[page.page_number] = [
 72 |                 section
 73 |                 for section in self.document.sectioning.sections
 74 |                 if section.elements & page.elements
 75 |             ]
 76 |         return self.__sections_by_page_number[page.page_number]
 77 | 
 78 |     def __get_segment_for_element(self, element: "PDFElement") -> List:
 79 |         bbox = element.bounding_box
 80 |         return [
 81 |             ((bbox.x0, bbox.y0), (bbox.x0, bbox.y1)),
 82 |             ((bbox.x0, bbox.y1), (bbox.x1, bbox.y1)),
 83 |             ((bbox.x1, bbox.y1), (bbox.x1, bbox.y0)),
 84 |             ((bbox.x1, bbox.y0), (bbox.x0, bbox.y0)),
 85 |         ]
 86 | 
 87 |     def __get_segments_for_elements(self, elements: List["PDFElement"]) -> List:
 88 |         return [
 89 |             (start, end)
 90 |             for element in elements
 91 |             for start, end in self.__get_segment_for_element(element)
 92 |         ]
 93 | 
 94 |     def __get_element_boxes(
 95 |         self, elements: Union[List["PDFElement"], "ElementList"]
 96 |     ) -> List:
 97 |         return [
 98 |             geometry.box(
 99 |                 element.bounding_box.x0,
100 |                 element.bounding_box.y0,
101 |                 element.bounding_box.x1,
102 |                 element.bounding_box.y1,
103 |             )
104 |             for element in elements
105 |         ]
106 | 
107 |     def __simplify_outlines(
108 |         self, line: geometry.LineString
109 |     ) -> Tuple[List[int], List[int]]:
110 |         """
111 |         Simplified the outline by considering set of 3 consecutive vertices, and if
112 |         there are no elements in this triangle, removes the middle vertex from the
113 |         shape. This is done iteratively around the shape until no further changes are
114 |         made.
115 |         """
116 |         xs, ys = line.xy
117 | 
118 |         # The last point is the same as the first point, which makes things a bit more
119 |         # complicated. We simply remove the last point and add it back at the end.
120 |         xs.pop(-1)
121 |         ys.pop(-1)
122 |         boxes = self.__get_element_boxes(self.all_elements)
123 |         idx = 0
124 |         since_last_changed = 0
125 |         while since_last_changed <= len(xs) + 1:
126 |             idx1 = (idx + 1) % len(xs)
127 |             idx2 = (idx + 2) % len(xs)
128 | 
129 |             x0 = xs[idx]
130 |             x1 = xs[idx1]
131 |             x2 = xs[idx2]
132 | 
133 |             y0 = ys[idx]
134 |             y1 = ys[idx1]
135 |             y2 = ys[idx2]
136 | 
137 |             triangle_points = ((x0, y0), (x1, y1), (x2, y2), (x0, y0))
138 |             triangle = geometry.Polygon(triangle_points)
139 |             if triangle.area < 0.1 or not any(
140 |                 triangle.intersects(box) for box in boxes
141 |             ):
142 |                 xs.pop(idx1)
143 |                 ys.pop(idx1)
144 |                 since_last_changed = 0
145 |             else:
146 |                 since_last_changed += 1
147 | 
148 |             idx = (idx + 1) % len(xs)
149 | 
150 |         # Add the last point back
151 |         xs.append(xs[0])
152 |         ys.append(ys[0])
153 |         return xs, ys
154 | 
155 |     def __plot_edges(
156 |         self, to_plot: List, edges: List, vertices: List, label: str
157 |     ) -> None:
158 |         lines = []
159 |         for edge_idx in to_plot:
160 |             edge = edges[edge_idx]
161 |             start_vertex = vertices[edge.start]
162 |             end_vertex = vertices[edge.end]
163 |             # Note it could be that the edge is supposed to be parabola (edge.is_linear
164 |             # will be false), but in our case we always have boxes with 90 degree
165 |             # corners. If it's a parabola then the focus is one of these corners, and by
166 |             # drawing a line instead of a parabola we at worse cut through this point,
167 |             # which is fine.
168 |             lines.append(
169 |                 geometry.LineString(
170 |                     [[start_vertex.X, start_vertex.Y], [end_vertex.X, end_vertex.Y]]
171 |                 )
172 |             )
173 |         merged_line = ops.linemerge(geometry.MultiLineString(lines))
174 |         kwargs = {"label": label, "alpha": 0.5, "color": self.__colour_mapping[label]}
175 |         # Merged line is either a MultiLineString which means we need to draw multiple
176 |         # lines, or it is a LineString which means we only need to draw one.
177 |         if isinstance(merged_line, geometry.MultiLineString):
178 |             for line in merged_line:
179 |                 xs, ys = self.__simplify_outlines(line)
180 |                 self.__ax.plot(xs, ys, **kwargs)
181 |                 kwargs.pop(
182 |                     "label", None
183 |                 )  # Only pass label once for single legend entry
184 |         else:
185 |             xs, ys = self.__simplify_outlines(merged_line)
186 |             self.__ax.plot(xs, ys, **kwargs)
187 | 
188 |     def __plot_section(self, section: "Section") -> None:
189 |         if self.pv is None or self.pv_segments is None:
190 |             self.pv, self.pv_segments = self.__get_voronoi()
191 |         edges = self.pv.GetEdges()
192 |         vertices = self.pv.GetVertices()
193 |         cells = self.pv.GetCells()
194 | 
195 |         # If an ignored element is within the section, we need to draw lines around it.
196 |         # The following code gets the first and last non-ignored elements in the section
197 |         # on the page, and then gets all elements between (inclusive) these elements,
198 |         # even if they are ignored.
199 |         section_elements_on_page = section.elements & self.page.elements
200 |         section_elements = [
201 |             section.document._element_list[index]
202 |             for index in range(
203 |                 section_elements_on_page[0]._index,
204 |                 section_elements_on_page[-1]._index + 1,
205 |             )
206 |         ]
207 |         section_segments = self.__get_segments_for_elements(section_elements)
208 |         in_section = [point in section_segments for point in self.pv_segments]
209 | 
210 |         to_plot = []
211 |         for idx, edge in enumerate(edges):
212 |             first_segment = cells[edge.cell].site
213 |             second_segment = cells[edges[edge.twin].cell].site
214 |             # We should plot if the first segment is in the section and the second isn't
215 |             if in_section[first_segment] and not in_section[second_segment]:
216 |                 to_plot.append(idx)
217 | 
218 |         self.__plot_edges(to_plot, edges, vertices, label=section.unique_name)
219 | 
220 |     def __get_voronoi(self) -> Tuple[pyvoronoi.Pyvoronoi, List]:
221 |         all_segments = self.__get_segments_for_elements(self.all_elements)
222 |         # Add the page boundary as segments:
223 |         all_segments += [
224 |             [(0, 0), (0, self.page.height)],
225 |             [(0, 0), (self.page.width, 0)],
226 |             [(0, self.page.height), (self.page.width, self.page.height)],
227 |             [(self.page.width, 0), (self.page.width, self.page.height)],
228 |         ]
229 | 
230 |         pv = pyvoronoi.Pyvoronoi(10)
231 |         for segment in all_segments:
232 |             pv.AddSegment(segment)
233 | 
234 |         pv.Construct()
235 |         return pv, all_segments
236 | 
237 |     def __get_boundary_for_elements(
238 |         self, elements: "ElementList", margin: int
239 |     ) -> Tuple[float, float, float, float]:
240 |         x0s = [element.bounding_box.x0 for element in elements]
241 |         x1s = [element.bounding_box.x1 for element in elements]
242 |         y0s = [element.bounding_box.y0 for element in elements]
243 |         y1s = [element.bounding_box.y1 for element in elements]
244 | 
245 |         x0 = min(x0s) - margin
246 |         x1 = max(x1s) + margin
247 |         y0 = min(y0s) - margin
248 |         y1 = max(y1s) + margin
249 | 
250 |         return x0, x1, y0, y1
251 | 
252 |     def __plot_section_simple(self, section: "Section") -> bool:
253 |         section_elements_on_page = section.elements & self.page.elements
254 |         non_section_elements = self.page.elements - section_elements_on_page
255 |         boxes = self.__get_element_boxes(non_section_elements)
256 | 
257 |         for margin in SIMPLE_BOUNDARY_MARGINS:
258 |             x0, x1, y0, y1 = self.__get_boundary_for_elements(
259 |                 section_elements_on_page, margin=margin
260 |             )
261 | 
262 |             boundary = geometry.box(x0, y0, x1, y1)
263 | 
264 |             if not any(box.intersects(boundary) for box in boxes):
265 |                 # No elements outside of the section are within this boundary, and as
266 |                 # such we can simply draw this boundary as the section outline. Break.
267 |                 break
268 |         else:
269 |             # None of the margins gave us a box which did not contain any non-section
270 |             # elements. We cannot use the simple method.
271 |             return False
272 | 
273 |         label = section.unique_name
274 | 
275 |         kwargs = {"label": label, "alpha": 0.5, "color": self.__colour_mapping[label]}
276 |         self.__ax.plot([x0, x1, x1, x0, x0], [y0, y0, y1, y1, y0], **kwargs)
277 | 
278 |         return True
279 | 
280 |     def plot_sections_for_page(self, page: "PDFPage") -> None:
281 |         self.pv = None
282 |         self.pv_segments = None
283 |         self.page = page
284 | 
285 |         sections = self.__get_sections_for_page(page)
286 | 
287 |         if not sections:
288 |             # No sections on page, nothing to plot
289 |             return
290 | 
291 |         # We want to include ignored elements for this bit.
292 |         page_indexes = set(
293 |             range(page.start_element._index, page.end_element._index + 1)
294 |         )
295 |         ignored_indexes_on_page = page_indexes & self.document._ignored_indexes
296 |         self.all_elements = list(page.elements) + [
297 |             self.document._element_list[index] for index in ignored_indexes_on_page
298 |         ]
299 | 
300 |         for section in sections:
301 |             plotted = self.__plot_section_simple(section)
302 |             if not plotted:
303 |                 self.__plot_section(section)
304 | 
305 |         # Show the legend
306 |         self.__ax.legend()
307 | 


--------------------------------------------------------------------------------
/pycodestyle.cfg:
--------------------------------------------------------------------------------
1 | [pycodestyle]
2 | # E231 is ignored because current released version of black can't handle it, we should
3 | # remove it from the ignore list when https://github.com/psf/black/issues/1010 is solved
4 | ignore = E722, E731, E241, E402, E203, W503, E231
5 | max-line-length = 88
6 | count = True
7 | exclude=.venv,.pytype
8 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.isort]
2 | profile = "black"
3 | known_typing = "typing"
4 | sections = "FUTURE,TYPING,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER"
5 | 


--------------------------------------------------------------------------------
/pytype.cfg:
--------------------------------------------------------------------------------
 1 | # NOTE: All relative paths are relative to the location of this file.
 2 | 
 3 | [pytype]
 4 | 
 5 | # Space-separated list of files or directories to exclude.
 6 | exclude =
 7 |     **/*_test.py
 8 |     **/test_*.py
 9 | 
10 | # Space-separated list of files or directories to process.
11 | inputs =
12 |     .
13 | 
14 | # Keep going past errors to analyze as many files as possible.
15 | keep_going = False
16 | 
17 | # Run N jobs in parallel. When 'auto' is used, this will be equivalent to the
18 | # number of CPUs on the host system.
19 | jobs = 4
20 | 
21 | # All pytype output goes here.
22 | output = .pytype
23 | 
24 | # Paths to source code directories, separated by ':'.
25 | pythonpath =
26 |     .
27 | 
28 | # Python version (major.minor) of the target code.
29 | python_version = 3.8
30 | 
31 | # Use the enum overlay for more precise enum checking. This flag is temporary
32 | # and will be removed once this behavior is enabled by default.
33 | use_enum_overlay = Use the enum overlay for more precise enum checking.
34 | 
35 | # Build dict literals from dict(k=v, ...) calls. This flag is temporary and will
36 | # be removed once this behavior is enabled by default.
37 | build_dict_literals_from_kwargs = Build dict literals from dict(k=v, ...) calls.
38 | 
39 | # Enable stricter namedtuple checks, such as unpacking and 'typing.Tuple'
40 | # compatibility. This flag is temporary and will be removed once this behavior
41 | # is enabled by default.
42 | strict_namedtuple_checks = Enable stricter namedtuple checks, such as unpacking and 'typing.Tuple' compatibility.
43 | 
44 | # Enable exhaustive checking of function parameter types. This flag is temporary
45 | # and will be removed once this behavior is enabled by default.
46 | strict_parameter_checks = Enable exhaustive checking of function parameter types.
47 | 
48 | # Enable support for TypedDicts. This flag is temporary and will be removed once
49 | # this behavior is enabled by default.
50 | enable_typed_dicts = Enable support for TypedDicts.
51 | 
52 | # Solve unknown types to label with structural types. This flag is temporary and
53 | # will be removed once this behavior is enabled by default.
54 | protocols = Solve unknown types to label with structural types.
55 | 
56 | # Only load submodules that are explicitly imported. This flag is temporary and
57 | # will be removed once this behavior is enabled by default.
58 | strict_import = Only load submodules that are explicitly imported.
59 | 
60 | # Infer precise return types even for invalid function calls. This flag is
61 | # temporary and will be removed once this behavior is enabled by default.
62 | precise_return = Infer precise return types even for invalid function calls.
63 | 
64 | # Comma or space separated list of error names to ignore.
65 | disable =
66 |     pyi-error
67 | 
68 | # Don't report errors.
69 | report_errors = True
70 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from setuptools import find_packages, setup
 5 | 
 6 | if sys.version_info < (3, 6):
 7 |     print(sys.stderr, "{}: need Python 3.6 or later.".format(sys.argv[0]))
 8 |     print(sys.stderr, "Your Python is {}".format(sys.version))
 9 |     sys.exit(1)
10 | 
11 | 
12 | ROOT_DIR = os.path.dirname(__file__)
13 | 
14 | 
15 | setup(
16 |     name="py-pdf-parser",
17 |     packages=find_packages(exclude=["tests", "tests.*", "docs", "docs.*"]),
18 |     version="0.13.0",
19 |     url="https://github.com/jstockwin/py-pdf-parser",
20 |     license="BSD",
21 |     description="A tool to help extracting information from structured PDFs.",
22 |     long_description=open(os.path.join(ROOT_DIR, "README.md")).read(),
23 |     long_description_content_type="text/markdown",
24 |     author="Jake Stockwin",
25 |     author_email="jstockwin@gmail.com",
26 |     include_package_data=True,
27 |     install_requires=[
28 |         "pdfminer.six==20220524",
29 |         "docopt==0.6.2",
30 |         "wand==0.6.10",
31 |     ],
32 |     extras_require={
33 |         "dev": [
34 |             "matplotlib==3.5.1",
35 |             "pillow==9.2.0",
36 |             "pyvoronoi==1.0.7",
37 |             "shapely==1.8.2",
38 |         ],
39 |         "test": [
40 |             "ddt==1.6.0",
41 |             "matplotlib==3.5.1",
42 |             "mock==4.0.3",
43 |             "nose==1.3.7",
44 |             "pillow==9.2.0",
45 |             "recommonmark==0.7.1",
46 |             "sphinx-autobuild==2021.3.14",
47 |             "sphinx-rtd-theme==1.0.0",
48 |             "Sphinx==5.2.3",
49 |         ],
50 |     },
51 | )
52 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/__init__.py


--------------------------------------------------------------------------------
/tests/base.py:
--------------------------------------------------------------------------------
  1 | from typing import TYPE_CHECKING, List, Optional, Union
  2 | 
  3 | import _tkinter
  4 | import logging
  5 | import os
  6 | import tkinter as tk
  7 | from unittest import TestCase
  8 | 
  9 | from PIL import Image
 10 | 
 11 | if TYPE_CHECKING:
 12 |     from pdfminer.layout import LTComponent
 13 | 
 14 |     from py_pdf_parser.components import PDFElement
 15 |     from py_pdf_parser.filtering import ElementList
 16 | 
 17 | 
 18 | # Turn of debug spam from pdfminer, matplotlib, shapely
 19 | logging.getLogger("pdfminer").setLevel(logging.WARNING)
 20 | logging.getLogger("matplotlib").setLevel(logging.WARNING)
 21 | logging.getLogger("shapely").setLevel(logging.WARNING)
 22 | 
 23 | 
 24 | class BaseTestCase(TestCase):
 25 |     # Helper functions
 26 |     def assert_original_element_in(
 27 |         self, original_element: "LTComponent", element_list: "ElementList"
 28 |     ):
 29 |         pdf_element = self.extract_element_from_list(original_element, element_list)
 30 |         self.assertIn(pdf_element, element_list)
 31 | 
 32 |     def assert_original_element_list_list_equal(
 33 |         self,
 34 |         original_element_list_list: List[List[Optional["LTComponent"]]],
 35 |         element_list_list: List[List[Optional["PDFElement"]]],
 36 |     ):
 37 |         self.assertEqual(len(original_element_list_list), len(element_list_list))
 38 |         for original_element_list, element_list in zip(
 39 |             original_element_list_list, element_list_list
 40 |         ):
 41 |             self.assert_original_element_list_equal(original_element_list, element_list)
 42 | 
 43 |     def assert_original_element_list_equal(
 44 |         self,
 45 |         original_element_list: List[Optional["LTComponent"]],
 46 |         element_list: Union[List[Optional["PDFElement"]], "ElementList"],
 47 |     ):
 48 |         self.assertEqual(len(original_element_list), len(element_list))
 49 |         for original_element, element in zip(original_element_list, element_list):
 50 |             if original_element is None or element is None:
 51 |                 self.assertIsNone(original_element)
 52 |                 self.assertIsNone(element)
 53 |             else:
 54 |                 self.assert_original_element_equal(original_element, element)
 55 | 
 56 |     def assert_original_element_equal(
 57 |         self, original_element: "LTComponent", element: "PDFElement"
 58 |     ):
 59 |         self.assertEqual(original_element, element.original_element)
 60 | 
 61 |     def extract_element_from_list(
 62 |         self,
 63 |         original_element: "LTComponent",
 64 |         element_list: Union[List[Optional["PDFElement"]], "ElementList"],
 65 |     ) -> "PDFElement":
 66 |         return [
 67 |             elem
 68 |             for elem in element_list
 69 |             if elem is not None
 70 |             if elem.original_element == original_element
 71 |         ][0]
 72 | 
 73 | 
 74 | class BaseVisualiseTestCase(BaseTestCase):
 75 |     """
 76 |     See the answer from ivan_pozdeev at
 77 |     https://stackoverflow.com/questions/4083796/how-do-i-run-unittest-on-a-tkinter-app
 78 |     for the setUp, tearDown and pump_events methods. This basically allows us to
 79 |     run tk.mainloop() manually using pump_events, thus allowing us to use visualise
 80 |     without blocking the thread.
 81 | 
 82 |     There is also a custom check_images function to do comparison of the screenshots
 83 |     from visualise. You can set self.WRITE_NEW_TEST_IMAGES to True to write new images
 84 |     if they don't exist. This also allows you to delete images which are old, and then
 85 |     run the tests with WRITE_NEW_TEST_IMAGES=True to replace them.
 86 |     """
 87 | 
 88 |     WRITE_NEW_TEST_IMAGES = False
 89 | 
 90 |     def setUp(self):
 91 |         self.root = tk.Tk()
 92 |         self.pump_events()
 93 | 
 94 |     def tearDown(self):
 95 |         if self.root:
 96 |             self.root.destroy()
 97 |             self.pump_events()
 98 | 
 99 |     def pump_events(self):
100 |         while self.root.dooneevent(_tkinter.ALL_EVENTS | _tkinter.DONT_WAIT):
101 |             pass
102 | 
103 |     def check_images(self, visualiser, image_name):
104 |         self.pump_events()
105 |         root_path = os.path.join(os.path.dirname(__file__), "data", "images")
106 |         existing_file_path = os.path.join(root_path, f"{image_name}.png")
107 |         new_file_path = os.path.join(root_path, f"{image_name}-new.png")
108 | 
109 |         # Check if file exists (write if not)
110 |         if not os.path.isfile(existing_file_path):
111 |             if not self.WRITE_NEW_TEST_IMAGES:
112 |                 self.fail(f"Could not find existing image for {image_name=}. Set ")
113 | 
114 |             visualiser._PDFVisualiser__fig.savefig(existing_file_path)
115 | 
116 |         # Check images are identical (fail if not)
117 |         existing_image = Image.open(existing_file_path)
118 | 
119 |         visualiser._PDFVisualiser__fig.savefig(new_file_path)
120 |         new_image = Image.open(new_file_path)
121 | 
122 |         if new_image.tobytes() != existing_image.tobytes():
123 |             self.fail(f"Images differ for {image_name=}.")
124 | 
125 |         os.remove(new_file_path)
126 | 


--------------------------------------------------------------------------------
/tests/data/images/tables1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/images/tables1.png


--------------------------------------------------------------------------------
/tests/data/images/tables2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/images/tables2.png


--------------------------------------------------------------------------------
/tests/data/pdfs/image.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/pdfs/image.pdf


--------------------------------------------------------------------------------
/tests/data/pdfs/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/pdfs/test.pdf


--------------------------------------------------------------------------------
/tests/data/pdfs/test_protected.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/data/pdfs/test_protected.pdf


--------------------------------------------------------------------------------
/tests/test_common.py:
--------------------------------------------------------------------------------
 1 | from py_pdf_parser.common import BoundingBox
 2 | from py_pdf_parser.exceptions import InvalidCoordinatesError
 3 | 
 4 | from .base import BaseTestCase
 5 | 
 6 | 
 7 | class TestBoundingBox(BaseTestCase):
 8 |     def test_create_bounding_box(self):
 9 |         bbox = BoundingBox(0, 1, 0, 1)
10 |         self.assertEqual(bbox.width, 1)
11 |         self.assertEqual(bbox.height, 1)
12 | 
13 |         # Checks that it raises an exception if coordinates are not valid
14 |         with self.assertRaises(InvalidCoordinatesError):
15 |             BoundingBox(1, 0, 0, 1)
16 | 
17 |         with self.assertRaises(InvalidCoordinatesError):
18 |             BoundingBox(0, 1, 1, 0)
19 | 
20 |     def test_eq(self):
21 |         bbox_1 = BoundingBox(0, 1, 0, 1)
22 |         bbox_2 = BoundingBox(0, 1, 0, 1)
23 |         self.assertEqual(bbox_1, bbox_2)
24 | 
25 |         bbox_3 = BoundingBox(0, 1, 0, 3)
26 |         self.assertNotEqual(bbox_1, bbox_3)
27 | 
28 |     def test_repr(self):
29 |         bbox = BoundingBox(0, 1, 0, 1)
30 |         self.assertEqual(repr(bbox), "<BoundingBox x0=0, x1=1, y0=0, y1=1>")
31 | 


--------------------------------------------------------------------------------
/tests/test_components.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from ddt import data, ddt
  4 | 
  5 | from py_pdf_parser.common import BoundingBox
  6 | from py_pdf_parser.components import ElementOrdering, PDFDocument
  7 | from py_pdf_parser.exceptions import NoElementsOnPageError, PageNotFoundError
  8 | from py_pdf_parser.filtering import ElementList
  9 | from py_pdf_parser.loaders import Page
 10 | 
 11 | from .base import BaseTestCase
 12 | from .utils import FakePDFMinerTextElement, create_pdf_document, create_pdf_element
 13 | 
 14 | 
 15 | @ddt
 16 | class TestPDFElement(BaseTestCase):
 17 |     element_bbox = BoundingBox(2, 5, 2, 5)
 18 | 
 19 |     def test_page_number(self):
 20 |         element = create_pdf_element()
 21 |         self.assertEqual(element.page_number, 1)
 22 | 
 23 |         with self.assertRaises(AttributeError):
 24 |             element.page_number = 2
 25 | 
 26 |     def test_font_name(self):
 27 |         element = create_pdf_element(font_name="test_font")
 28 |         self.assertEqual(element.font_name, "test_font")
 29 | 
 30 |     def test_font_size(self):
 31 |         element = create_pdf_element(font_size=2)
 32 |         self.assertEqual(element.font_size, 2)
 33 | 
 34 |     def test_font_size_precision(self):
 35 |         element = create_pdf_element(font_size=1.234)
 36 |         self.assertEqual(element.font_size, 1.2)
 37 | 
 38 |         element = create_pdf_element(font_size=1.234, font_size_precision=0)
 39 |         self.assertEqual(element.font_size, 1)
 40 | 
 41 |         element = create_pdf_element(font_size=1.234, font_size_precision=3)
 42 |         self.assertEqual(element.font_size, 1.234)
 43 | 
 44 |     def test_font(self):
 45 |         element = create_pdf_element(font_name="test_font", font_size=2)
 46 |         self.assertEqual(element.font, "test_font,2")
 47 | 
 48 |         element = create_pdf_element(
 49 |             font_name="test_font",
 50 |             font_size=3,
 51 |             font_mapping={"test_font,3": "test_named_font"},
 52 |         )
 53 |         self.assertEqual(element.font, "test_named_font")
 54 | 
 55 |         element = create_pdf_element(
 56 |             font_name="test_font",
 57 |             font_size=2,
 58 |             font_mapping={"test_font,3": "test_named_font"},
 59 |         )
 60 |         self.assertEqual(element.font, "test_font,2")
 61 | 
 62 |         # Test when font_mapping argument is passed to PDFDocument
 63 |         font_mapping = {}
 64 |         element = create_pdf_element(
 65 |             font_name="fake_font_1", font_size=10, font_mapping=font_mapping
 66 |         )
 67 |         self.assertEqual(element.font, "fake_font_1,10")
 68 | 
 69 |         font_mapping = {"fake_font_1,10": "large_text"}
 70 |         element = create_pdf_element(
 71 |             font_name="fake_font_1", font_size=10, font_mapping=font_mapping
 72 |         )
 73 |         self.assertEqual(element.font, "large_text")
 74 | 
 75 |         font_mapping = {r"^fake_font_\d,10$": "large_text"}
 76 |         element = create_pdf_element(
 77 |             font_name="fake_font_1",
 78 |             font_size=10,
 79 |             font_mapping=font_mapping,
 80 |             font_mapping_is_regex=True,
 81 |         )
 82 |         self.assertEqual(element.font, "large_text")
 83 | 
 84 |         font_mapping = {r"^fake_font_\d,10$": "large_text"}
 85 |         element = create_pdf_element(
 86 |             font_name="FAKE_FONT_1",
 87 |             font_size=10,
 88 |             font_mapping=font_mapping,
 89 |             font_mapping_is_regex=True,
 90 |         )
 91 |         self.assertEqual(element.font, "FAKE_FONT_1,10")
 92 | 
 93 |         font_mapping = {r"^fake_font_\d,10$": "large_text"}
 94 |         element = create_pdf_element(
 95 |             font_name="FAKE_FONT_1",
 96 |             font_size=10,
 97 |             font_mapping=font_mapping,
 98 |             font_mapping_is_regex=True,
 99 |             regex_flags=re.IGNORECASE,
100 |         )
101 |         self.assertEqual(element.font, "large_text")
102 | 
103 |     def test_text(self):
104 |         element = create_pdf_element(text=" test ")
105 |         self.assertEqual(element.text(), "test")
106 |         self.assertEqual(element.text(stripped=False), " test ")
107 | 
108 |     def test_add_tag(self):
109 |         element = create_pdf_element()
110 |         self.assertEqual(element.tags, set())
111 | 
112 |         element.add_tag("foo")
113 |         self.assertEqual(element.tags, set(["foo"]))
114 | 
115 |         element.add_tag("foo")
116 |         self.assertEqual(element.tags, set(["foo"]))
117 | 
118 |         element.add_tag("bar")
119 |         self.assertEqual(element.tags, set(["foo", "bar"]))
120 | 
121 |     def test_repr(self):
122 |         element = create_pdf_element(font_name="test_font", font_size=2)
123 |         self.assertEqual(repr(element), "<PDFElement tags: set(), font: 'test_font,2'>")
124 | 
125 |         element.add_tag("foo")
126 |         self.assertEqual(
127 |             repr(element), "<PDFElement tags: {'foo'}, font: 'test_font,2'>"
128 |         )
129 | 
130 |         element.ignore()
131 |         self.assertEqual(
132 |             repr(element), "<PDFElement tags: {'foo'}, font: 'test_font,2', ignored>"
133 |         )
134 | 
135 |     @data(
136 |         BoundingBox(1, 6, 1, 6),  # This box fully encloses the element
137 |         BoundingBox(1, 6, 0, 3),  # This box intersects the bottom of the element
138 |         BoundingBox(1, 6, 0, 2),  # This box touches the bottom of the element
139 |         BoundingBox(1, 6, 4, 6),  # This box intersects the top of the element
140 |         BoundingBox(1, 6, 5, 6),  # This box touches the top of the element
141 |         BoundingBox(1, 6, 3, 4),  # This box goes through center horizontally
142 |         BoundingBox(1, 3, 1, 6),  # This box intersects the left of the element
143 |         BoundingBox(1, 2, 1, 6),  # This box touches the left of the element
144 |         BoundingBox(4, 6, 1, 6),  # This box intersects the left of the element
145 |         BoundingBox(5, 6, 1, 6),  # This box touches the left of the element
146 |         BoundingBox(3, 4, 1, 6),  # This box goes through the center vertically
147 |         BoundingBox(3, 4, 3, 4),  # This box is enclosed inside the element
148 |     )
149 |     def test_partially_within_true(self, bounding_box):
150 |         element = create_pdf_element(self.element_bbox)
151 |         self.assertTrue(element.partially_within(bounding_box))
152 | 
153 |     @data(
154 |         BoundingBox(1, 6, 0, 1),  # This box is underneath the element
155 |         BoundingBox(1, 6, 6, 7),  # This box is above the element
156 |         BoundingBox(0, 1, 1, 6),  # This box is to the left of the element
157 |         BoundingBox(6, 7, 1, 6),  # This box is to the lerightft of the element
158 |     )
159 |     def test_partially_within_false(self, bounding_box):
160 |         element = create_pdf_element(self.element_bbox)
161 |         self.assertFalse(element.partially_within(bounding_box))
162 | 
163 |     @data(BoundingBox(1, 6, 1, 6))  # This box fully encloses the element
164 |     def test_entirely_within_true(self, bounding_box):
165 |         element = create_pdf_element(self.element_bbox)
166 |         self.assertTrue(element.entirely_within(bounding_box))
167 | 
168 |     @data(
169 |         BoundingBox(1, 6, 0, 3),  # This box intersects the bottom of the element
170 |         BoundingBox(1, 6, 0, 2),  # This box touches the bottom of the element
171 |         BoundingBox(1, 6, 4, 6),  # This box intersects the top of the element
172 |         BoundingBox(1, 6, 5, 6),  # This box touches the top of the element
173 |         BoundingBox(1, 6, 3, 4),  # This box goes through center horizontally
174 |         BoundingBox(1, 3, 1, 6),  # This box intersects the left of the element
175 |         BoundingBox(1, 2, 1, 6),  # This box touches the left of the element
176 |         BoundingBox(4, 6, 1, 6),  # This box intersects the left of the element
177 |         BoundingBox(5, 6, 1, 6),  # This box touches the left of the element
178 |         BoundingBox(3, 4, 1, 6),  # This box goes through the center vertically
179 |         BoundingBox(1, 6, 0, 1),  # This box is underneath the element
180 |         BoundingBox(1, 6, 6, 7),  # This box is above the element
181 |         BoundingBox(0, 1, 1, 6),  # This box is to the left of the element
182 |         BoundingBox(6, 7, 1, 6),  # This box is to the right of the element
183 |         BoundingBox(3, 4, 3, 4),  # This box is enclosed inside the element
184 |     )
185 |     def test_entirely_within_false(self, bounding_box):
186 |         element = create_pdf_element(self.element_bbox)
187 |         self.assertFalse(element.entirely_within(bounding_box))
188 | 
189 | 
190 | class TestPDFDocument(BaseTestCase):
191 |     def test_document(self):
192 |         el_page_1_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3))
193 |         el_page_1_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3))
194 |         el_page_1_bottom_left = FakePDFMinerTextElement(BoundingBox(0, 1, 0, 1))
195 |         el_page_1_bottom_right = FakePDFMinerTextElement(BoundingBox(2, 3, 0, 1))
196 |         page_1 = Page(
197 |             elements=[
198 |                 el_page_1_top_left,
199 |                 el_page_1_top_right,
200 |                 el_page_1_bottom_left,
201 |                 el_page_1_bottom_right,
202 |             ],
203 |             width=100,
204 |             height=100,
205 |         )
206 | 
207 |         el_page_2_top_left = FakePDFMinerTextElement(BoundingBox(0, 1, 2, 3))
208 |         el_page_2_top_right = FakePDFMinerTextElement(BoundingBox(2, 3, 2, 3))
209 |         el_page_2_bottom_left = FakePDFMinerTextElement(BoundingBox(0, 1, 0, 1))
210 |         el_page_2_bottom_right = FakePDFMinerTextElement(BoundingBox(2, 3, 0, 1))
211 |         page_2 = Page(
212 |             elements=[
213 |                 el_page_2_bottom_right,
214 |                 el_page_2_bottom_left,
215 |                 el_page_2_top_right,
216 |                 el_page_2_top_left,
217 |             ],
218 |             width=100,
219 |             height=100,
220 |         )
221 | 
222 |         document = PDFDocument(pages={1: page_1, 2: page_2})
223 | 
224 |         # Checks elements were reordered
225 |         expected_ordered_list = [
226 |             el_page_1_top_left,
227 |             el_page_1_top_right,
228 |             el_page_1_bottom_left,
229 |             el_page_1_bottom_right,
230 |             el_page_2_top_left,
231 |             el_page_2_top_right,
232 |             el_page_2_bottom_left,
233 |             el_page_2_bottom_right,
234 |         ]
235 |         self.assertEqual(
236 |             [elem.original_element for elem in document._element_list],
237 |             expected_ordered_list,
238 |         )
239 | 
240 |         # Checks indexes were assigned properly
241 |         self.assertEqual(
242 |             [elem._index for elem in document._element_list], [0, 1, 2, 3, 4, 5, 6, 7]
243 |         )
244 | 
245 |         # Checks page numbers is correct
246 |         self.assertEqual(document.page_numbers, [1, 2])
247 | 
248 |         # Checks number of pages is correct
249 |         self.assertEqual(document.number_of_pages, 2)
250 | 
251 |         # Checks pages were assigned properly
252 |         self.assertEqual(
253 |             [elem.page_number for elem in document._element_list],
254 |             [1, 1, 1, 1, 2, 2, 2, 2],
255 |         )
256 | 
257 |         # Checks pages were instantiated correctly
258 |         pdf_page_1 = document.get_page(1)
259 |         self.assertEqual(page_1.width, pdf_page_1.width)
260 |         self.assertEqual(page_1.height, pdf_page_1.height)
261 |         self.assertEqual(el_page_1_top_left, pdf_page_1.start_element.original_element)
262 |         self.assertEqual(
263 |             el_page_1_bottom_right, pdf_page_1.end_element.original_element
264 |         )
265 |         self.assertEqual(pdf_page_1.page_number, 1)
266 |         self.assertEqual(pdf_page_1.elements, ElementList(document, set([0, 1, 2, 3])))
267 | 
268 |         pdf_page_2 = document.get_page(2)
269 |         self.assertEqual(page_2.width, pdf_page_2.width)
270 |         self.assertEqual(page_2.height, pdf_page_2.height)
271 |         self.assertEqual(el_page_2_top_left, pdf_page_2.start_element.original_element)
272 |         self.assertEqual(
273 |             el_page_2_bottom_right, pdf_page_2.end_element.original_element
274 |         )
275 |         self.assertEqual(pdf_page_2.page_number, 2)
276 |         self.assertEqual(pdf_page_2.elements, ElementList(document, set([4, 5, 6, 7])))
277 | 
278 |         self.assertEqual(document.pages, [pdf_page_1, pdf_page_2])
279 | 
280 |         self.assertEqual(
281 |             document.elements, ElementList(document, set([0, 1, 2, 3, 4, 5, 6, 7]))
282 |         )
283 |         with self.assertRaises(PageNotFoundError):
284 |             document.get_page(3)
285 | 
286 |     def test_document_with_blank_page(self):
287 |         with self.assertRaises(NoElementsOnPageError):
288 |             PDFDocument(pages={1: Page(elements=[], width=100, height=100)})
289 | 
290 |     def test_element_ordering(self):
291 |         #       elem_1      elem_2
292 |         #       elem_3      elem_4
293 |         elem_1 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 6, 10))
294 |         elem_2 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 6, 10))
295 |         elem_3 = FakePDFMinerTextElement(bounding_box=BoundingBox(0, 5, 0, 5))
296 |         elem_4 = FakePDFMinerTextElement(bounding_box=BoundingBox(6, 10, 0, 5))
297 | 
298 |         # Check default: left to right, top to bottom
299 |         document = create_pdf_document(elements=[elem_1, elem_2, elem_3, elem_4])
300 |         self.assert_original_element_list_equal(
301 |             [elem_1, elem_2, elem_3, elem_4], document.elements
302 |         )
303 | 
304 |         # Check other presets
305 |         document = create_pdf_document(
306 |             elements=[elem_1, elem_2, elem_3, elem_4],
307 |             element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM,
308 |         )
309 |         self.assert_original_element_list_equal(
310 |             [elem_2, elem_1, elem_4, elem_3], document.elements
311 |         )
312 | 
313 |         document = create_pdf_document(
314 |             elements=[elem_1, elem_2, elem_3, elem_4],
315 |             element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT,
316 |         )
317 |         self.assert_original_element_list_equal(
318 |             [elem_1, elem_3, elem_2, elem_4], document.elements
319 |         )
320 | 
321 |         document = create_pdf_document(
322 |             elements=[elem_1, elem_2, elem_3, elem_4],
323 |             element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT,
324 |         )
325 |         self.assert_original_element_list_equal(
326 |             [elem_2, elem_4, elem_1, elem_3], document.elements
327 |         )
328 | 
329 |         # Check custom function
330 |         document = create_pdf_document(
331 |             elements=[elem_1, elem_2, elem_3, elem_4],
332 |             element_ordering=lambda elements: [
333 |                 elements[0],
334 |                 elements[3],
335 |                 elements[1],
336 |                 elements[2],
337 |             ],
338 |         )
339 |         self.assert_original_element_list_equal(
340 |             [elem_1, elem_4, elem_2, elem_3], document.elements
341 |         )
342 | 


--------------------------------------------------------------------------------
/tests/test_doc_examples/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jstockwin/py-pdf-parser/6aa400371e948307bcaa5073f33507c9b6e84f06/tests/test_doc_examples/__init__.py


--------------------------------------------------------------------------------
/tests/test_doc_examples/test_element_ordering.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from py_pdf_parser.components import ElementOrdering
  4 | from py_pdf_parser.loaders import load_file
  5 | from tests.base import BaseTestCase
  6 | 
  7 | 
  8 | class TestSimpleMemo(BaseTestCase):
  9 |     def test_output_is_correct(self):
 10 |         file_path = os.path.join(
 11 |             os.path.dirname(__file__), "../../docs/source/example_files/grid.pdf"
 12 |         )
 13 | 
 14 |         # Default - left to right, top to bottom
 15 |         document = load_file(file_path)
 16 |         self.assertListEqual(
 17 |             [element.text() for element in document.elements],
 18 |             ["Top Left", "Top Right", "Bottom Left", "Bottom Right"],
 19 |         )
 20 | 
 21 |         # Preset - right to left, top to bottom
 22 |         document = load_file(
 23 |             file_path, element_ordering=ElementOrdering.RIGHT_TO_LEFT_TOP_TO_BOTTOM
 24 |         )
 25 |         self.assertListEqual(
 26 |             [element.text() for element in document.elements],
 27 |             ["Top Right", "Top Left", "Bottom Right", "Bottom Left"],
 28 |         )
 29 | 
 30 |         # Preset - top to bottom, left to right
 31 |         document = load_file(
 32 |             file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_LEFT_TO_RIGHT
 33 |         )
 34 |         self.assertListEqual(
 35 |             [element.text() for element in document.elements],
 36 |             ["Bottom Left", "Top Left", "Bottom Right", "Top Right"],
 37 |         )
 38 | 
 39 |         # Preset - top to bottom, right to left
 40 |         document = load_file(
 41 |             file_path, element_ordering=ElementOrdering.TOP_TO_BOTTOM_RIGHT_TO_LEFT
 42 |         )
 43 |         self.assertListEqual(
 44 |             [element.text() for element in document.elements],
 45 |             ["Top Right", "Bottom Right", "Top Left", "Bottom Left"],
 46 |         )
 47 | 
 48 |         # Custom - bottom to top, left to right
 49 |         def ordering_function(elements):
 50 |             return sorted(elements, key=lambda elem: (elem.x0, elem.y0))
 51 | 
 52 |         document = load_file(file_path, element_ordering=ordering_function)
 53 |         self.assertListEqual(
 54 |             [element.text() for element in document.elements],
 55 |             ["Bottom Left", "Top Left", "Bottom Right", "Top Right"],
 56 |         )
 57 | 
 58 |         # Custom - This PDF has columns!
 59 |         # TODO: CHANGE PATH!
 60 |         file_path = os.path.join(
 61 |             os.path.dirname(__file__), "../../docs/source/example_files/columns.pdf"
 62 |         )
 63 | 
 64 |         # Default - left to right, top to bottom
 65 |         document = load_file(file_path)
 66 |         self.assertListEqual(
 67 |             [element.text() for element in document.elements],
 68 |             [
 69 |                 "Column 1 Title",
 70 |                 "Column 2 Title",
 71 |                 "Here is some column 1 text.",
 72 |                 "Here is some column 2 text.",
 73 |                 "Col 1 left",
 74 |                 "Col 1 right",
 75 |                 "Col 2 left",
 76 |                 "Col 2 right",
 77 |             ],
 78 |         )
 79 | 
 80 |         # Visualise, and we can see that the middle is at around x = 300.
 81 |         # visualise(document)
 82 | 
 83 |         def column_ordering_function(elements):
 84 |             return sorted(elements, key=lambda elem: (elem.x0 > 300, -elem.y0, elem.x0))
 85 | 
 86 |         document = load_file(file_path, element_ordering=column_ordering_function)
 87 |         self.assertListEqual(
 88 |             [element.text() for element in document.elements],
 89 |             [
 90 |                 "Column 1 Title",
 91 |                 "Here is some column 1 text.",
 92 |                 "Col 1 left",
 93 |                 "Col 1 right",
 94 |                 "Column 2 Title",
 95 |                 "Here is some column 2 text.",
 96 |                 "Col 2 left",
 97 |                 "Col 2 right",
 98 |             ],
 99 |         )
100 | 


--------------------------------------------------------------------------------
/tests/test_doc_examples/test_extracting_text_from_figures.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from py_pdf_parser.loaders import load_file
 4 | from tests.base import BaseTestCase
 5 | 
 6 | 
 7 | class TestExtractingTextFromFigures(BaseTestCase):
 8 |     def test_output_is_correct(self):
 9 |         file_path = os.path.join(
10 |             os.path.dirname(__file__), "../../docs/source/example_files/figure.pdf"
11 |         )
12 | 
13 |         # Without all_texts
14 |         document = load_file(file_path)
15 |         self.assertListEqual(
16 |             [element.text() for element in document.elements],
17 |             ["Here is some text outside of an image"],
18 |         )
19 | 
20 |         document = load_file(file_path, la_params={"all_texts": True})
21 |         self.assertListEqual(
22 |             [element.text() for element in document.elements],
23 |             ["This is some text in an image", "Here is some text outside of an image"],
24 |         )
25 | 


--------------------------------------------------------------------------------
/tests/test_doc_examples/test_order_summary.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from py_pdf_parser import tables
  4 | from py_pdf_parser.loaders import load_file
  5 | from tests.base import BaseTestCase
  6 | 
  7 | 
  8 | class TestSimpleMemo(BaseTestCase):
  9 |     def test_output_is_correct(self):
 10 |         # The code below should match that in the documentation example "order_summary"
 11 |         # Step 1 - Load the document
 12 |         file_path = os.path.join(
 13 |             os.path.dirname(__file__),
 14 |             "../../docs/source/example_files/order_summary.pdf",
 15 |         )
 16 |         FONT_MAPPING = {
 17 |             "BAAAAA+LiberationSerif-Bold,16.0": "title",
 18 |             "BAAAAA+LiberationSerif-Bold,12.0": "sub_title",
 19 |             "CAAAAA+LiberationSerif,12.0": "text",
 20 |             "DAAAAA+FreeMonoBold,12.0": "table_header",
 21 |             "EAAAAA+FreeMono,12.0": "table_text",
 22 |         }
 23 |         document = load_file(file_path, font_mapping=FONT_MAPPING)
 24 | 
 25 |         # visualise(document)
 26 | 
 27 |         # Step 3 - Add sections
 28 |         order_summary_sub_title_element = (
 29 |             document.elements.filter_by_font("sub_title")
 30 |             .filter_by_text_equal("Order Summary:")
 31 |             .extract_single_element()
 32 |         )
 33 | 
 34 |         totals_sub_title_element = (
 35 |             document.elements.filter_by_font("sub_title")
 36 |             .filter_by_text_equal("Totals:")
 37 |             .extract_single_element()
 38 |         )
 39 | 
 40 |         final_element = document.elements[-1]
 41 | 
 42 |         order_summary_section = document.sectioning.create_section(
 43 |             name="order_summary",
 44 |             start_element=order_summary_sub_title_element,
 45 |             end_element=totals_sub_title_element,
 46 |             include_last_element=False,
 47 |         )
 48 | 
 49 |         totals_section = document.sectioning.create_section(
 50 |             name="totals",
 51 |             start_element=totals_sub_title_element,
 52 |             end_element=final_element,
 53 |         )
 54 | 
 55 |         # visualise(document)
 56 | 
 57 |         # Step 4 - Extract tables
 58 | 
 59 |         order_summary_table = tables.extract_simple_table(
 60 |             order_summary_section.elements.filter_by_fonts(
 61 |                 "table_header", "table_text"
 62 |             ),
 63 |             as_text=True,
 64 |         )
 65 | 
 66 |         totals_table = tables.extract_simple_table(
 67 |             totals_section.elements.filter_by_fonts("table_header", "table_text"),
 68 |             as_text=True,
 69 |         )
 70 | 
 71 |         order_summary_with_header = tables.add_header_to_table(order_summary_table)
 72 | 
 73 |         self.assertListEqual(
 74 |             order_summary_table,
 75 |             [
 76 |                 ["Item", "Unit Cost", "Quantity", "Cost"],
 77 |                 ["Challenger 100g\nWhole Hops", "£3.29", "1", "£3.29"],
 78 |                 [
 79 |                     "Maris Otter \nPale Ale Malt \n(Crushed)",
 80 |                     "£1.50/1000g",
 81 |                     "4000g",
 82 |                     "£6.00",
 83 |                 ],
 84 |                 ["WLP037 \nYorkshire Ale \nYeast", "£7.08", "1", "£7.08"],
 85 |                 ["Bottle Caps", "£1 per 100", "500", "£5"],
 86 |             ],
 87 |         )
 88 | 
 89 |         self.assertListEqual(
 90 |             totals_table,
 91 |             [
 92 |                 ["Subtotal:", "£26.28"],
 93 |                 ["Shipping", "£6"],
 94 |                 ["VAT 20%", "£6.45"],
 95 |                 ["Total:", "£38.73"],
 96 |             ],
 97 |         )
 98 | 
 99 |         self.assertListEqual(
100 |             order_summary_with_header,
101 |             [
102 |                 {
103 |                     "Item": "Challenger 100g\nWhole Hops",
104 |                     "Unit Cost": "£3.29",
105 |                     "Quantity": "1",
106 |                     "Cost": "£3.29",
107 |                 },
108 |                 {
109 |                     "Item": "Maris Otter \nPale Ale Malt \n(Crushed)",
110 |                     "Unit Cost": "£1.50/1000g",
111 |                     "Quantity": "4000g",
112 |                     "Cost": "£6.00",
113 |                 },
114 |                 {
115 |                     "Item": "WLP037 \nYorkshire Ale \nYeast",
116 |                     "Unit Cost": "£7.08",
117 |                     "Quantity": "1",
118 |                     "Cost": "£7.08",
119 |                 },
120 |                 {
121 |                     "Item": "Bottle Caps",
122 |                     "Unit Cost": "£1 per 100",
123 |                     "Quantity": "500",
124 |                     "Cost": "£5",
125 |                 },
126 |             ],
127 |         )
128 | 


--------------------------------------------------------------------------------
/tests/test_doc_examples/test_simple_memo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from py_pdf_parser.loaders import load_file
 4 | from tests.base import BaseTestCase
 5 | 
 6 | 
 7 | class TestSimpleMemo(BaseTestCase):
 8 |     def test_output_is_correct(self):
 9 |         # The code below should match that in the documentation example "simple_memo"
10 |         # Step 1 - Load the document
11 |         file_path = os.path.join(
12 |             os.path.dirname(__file__),
13 |             "../../docs/source/example_files/simple_memo.pdf",
14 |         )
15 |         document = load_file(file_path)
16 | 
17 |         # We could visualise it here to check it looks correct:
18 |         # from py_pdf_parser.visualise import visualise
19 |         # visualise(document)
20 | 
21 |         # Step 2 - Extract reference elements:
22 |         to_element = document.elements.filter_by_text_equal(
23 |             "TO:"
24 |         ).extract_single_element()
25 |         from_element = document.elements.filter_by_text_equal(
26 |             "FROM:"
27 |         ).extract_single_element()
28 |         date_element = document.elements.filter_by_text_equal(
29 |             "DATE:"
30 |         ).extract_single_element()
31 |         subject_element = document.elements.filter_by_text_equal(
32 |             "SUBJECT:"
33 |         ).extract_single_element()
34 | 
35 |         # Step 3 - Extract the data
36 |         to_text = (
37 |             document.elements.to_the_right_of(to_element)
38 |             .extract_single_element()
39 |             .text()
40 |         )
41 |         from_text = (
42 |             document.elements.to_the_right_of(from_element)
43 |             .extract_single_element()
44 |             .text()
45 |         )
46 |         date_text = (
47 |             document.elements.to_the_right_of(date_element)
48 |             .extract_single_element()
49 |             .text()
50 |         )
51 |         subject_text_element = document.elements.to_the_right_of(
52 |             subject_element
53 |         ).extract_single_element()
54 |         subject_text = subject_text_element.text()
55 | 
56 |         content_elements = document.elements.after(subject_element)
57 |         content_text = "\n".join(element.text() for element in content_elements)
58 | 
59 |         output = {
60 |             "to": to_text,
61 |             "from": from_text,
62 |             "date": date_text,
63 |             "subject": subject_text,
64 |             "content": content_text,
65 |         }
66 | 
67 |         self.assertDictEqual(
68 |             output,
69 |             {
70 |                 "content": (
71 |                     "A new PDF Parsing tool\n"
72 |                     "There is a new PDF parsing tool available, called py-pdf-parser - "
73 |                     "you should all check it out!\n"
74 |                     "I think it could really help you extract that data we need from "
75 |                     "those PDFs."
76 |                 ),
77 |                 "date": "1st January 2020",
78 |                 "from": "John Smith",
79 |                 "subject": "A new PDF Parsing tool",
80 |                 "to": "All Developers",
81 |             },
82 |         )
83 | 


--------------------------------------------------------------------------------
/tests/test_doc_examples/test_tables.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from py_pdf_parser import tables
  4 | from py_pdf_parser.exceptions import TableExtractionError
  5 | from py_pdf_parser.loaders import load_file
  6 | from tests.base import BaseTestCase
  7 | 
  8 | 
  9 | class TestSimpleMemo(BaseTestCase):
 10 |     def test_output_is_correct(self):
 11 |         file_path = os.path.join(
 12 |             os.path.dirname(__file__), "../../docs/source/example_files/tables.pdf"
 13 |         )
 14 | 
 15 |         # Step 1 - Load the file
 16 |         FONT_MAPPING = {
 17 |             "BAAAAA+LiberationSerif-Bold,12.0": "header",
 18 |             "CAAAAA+LiberationSerif,12.0": "table_element",
 19 |         }
 20 |         document = load_file(file_path, font_mapping=FONT_MAPPING)
 21 | 
 22 |         headers = document.elements.filter_by_font("header")
 23 | 
 24 |         # Extract reference elements
 25 |         simple_table_header = headers.filter_by_text_equal(
 26 |             "Simple Table"
 27 |         ).extract_single_element()
 28 | 
 29 |         simple_table_with_gaps_header = headers.filter_by_text_equal(
 30 |             "Simple Table with gaps"
 31 |         ).extract_single_element()
 32 | 
 33 |         simple_table_with_gaps_in_first_row_col_header = headers.filter_by_text_equal(
 34 |             "Simple Table with gaps in first row/col"
 35 |         ).extract_single_element()
 36 | 
 37 |         non_simple_table_header = headers.filter_by_text_equal(
 38 |             "Non Simple Table"
 39 |         ).extract_single_element()
 40 | 
 41 |         non_simple_table_with_merged_cols_header = headers.filter_by_text_equal(
 42 |             "Non Simple Table with Merged Columns"
 43 |         ).extract_single_element()
 44 | 
 45 |         non_simple_table_with_merged_rows_header = headers.filter_by_text_equal(
 46 |             "Non Simple Table with Merged Rows and Columns"
 47 |         ).extract_single_element()
 48 | 
 49 |         over_the_page_header = headers.filter_by_text_equal(
 50 |             "Over the page"
 51 |         ).extract_single_element()
 52 | 
 53 |         # Extract table elements
 54 |         simple_table_elements = document.elements.between(
 55 |             simple_table_header, simple_table_with_gaps_header
 56 |         )
 57 |         simple_table_with_gaps_elements = document.elements.between(
 58 |             simple_table_with_gaps_header,
 59 |             simple_table_with_gaps_in_first_row_col_header,
 60 |         )
 61 | 
 62 |         simple_table_with_gaps_in_first_row_col_elements = document.elements.between(
 63 |             simple_table_with_gaps_in_first_row_col_header, non_simple_table_header
 64 |         )
 65 | 
 66 |         non_simple_table_elements = document.elements.between(
 67 |             non_simple_table_header, non_simple_table_with_merged_cols_header
 68 |         )
 69 | 
 70 |         non_simple_table_with_merged_cols_elements = document.elements.between(
 71 |             non_simple_table_with_merged_cols_header,
 72 |             non_simple_table_with_merged_rows_header,
 73 |         )
 74 | 
 75 |         non_simple_table_with_merged_rows_and_cols_elements = document.elements.between(
 76 |             non_simple_table_with_merged_rows_header, over_the_page_header
 77 |         )
 78 | 
 79 |         over_the_page_elements = document.elements.after(over_the_page_header)
 80 | 
 81 |         # Simple Table
 82 |         table = tables.extract_simple_table(simple_table_elements, as_text=True)
 83 |         self.assertListEqual(
 84 |             table,
 85 |             [
 86 |                 ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
 87 |                 ["A", "1", "A", "1"],
 88 |                 ["B", "2", "B", "2"],
 89 |                 ["C", "3", "C", "3"],
 90 |             ],
 91 |         )
 92 | 
 93 |         # Simple Table with gaps
 94 | 
 95 |         with self.assertRaises(TableExtractionError):
 96 |             tables.extract_simple_table(simple_table_with_gaps_elements, as_text=True)
 97 | 
 98 |         table = tables.extract_simple_table(
 99 |             simple_table_with_gaps_elements, as_text=True, allow_gaps=True
100 |         )
101 |         self.assertListEqual(
102 |             table,
103 |             [
104 |                 ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
105 |                 ["A", "1", "", "1"],
106 |                 ["B", "", "", ""],
107 |                 ["C", "", "C", "3"],
108 |             ],
109 |         )
110 | 
111 |         # Simple Table with gaps in first row/col
112 |         with self.assertRaises(TableExtractionError):
113 |             tables.extract_simple_table(
114 |                 simple_table_with_gaps_in_first_row_col_elements,
115 |                 as_text=True,
116 |                 allow_gaps=True,
117 |             )
118 | 
119 |         reference_element = simple_table_with_gaps_in_first_row_col_elements[9]
120 |         table = tables.extract_simple_table(
121 |             simple_table_with_gaps_in_first_row_col_elements,
122 |             as_text=True,
123 |             allow_gaps=True,
124 |             reference_element=reference_element,
125 |         )
126 |         self.assertListEqual(
127 |             table,
128 |             [
129 |                 ["Heading 1", "Heading 2", "", "Heading 4"],
130 |                 ["", "1", "A", ""],
131 |                 ["B", "2", "", "2"],
132 |                 ["C", "3", "C", "3"],
133 |             ],
134 |         )
135 | 
136 |         # Non Simple Table
137 |         table = tables.extract_table(non_simple_table_elements, as_text=True)
138 |         self.assertListEqual(
139 |             table,
140 |             [
141 |                 ["", "Heading 2", "Heading 3", "Heading 4"],
142 |                 ["A", "1", "", "1"],
143 |                 ["B", "", "B", "2"],
144 |                 ["C", "3", "C", ""],
145 |             ],
146 |         )
147 | 
148 |         # Non Simple Table with Merged Columns
149 |         with self.assertRaises(TableExtractionError):
150 |             tables.extract_table(
151 |                 non_simple_table_with_merged_cols_elements, as_text=True
152 |             )
153 | 
154 |         table = tables.extract_table(
155 |             non_simple_table_with_merged_cols_elements,
156 |             as_text=True,
157 |             fix_element_in_multiple_cols=True,
158 |         )
159 |         self.assertListEqual(
160 |             table,
161 |             [
162 |                 ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
163 |                 ["A", "1", "A", "1"],
164 |                 ["This text spans across multiple columns", "", "B", "2"],
165 |                 ["C", "3", "C", "3"],
166 |             ],
167 |         )
168 | 
169 |         # Non Simple Table with Merged Rows and Columns
170 |         table = tables.extract_table(
171 |             non_simple_table_with_merged_rows_and_cols_elements,
172 |             as_text=True,
173 |             fix_element_in_multiple_rows=True,
174 |             fix_element_in_multiple_cols=True,
175 |         )
176 |         self.assertListEqual(
177 |             table,
178 |             [
179 |                 ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
180 |                 [
181 |                     "This text spans across multiple rows and \nmultiple columns.",
182 |                     "",
183 |                     "A",
184 |                     "1",
185 |                 ],
186 |                 ["", "", "B", "2"],
187 |                 ["C", "3", "C", "3"],
188 |             ],
189 |         )
190 | 
191 |         # Over the page
192 |         table = tables.extract_simple_table(over_the_page_elements, as_text=True)
193 |         self.assertListEqual(
194 |             table,
195 |             [
196 |                 ["Heading 1", "Heading 2", "Heading 3", "Heading 4"],
197 |                 ["A", "1", "A", "1"],
198 |                 ["B", "2", "B", "2"],
199 |                 ["C", "3", "C", "3"],
200 |             ],
201 |         )
202 | 


--------------------------------------------------------------------------------
/tests/test_loaders.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | from pdfminer.pdfdocument import PDFPasswordIncorrect
 5 | 
 6 | from py_pdf_parser.components import PDFDocument
 7 | from py_pdf_parser.loaders import load, load_file
 8 | 
 9 | 
10 | class LoadersTest(TestCase):
11 |     def test_load_file(self):
12 |         file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf")
13 |         document = load_file(file_path)
14 |         self.assertIsInstance(document, PDFDocument)
15 | 
16 |     def test_load_protected_file(self):
17 |         file_path = os.path.join(
18 |             os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
19 |         )
20 |         document = load_file(file_path, password="p4ssword")
21 |         self.assertIsInstance(document, PDFDocument)
22 | 
23 |     def test_load_protected_file_wrong_password(self):
24 |         file_path = os.path.join(
25 |             os.path.dirname(__file__), "data", "pdfs", "test_protected.pdf"
26 |         )
27 |         with self.assertRaises(PDFPasswordIncorrect):
28 |             load_file(file_path, password="wrong_password")
29 | 
30 |     def test_load(self):
31 |         file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "test.pdf")
32 |         with open(file_path, "rb") as in_file:
33 |             document = load(in_file)
34 |         self.assertIsInstance(document, PDFDocument)
35 | 
36 |     def test_load_with_text_in_image(self):
37 |         file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "image.pdf")
38 |         with open(file_path, "rb") as in_file:
39 |             document = load(in_file)
40 |         self.assertIsInstance(document, PDFDocument)
41 |         self.assertEqual(len(document.elements), 1)
42 | 
43 |         with open(file_path, "rb") as in_file:
44 |             document = load(in_file, la_params={"all_texts": True})
45 |         self.assertIsInstance(document, PDFDocument)
46 |         self.assertEqual(len(document.elements), 2)
47 | 
48 |     def test_load_file_with_text_in_image(self):
49 |         file_path = os.path.join(os.path.dirname(__file__), "data", "pdfs", "image.pdf")
50 |         document = load_file(file_path, la_params={"all_texts": True})
51 |         self.assertIsInstance(document, PDFDocument)
52 |         self.assertEqual(len(document.elements), 2)
53 | 


--------------------------------------------------------------------------------
/tests/test_sectioning.py:
--------------------------------------------------------------------------------
  1 | import types
  2 | 
  3 | from py_pdf_parser.exceptions import InvalidSectionError, SectionNotFoundError
  4 | from py_pdf_parser.sectioning import Sectioning
  5 | 
  6 | from .base import BaseTestCase
  7 | from .utils import FakePDFMinerTextElement, create_pdf_document, create_section
  8 | 
  9 | 
 10 | class TestSection(BaseTestCase):
 11 |     def test_contains(self):
 12 |         elem_1 = FakePDFMinerTextElement()
 13 |         elem_2 = FakePDFMinerTextElement()
 14 |         elem_3 = FakePDFMinerTextElement()
 15 |         document = create_pdf_document([elem_1, elem_2, elem_3])
 16 | 
 17 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
 18 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
 19 |         pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list)
 20 | 
 21 |         section = create_section(
 22 |             document, start_element=pdf_elem_1, end_element=pdf_elem_2
 23 |         )
 24 | 
 25 |         self.assertIn(pdf_elem_1, section)
 26 |         self.assertIn(pdf_elem_2, section)
 27 |         self.assertNotIn(pdf_elem_3, section)
 28 | 
 29 |     def test_eq(self):
 30 |         elem_1 = FakePDFMinerTextElement()
 31 |         elem_2 = FakePDFMinerTextElement()
 32 |         elem_3 = FakePDFMinerTextElement()
 33 |         document = create_pdf_document([elem_1, elem_2, elem_3])
 34 | 
 35 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
 36 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
 37 |         pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list)
 38 | 
 39 |         section_1 = create_section(
 40 |             document, start_element=pdf_elem_1, end_element=pdf_elem_2
 41 |         )
 42 |         section_2 = create_section(
 43 |             document, start_element=pdf_elem_1, end_element=pdf_elem_2
 44 |         )
 45 |         self.assertEqual(section_1, section_2)
 46 |         section_3 = create_section(
 47 |             document, start_element=pdf_elem_1, end_element=pdf_elem_3
 48 |         )
 49 |         self.assertNotEqual(section_1, section_3)
 50 | 
 51 |     def test_exceptions(self):
 52 |         elem_1 = FakePDFMinerTextElement()
 53 |         elem_2 = FakePDFMinerTextElement()
 54 |         document = create_pdf_document([elem_1, elem_2])
 55 | 
 56 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
 57 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
 58 |         with self.assertRaises(InvalidSectionError):
 59 |             create_section(document, start_element=pdf_elem_2, end_element=pdf_elem_1)
 60 | 
 61 |     def test_len(self):
 62 |         elem_1 = FakePDFMinerTextElement()
 63 |         elem_2 = FakePDFMinerTextElement()
 64 |         elem_3 = FakePDFMinerTextElement()
 65 |         document = create_pdf_document([elem_1, elem_2, elem_3])
 66 | 
 67 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
 68 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
 69 |         pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list)
 70 | 
 71 |         section = create_section(
 72 |             document,
 73 |             name="fake_section",
 74 |             start_element=pdf_elem_1,
 75 |             end_element=pdf_elem_3,
 76 |         )
 77 |         self.assertEqual(len(section), 3)
 78 | 
 79 |         # Ignoring an element should affect the length of the section.
 80 |         pdf_elem_2.ignore()
 81 |         self.assertEqual(len(section), 2)
 82 | 
 83 |     def test_repr(self):
 84 |         elem_1 = FakePDFMinerTextElement()
 85 |         elem_2 = FakePDFMinerTextElement()
 86 |         document = create_pdf_document([elem_1, elem_2])
 87 | 
 88 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
 89 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
 90 | 
 91 |         section = create_section(
 92 |             document,
 93 |             name="fake_section",
 94 |             unique_name="fake_section_0",
 95 |             start_element=pdf_elem_1,
 96 |             end_element=pdf_elem_2,
 97 |         )
 98 | 
 99 |         self.assertEqual(
100 |             repr(section),
101 |             (
102 |                 "<Section name: 'fake_section', unique_name: 'fake_section_0', "
103 |                 "number of elements: 2>"
104 |             ),
105 |         )
106 | 
107 |         # Ignoring an element should affect the number of elements of the section.
108 |         pdf_elem_2.ignore()
109 |         self.assertEqual(
110 |             repr(section),
111 |             (
112 |                 "<Section name: 'fake_section', unique_name: 'fake_section_0', "
113 |                 "number of elements: 1>"
114 |             ),
115 |         )
116 | 
117 | 
118 | class TestSectioning(BaseTestCase):
119 |     def test_create_section(self):
120 |         elem_1 = FakePDFMinerTextElement()
121 |         elem_2 = FakePDFMinerTextElement()
122 |         elem_3 = FakePDFMinerTextElement()
123 |         document = create_pdf_document([elem_1, elem_2, elem_3])
124 | 
125 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
126 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
127 |         pdf_elem_3 = self.extract_element_from_list(elem_3, document._element_list)
128 | 
129 |         sectioning = Sectioning(document)
130 |         sectioning.create_section(
131 |             "fake_section", start_element=pdf_elem_1, end_element=pdf_elem_2
132 |         )
133 | 
134 |         section_1 = create_section(
135 |             document,
136 |             unique_name="fake_section_0",
137 |             start_element=pdf_elem_1,
138 |             end_element=pdf_elem_2,
139 |         )
140 |         self.assertEqual(len(sectioning.sections), 1)
141 |         self.assertIn(section_1, sectioning.sections)
142 | 
143 |         # Checks that section with the same name would have different unique names when
144 |         # added in Sectioning
145 |         section_2 = create_section(
146 |             document,
147 |             unique_name="fake_section_1",
148 |             start_element=pdf_elem_2,
149 |             end_element=pdf_elem_3,
150 |         )
151 |         sectioning.create_section(
152 |             name="fake_section", start_element=pdf_elem_2, end_element=pdf_elem_3
153 |         )
154 |         self.assertEqual(len(sectioning.sections), 2)
155 |         self.assertIn(section_1, sectioning.sections)
156 |         self.assertIn(section_2, sectioning.sections)
157 | 
158 |         # Test with include_end_element being False
159 |         section_3 = sectioning.create_section(
160 |             name="test",
161 |             start_element=pdf_elem_1,
162 |             end_element=pdf_elem_3,
163 |             include_last_element=False,
164 |         )
165 |         self.assertEqual(len(section_3.elements), 2)
166 |         self.assertIn(pdf_elem_1, section_3.elements)
167 |         self.assertIn(pdf_elem_2, section_3.elements)
168 |         self.assertNotIn(pdf_elem_3, section_3.elements)
169 | 
170 |         with self.assertRaises(InvalidSectionError):
171 |             sectioning.create_section(
172 |                 name="test",
173 |                 start_element=pdf_elem_1,
174 |                 end_element=pdf_elem_1,
175 |                 include_last_element=False,
176 |             )
177 | 
178 |     def test_get_sections_with_name(self):
179 |         elem_1 = FakePDFMinerTextElement()
180 |         elem_2 = FakePDFMinerTextElement()
181 |         document = create_pdf_document([elem_1, elem_2])
182 | 
183 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
184 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
185 | 
186 |         self.assertTrue(
187 |             isinstance(
188 |                 document.sectioning.get_sections_with_name("foo"), types.GeneratorType
189 |             )
190 |         )
191 |         self.assertEqual(list(document.sectioning.get_sections_with_name("foo")), [])
192 | 
193 |         section_1 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2)
194 |         section_2 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2)
195 |         document.sectioning.create_section("bar", pdf_elem_1, pdf_elem_2)
196 | 
197 |         self.assertTrue(
198 |             isinstance(
199 |                 document.sectioning.get_sections_with_name("foo"), types.GeneratorType
200 |             )
201 |         )
202 |         self.assertEqual(
203 |             list(document.sectioning.get_sections_with_name("foo")),
204 |             [section_1, section_2],
205 |         )
206 | 
207 |     def test_get_section(self):
208 |         elem_1 = FakePDFMinerTextElement()
209 |         elem_2 = FakePDFMinerTextElement()
210 |         document = create_pdf_document([elem_1, elem_2])
211 | 
212 |         pdf_elem_1 = self.extract_element_from_list(elem_1, document._element_list)
213 |         pdf_elem_2 = self.extract_element_from_list(elem_2, document._element_list)
214 | 
215 |         with self.assertRaises(SectionNotFoundError):
216 |             document.sectioning.get_section("foo")
217 | 
218 |         self.assertTrue(
219 |             isinstance(
220 |                 document.sectioning.get_sections_with_name("foo"), types.GeneratorType
221 |             )
222 |         )
223 |         self.assertEqual(list(document.sectioning.get_sections_with_name("foo")), [])
224 | 
225 |         section_1 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2)
226 |         section_2 = document.sectioning.create_section("foo", pdf_elem_1, pdf_elem_2)
227 | 
228 |         self.assertEqual(document.sectioning.get_section("foo_0"), section_1)
229 |         self.assertEqual(document.sectioning.get_section("foo_1"), section_2)
230 | 


--------------------------------------------------------------------------------
/tests/test_visualise.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from py_pdf_parser.loaders import load_file
 4 | from py_pdf_parser.visualise.main import PDFVisualiser
 5 | 
 6 | from .base import BaseVisualiseTestCase
 7 | 
 8 | 
 9 | class TestVisualise(BaseVisualiseTestCase):
10 |     def test_visualise(self):
11 |         file_path = os.path.join(
12 |             os.path.dirname(__file__), "../docs/source/example_files/tables.pdf"
13 |         )
14 | 
15 |         FONT_MAPPING = {
16 |             "BAAAAA+LiberationSerif-Bold,12.0": "header",
17 |             "CAAAAA+LiberationSerif,12.0": "table_element",
18 |         }
19 |         document = load_file(file_path, font_mapping=FONT_MAPPING)
20 | 
21 |         visualiser = PDFVisualiser(
22 |             self.root, document, show_info=True, width=1920, height=1080
23 |         )
24 | 
25 |         self.check_images(visualiser, "tables1")
26 | 
27 |         visualiser.toolbar._buttons["Next page"].invoke()
28 |         self.check_images(visualiser, "tables2")
29 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable, Dict, List, NamedTuple, Optional, Union
  2 | 
  3 | import re
  4 | 
  5 | from pdfminer.layout import LTComponent
  6 | 
  7 | from py_pdf_parser.common import BoundingBox
  8 | from py_pdf_parser.components import ElementOrdering, PDFDocument, PDFElement
  9 | from py_pdf_parser.loaders import Page
 10 | from py_pdf_parser.sectioning import Section
 11 | 
 12 | 
 13 | class FakePDFMinerCharacter(NamedTuple):
 14 |     fontname: str = "fake_fontname"
 15 |     height: float = 10
 16 | 
 17 | 
 18 | class FakePDFMinerIterator:
 19 |     def __init__(self, font_name: str = "fake_font", font_size: float = 10):
 20 |         self.finished = False
 21 |         self.font_name = font_name
 22 |         self.font_size = font_size
 23 | 
 24 |     def __next__(self):
 25 |         if self.finished:
 26 |             raise StopIteration()
 27 | 
 28 |         self.finished = True
 29 |         return [FakePDFMinerCharacter(fontname=self.font_name, height=self.font_size)]
 30 | 
 31 | 
 32 | class FakePDFMinerTextElement(LTComponent):
 33 |     """
 34 |     This is a stub to help create something which looks like a PDFMiner text element
 35 |     for use in testing.
 36 | 
 37 |     The fontname and size are detected by getting the first character of the first row
 38 |     of the contained text. This is done by iterating, hence we define an iterator which
 39 |     simply returns one list of length one and then raises StopIteration. This is the
 40 |     minimum needed to pretend to allow extraction of the first character, for which
 41 |     we use the FakeCharacter namedtuple which has fontname and height attibutes set.
 42 |     """
 43 | 
 44 |     def __init__(
 45 |         self,
 46 |         bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1),
 47 |         text: str = "fake_text",
 48 |         font_name: str = "fake_font",
 49 |         font_size: float = 10,
 50 |     ):
 51 |         super().__init__(
 52 |             bbox=[bounding_box.x0, bounding_box.y0, bounding_box.x1, bounding_box.y1]
 53 |         )
 54 |         self.text = text
 55 |         self.font_name = font_name
 56 |         self.font_size = font_size
 57 | 
 58 |     def __iter__(self):
 59 |         return FakePDFMinerIterator(font_name=self.font_name, font_size=self.font_size)
 60 | 
 61 |     def get_text(self) -> str:
 62 |         if self.text is None:
 63 |             return ""
 64 |         return self.text
 65 | 
 66 | 
 67 | def create_pdf_element(
 68 |     bounding_box: "BoundingBox" = BoundingBox(0, 1, 0, 1),
 69 |     text: str = "fake_text",
 70 |     font_name: str = "fake_font",
 71 |     font_size: float = 10,
 72 |     font_mapping: Optional[Dict[str, str]] = None,
 73 |     font_mapping_is_regex: bool = False,
 74 |     regex_flags: Union[int, re.RegexFlag] = 0,
 75 |     font_size_precision: int = 1,
 76 | ) -> "PDFElement":
 77 |     document = create_pdf_document(
 78 |         elements=[
 79 |             FakePDFMinerTextElement(
 80 |                 bounding_box, text=text, font_name=font_name, font_size=font_size
 81 |             )
 82 |         ],
 83 |         font_mapping=font_mapping,
 84 |         font_mapping_is_regex=font_mapping_is_regex,
 85 |         regex_flags=regex_flags,
 86 |         font_size_precision=font_size_precision,
 87 |     )
 88 |     return document.elements[0]
 89 | 
 90 | 
 91 | def create_pdf_document(
 92 |     elements: Union[List[LTComponent], Dict[int, List[LTComponent]]],
 93 |     font_mapping: Optional[Dict[str, str]] = None,
 94 |     font_mapping_is_regex: bool = False,
 95 |     regex_flags: Union[int, re.RegexFlag] = 0,
 96 |     font_size_precision: int = 1,
 97 |     element_ordering: Union[
 98 |         ElementOrdering, Callable[[List], List]
 99 |     ] = ElementOrdering.LEFT_TO_RIGHT_TOP_TO_BOTTOM,
100 | ) -> "PDFDocument":
101 |     """
102 |     Creates a PDF document with the given elements.
103 |     "elements" can be a list of elements (in which case a document with a single page
104 |     will be created) or a dictionary mapping page number to its list of elements.
105 |     """
106 |     if not isinstance(elements, dict):
107 |         pages = {1: Page(elements=elements, width=100, height=100)}
108 |     else:
109 |         pages = {
110 |             page_number: Page(elements=elements_list, width=100, height=100)
111 |             for page_number, elements_list in elements.items()
112 |         }
113 | 
114 |     return PDFDocument(
115 |         pages=pages,
116 |         font_mapping=font_mapping,
117 |         font_mapping_is_regex=font_mapping_is_regex,
118 |         regex_flags=regex_flags,
119 |         font_size_precision=font_size_precision,
120 |         element_ordering=element_ordering,
121 |     )
122 | 
123 | 
124 | def create_section(
125 |     document: "PDFDocument",
126 |     name: str = "fake_name",
127 |     unique_name: str = "fake_name_1",
128 |     start_element: Optional["PDFElement"] = None,
129 |     end_element: Optional["PDFElement"] = None,
130 | ) -> "Section":
131 |     """
132 |     Creates a simple section
133 |     """
134 |     if start_element is None:
135 |         start_element = document._element_list[0]
136 |     if end_element is None:
137 |         end_element = document._element_list[-1]
138 | 
139 |     return Section(document, name, unique_name, start_element, end_element)
140 | 


--------------------------------------------------------------------------------