├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ └── feature.md ├── pull_request_template.md └── workflows │ ├── documentation.yml │ ├── release.yml │ └── tests.yml ├── .gitignore ├── .gitlab-ci.yml ├── .pre-commit-config.yaml ├── CITATION.cff ├── LICENSE ├── README.md ├── changelog.md ├── contributing.md ├── demo ├── app.py └── requirements.txt ├── docs ├── alternatives.md ├── assets │ ├── images │ │ ├── model-parallelism.png │ │ ├── multiprocessing.png │ │ └── transformer-windowing.svg │ ├── logo │ │ ├── aphp-blue.svg │ │ ├── aphp-white.svg │ │ ├── edspdf-blue.svg │ │ ├── edspdf-red.svg │ │ └── edspdf-white.svg │ ├── stylesheets │ │ └── extra.css │ ├── templates │ │ └── python │ │ │ └── material │ │ │ ├── class.html │ │ │ ├── docstring.html │ │ │ ├── docstring │ │ │ ├── examples.html │ │ │ └── parameters.html │ │ │ └── function.html │ └── termynal │ │ ├── termynal.css │ │ └── termynal.js ├── changelog.md ├── configuration.md ├── contributing.md ├── data-structures.md ├── index.md ├── inference.md ├── layers │ ├── box-transformer-layer.md │ ├── box-transformer.md │ ├── index.md │ ├── relative-attention.md │ ├── sinusoidal-embedding.md │ └── vocabulary.md ├── pipeline.md ├── pipes │ ├── aggregators │ │ ├── index.md │ │ └── simple-aggregator.md │ ├── box-classifiers │ │ ├── dummy.md │ │ ├── index.md │ │ ├── mask.md │ │ ├── random.md │ │ └── trainable.md │ ├── embeddings │ │ ├── box-layout-embedding.md │ │ ├── box-transformer.md │ │ ├── embedding-combiner.md │ │ ├── huggingface-embedding.md │ │ ├── index.md │ │ ├── simple-text-embedding.md │ │ └── sub-box-cnn-pooler.md │ ├── extractors │ │ ├── index.md │ │ └── pdfminer.md │ └── index.md ├── recipes │ ├── annotation.md │ ├── extension.md │ ├── index.md │ ├── resources │ │ ├── deep-learning-architecture.svg │ │ ├── lines.jpeg │ │ └── merged.jpeg │ ├── rule-based.md │ └── training.md ├── references.bib ├── roadmap.md ├── scripts │ ├── bibtex.py │ └── plugin.py ├── trainable-pipes.md └── utilities │ ├── alignment.md │ ├── index.md │ ├── resources │ ├── aligned-merged.jpeg │ ├── aligned.jpeg │ ├── blocs.jpeg │ ├── blocs.png │ ├── lines.jpeg │ └── merged.jpeg │ └── visualisation.md ├── edspdf ├── __init__.py ├── accelerators │ ├── __init__.py │ ├── base.py │ └── multiprocessing.py ├── data │ ├── __init__.py │ ├── base.py │ ├── converters.py │ ├── files.py │ ├── pandas.py │ └── parquet.py ├── layers │ ├── __init__.py │ ├── box_transformer.py │ ├── relative_attention.py │ ├── sinusoidal_embedding.py │ └── vocabulary.py ├── lazy_collection.py ├── pipeline.py ├── pipes │ ├── __init__.py │ ├── aggregators │ │ ├── __init__.py │ │ └── simple.py │ ├── classifiers │ │ ├── __init__.py │ │ ├── dummy.py │ │ ├── mask.py │ │ ├── random.py │ │ └── trainable.py │ ├── embeddings │ │ ├── __init__.py │ │ ├── box_layout_embedding.py │ │ ├── box_layout_preprocessor.py │ │ ├── box_transformer.py │ │ ├── embedding_combiner.py │ │ ├── huggingface_embedding.py │ │ ├── simple_text_embedding.py │ │ └── sub_box_cnn_pooler.py │ └── extractors │ │ ├── __init__.py │ │ └── pdfminer.py ├── processing │ ├── __init__.py │ ├── multiprocessing.py │ ├── simple.py │ └── utils.py ├── registry.py ├── structures.py ├── trainable_pipe.py ├── utils │ ├── __init__.py │ ├── alignment.py │ ├── collections.py │ ├── file_system.py │ ├── lazy_module.py │ ├── optimization.py │ ├── package.py │ ├── random.py │ └── torch.py └── visualization │ ├── __init__.py │ ├── annotations.py │ └── merge.py ├── mkdocs.yml ├── pyproject.toml ├── roadmap.md └── tests ├── conftest.py ├── core ├── config.cfg ├── test_data.py ├── test_pipeline.py ├── test_registry.py └── test_structures.py ├── pipes ├── aggregators │ └── test_simple.py ├── classifiers │ ├── conftest.py │ ├── test_align.py │ ├── test_dummy.py │ ├── test_mask.py │ └── test_random.py ├── embeddings │ ├── test_custom.py │ └── test_huggingface.py └── extractors │ ├── blocks_ground_truth.py │ └── test_pdfminer.py ├── recipes ├── config.cfg ├── test_markdown_aggregator.py └── test_train.py ├── resources ├── blank.pdf ├── distant-superscript.pdf ├── error.pdf ├── letter.pdf ├── styles.pdf └── test.pdf ├── utils.py ├── utils ├── test_package.py ├── test_py_utils.py └── test_torch_utils.py └── visualization ├── test_annotations.py └── test_merge.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,build 4 | per-file-ignores = __init__.py:F401,tests/*.py:F401,factory.py:F401 5 | ignore = W503, E203 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Bug Report" 3 | about: Use this template if you came across a bug or unexpected behaviour differing from the docs. 4 | --- 5 | 6 | 7 | 8 | ## Description 9 | 10 | 11 | 12 | ## How to reproduce the bug 13 | 14 | 15 | 16 | ## Your Environment 17 | 18 | 19 | 20 | - Operating System: 21 | - Python Version Used: 22 | - EDS-PDF Version Used: 23 | - Environment Information: 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Feature request" 3 | about: Use this template if you'd like EDS-PDF to add a new feature. 4 | title: "Feature request: [feature]" 5 | --- 6 | 7 | ## Feature type 8 | 9 | 10 | 11 | ## Description 12 | 13 | 14 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Description 4 | 5 | 6 | 7 | ## Checklist 8 | 9 | 10 | 11 | - [ ] If this PR is a bug fix, the bug is documented in the test suite. 12 | - [ ] Changes were documented in the changelog (pending section). 13 | - [ ] If necessary, changes were made to the documentation. 14 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: Documentation 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: [main] 7 | 8 | jobs: 9 | Documentation: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - uses: actions/setup-python@v2 14 | with: 15 | python-version: "3.10" 16 | 17 | - name: Set PY variable 18 | run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV 19 | 20 | - name: Install hatch 21 | run: pip install hatch 22 | 23 | - name: Set up Git 24 | run: | 25 | git config user.name ${{ github.actor }} 26 | git config user.email ${{ github.actor }}@users.noreply.github.com 27 | - name: Build documentation 28 | run: | 29 | git fetch origin gh-pages 30 | hatch -e docs run mike delete main 31 | hatch -e docs run mike deploy --push main 32 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Release 10 | 11 | on: 12 | workflow_dispatch: 13 | release: 14 | types: [published] 15 | 16 | jobs: 17 | build: 18 | name: Build package 19 | runs-on: ubuntu-22.04 20 | steps: 21 | - uses: actions/checkout@v2 22 | 23 | - name: Build sdist 24 | run: pipx run build --sdist --wheel 25 | 26 | - uses: actions/upload-artifact@v4 27 | with: 28 | name: artifact 29 | path: | 30 | dist/*.tar.gz 31 | dist/*.whl 32 | 33 | pypi: 34 | name: Upload to PyPI 35 | needs: [ build ] 36 | runs-on: ubuntu-22.04 37 | permissions: 38 | id-token: write 39 | 40 | steps: 41 | - uses: actions/download-artifact@v4 42 | with: 43 | name: artifact 44 | path: dist 45 | merge-multiple: true 46 | - name: Publish package 47 | uses: pypa/gh-action-pypi-publish@release/v1 48 | 49 | documentation: 50 | name: Build documentation 51 | 52 | runs-on: ubuntu-22.04 53 | steps: 54 | - uses: actions/checkout@v2 55 | - uses: actions/setup-python@v2 56 | with: 57 | python-version: "3.10" 58 | 59 | - name: Set PY variable 60 | run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV 61 | 62 | - name: Install hatch 63 | run: pip install hatch 64 | 65 | - name: Set up Git 66 | run: | 67 | git config user.name ${{ github.actor }} 68 | git config user.email ${{ github.actor }}@users.noreply.github.com 69 | 70 | - name: Build documentation 71 | run: | 72 | git fetch origin gh-pages 73 | hatch -e docs run mike deploy --push --no-redirect --update-aliases $GITHUB_REF_NAME latest 74 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests and Linting 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | push: 7 | branches: [main] 8 | 9 | jobs: 10 | Linting: 11 | runs-on: ubuntu-22.04 12 | steps: 13 | - uses: actions/checkout@v3 14 | with: 15 | # requites to grab the history of the PR 16 | fetch-depth: 0 17 | - uses: actions/setup-python@v3 18 | with: 19 | python-version: "3.10" 20 | - uses: pre-commit/action@v3.0.0 21 | 22 | Pytest: 23 | runs-on: ubuntu-22.04 24 | strategy: 25 | fail-fast: true 26 | matrix: 27 | python-version: ["3.7", "3.8", "3.9", "3.10"] 28 | steps: 29 | - uses: actions/checkout@v2 30 | - name: Set up Python 31 | uses: actions/setup-python@v2 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | architecture: x64 35 | 36 | - name: Cache HuggingFace Models 37 | uses: actions/cache@v2 38 | id: cache-huggingface 39 | with: 40 | path: ~/.cache/huggingface/ 41 | key: ${{ matrix.python-version }}-huggingface 42 | 43 | - name: Install hatch 44 | run: pip install hatch 45 | 46 | - name: Test with Pytest on Python ${{ matrix.python-version }} 47 | run: hatch run tests 48 | 49 | - name: Upload coverage data 50 | uses: actions/upload-artifact@v4 51 | with: 52 | name: coverage-data-${{ matrix.python-version }} 53 | path: .coverage.* 54 | if-no-files-found: ignore 55 | include-hidden-files: true 56 | 57 | Coverage: 58 | name: Coverage 59 | needs: Pytest 60 | uses: aphp/foldedtensor/.github/workflows/coverage.yml@main 61 | with: 62 | base-branch: main 63 | coverage-data-pattern: coverage-data-* 64 | coverage-report: coverage.txt 65 | coverage-badge: coverage.svg 66 | coverage-branch: coverage 67 | 68 | Documentation: 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v2 72 | - uses: actions/setup-python@v2 73 | with: 74 | python-version: "3.10" 75 | 76 | - name: Install hatch 77 | run: pip install hatch 78 | 79 | - name: Build documentation 80 | run: hatch run docs:build 81 | 82 | Installation: 83 | runs-on: ubuntu-22.04 84 | strategy: 85 | fail-fast: false 86 | matrix: 87 | python-version: ["3.7", "3.8", "3.9", "3.10"] 88 | steps: 89 | - uses: actions/checkout@v2 90 | - uses: actions/setup-python@v2 91 | with: 92 | python-version: ${{ matrix.python-version }} 93 | - name: Install library from source 94 | run: | 95 | pip install . 96 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | 4 | # DS Store 5 | .DS_Store 6 | 7 | .idea 8 | .vscode 9 | 10 | .venv 11 | 12 | # C extensions 13 | *.so 14 | *.dylib 15 | *.cpp 16 | 17 | # Distribution / packaging 18 | setup.py 19 | poetry.lock 20 | init 21 | .Python 22 | env/ 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | report.xml 50 | 51 | # IPython Notebook 52 | .ipynb_checkpoints 53 | *.ipynb 54 | 55 | # Data 56 | *.csv 57 | *.xls 58 | *.xlsx 59 | *.pkl 60 | *.jpg 61 | *.png 62 | *.html 63 | *.pickle 64 | *.joblib 65 | *.pdf 66 | /data/ 67 | 68 | # MkDocs output 69 | docs/reference 70 | site/ 71 | public/ 72 | -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | image: harbor.eds.aphp.fr/public/python:3.8-slim 2 | 3 | variables: 4 | GIT_SUBMODULE_STRATEGY: recursive 5 | 6 | stages: 7 | - test 8 | - pages 9 | - package 10 | 11 | Linting: 12 | stage: test 13 | cache: 14 | - key: 15 | files: 16 | - .pre-commit-config.yaml 17 | paths: 18 | - ~/.pre-commit 19 | before_script: 20 | - apt-get update 21 | - apt-get install -y --no-install-recommends git 22 | - pip install pre-commit 23 | script: 24 | - pre-commit run --all-files 25 | only: 26 | refs: 27 | - main 28 | - merge_request 29 | 30 | Running Pytest: 31 | stage: test 32 | before_script: 33 | - pip install cython setuptools # because `poetry install` does not correctly build the package 34 | - pip install -e '.[dev]' 35 | script: 36 | - pytest tests --cov edspdf --junitxml=report.xml 37 | after_script: 38 | - coverage xml -o coverage.xml 39 | coverage: "/TOTAL.+ ([0-9]{1,3}%)/" 40 | artifacts: 41 | when: always 42 | paths: 43 | - coverage.xml 44 | - report.xml 45 | - ./ 46 | reports: 47 | junit: report.xml 48 | coverage_report: 49 | coverage_format: cobertura 50 | path: coverage.xml 51 | 52 | only: 53 | refs: 54 | - main 55 | - merge_request 56 | 57 | Installation: 58 | stage: test 59 | script: 60 | - pip install . 61 | only: 62 | refs: 63 | - main 64 | - merge_request 65 | 66 | Test documentation: 67 | stage: test 68 | before_script: 69 | - pip install -e '.[docs]' 70 | script: 71 | - mkdocs build --site-dir documentation 72 | artifacts: 73 | paths: 74 | - documentation 75 | only: 76 | refs: 77 | - merge_request 78 | 79 | pages: 80 | stage: pages 81 | before_script: 82 | - pip install -e '.[docs]' 83 | script: 84 | - mkdocs build --site-dir public 85 | artifacts: 86 | paths: 87 | - public 88 | only: 89 | - main 90 | 91 | Package: 92 | stage: package 93 | before_script: 94 | - pip install build twine 95 | - python -m build 96 | script: 97 | - TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --repository-url ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi dist/* 98 | only: 99 | - tags 100 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v3.2.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: no-commit-to-branch 9 | - id: end-of-file-fixer 10 | - id: check-yaml 11 | args: ["--unsafe"] 12 | - id: check-toml 13 | - id: check-json 14 | - id: check-symlinks 15 | - id: check-docstring-first 16 | - id: check-added-large-files 17 | - id: detect-private-key 18 | # ruff 19 | - repo: https://github.com/charliermarsh/ruff-pre-commit 20 | # Ruff version. 21 | rev: 'v0.0.287' 22 | hooks: 23 | - id: ruff 24 | args: ['--config', 'pyproject.toml'] 25 | - repo: https://github.com/psf/black 26 | rev: 22.3.0 27 | hooks: 28 | - id: black 29 | - repo: https://github.com/asottile/blacken-docs 30 | rev: v1.10.0 31 | hooks: 32 | - id: blacken-docs 33 | additional_dependencies: [black==20.8b1] 34 | exclude: notebooks/ 35 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | # This CITATION.cff file was generated with cffinit. 2 | # Visit https://bit.ly/cffinit to generate yours today! 3 | 4 | cff-version: 1.2.0 5 | title: >- 6 | EDS-PDF: Smart text extraction from PDF documents 7 | message: If you use EDS-PDF, please cite us as below. 8 | type: software 9 | authors: 10 | - given-names: Basile 11 | family-names: Dura 12 | orcid: "https://orcid.org/0000-0002-8315-4050" 13 | affiliation: Assistance Publique – Hôpitaux de Paris 14 | - given-names: Perceval 15 | family-names: Wajsburt 16 | affiliation: Assistance Publique – Hôpitaux de Paris 17 | - given-names: Alice 18 | family-names: Calliger 19 | affiliation: Assistance Publique – Hôpitaux de Paris 20 | - given-names: Christel 21 | family-names: Gérardin 22 | affiliation: Assistance Publique – Hôpitaux de Paris 23 | - given-names: Romain 24 | family-names: Bey 25 | affiliation: Assistance Publique – Hôpitaux de Paris 26 | repository-code: "https://github.com/aphp/edspdf" 27 | url: "https://github.com/aphp/edspdf" 28 | abstract: >- 29 | EDS-PDF provides a modular and extendable framework to extract text from PDF documents. 30 | keywords: 31 | - PDF 32 | - extraction 33 | - python 34 | - NLP 35 | license: BSD-3-Clause 36 | year: 2022 37 | doi: 10.5281/zenodo.6902977 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2021 Assistance Publique - Hôpitaux de Paris 2 | 3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 4 | 5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 | 7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | 9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Tests](https://img.shields.io/github/actions/workflow/status/aphp/edspdf/tests.yml?branch=main&label=tests&style=flat-square) 2 | [![Documentation](https://img.shields.io/github/actions/workflow/status/aphp/edspdf/documentation.yml?branch=main&label=docs&style=flat-square)](https://aphp.github.io/edspdf/latest/) 3 | [![PyPI](https://img.shields.io/pypi/v/edspdf?color=blue&style=flat-square)](https://pypi.org/project/edspdf/) 4 | [![Coverage](https://raw.githubusercontent.com/aphp/edspdf/coverage/coverage.svg)](https://raw.githubusercontent.com/aphp/edspdf/coverage/coverage.txt) 5 | [![DOI](https://zenodo.org/badge/517726737.svg)](https://zenodo.org/badge/latestdoi/517726737) 6 | 7 | # EDS-PDF 8 | 9 | EDS-PDF provides a modular framework to extract text information from PDF documents. 10 | 11 | You can use it out-of-the-box, or extend it to fit your specific use case. We provide a pipeline system and various utilities for visualizing and processing PDFs, as well as multiple components to build complex models:complex models: 12 | - 📄 [Extractors](https://aphp.github.io/edspdf/latest/pipes/extractors) to parse PDFs (based on [pdfminer](https://github.com/euske/pdfminer), [mupdf](https://github.com/aphp/edspdf-mupdf) or [poppler](https://github.com/aphp/edspdf-poppler)) 13 | - 🎯 [Classifiers](https://aphp.github.io/edspdf/latest/pipes/box-classifiers) to perform text box classification, in order to segment PDFs 14 | - 🧩 [Aggregators](https://aphp.github.io/edspdf/latest/pipes/aggregators) to produce an aggregated output from the detected text boxes 15 | - 🧠 Trainable layers to incorporate machine learning in your pipeline (e.g., [embedding](https://aphp.github.io/edspdf/latest/pipes/embeddings) building blocks or a [trainable classifier](https://aphp.github.io/edspdf/latest/pipes/box-classifiers/trainable/)) 16 | 17 | Visit the [:book: documentation](https://aphp.github.io/edspdf/) for more information! 18 | 19 | ## Getting started 20 | 21 | ### Installation 22 | 23 | Install the library with pip: 24 | 25 | ```bash 26 | pip install edspdf 27 | ``` 28 | 29 | ### Extracting text 30 | 31 | Let's build a simple PDF extractor that uses a rule-based classifier. There are two 32 | ways to do this, either by using the [configuration system](#configuration) or by using 33 | the pipeline API. 34 | 35 | Create a configuration file: 36 | 37 |
config.cfg
38 | 39 | ```ini 40 | [pipeline] 41 | pipeline = ["extractor", "classifier", "aggregator"] 42 | 43 | [components.extractor] 44 | @factory = "pdfminer-extractor" 45 | 46 | [components.classifier] 47 | @factory = "mask-classifier" 48 | x0 = 0.2 49 | x1 = 0.9 50 | y0 = 0.3 51 | y1 = 0.6 52 | threshold = 0.1 53 | 54 | [components.aggregator] 55 | @factory = "simple-aggregator" 56 | ``` 57 | 58 | and load it from Python: 59 | 60 | ```python 61 | import edspdf 62 | from pathlib import Path 63 | 64 | model = edspdf.load("config.cfg") # (1) 65 | ``` 66 | 67 | Or create a pipeline directly from Python: 68 | 69 | ```python 70 | from edspdf import Pipeline 71 | 72 | model = Pipeline() 73 | model.add_pipe("pdfminer-extractor") 74 | model.add_pipe( 75 | "mask-classifier", 76 | config=dict( 77 | x0=0.2, 78 | x1=0.9, 79 | y0=0.3, 80 | y1=0.6, 81 | threshold=0.1, 82 | ), 83 | ) 84 | model.add_pipe("simple-aggregator") 85 | ``` 86 | 87 | This pipeline can then be applied (for instance with this [PDF](https://github.com/aphp/edspdf/raw/main/tests/resources/letter.pdf)): 88 | 89 | ```python 90 | # Get a PDF 91 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes() 92 | pdf = model(pdf) 93 | 94 | body = pdf.aggregated_texts["body"] 95 | 96 | text, style = body.text, body.properties 97 | ``` 98 | 99 | See the [rule-based recipe](https://aphp.github.io/edspdf/latest/recipes/rule-based) for a step-by-step explanation of what is happening. 100 | 101 | ## Citation 102 | 103 | If you use EDS-PDF, please cite us as below. 104 | 105 | ```bibtex 106 | @software{edspdf, 107 | author = {Dura, Basile and Wajsburt, Perceval and Calliger, Alice and Gérardin, Christel and Bey, Romain}, 108 | doi = {10.5281/zenodo.6902977}, 109 | license = {BSD-3-Clause}, 110 | title = {{EDS-PDF: Smart text extraction from PDF documents}}, 111 | url = {https://github.com/aphp/edspdf} 112 | } 113 | ``` 114 | 115 | ## Acknowledgement 116 | 117 | We would like to thank [Assistance Publique – Hôpitaux de Paris](https://www.aphp.fr/) and 118 | [AP-HP Foundation](https://fondationrechercheaphp.fr/) for funding this project. 119 | -------------------------------------------------------------------------------- /contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to EDS-PDF 2 | 3 | We welcome contributions ! There are many ways to help. For example, you can: 4 | 5 | 1. Help us track bugs by filing issues 6 | 2. Suggest and help prioritise new functionalities 7 | 3. Help us make the library as straightforward as possible, by simply asking questions on whatever does not seem clear to you. 8 | 9 | ## Development installation 10 | 11 | To be able to run the test suite and develop your own pipeline, you should clone the repo and install it locally. We use the [`hatch`](https://hatch.pypa.io/) package manager to manage the project. 12 | 13 | 14 |
15 | 16 | ```console 17 | color:gray # Clone the repository and change directory 18 | $ git clone ssh://git@github.com/aphp/edspdf.git 19 | ---> 100% 20 | 21 | color:gray # Ensure hatch is installed, preferably via pipx 22 | $ pipx install hatch 23 | 24 | $ cd edspdf 25 | 26 | color:gray # Enter a shell to develop / test the project. This will install everything required in a virtual environment. You can also `source` the path shown by hatch. 27 | $ hatch shell 28 | $ ... 29 | $ exit # when you're done 30 | ``` 31 | 32 |
33 | 34 | To make sure the pipeline will not fail because of formatting errors, we added pre-commit hooks using the `pre-commit` Python library. To use it, simply install it: 35 | 36 |
37 | 38 | ```console 39 | $ pre-commit install 40 | ``` 41 | 42 |
43 | 44 | The pre-commit hooks defined in the [configuration](https://github.com/aphp/edspdf/blob/main/.pre-commit-config.yaml) will automatically run when you commit your changes, letting you know if something went wrong. 45 | 46 | The hooks only run on staged changes. To force-run it on all files, run: 47 | 48 |
49 | 50 | ```console 51 | $ pre-commit run --all-files 52 | ---> 100% 53 | color:green All good ! 54 | ``` 55 | 56 |
57 | 58 | ## Proposing a merge request 59 | 60 | At the very least, your changes should : 61 | 62 | - Be well-documented ; 63 | - Pass every tests, and preferably implement its own ; 64 | - Follow the style guide. 65 | 66 | ### Testing your code 67 | 68 | We use the Pytest test suite. 69 | 70 | The following command will run the test suite. Writing your own tests is encouraged ! 71 | 72 | ```shell 73 | pytest 74 | ``` 75 | 76 | Should your contribution propose a bug fix, we require the bug be thoroughly tested. 77 | 78 | ### Style Guide 79 | 80 | We use [Black](https://github.com/psf/black) to reformat the code. While other formatter only enforce PEP8 compliance, Black also makes the code uniform. In short : 81 | 82 | > Black reformats entire files in place. It is not configurable. 83 | 84 | Moreover, the CI/CD pipeline enforces a number of checks on the "quality" of the code. To wit, non black-formatted code will make the test pipeline fail. We use `pre-commit` to keep our codebase clean. 85 | 86 | Refer to the [development install tutorial](#development-installation) for tips on how to format your files automatically. 87 | Most modern editors propose extensions that will format files on save. 88 | 89 | ### Documentation 90 | 91 | Make sure to document your improvements, both within the code with comprehensive docstrings, 92 | as well as in the documentation itself if need be. 93 | 94 | We use `MkDocs` for EDS-PDF's documentation. You can view your changes with 95 | 96 |
97 | 98 | ```console 99 | color:gray # Run the documentation 100 | $ hatch run docs:serve 101 | ``` 102 | 103 |
104 | 105 | Go to [`localhost:8000`](http://localhost:8000) to see your changes. MkDocs watches for changes in the documentation folder 106 | and automatically reloads the page. 107 | -------------------------------------------------------------------------------- /demo/app.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | from confit import Config 6 | 7 | import edspdf 8 | from edspdf.visualization import merge_boxes, show_annotations 9 | 10 | CONFIG = """\ 11 | [pipeline] 12 | pipeline = ["extractor", "classifier", "aggregator"] 13 | 14 | [components] 15 | 16 | [components.extractor] 17 | @factory = "pdfminer-extractor" 18 | extract_style = true 19 | 20 | [components.classifier] 21 | @factory = "mask-classifier" 22 | x0 = 0.25 23 | x1 = 0.95 24 | y0 = 0.3 25 | y1 = 0.9 26 | threshold = 0.1 27 | 28 | [components.aggregator] 29 | @factory = "simple-aggregator" 30 | """ 31 | 32 | 33 | st.set_page_config( 34 | page_title="EDS-PDF Demo", 35 | page_icon="📄", 36 | ) 37 | 38 | st.title("EDS-PDF") 39 | 40 | st.warning( 41 | "You should **not** put sensitive data in the example, as this application " 42 | "**is not secure**." 43 | ) 44 | 45 | st.sidebar.header("About") 46 | st.sidebar.markdown( 47 | "EDS-PDF is a contributive effort maintained by AP-HP's Data Science team. " 48 | "Have a look at the " 49 | "[documentation](https://aphp.github.io/edspdf/) for more information." 50 | ) 51 | 52 | 53 | st.header("Extract a PDF") 54 | 55 | st.subheader("Configuration") 56 | config = st.text_area(label="Change the config", value=CONFIG, height=200) 57 | 58 | 59 | model_load_state = st.info("Loading model...") 60 | 61 | reader = edspdf.load(Config.from_str(config)) 62 | 63 | model_load_state.empty() 64 | 65 | st.subheader("Input") 66 | upload = st.file_uploader("PDF to analyse", accept_multiple_files=False) 67 | 68 | if upload: 69 | 70 | pdf = upload.getvalue() 71 | 72 | base64_pdf = base64.b64encode(pdf).decode("utf-8") 73 | 74 | doc = reader(pdf) 75 | 76 | body = doc.aggregated_texts["body"].text 77 | styles = doc.aggregated_texts["body"].properties 78 | 79 | pdf_display = f"""\ 80 | """ 86 | 87 | st.subheader("Output") 88 | 89 | with st.expander("Visualisation"): 90 | 91 | merged = merge_boxes(sorted(doc.text_boxes)) 92 | 93 | imgs = show_annotations(pdf=pdf, annotations=merged) 94 | 95 | page = st.selectbox("Pages", options=[i + 1 for i in range(len(imgs))]) - 1 96 | 97 | st.image(imgs[page]) 98 | 99 | # with st.expander("PDF"): 100 | # st.markdown(pdf_display, unsafe_allow_html=True) 101 | 102 | with st.expander("Text"): 103 | if body is None: 104 | st.warning( 105 | "No text detected... Are you sure this is a text-based PDF?\n\n" 106 | "There is no support for OCR within EDS-PDF (for now?)." 107 | ) 108 | else: 109 | st.markdown("```\n" + body + "\n```") 110 | 111 | with st.expander("Styles"): 112 | if styles is None: 113 | st.warning( 114 | "No text detected... Are you sure this is a text-based PDF?\n\n" 115 | "There is no support for OCR within EDS-PDF (for now?)." 116 | ) 117 | else: 118 | st.dataframe(pd.DataFrame(styles)) 119 | -------------------------------------------------------------------------------- /demo/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/aphp/edspdf.git 2 | streamlit 3 | -------------------------------------------------------------------------------- /docs/alternatives.md: -------------------------------------------------------------------------------- 1 | # Alternatives & Comparison 2 | 3 | EDS-PDF was developed to propose a more modular and extendable approach to PDF extraction than [PDFBox](https://pdfbox.apache.org/), the legacy implementation at APHP's clinical data warehouse. 4 | 5 | EDS-PDF takes inspiration from Explosion's [spaCy](https://spacy.io) pipelining system and closely follows its API. Therefore, the core object within EDS-PDF is the Pipeline, which organises the processing of PDF documents into multiple components. However, unlike spaCy, the library is built around a single deep learning framework, pytorch, which makes model development easier. 6 | -------------------------------------------------------------------------------- /docs/assets/images/model-parallelism.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/assets/images/model-parallelism.png -------------------------------------------------------------------------------- /docs/assets/images/multiprocessing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/assets/images/multiprocessing.png -------------------------------------------------------------------------------- /docs/assets/logo/aphp-blue.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xml 79 | -------------------------------------------------------------------------------- /docs/assets/logo/aphp-white.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | image/svg+xml 79 | -------------------------------------------------------------------------------- /docs/assets/logo/edspdf-red.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/assets/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | [data-md-color-scheme="default"] { 2 | --md-primary-fg-color: #006bb6; 3 | --md-primary-fg-color--light: #006bb6; 4 | --md-accent-fg-color: #006bb6; 5 | --md-accent-fg-color--light: #006bb6; 6 | } 7 | 8 | [data-md-color-scheme="slate"] { 9 | --md-primary-fg-color: #006bb6; 10 | --md-primary-fg-color--dark: #006bb6; 11 | --md-accent-fg-color: #006bb6; 12 | --md-accent-fg-color--light: #006bb6; 13 | } 14 | 15 | :root { 16 | --md-admonition-icon--aphp: url('data:image/svg+xml;charset=utf-8,'); 17 | } 18 | 19 | 20 | .md-typeset .admonition.aphp, 21 | .md-typeset details.aphp { 22 | border-color: rgb(0, 107, 182); 23 | } 24 | 25 | .md-typeset .aphp > .admonition-title, 26 | .md-typeset .aphp > summary { 27 | background-color: rgba(0, 107, 182, 0.1); 28 | border-color: rgb(0, 107, 182); 29 | } 30 | 31 | .md-typeset .aphp > .admonition-title::before, 32 | .md-typeset .aphp > summary::before { 33 | background-color: rgb(0, 107, 182); 34 | -webkit-mask-image: var(--md-admonition-icon--aphp); 35 | mask-image: var(--md-admonition-icon--aphp); 36 | } 37 | 38 | 39 | :root { 40 | --md-code-font: Consolas, Roboto Mono, Roboto; 41 | --md-code-bg-color: rgba(175, 184, 193, .2); 42 | 43 | --md-typeset-color: #24292e; 44 | } 45 | 46 | 47 | :root, [data-md-color-scheme=default] { 48 | --md-main-bg: #eef4f8; 49 | } 50 | 51 | :root, [data-md-color-scheme=slate] { 52 | --md-main-bg: hsl(232deg 15% 25%);; 53 | } 54 | 55 | html { 56 | } 57 | 58 | body, input { 59 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Fira Sans", "Droid Sans", "Helvetica Neue", Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";; 60 | font-weight: 400; 61 | font-feature-settings: normal; 62 | } 63 | 64 | .md-typeset h1, .md-typeset h2, .md-typeset h3, .md-typeset h4, .md-typeset h5, .md-typeset h6 { 65 | word-wrap: normal; 66 | color: var(--md-typeset-color); 67 | font-weight: 600; 68 | scroll-margin-top: 1.25rem; 69 | letter-spacing: 0; 70 | } 71 | 72 | .md-typeset h1 { 73 | border-bottom: 1px solid #d8dee4; 74 | } 75 | 76 | .md-nav { 77 | font-size: 0.8rem; 78 | } 79 | 80 | .md-typeset code { 81 | font-size: 0.95em; 82 | } 83 | 84 | .md-typeset pre > code, .termy > [data-termynal], .highlighttable .linenos { 85 | font-size: .75rem; 86 | } 87 | 88 | .termy > [data-termynal] { 89 | font-size: 0.8rem; 90 | font-family: var(--md-code-font); 91 | padding: 45px 45px 25px; 92 | } 93 | 94 | .termy > [data-termynal] { 95 | 96 | } 97 | 98 | .md-typeset :is(.admonition,details) { 99 | font-size: inherit !important; 100 | } 101 | 102 | .highlight span.filename, .quote > summary { 103 | font-size: 0.85em; 104 | padding-top: 0.3em; 105 | padding-bottom: 0.3em; 106 | } 107 | 108 | .md-typeset pre > code, .highlight span.filename { 109 | border-top-left-radius: 5px; 110 | border-top-right-radius: 5px; 111 | } 112 | 113 | .md-typeset pre > code { 114 | border-bottom-left-radius: 5px; 115 | border-bottom-right-radius: 5px; 116 | } 117 | 118 | .md-main__inner { 119 | margin-top: 0; 120 | } 121 | 122 | .md-typeset__table td > a { 123 | white-space: nowrap; 124 | } 125 | 126 | @media screen and (min-width: 76.1875em) { 127 | .md-sidebar { 128 | margin-top: 1.5rem; 129 | } 130 | } 131 | 132 | @media screen and (min-width: 60em) { 133 | .md-nav--secondary .md-nav__title { 134 | background: var(--md-main-bg) !important; 135 | box-shadow: 0 0 0.4rem 0.4rem var(--md-main-bg) !important; 136 | } 137 | } 138 | 139 | @media screen and (min-width: 76.25em) { 140 | .md-nav--primary .md-nav__title, .md-nav--secondary .md-nav__title, .md-nav--lifted > .md-nav__list > .md-nav__item--active > .md-nav__link { 141 | background: var(--md-main-bg) !important; 142 | box-shadow: 0 0 0.4rem 0.4rem var(--md-main-bg) !important; 143 | } 144 | } 145 | 146 | .md-content { 147 | background: var(--md-default-bg-color); 148 | } 149 | 150 | .md-main { 151 | background: var(--md-main-bg); 152 | } 153 | 154 | .md-content__inner { 155 | margin-top: 1.5rem; 156 | } 157 | 158 | .doc td > code { 159 | word-break: normal; 160 | } 161 | -------------------------------------------------------------------------------- /docs/assets/templates/python/material/docstring.html: -------------------------------------------------------------------------------- 1 | {% if docstring_sections %} 2 | {{ log.debug("Rendering docstring") }} 3 | {% for section in docstring_sections %} 4 | {% if not config.only_parameters %} 5 | {% if section.kind.value == "text" %} 6 | {{ section.value|convert_markdown(heading_level, html_id) }} 7 | {% elif section.kind.value == "attributes" %} 8 | {% include "docstring/attributes.html" with context %} 9 | {% elif section.kind.value == "parameters" %} 10 | {% include "docstring/parameters.html" with context %} 11 | {% elif section.kind.value == "other parameters" %} 12 | {% include "docstring/other_parameters.html" with context %} 13 | {% elif section.kind.value == "raises" %} 14 | {% include "docstring/raises.html" with context %} 15 | {% elif section.kind.value == "warns" %} 16 | {% include "docstring/warns.html" with context %} 17 | {% elif section.kind.value == "yields" %} 18 | {% include "docstring/yields.html" with context %} 19 | {% elif section.kind.value == "receives" %} 20 | {% include "docstring/receives.html" with context %} 21 | {% elif section.kind.value == "returns" %} 22 | {% include "docstring/returns.html" with context %} 23 | {% elif section.kind.value == "examples" %} 24 | {% include "docstring/examples.html" with context %} 25 | {% elif section.kind.value == "admonition" %} 26 | {% include "docstring/admonition.html" with context %} 27 | {% endif %} 28 | {% elif section.kind.value == "parameters" %} 29 | {% include "docstring/parameters.html" with context %} 30 | {% elif section.kind.value == "attributes" %} 31 | {% include "docstring/attributes.html" with context %} 32 | {% endif %} 33 | {% endfor %} 34 | {% endif %} 35 | -------------------------------------------------------------------------------- /docs/assets/templates/python/material/docstring/examples.html: -------------------------------------------------------------------------------- 1 | {{ "# Examples\n"|convert_markdown(heading_level, html_id) }} 2 | {% for section_type, sub_section in section.value %} 3 | {% if section_type.value == "text" %} 4 | {{ sub_section|convert_markdown(heading_level, html_id) }} 5 | {% elif section_type.value == "examples" %} 6 | {{ sub_section|highlight(language="pycon", linenums=False) }} 7 | {% endif %} 8 | {% endfor %} 9 | -------------------------------------------------------------------------------- /docs/assets/templates/python/material/docstring/parameters.html: -------------------------------------------------------------------------------- 1 | {{ log.debug("Rendering parameters section") }} 2 | {% if is_merged_init %} 3 | {{ "# Parameters\n"|convert_markdown(heading_level, html_id) }} 4 | {% endif %} 5 | {% if config.docstring_section_style == "table" %} 6 | {% block table_style %} 7 |

{{ section.title or "Parameters:" }}

8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | {% for parameter in section.value %} 19 | {% if not config.only_parameters or parameter.name not in ("nlp", "name", "vocab", "scorer") %} 20 | 21 | 22 | 29 | 30 | 39 | 40 | {% endif %} 41 | {% endfor %} 42 | 43 |
NameTypeDescriptionDefault
{{ parameter.name }} 23 | {% if parameter.annotation %} 24 | {% with expression = parameter.annotation %} 25 | {% include "expression.html" with context %} 26 | {% endwith %} 27 | {% endif %} 28 | {{ parameter.description|convert_markdown(heading_level, html_id) }} 31 | {% if parameter.default %} 32 | {% with expression = parameter.default %} 33 | {% include "expression.html" with context %} 34 | {% endwith %} 35 | {% else %} 36 | required 37 | {% endif %} 38 |
44 | {% endblock table_style %} 45 | {% elif config.docstring_section_style == "list" %} 46 | {% block list_style %} 47 |

{{ section.title or "Parameters:" }}

48 | 63 | {% endblock list_style %} 64 | {% elif config.docstring_section_style == "spacy" %} 65 | {% block spacy_style %} 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | {% for parameter in section.value %} 75 | {% if not config.only_parameters or parameter.name not in ("nlp", "name", "vocab", "scorer") %} 76 | 77 | 78 | 99 | 100 | {% endif %} 101 | {% endfor %} 102 | 103 |
{{ (section.title or "PARAMETER").rstrip(":").upper() }}DESCRIPTION
{{ parameter.name }} 79 | {{ parameter.description|convert_markdown(heading_level, html_id) }} 80 |

81 | {% if parameter.annotation %} 82 | 83 | TYPE: 84 | {% with expression = parameter.annotation %} 85 | {% include "expression.html" with context %} 86 | {% endwith %} 87 | 88 | {% endif %} 89 | {% if parameter.default %} 90 | 91 | DEFAULT: 92 | {% with expression = parameter.default %} 93 | {% include "expression.html" with context %} 94 | {% endwith %} 95 | 96 | {% endif %} 97 |

98 |
104 | {% endblock spacy_style %} 105 | {% endif %} 106 | -------------------------------------------------------------------------------- /docs/assets/templates/python/material/function.html: -------------------------------------------------------------------------------- 1 | {{ log.debug("Rendering " + function.path) }} 2 | 3 |
4 | {% with html_id = function.path %} 5 | 6 | {% if root %} 7 | {% set show_full_path = config.show_root_full_path %} 8 | {% set root_members = True %} 9 | {% elif root_members %} 10 | {% set show_full_path = config.show_root_members_full_path or config.show_object_full_path %} 11 | {% set root_members = False %} 12 | {% else %} 13 | {% set show_full_path = config.show_object_full_path %} 14 | {% endif %} 15 | 16 | {% if not root or config.show_root_heading %} 17 | 18 | {% filter heading(heading_level, 19 | role="function", 20 | id=html_id, 21 | class="doc doc-heading", 22 | toc_label=function.name ~ "()") %} 23 | 24 | {% if config.separate_signature %} 25 | {% if show_full_path %}{{ function.path }}{% else %}{{ function.name }}{% endif %} 26 | {% else %} 27 | {% filter highlight(language="python", inline=True) %} 28 | {% if show_full_path %}{{ function.path }}{% else %}{{ function.name }}{% endif %} 29 | {% include "signature.html" with context %} 30 | {% endfilter %} 31 | {% endif %} 32 | 33 | {% with labels = function.labels %} 34 | {% include "labels.html" with context %} 35 | {% endwith %} 36 | 37 | {% endfilter %} 38 | 39 | {% if config.separate_signature %} 40 | {% filter highlight(language="python", inline=False) %} 41 | {% filter format_signature(config.line_length) %} 42 | {% if show_full_path %}{{ function.path }}{% else %}{{ function.name }}{% endif %} 43 | {% include "signature.html" with context %} 44 | {% endfilter %} 45 | {% endfilter %} 46 | {% endif %} 47 | 48 | {% else %} 49 | {% if config.show_root_toc_entry %} 50 | {% filter heading(heading_level, 51 | role="function", 52 | id=html_id, 53 | toc_label=function.path if config.show_root_full_path else function.name, 54 | hidden=True) %} 55 | {% endfilter %} 56 | {% endif %} 57 | {% set heading_level = heading_level - 1 %} 58 | {% endif %} 59 | 60 |
61 | {% with docstring_sections = function.docstring.parsed %} 62 | {% include "docstring.html" with context %} 63 | {% endwith %} 64 | 65 | {% if not config.only_parameters and config.show_source and function.source %} 66 |
67 | Source code in {{ function.relative_filepath }} 68 | {{ function.source|highlight(language="python", linestart=function.lineno, linenums=True) }} 69 |
70 | {% endif %} 71 |
72 | 73 | {% endwith %} 74 |
75 | -------------------------------------------------------------------------------- /docs/assets/termynal/termynal.css: -------------------------------------------------------------------------------- 1 | /** 2 | * termynal.js 3 | * 4 | * @author Ines Montani 5 | * @version 0.0.1 6 | * @license MIT 7 | * 8 | * Modified version from https://github.com/tiangolo/typer 9 | */ 10 | 11 | :root { 12 | --color-bg: #252a33; 13 | --color-text: #eee; 14 | --color-text-subtle: #a2a2a2; 15 | } 16 | 17 | [data-termynal] { 18 | width: auto; 19 | max-width: 100%; 20 | background: var(--color-bg); 21 | color: var(--color-text); 22 | font-size: 18px; 23 | /* font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; */ 24 | font-family: 'Roboto Mono', 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; 25 | border-radius: 4px; 26 | padding: 75px 45px 35px; 27 | position: relative; 28 | -webkit-box-sizing: border-box; 29 | box-sizing: border-box; 30 | } 31 | 32 | [data-termynal]:before { 33 | content: ''; 34 | position: absolute; 35 | top: 15px; 36 | left: 15px; 37 | display: inline-block; 38 | width: 15px; 39 | height: 15px; 40 | border-radius: 50%; 41 | /* A little hack to display the window buttons in one pseudo element. */ 42 | background: #d9515d; 43 | -webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930; 44 | box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930; 45 | } 46 | 47 | [data-termynal]:after { 48 | content: 'bash'; 49 | position: absolute; 50 | color: var(--color-text-subtle); 51 | top: 5px; 52 | left: 0; 53 | width: 100%; 54 | text-align: center; 55 | } 56 | 57 | a[data-terminal-control] { 58 | text-align: right; 59 | display: block; 60 | color: #aebbff; 61 | } 62 | 63 | [data-terminal-copy] { 64 | text-align: right; 65 | position: absolute; 66 | top: 5px; 67 | right: 5px; 68 | } 69 | 70 | [data-terminal-copy].md-icon { 71 | color: #aebbff; 72 | } 73 | 74 | [data-ty] { 75 | display: block; 76 | line-height: 2; 77 | } 78 | 79 | [data-ty]:before { 80 | /* Set up defaults and ensure empty lines are displayed. */ 81 | content: ''; 82 | display: inline-block; 83 | vertical-align: middle; 84 | } 85 | 86 | [data-ty="input"]:before, 87 | [data-ty-prompt]:before { 88 | margin-right: 0.72em; 89 | color: var(--color-text-subtle); 90 | } 91 | 92 | [data-ty="input"]:before { 93 | content: '$'; 94 | } 95 | 96 | [data-ty][data-ty-prompt]:before { 97 | content: attr(data-ty-prompt); 98 | } 99 | 100 | [data-ty-cursor]:after { 101 | content: attr(data-ty-cursor); 102 | font-family: monospace; 103 | margin-left: 0.5em; 104 | -webkit-animation: blink 1s infinite; 105 | animation: blink 1s infinite; 106 | } 107 | 108 | 109 | /* Cursor animation */ 110 | 111 | @-webkit-keyframes blink { 112 | 50% { 113 | opacity: 0; 114 | } 115 | } 116 | 117 | @keyframes blink { 118 | 50% { 119 | opacity: 0; 120 | } 121 | } 122 | 123 | /* tooltip */ 124 | 125 | [data-md-state="open"] { 126 | transform: translateY(0); 127 | opacity: 1; 128 | transition: 129 | transform 400ms cubic-bezier(0.075, 0.85, 0.175, 1), 130 | opacity 400ms; 131 | pointer-events: initial; 132 | } 133 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | ---8<--- "changelog.md" 2 | -------------------------------------------------------------------------------- /docs/configuration.md: -------------------------------------------------------------------------------- 1 | # Configuration 2 | 3 | EDS-PDF is built on top of the [`confit`](https://github.com/aphp/confit) configuration system. 4 | 5 | The following [catalogue](https://github.com/explosion/catalogue) registries are included within EDS-PDF: 6 | 7 | | Section | Description | 8 | |---------------|-------------------------------------------| 9 | | `factory` | Components factories (most often classes) | 10 | | `adapter` | Raw data preprocessing functions | 11 | 12 | EDS-PDF pipelines are meant to be reproducible and serializable, such that you can always define a pipeline through the configuration system. 13 | 14 | To wit, compare the API-based approach to the configuration-based approach (the two are strictly equivalent): 15 | 16 | === "API-based" 17 | 18 | ```python hl_lines="4-13" 19 | import edspdf 20 | from pathlib import Path 21 | 22 | model = edspdf.Pipeline() 23 | model.add_pipe("pdfminer-extractor", name="extractor") 24 | model.add_pipe("mask-classifier", name="classifier", config=dict( 25 | x0=0.2, 26 | x1=0.9, 27 | y0=0.3, 28 | y1=0.6, 29 | threshold=0.1, 30 | ) 31 | model.add_pipe("simple-aggregator", name="aggregator") 32 | 33 | # Get a PDF 34 | pdf = Path("letter.pdf").read_bytes() 35 | 36 | pdf = model(pdf) 37 | 38 | str(pdf.aggregated_texts["body"]) 39 | # Out: Cher Pr ABC, Cher DEF,\n... 40 | ``` 41 | 42 | === "Configuration-based" 43 | 44 | ```toml title="config.cfg" 45 | [pipeline] 46 | pipeline = ["extractor", "classifier", "aggregator"] 47 | 48 | [components.extractor] 49 | @factory = "pdfminer-extractor" 50 | 51 | [components.classifier] 52 | @factory = "mask-classifier" 53 | x0 = 0.2 54 | x1 = 0.9 55 | y0 = 0.3 56 | y1 = 0.6 57 | threshold = 0.1 58 | 59 | [components.aggregator] 60 | @factory = "simple-aggregator" 61 | ``` 62 | 63 | ```python hl_lines="4" 64 | import edspdf 65 | from pathlib import Path 66 | 67 | pipeline = edspdf.load("config.cfg") 68 | 69 | # Get a PDF 70 | pdf = Path("letter.pdf").read_bytes() 71 | 72 | pdf = pipeline(pdf) 73 | 74 | str(pdf.aggregated_texts["body"]) 75 | # Out: Cher Pr ABC, Cher DEF,\n... 76 | ``` 77 | 78 | The configuration-based approach strictly separates the definition of the pipeline 79 | to its application and avoids tucking away important configuration details. 80 | Changes to the pipeline are transparent as there is a single source of truth: the configuration file. 81 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | ---8<--- "contributing.md" 2 | -------------------------------------------------------------------------------- /docs/data-structures.md: -------------------------------------------------------------------------------- 1 | # Data Structures 2 | 3 | 4 | EDS-PDF stores PDFs and their annotation in a custom data structures that are 5 | designed to be easy to use and manipulate. We must distinguish between: 6 | 7 | - the data models used to store the PDFs and exchange them between the 8 | different components of EDS-PDF 9 | - the tensors structures used to process the PDFs with deep learning models 10 | 11 | ## Itinerary of a PDF 12 | 13 | A PDF is first converted to a [PDFDoc][edspdf.structures.PDFDoc] object, which contains the raw PDF content. This task is usually performed a [PDF extractor component](/components/extractors). Once the PDF is converted, the same object will be used and updated by the different components, and returned at the end of the pipeline. 14 | 15 | When running a trainable component, the [PDFDoc][edspdf.structures.PDFDoc] is preprocessed and converted to tensors containing relevant features for the task. This task is performed in the `preprocess` method of the component. The resulting tensors are then collated together to form a batch, in the `collate` method of the component. After running the `forward` method of the component, the tensor predictions are finally assigned as annotations to original [PDFDoc][edspdf.structures.PDFDoc] objects in the `postprocess` method. 16 | 17 | 18 | ## Data models 19 | 20 | The main data structure is the [PDFDoc][edspdf.structures.PDFDoc], which represents full a PDF document. It contains the raw PDF content, annotations for the full document, regardless of pages. A PDF is split into [Page][edspdf.structures.Page] objects that stores their number, dimension and optionally an image of the rendered page. 21 | 22 | The PDF annotations are stored in [Box][edspdf.structures.Box] objects, which represent a rectangular region of the PDF. At the moment, box can only be specialized into [TextBox][edspdf.structures.TextBox] to represent text regions, such as lines extracted by a PDF extractor. Aggregated texts are stored in [Text][edspdf.structures.Text] objects, that are not associated with a specific box. 23 | 24 | A [TextBox][edspdf.structures.TextBox] contains a list of [TextProperties][edspdf.structures.TextProperties] objects to store the style properties of a styled spans of the text. 25 | 26 | ??? note "Reference" 27 | 28 | ::: edspdf.structures 29 | options: 30 | heading_level: 3 31 | 32 | ## Tensor structure 33 | 34 | The tensors used to process PDFs with deep learning models usually contain 4 main dimensions, in addition to the standard embedding dimensions: 35 | 36 | - `samples`: one entry per PDF in the batch 37 | - `pages`: one entry per page in a PDF 38 | - `boxes`: one entry per box in a page 39 | - `token`: one entry per token in a box (only for text boxes) 40 | 41 | These tensors use a special [FoldedTensor](http://pypi.org/project/foldedtensor) format to store the data in a compact way and reshape the data depending on the requirements of a layer. 42 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | EDS-PDF provides modular framework to extract text information from PDF documents. 4 | 5 | You can use it out-of-the-box, or extend it to fit your use-case. 6 | 7 | ## Getting started 8 | 9 | ### Installation 10 | 11 | Install the library with pip: 12 | 13 |
14 | 15 | ```console 16 | $ pip install edspdf 17 | ---> 100% 18 | color:green Installation successful 19 | ``` 20 | 21 |
22 | 23 | ### Extracting text 24 | 25 | Let's build a simple PDF extractor that uses a rule-based classifier. There are two 26 | ways to do this, either by using the [configuration system](#configuration) or by using 27 | the pipeline API. 28 | 29 | === "Configuration based pipeline" 30 | 31 | Create a configuration file: 32 | 33 | ```toml title="config.cfg" 34 | [pipeline] 35 | pipeline = ["extractor", "classifier", "aggregator"] 36 | 37 | [components.extractor] 38 | @factory = "pdfminer-extractor" 39 | 40 | [components.classifier] 41 | @factory = "mask-classifier" 42 | x0 = 0.2 43 | x1 = 0.9 44 | y0 = 0.3 45 | y1 = 0.6 46 | threshold = 0.1 47 | 48 | [components.aggregator] 49 | @factory = "simple-aggregator" 50 | ``` 51 | 52 | and load it from Python: 53 | 54 | ```python 55 | import edspdf 56 | from pathlib import Path 57 | 58 | model = edspdf.load("config.cfg") # (1) 59 | ``` 60 | 61 | === "API based pipeline" 62 | 63 | Or create a pipeline directly from Python: 64 | 65 | ```python 66 | from edspdf import Pipeline 67 | 68 | model = Pipeline() 69 | model.add_pipe("pdfminer-extractor") 70 | model.add_pipe( 71 | "mask-classifier", 72 | config=dict( 73 | x0=0.2, 74 | x1=0.9, 75 | y0=0.3, 76 | y1=0.6, 77 | threshold=0.1, 78 | ), 79 | ) 80 | model.add_pipe("simple-aggregator") 81 | ``` 82 | 83 | This pipeline can then be applied (for instance with this [PDF](https://github.com/aphp/edspdf/raw/main/tests/resources/letter.pdf)): 84 | 85 | ```python 86 | # Get a PDF 87 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes() 88 | pdf = model(pdf) 89 | 90 | body = pdf.aggregated_texts["body"] 91 | 92 | text, style = body.text, body.properties 93 | ``` 94 | 95 | See the [rule-based recipe](recipes/rule-based.md) for a step-by-step explanation of what is happening. 96 | 97 | ## Citation 98 | 99 | If you use EDS-PDF, please cite us as below. 100 | 101 | ```bibtex 102 | @article{gerardin_wajsburt_pdf, 103 | title={Bridging Clinical PDFs and Downstream Natural Language Processing: An Efficient Neural Approach to Layout Segmentation}, 104 | author={G{\'e}rardin, Christel Ducroz and Wajsburt, Perceval and Dura, Basile and Calliger, Alice and Mouchet, Alexandre and Tannier, Xavier and Bey, Romain}, 105 | journal={Available at SSRN 4587624} 106 | } 107 | ``` 108 | 109 | ## Acknowledgement 110 | 111 | We would like to thank [Assistance Publique – Hôpitaux de Paris](https://www.aphp.fr/) and 112 | [AP-HP Foundation](https://fondationrechercheaphp.fr/) for funding this project. 113 | -------------------------------------------------------------------------------- /docs/layers/box-transformer-layer.md: -------------------------------------------------------------------------------- 1 | # BoxTransformerLayer {: #edspdf.layers.box_transformer.BoxTransformerLayer } 2 | 3 | ::: edspdf.layers.box_transformer.BoxTransformerLayer 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/layers/box-transformer.md: -------------------------------------------------------------------------------- 1 | # BoxTransformerModule {: #edspdf.layers.box_transformer.BoxTransformerModule } 2 | 3 | ::: edspdf.layers.box_transformer.BoxTransformerModule 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/layers/index.md: -------------------------------------------------------------------------------- 1 | # Deep learning layers 2 | 3 | EDS-PDF provides a set of specialized deep learning layers that can be used to build trainable 4 | components. These layers are built on top of the PyTorch framework and can be used in 5 | any PyTorch model. 6 | 7 | | Layer | Description | 8 | |---------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------| 9 | | [`BoxTransformerModule`][edspdf.layers.box_transformer.BoxTransformerModule] | Contextualize box embeddings with a 2d Transformer with relative position representations | 10 | | [`BoxTransformerLayer`][edspdf.layers.box_transformer.BoxTransformerLayer] | A single layer of the above `BoxTransformerModule` layer | 11 | | [`RelativeAttention`][edspdf.layers.relative_attention.RelativeAttention] | A 2d attention layer that optionally uses relative position to compute its attention scores | 12 | | [`SinusoidalEmbedding`][edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding] | A position embedding that uses trigonometric functions to encode positions | 13 | | [`Vocabulary`][edspdf.layers.vocabulary.Vocabulary] | A non deep learning layer to encodes / decode vocabularies | 14 | -------------------------------------------------------------------------------- /docs/layers/relative-attention.md: -------------------------------------------------------------------------------- 1 | # RelativeAttention {: #edspdf.layers.relative_attention.RelativeAttention } 2 | 3 | 4 | ::: edspdf.layers.relative_attention.RelativeAttention 5 | options: 6 | heading_level: 2 7 | show_bases: false 8 | show_source: false 9 | -------------------------------------------------------------------------------- /docs/layers/sinusoidal-embedding.md: -------------------------------------------------------------------------------- 1 | # SinusoidalEmbedding {: #edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding } 2 | 3 | ::: edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/layers/vocabulary.md: -------------------------------------------------------------------------------- 1 | # Vocabulary {: #edspdf.layers.vocabulary.Vocabulary } 2 | 3 | ::: edspdf.layers.vocabulary.Vocabulary 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | show_category_heading: true 9 | -------------------------------------------------------------------------------- /docs/pipeline.md: -------------------------------------------------------------------------------- 1 | # Pipeline {: #edspdf.pipeline.Pipeline } 2 | 3 | The goal of EDS-PDF is to provide a **framework** for processing PDF documents, along with some utilities and a few components, stitched together by a robust pipeline and configuration system. 4 | 5 | Processing PDFs usually involves many steps such as extracting lines, running OCR models, detecting and classifying boxes, filtering and aggregating parts of the extracted texts, etc. Organising these steps together, combining static and deep learning components, while remaining modular and efficient is a challenge. This is why EDS-PDF is built on top of a new pipelining system. 6 | 7 | 8 | !!! note "Deep learning frameworks" 9 | 10 | The EDS-PDF trainable components are built around the PyTorch framework. While you 11 | can use any technology in static components, we do not provide tools to train 12 | components built with other deep learning frameworks. 13 | 14 | ## Creating a pipeline 15 | 16 | A pipe is a processing block (like a function) that applies a transformation on its input and returns a modified object. 17 | 18 | At the moment, four types of pipes are implemented in the library: 19 | 20 | 1. **extraction** components extract lines from a raw PDF and return a [`PDFDoc`][edspdf.structures.PDFDoc] object filled with these text boxes. 21 | 2. **classification** components classify each box with labels, such as `body`, `header`, `footer`... 22 | 3. **aggregation** components compiles the lines together according to their classes to re-create the original text. 23 | 4. **embedding** components don't directly update the annotations on the document but have specific deep-learning methods (see the [TrainablePipe][edspdf.trainable_pipe.TrainablePipe] page) that can be composed to form a machine learning model. 24 | 25 | To create your first pipeline, execute the following code: 26 | 27 | ```python 28 | from edspdf import Pipeline 29 | 30 | model = Pipeline() 31 | # will extract text lines from a document 32 | model.add_pipe( 33 | "pdfminer-extractor", 34 | config=dict( 35 | extract_style=False, 36 | ), 37 | ) 38 | # classify everything inside the `body` bounding box as `body` 39 | model.add_pipe( 40 | "mask-classifier", config=dict(body={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9}) 41 | ) 42 | # aggregates the lines together to re-create the original text 43 | model.add_pipe("simple-aggregator") 44 | ``` 45 | 46 | This pipeline can then be run on one or more PDF documents. 47 | As the pipeline process documents, components will be called in the order 48 | they were added to the pipeline. 49 | 50 | ```python 51 | from pathlib import Path 52 | 53 | pdf_bytes = Path("path/to/your/pdf").read_bytes() 54 | 55 | # Processing one document 56 | model(pdf_bytes) 57 | 58 | # Processing multiple documents 59 | model.pipe([pdf_bytes, ...]) 60 | ``` 61 | 62 | For more information on how to use the pipeline, refer to the [Inference](/inference) page. 63 | 64 | ### Hybrid models 65 | 66 | EDS-PDF was designed to facilitate the training and inference of hybrid models that 67 | arbitrarily chain static components or trained deep learning components. Static components are callable objects that take a PDFDoc object as input, perform arbitrary transformations over the input, and return the modified object. [Trainable pipes][edspdf.trainable_pipe.TrainablePipe], on the other hand, allow for deep learning operations to be performed on the [PDFDoc][edspdf.structures.PDFDoc] object and must be trained to be used. 68 | 69 | ## Saving and loading a pipeline 70 | 71 | Pipelines can be saved and loaded using the `save` and `load` methods. The saved pipeline is not a pickled objet but a folder containing the config file, the weights and extra resources for each pipeline. This allows for easy inspection and modification of the pipeline, and avoids the execution of arbitrary code when loading a pipeline. 72 | 73 | ```python 74 | model.save("path/to/your/model") 75 | model = edspdf.load("path/to/your/model") 76 | ``` 77 | 78 | To share the pipeline and turn it into a pip installable package, you can use the `package` method, which will use or create a pyproject.toml file, fill it accordingly, and create a wheel file. At the moment, we only support the poetry package manager. 79 | 80 | ```python 81 | model.package( 82 | name="your-package-name", # leave None to reuse name in pyproject.toml 83 | version="0.0.1", 84 | root_dir="path/to/project/root", # optional, to retrieve an existing pyproject.toml file 85 | # if you don't have a pyproject.toml, you can provide the metadata here instead 86 | metadata=dict( 87 | authors="Firstname Lastname ", 88 | description="A short description of your package", 89 | ), 90 | ) 91 | ``` 92 | 93 | This will create a wheel file in the root_dir/dist folder, which you can share and install with pip 94 | -------------------------------------------------------------------------------- /docs/pipes/aggregators/index.md: -------------------------------------------------------------------------------- 1 | # Aggregation 2 | 3 | The aggregation step compiles extracted text blocs together according to their detected class. 4 | 5 | 6 | 7 | | Factory name | Description | 8 | |-------------------------------------------------------------------------|-------------------------------------------------------------------| 9 | | [`simple-aggregator`][edspdf.pipes.aggregators.simple.SimpleAggregator] | Returns a dictionary with one key for each detected class | 10 | 11 | 12 | -------------------------------------------------------------------------------- /docs/pipes/aggregators/simple-aggregator.md: -------------------------------------------------------------------------------- 1 | ::: edspdf.pipes.aggregators.simple 2 | options: 3 | heading_level: 1 4 | -------------------------------------------------------------------------------- /docs/pipes/box-classifiers/dummy.md: -------------------------------------------------------------------------------- 1 | # Dummy classifier {: #edspdf.pipes.classifiers.dummy.DummyClassifier } 2 | 3 | ::: edspdf.pipes.classifiers.dummy.DummyClassifier 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/box-classifiers/index.md: -------------------------------------------------------------------------------- 1 | # Box classifiers 2 | 3 | We developed EDS-PDF with modularity in mind. To that end, you can choose between multiple classification methods. 4 | 5 | 6 | 7 | | Factory name | Description | 8 | |--------------------------------------------------------------------------------------------------|-----------------------------------------| 9 | | [`mask-classifier`][edspdf.pipes.classifiers.mask.simple_mask_classifier_factory] | Simple rule-based classification | 10 | | [`multi-mask-classifier`][edspdf.pipes.classifiers.mask.mask_classifier_factory] | Simple rule-based classification | 11 | | [`dummy-classifier`][edspdf.pipes.classifiers.dummy.DummyClassifier] | Dummy classifier, for testing purposes. | 12 | | [`random-classifier`][edspdf.pipes.classifiers.random.RandomClassifier] | To sow chaos | 13 | | [`trainable-classifier`][edspdf.pipes.classifiers.trainable.TrainableClassifier] | Trainable box classification model | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/pipes/box-classifiers/mask.md: -------------------------------------------------------------------------------- 1 | # Mask Classification 2 | 3 | We developed a simple classifier that roughly uses the same strategy as PDFBox, namely: 4 | 5 | - define a "mask" on the PDF documents ; 6 | - keep every text bloc within that mask, tag everything else as pollution. 7 | 8 | ## Factories 9 | 10 | Two factories are available in the `classifiers` registry: `mask-classifier` and `multi-mask-classifier`. 11 | 12 | ### `mask-classifier` {: #edspdf.pipes.classifiers.mask.simple_mask_classifier_factory } 13 | 14 | ::: edspdf.pipes.classifiers.mask.simple_mask_classifier_factory 15 | options: 16 | heading_level: 4 17 | show_bases: false 18 | show_source: false 19 | 20 | --- 21 | 22 | ### `multi-mask-classifier` {: #edspdf.pipes.classifiers.mask.mask_classifier_factory } 23 | 24 | ::: edspdf.pipes.classifiers.mask.mask_classifier_factory 25 | options: 26 | heading_level: 4 27 | show_bases: false 28 | show_source: false 29 | -------------------------------------------------------------------------------- /docs/pipes/box-classifiers/random.md: -------------------------------------------------------------------------------- 1 | # Random classifier {: #edspdf.pipes.classifiers.random.RandomClassifier } 2 | 3 | ::: edspdf.pipes.classifiers.random.RandomClassifier 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/box-classifiers/trainable.md: -------------------------------------------------------------------------------- 1 | # Trainable classifier {: #edspdf.pipes.classifiers.trainable.TrainableClassifier } 2 | 3 | ::: edspdf.pipes.classifiers.trainable.TrainableClassifier 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/embeddings/box-layout-embedding.md: -------------------------------------------------------------------------------- 1 | # BoxLayoutEmbedding {: #edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding } 2 | 3 | ::: edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/embeddings/box-transformer.md: -------------------------------------------------------------------------------- 1 | # BoxTransformer {: #edspdf.pipes.embeddings.box_transformer.BoxTransformer } 2 | 3 | ::: edspdf.pipes.embeddings.box_transformer.BoxTransformer 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/embeddings/embedding-combiner.md: -------------------------------------------------------------------------------- 1 | # EmbeddingCombiner {: #edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner } 2 | 3 | ::: edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/embeddings/huggingface-embedding.md: -------------------------------------------------------------------------------- 1 | # HuggingfaceEmbedding {: #edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding } 2 | 3 | ::: edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/embeddings/index.md: -------------------------------------------------------------------------------- 1 | # Embeddings 2 | 3 | We offer multiple embedding methods to encode the text and layout information of the PDFs. The following components can be added to a pipeline or composed together, and contain preprocessing and postprocessing logic to convert and batch documents. 4 | 5 | 6 | 7 | 12 | 13 | | Factory name | Description | 14 | |-----------------------------------------------------------------------------------------------|-------------------------------------------------------------------| 15 | | [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding] | A module that embeds the textual features of the blocks. | 16 | | [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner] | Encodes boxes using a combination of multiple encoders | 17 | | [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler] | Pools the output of a CNN over the elements of a box (like words) | 18 | | [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding] | Encodes the layout of the boxes | 19 | | [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer] | Contextualizes box representations using a transformer | 20 | | [`huggingface-embedding`][edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding] | Box representations using a Huggingface multi-modal model. | 21 | 22 | 23 | 24 | !!! warning "Layers" 25 | These components are not to be confused with [`layers`](/layers), which are standard 26 | PyTorch modules that can be used to build trainable components, such as the ones 27 | described here. 28 | -------------------------------------------------------------------------------- /docs/pipes/embeddings/simple-text-embedding.md: -------------------------------------------------------------------------------- 1 | # SimpleTextEmbedding {: #edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding } 2 | 3 | ::: edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/embeddings/sub-box-cnn-pooler.md: -------------------------------------------------------------------------------- 1 | # SubBoxCNNPooler {: #edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler } 2 | 3 | ::: edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/extractors/index.md: -------------------------------------------------------------------------------- 1 | # Extraction 2 | 3 | The extraction phase consists of reading the PDF document and gather text blocs, along with their dimensions and position within the document. Said blocs will go on to the classification phase to separate the body from the rest. 4 | 5 | ## Text-based PDF 6 | 7 | We provide a multiple extractor architectures for text-based PDFs : 8 | 9 | 10 | 11 | | Factory name | Description | 12 | |----------------------------------------------------------------------------|-------------------------------------------------| 13 | | [`pdfminer-extractor`][edspdf.pipes.extractors.pdfminer.PdfMinerExtractor] | Extracts text lines with the `pdfminer` library | 14 | | [`mupdf-extractor`][edspdf_mupdf.MuPdfExtractor] | Extracts text lines with the `pymupdf` library | 15 | | [`poppler-extractor`][edspdf_poppler.PopplerExtractor] | Extracts text lines with the `poppler` library | 16 | 17 | 18 | 19 | ## Image-based PDF 20 | 21 | Image-based PDF documents require an OCR[^1] step, which is not natively supported by EDS-PDF. 22 | However, you can easily extend EDS-PDF by adding such a method to the registry. 23 | 24 | We plan on adding such an OCR extractor component in the future. 25 | 26 | [^1]: Optical Character Recognition, or OCR, is the process of extracting characters and words from an image. 27 | -------------------------------------------------------------------------------- /docs/pipes/extractors/pdfminer.md: -------------------------------------------------------------------------------- 1 | # PdfMiner Extractor {: #edspdf.pipes.extractors.pdfminer.PdfMinerExtractor } 2 | 3 | ::: edspdf.pipes.extractors.pdfminer.PdfMinerExtractor 4 | options: 5 | heading_level: 2 6 | show_bases: false 7 | show_source: false 8 | -------------------------------------------------------------------------------- /docs/pipes/index.md: -------------------------------------------------------------------------------- 1 | # Components overview 2 | 3 | EDS-PDF provides easy-to-use components for defining PDF processing pipelines. 4 | 5 | 6 | 7 | === "Box extractors" 8 | 9 | --8<-- "docs/pipes/extractors/index.md:components" 10 | 11 | === "Box classifiers" 12 | 13 | --8<-- "docs/pipes/box-classifiers/index.md:components" 14 | 15 | 16 | === "Aggregators" 17 | 18 | --8<-- "docs/pipes/aggregators/index.md:components" 19 | 20 | 21 | === "Embeddings" 22 | 23 | --8<-- "docs/pipes/embeddings/index.md:components" 24 | 25 | You can add them to your EDS-PDF pipeline by simply calling `add_pipe`, for instance: 26 | 27 | 28 | 29 | ```python 30 | # ↑ Omitted code that defines the pipeline object ↑ 31 | pipeline.add_pipe("pdfminer-extractor", name="component-name", config=...) 32 | ``` 33 | -------------------------------------------------------------------------------- /docs/recipes/annotation.md: -------------------------------------------------------------------------------- 1 | # PDF Annotation 2 | 3 | In this section, we will cover one methodology to annotate PDF documents. 4 | 5 | !!! aphp "Data annotation at AP-HP's CDW" 6 | 7 | At AP-HP's CDW[^1], we recently moved away from a rule- and Java-based PDF extraction pipeline 8 | (using PDFBox) to one using EDS-PDF. Hence, EDS-PDF is used in production, helping 9 | extract text from around 100k PDF documents every day. 10 | 11 | To train our pipeline presently in production, we annotated **around 270 documents**, and reached 12 | a **f1-score of 0.98** on the body classification. 13 | 14 | ## Preparing the data for annotation 15 | 16 | We will frame the annotation phase as an image segmentation task, 17 | where annotators are asked to draw bounding boxes around the different sections. 18 | Hence, the very first step is to convert PDF documents to images. We suggest using the 19 | library `pdf2image` for that step. 20 | 21 | The following script will convert the PDF documents located in a `data/pdfs` directory 22 | to PNG images inside the `data/images` folder. 23 | 24 | ```python 25 | import pdf2image 26 | from pathlib import Path 27 | 28 | DATA_DIR = Path("data") 29 | PDF_DIR = DATA_DIR / "pdfs" 30 | IMAGE_DIR = DATA_DIR / "images" 31 | 32 | for pdf in PDF_DIR.glob("*.pdf"): 33 | imgs = pdf2image.convert_from_bytes(pdf) 34 | 35 | for page, img in enumerate(imgs): 36 | path = IMAGE_DIR / f"{pdf.stem}_{page}.png" 37 | img.save(path) 38 | ``` 39 | 40 | You can use any annotation tool to annotate the images. If you're looking for a simple 41 | way to annotate from within a Jupyter Notebook, 42 | [ipyannotations](https://ipyannotations.readthedocs.io/en/latest/examples/image-landmarks.html#annotating-bounding-boxes) 43 | might be a good fit. 44 | 45 | You will need to post-process the output 46 | to convert the annotations to the following format: 47 | 48 | | Key | Description | 49 | |---------|--------------------------------------------------------------------| 50 | | `page` | Page within the PDF (0-indexed) | 51 | | `x0` | Horizontal position of the top-left corner of the bounding box | 52 | | `x1` | Horizontal position of the bottom-right corner of the bounding box | 53 | | `y0` | Vertical position of the top-left corner of the bounding box | 54 | | `y1` | Vertical position of the bottom-right corner of the bounding box | 55 | | `label` | Class of the bounding box (eg `body`, `header`...) | 56 | 57 | All dimensions should be normalised by the height and width of the page. 58 | 59 | ## Saving the dataset 60 | 61 | Once the annotation phase is complete, make sure the train/test split is performed 62 | once and for all when you create the dataset. 63 | 64 | We suggest the following structure: 65 | 66 | ```title="Directory structure" 67 | dataset/ 68 | ├── train/ 69 | │ ├── .pdf 70 | │ ├── .json 71 | │ ├── .pdf 72 | │ ├── .json 73 | │ └── ... 74 | └── test/ 75 | ├── .pdf 76 | ├── .json 77 | └── ... 78 | ``` 79 | 80 | Where the normalised annotation resides in a JSON file living next to the related PDF, 81 | and uses the following schema: 82 | 83 | | Key | Description | 84 | | -------------- | ----------------------------------------------- | 85 | | `note_id` | Reference to the document | 86 | | `` | Optional property of the document itself | 87 | | `annotations` | List of annotations, following the schema above | 88 | 89 | This structure presents the advantage of being machine- and human-friendly. 90 | The JSON file contains annotated regions as well as any document property that 91 | could be useful to adapt the pipeline (typically for the classification step). 92 | 93 | ## Extracting annotations 94 | 95 | The following snippet extracts the annotations into a workable format: 96 | 97 | ```python 98 | from pathlib import Path 99 | import pandas as pd 100 | 101 | 102 | def get_annotations( 103 | directory: Path, 104 | ) -> pd.DataFrame: 105 | """ 106 | Read annotations from the dataset directory. 107 | 108 | Parameters 109 | ---------- 110 | directory : Path 111 | Dataset directory 112 | 113 | Returns 114 | ------- 115 | pd.DataFrame 116 | Pandas DataFrame containing the annotations. 117 | """ 118 | dfs = [] 119 | 120 | iterator = tqdm(list(directory.glob("*.json"))) 121 | 122 | for path in iterator: 123 | meta = json.loads(path.read_text()) 124 | df = pd.DataFrame.from_records(meta.pop("annotations")) 125 | 126 | for k, v in meta.items(): # (1) 127 | df[k] = v 128 | 129 | dfs.append(df) 130 | 131 | return pd.concat(dfs) 132 | 133 | 134 | train_path = Path("dataset/train") 135 | 136 | annotations = get_annotations(train_path) 137 | ``` 138 | 139 | 1. Add a column for each additional property saved in the dataset. 140 | 141 | The annotations compiled this way can be used to train a pipeline. 142 | See the [trained pipeline recipe](./training.md) for more detail. 143 | 144 | [^1]: Greater Paris University Hospital's Clinical Data Warehouse 145 | -------------------------------------------------------------------------------- /docs/recipes/extension.md: -------------------------------------------------------------------------------- 1 | # Extending EDS-PDF 2 | 3 | EDS-PDF is organised around a function registry powered by catalogue and a custom configuration system. The result is a powerful framework that is easy to extend - and we'll see how in this section. 4 | 5 | For this recipe, let's imagine we're not entirely satisfied with the aggregation 6 | proposed by EDS-PDF. For instance, we might want an aggregator that outputs the 7 | text in Markdown format. 8 | 9 | !!! note 10 | 11 | Properly converting to markdown is no easy task. For this example, 12 | we will limit ourselves to detecting bold and italics sections. 13 | 14 | ## Developing the new aggregator 15 | 16 | Our aggregator will inherit from the [`SimpleAggregator`][edspdf.pipes.aggregators.simple.SimpleAggregator], 17 | and use the style to detect italics and bold sections. 18 | 19 | ```python title="markdown_aggregator.py" 20 | from edspdf import registry 21 | from edspdf.pipes.aggregators.simple import SimpleAggregator 22 | from edspdf.structures import PDFDoc, Text 23 | 24 | 25 | @registry.factory.register("markdown-aggregator") # (1) 26 | class MarkdownAggregator(SimpleAggregator): 27 | def __call__(self, doc: PDFDoc) -> PDFDoc: 28 | doc = super().__call__(doc) 29 | 30 | for label in doc.aggregated_texts.keys(): 31 | text = doc.aggregated_texts[label].text 32 | 33 | fragments = [] 34 | 35 | offset = 0 36 | for s in doc.aggregated_texts[label].properties: 37 | if s.begin >= s.end: 38 | continue 39 | if offset < s.begin: 40 | fragments.append(text[offset : s.begin]) 41 | 42 | offset = s.end 43 | snippet = text[s.begin : s.end] 44 | if s.bold: 45 | snippet = f"**{snippet}**" 46 | if s.italic: 47 | snippet = f"_{snippet}_" 48 | fragments.append(snippet) 49 | 50 | if offset < len(text): 51 | fragments.append(text[offset:]) 52 | 53 | doc.aggregated_texts[label] = Text(text="".join(fragments)) 54 | 55 | return doc 56 | ``` 57 | 58 | 1. The new aggregator is registered via this line 59 | 2. The new aggregator redefines the `__call__` method. 60 | It will output a single string, corresponding to the markdown-formatted output. 61 | 62 | That's it! You can use this new aggregator with the API: 63 | 64 | ```python 65 | from edspdf import Pipeline 66 | from markdown_aggregator import MarkdownAggregator # (1) 67 | 68 | model = Pipeline() 69 | # will extract text lines from a document 70 | model.add_pipe( 71 | "pdfminer-extractor", 72 | config=dict( 73 | extract_style=False, 74 | ), 75 | ) 76 | # classify everything inside the `body` bounding box as `body` 77 | model.add_pipe("mask-classifier", config={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9}) 78 | # aggregates the lines together to generate the markdown formatted text 79 | model.add_pipe("markdown-aggregator") 80 | ``` 81 | 82 | 1. We're importing the aggregator that we just defined. 83 | 84 | It all works relatively smoothly! 85 | 86 | ## Making the aggregator discoverable 87 | 88 | Now, how can we instantiate the pipeline using the configuration system? 89 | The registry needs to be aware of the new function, but we shouldn't have to 90 | import `mardown_aggregator.py` just so that the module is registered as a side-effect... 91 | 92 | Catalogue solves this problem by using Python _entry points_. 93 | 94 | === "pyproject.toml" 95 | 96 | ```toml 97 | [project.entry-points."edspdf_factories"] 98 | "markdown-aggregator" = "markdown_aggregator:MarkdownAggregator" 99 | ``` 100 | 101 | === "setup.py" 102 | 103 | ```python 104 | from setuptools import setup 105 | 106 | setup( 107 | name="edspdf-markdown-aggregator", 108 | entry_points={ 109 | "edspdf_factories": [ 110 | "markdown-aggregator = markdown_aggregator:MarkdownAggregator" 111 | ] 112 | }, 113 | ) 114 | ``` 115 | 116 | By declaring the new aggregator as an entrypoint, it will become discoverable by EDS-PDF 117 | as long as it is installed in your environment! 118 | -------------------------------------------------------------------------------- /docs/recipes/index.md: -------------------------------------------------------------------------------- 1 | # EDS-PDF Recipes 2 | 3 | This section goes over a few use-cases for PDF extraction. 4 | It is meant as a more hands-on tutorial to get a grip on the library. 5 | -------------------------------------------------------------------------------- /docs/recipes/resources/lines.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/recipes/resources/lines.jpeg -------------------------------------------------------------------------------- /docs/recipes/resources/merged.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/recipes/resources/merged.jpeg -------------------------------------------------------------------------------- /docs/recipes/rule-based.md: -------------------------------------------------------------------------------- 1 | # Rule-based extraction 2 | 3 | Let's create a rule-based extractor for PDF documents. 4 | 5 | !!! note 6 | 7 | This pipeline will likely perform poorly as soon as your PDF documents 8 | come in varied forms. In that case, even a very simple trained pipeline 9 | may give you a substantial performance boost (see [next section](training.md)). 10 | 11 | First, download this example [PDF](https://github.com/aphp/edspdf/raw/main/tests/resources/letter.pdf). 12 | 13 | We will use the following configuration: 14 | 15 | ```toml title="config.cfg" 16 | [pipeline] 17 | components = ["extractor", "classifier", "aggregator"] 18 | components_config = ${components} 19 | 20 | [components.extractor] 21 | @factory = "pdfminer-extractor" # (2) 22 | extract_style = true 23 | 24 | [components.classifier] 25 | @factory = "mask-classifier" # (3) 26 | x0 = 0.2 27 | x1 = 0.9 28 | y0 = 0.3 29 | y1 = 0.6 30 | threshold = 0.1 31 | 32 | [components.aggregator] 33 | @factory = "simple-aggregator" # (4) 34 | ``` 35 | 36 | 1. This is the top-level object, which organises the entire extraction process. 37 | 2. Here we use the provided text-based extractor, based on the PDFMiner library 38 | 3. This is where we define the rule-based classifier. Here, we use a "mask", 39 | meaning that every text bloc that falls within the boundaries will be assigned 40 | the `body` label, everything else will be tagged as pollution. 41 | 4. This aggregator returns a tuple of dictionaries. The first contains compiled text for each 42 | label, the second exports their style. 43 | 44 | Save the configuration as `config.cfg` and run the following snippet: 45 | 46 | ```python 47 | import edspdf 48 | import pandas as pd 49 | from pathlib import Path 50 | 51 | model = edspdf.load("config.cfg") # (1) 52 | 53 | # Get a PDF 54 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes() 55 | pdf = model(pdf) 56 | 57 | body = pdf.aggregated_texts["body"] 58 | 59 | text, style = body.text, body.properties 60 | print(text) 61 | print(pd.DataFrame(style)) 62 | ``` 63 | 64 | This code will output the following results: 65 | 66 | === "Visualisation" 67 | 68 | ![lines](resources/lines.jpeg) 69 | 70 | === "Extracted Text" 71 | 72 | ``` 73 | Cher Pr ABC, Cher DEF, 74 | 75 | Nous souhaitons remercier le CSE pour son avis favorable quant à l’accès aux données de 76 | l’Entrepôt de Données de Santé du projet n° XXXX. 77 | 78 | Nous avons bien pris connaissance des conditions requises pour cet avis favorable, c’est 79 | pourquoi nous nous engageons par la présente à : 80 | 81 | • Informer individuellement les patients concernés par la recherche, admis à l'AP-HP 82 | avant juillet 2017, sortis vivants, et non réadmis depuis. 83 | 84 | • Effectuer une demande d'autorisation à la CNIL en cas d'appariement avec d’autres 85 | cohortes. 86 | 87 | Bien cordialement, 88 | ``` 89 | 90 | === "Extracted Style" 91 | 92 | The `start` and `end` columns refer to the character indices within the extracted text. 93 | 94 | | italic | bold | fontname | start | end | 95 | |--------|--------|----------------|-------|-----| 96 | | False | False | BCDFEE+Calibri | 0 | 22 | 97 | | False | False | BCDFEE+Calibri | 24 | 90 | 98 | | False | False | BCDHEE+Calibri | 90 | 91 | 99 | | False | False | BCDFEE+Calibri | 91 | 111 | 100 | | False | False | BCDFEE+Calibri | 112 | 113 | 101 | | False | False | BCDHEE+Calibri | 113 | 114 | 102 | | False | False | BCDFEE+Calibri | 114 | 161 | 103 | | False | False | BCDFEE+Calibri | 163 | 247 | 104 | | False | False | BCDHEE+Calibri | 247 | 248 | 105 | | False | False | BCDFEE+Calibri | 248 | 251 | 106 | | False | False | BCDFEE+Calibri | 252 | 300 | 107 | | False | False | SymbolMT | 302 | 303 | 108 | | False | False | BCDFEE+Calibri | 304 | 386 | 109 | | False | False | BCDFEE+Calibri | 387 | 445 | 110 | | False | False | SymbolMT | 447 | 448 | 111 | | False | False | BCDFEE+Calibri | 449 | 523 | 112 | | False | False | BCDHEE+Calibri | 523 | 524 | 113 | | False | False | BCDFEE+Calibri | 524 | 530 | 114 | | False | False | BCDFEE+Calibri | 531 | 540 | 115 | | False | False | BCDFEE+Calibri | 542 | 560 | 116 | -------------------------------------------------------------------------------- /docs/references.bib: -------------------------------------------------------------------------------- 1 | @article{vaswani2017attention, 2 | title={Attention is all you need}, 3 | author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, 4 | journal={Advances in neural information processing systems}, 5 | volume={30}, 6 | year={2017} 7 | } 8 | -------------------------------------------------------------------------------- /docs/roadmap.md: -------------------------------------------------------------------------------- 1 | ---8<--- "roadmap.md" 2 | -------------------------------------------------------------------------------- /docs/utilities/alignment.md: -------------------------------------------------------------------------------- 1 | # Alignment 2 | 3 | To simplify the annotation process, EDS-PDF provides a [utility that aligns 4 | bounding boxes][edspdf.utils.alignment.align_box_labels] with text blocs extracted from a PDF document. 5 | This is particularly useful for annotating documents. 6 | 7 | === "Blocs" 8 | 9 | ![blocs](resources/blocs.jpeg) 10 | 11 | === "Blocs + Annotation" 12 | 13 | ![blocs + annotation](resources/blocs.png) 14 | 15 | === "Aligned" 16 | 17 | ![aligned](resources/aligned.jpeg) 18 | 19 | === "Merged Blocs" 20 | 21 | ![resources](resources/aligned-merged.jpeg) 22 | -------------------------------------------------------------------------------- /docs/utilities/index.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | EDS-PDF provides a few utilities help annotate PDF documents, and debug the output of an extraction pipeline. 4 | -------------------------------------------------------------------------------- /docs/utilities/resources/aligned-merged.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/aligned-merged.jpeg -------------------------------------------------------------------------------- /docs/utilities/resources/aligned.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/aligned.jpeg -------------------------------------------------------------------------------- /docs/utilities/resources/blocs.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/blocs.jpeg -------------------------------------------------------------------------------- /docs/utilities/resources/blocs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/blocs.png -------------------------------------------------------------------------------- /docs/utilities/resources/lines.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/lines.jpeg -------------------------------------------------------------------------------- /docs/utilities/resources/merged.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/merged.jpeg -------------------------------------------------------------------------------- /docs/utilities/visualisation.md: -------------------------------------------------------------------------------- 1 | # Visualisation 2 | 3 | EDS-PDF provides utilities to help you visualise the output of the pipeline. 4 | 5 | ## Visualising a pipeline's output 6 | 7 | You can use EDS-PDF to overlay labelled bounding boxes on top of a PDF document. 8 | 9 | ```python 10 | import edspdf 11 | from confit import Config 12 | from pathlib import Path 13 | from edspdf.visualization import show_annotations 14 | 15 | config = """ 16 | [pipeline] 17 | pipeline = ["extractor", "classifier"] 18 | 19 | [components] 20 | 21 | [components.extractor] 22 | @factory = "pdfminer-extractor" 23 | extract_style = true 24 | 25 | [components.classifier] 26 | @factory = "mask-classifier" 27 | x0 = 0.25 28 | x1 = 0.95 29 | y0 = 0.3 30 | y1 = 0.9 31 | threshold = 0.1 32 | """ 33 | 34 | model = edspdf.load(Config.from_str(config)) 35 | 36 | # Get a PDF 37 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes() 38 | 39 | # Construct the DataFrame of blocs 40 | doc = model(pdf) 41 | 42 | # Compute an image representation of each page of the PDF 43 | # overlaid with the predicted bounding boxes 44 | imgs = show_annotations(pdf=pdf, annotations=doc.text_boxes) 45 | 46 | imgs[0] 47 | ``` 48 | 49 | If you run this code in a Jupyter notebook, you'll see the following: 50 | 51 | ![lines](resources/lines.jpeg) 52 | 53 | ## Merging blocs together 54 | 55 | To help debug a pipeline (or a labelled dataset), you might want to 56 | merge blocs together according to their labels. EDS-PDF provides a `merge_lines` method 57 | that does just that. 58 | 59 | ```python 60 | # ↑ Omitted code above ↑ 61 | from edspdf.visualization import merge_boxes, show_annotations 62 | 63 | merged = merge_boxes(doc.text_boxes) 64 | 65 | imgs = show_annotations(pdf=pdf, annotations=merged) 66 | imgs[0] 67 | ``` 68 | 69 | See the difference: 70 | 71 | === "Original" 72 | 73 | ![lines](resources/lines.jpeg) 74 | 75 | === "Merged" 76 | 77 | ![lines](resources/merged.jpeg) 78 | 79 | The `merge_boxes` method uses the notion of maximal cliques to compute merges. 80 | It forbids the combined blocs from overlapping with any bloc from another label. 81 | -------------------------------------------------------------------------------- /edspdf/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 2 | from .trainable_pipe import TrainablePipe 3 | from .pipeline import Pipeline, load 4 | from .registry import registry 5 | from .structures import Box, Page, PDFDoc, Text, TextBox, TextProperties 6 | from . import data 7 | 8 | from . import utils # isort:skip 9 | 10 | __version__ = "0.10.0" 11 | -------------------------------------------------------------------------------- /edspdf/accelerators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/accelerators/__init__.py -------------------------------------------------------------------------------- /edspdf/accelerators/base.py: -------------------------------------------------------------------------------- 1 | class Accelerator: 2 | pass 3 | -------------------------------------------------------------------------------- /edspdf/accelerators/multiprocessing.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Union 2 | 3 | import torch 4 | 5 | from ..registry import registry 6 | from .base import Accelerator 7 | 8 | 9 | @registry.accelerator.register("multiprocessing") 10 | class MultiprocessingAccelerator(Accelerator): 11 | """ 12 | Deprecated: Use `docs.map_pipeline(model).set_processing(...)` instead 13 | """ 14 | 15 | def __init__( 16 | self, 17 | batch_size: int, 18 | num_cpu_workers: Optional[int] = None, 19 | num_gpu_workers: Optional[int] = None, 20 | gpu_pipe_names: Optional[List[str]] = None, 21 | gpu_worker_devices: Optional[List[Union[torch.device, str]]] = None, 22 | cpu_worker_devices: Optional[List[Union[torch.device, str]]] = None, 23 | ): 24 | self.batch_size = batch_size 25 | self.num_gpu_workers: Optional[int] = num_gpu_workers 26 | self.num_cpu_workers = num_cpu_workers 27 | self.gpu_pipe_names = gpu_pipe_names 28 | self.gpu_worker_devices = gpu_worker_devices 29 | self.cpu_worker_devices = cpu_worker_devices 30 | -------------------------------------------------------------------------------- /edspdf/data/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | from edspdf.utils.lazy_module import lazify 3 | 4 | lazify() 5 | 6 | if TYPE_CHECKING: 7 | from .base import from_iterable, to_iterable 8 | from .files import read_files, write_files 9 | from .parquet import read_parquet, write_parquet 10 | from .pandas import from_pandas, to_pandas 11 | from .converters import get_dict2doc_converter, get_doc2dict_converter 12 | -------------------------------------------------------------------------------- /edspdf/data/converters.py: -------------------------------------------------------------------------------- 1 | """ 2 | Converters are used to convert documents between python dictionaries and Doc objects. 3 | There are two types of converters: readers and writers. Readers convert dictionaries to 4 | Doc objects, and writers convert Doc objects to dictionaries. 5 | """ 6 | import inspect 7 | from copy import copy 8 | from types import FunctionType 9 | from typing import ( 10 | Any, 11 | Callable, 12 | Dict, 13 | Optional, 14 | Tuple, 15 | ) 16 | 17 | from confit.registry import ValidatedFunction 18 | 19 | FILENAME = "__FILENAME__" 20 | CONTENT = "__CONTENT__" 21 | 22 | SCHEMA = {} 23 | 24 | 25 | def validate_kwargs(converter, kwargs): 26 | converter: FunctionType = copy(converter) 27 | spec = inspect.getfullargspec(converter) 28 | first = spec.args[0] 29 | converter.__annotations__[first] = Optional[Any] 30 | converter.__defaults__ = (None, *(spec.defaults or ())[-len(spec.args) + 1 :]) 31 | vd = ValidatedFunction(converter, {"arbitrary_types_allowed": True}) 32 | model = vd.init_model_instance(**kwargs) 33 | d = { 34 | k: v 35 | for k, v in model._iter() 36 | if (k in model.__fields__ or model.__fields__[k].default_factory) 37 | } 38 | d.pop("v__duplicate_kwargs", None) # see pydantic ValidatedFunction code 39 | d.pop(vd.v_args_name, None) 40 | d.pop(first, None) 41 | return {**(d.pop(vd.v_kwargs_name, None) or {}), **d} 42 | 43 | 44 | def get_dict2doc_converter(converter: Callable, kwargs) -> Tuple[Callable, Dict]: 45 | # kwargs_to_init = False 46 | # if not callable(converter): 47 | # available = edspdf.registry.factory.get_available() 48 | # try: 49 | # filtered = [ 50 | # name 51 | # for name in available 52 | # if converter == name or (converter in name and "dict2doc" in name) 53 | # ] 54 | # converter = edspdf.registry.factory.get(filtered[0]) 55 | # converter = converter(**kwargs).instantiate(nlp=None) 56 | # kwargs = {} 57 | # return converter, kwargs 58 | # except (KeyError, IndexError): 59 | # available = [v for v in available if "dict2doc" in v] 60 | # raise ValueError( 61 | # f"Cannot find converter for format {converter}. " 62 | # f"Available converters are {', '.join(available)}" 63 | # ) 64 | # if isinstance(converter, type) or kwargs_to_init: 65 | # return converter(**kwargs), {} 66 | return converter, validate_kwargs(converter, kwargs) 67 | 68 | 69 | def get_doc2dict_converter(converter: Callable, kwargs) -> Tuple[Callable, Dict]: 70 | # if not callable(converter): 71 | # available = edspdf.registry.factory.get_available() 72 | # try: 73 | # filtered = [ 74 | # name 75 | # for name in available 76 | # if converter == name or (converter in name and "doc2dict" in name) 77 | # ] 78 | # converter = edspdf.registry.factory.get(filtered[0]) 79 | # converter = converter(**kwargs).instantiate(nlp=None) 80 | # kwargs = {} 81 | # return converter, kwargs 82 | # except (KeyError, IndexError): 83 | # available = [v for v in available if "doc2dict" in v] 84 | # raise ValueError( 85 | # f"Cannot find converter for format {converter}. " 86 | # f"Available converters are {', '.join(available)}" 87 | # ) 88 | return converter, validate_kwargs(converter, kwargs) 89 | -------------------------------------------------------------------------------- /edspdf/data/pandas.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from typing import Any, Callable, Iterable, Optional, Tuple, Union 4 | 5 | import pandas as pd 6 | 7 | from edspdf import registry 8 | from edspdf.data.base import BaseReader, BaseWriter 9 | from edspdf.data.converters import ( 10 | FILENAME, 11 | get_dict2doc_converter, 12 | get_doc2dict_converter, 13 | ) 14 | from edspdf.lazy_collection import LazyCollection 15 | from edspdf.utils.collections import dl_to_ld, flatten, ld_to_dl 16 | 17 | 18 | class PandasReader(BaseReader): 19 | DATA_FIELDS = ("data",) 20 | 21 | def __init__( 22 | self, 23 | data: pd.DataFrame, 24 | **kwargs, 25 | ): 26 | assert isinstance(data, pd.DataFrame) 27 | self.data = data 28 | 29 | super().__init__(**kwargs) 30 | 31 | def read_main(self) -> Iterable[Tuple[Any, int]]: 32 | return ((item, 1) for item in dl_to_ld(dict(self.data))) 33 | 34 | def read_worker(self, fragments): 35 | return [task for task in fragments] 36 | 37 | 38 | @registry.readers.register("pandas") 39 | def from_pandas( 40 | data, 41 | converter: Union[str, Callable], 42 | **kwargs, 43 | ) -> LazyCollection: 44 | """ 45 | The PandasReader (or `edspdf.data.from_pandas`) handles reading from a table and 46 | yields documents. At the moment, only entities and attributes are loaded. Relations 47 | and events are not supported. 48 | 49 | Example 50 | ------- 51 | ```{ .python .no-check } 52 | 53 | import edspdf 54 | 55 | nlp = edspdf.blank("eds") 56 | nlp.add_pipe(...) 57 | doc_iterator = edspdf.data.from_pandas(df, nlp=nlp, converter="omop") 58 | annotated_docs = nlp.pipe(doc_iterator) 59 | ``` 60 | 61 | !!! note "Generator vs list" 62 | 63 | `edspdf.data.from_pandas` returns a 64 | [LazyCollection][edspdf.core.lazy_collection.LazyCollection]. 65 | To iterate over the documents multiple times efficiently or to access them by 66 | index, you must convert it to a list 67 | 68 | ```{ .python .no-check } 69 | docs = list(edspdf.data.from_pandas(df, converter="omop")) 70 | ``` 71 | 72 | Parameters 73 | ---------- 74 | data: pd.DataFrame 75 | Pandas object 76 | converter: Optional[Union[str, Callable]] 77 | Converter to use to convert the rows of the DataFrame to Doc objects 78 | kwargs: 79 | Additional keyword arguments passed to the converter. These are documented 80 | on the [Data schemas](/data/schemas) page. 81 | 82 | Returns 83 | ------- 84 | LazyCollection 85 | """ 86 | 87 | data = LazyCollection(reader=PandasReader(data)) 88 | if converter: 89 | converter, kwargs = get_dict2doc_converter(converter, kwargs) 90 | data = data.map(converter, kwargs=kwargs) 91 | return data 92 | 93 | 94 | class PandasWriter(BaseWriter): 95 | def __init__(self, dtypes: Optional[dict] = None): 96 | self.dtypes = dtypes 97 | 98 | def write_worker(self, records): 99 | # If write as jsonl, we will perform the actual writing in the `write` method 100 | for rec in records: 101 | if isinstance(rec, dict): 102 | rec.pop(FILENAME, None) 103 | return records, len(records) 104 | 105 | def write_main(self, fragments): 106 | import pandas as pd 107 | 108 | columns = ld_to_dl(flatten(fragments)) 109 | res = pd.DataFrame(columns) 110 | return res.astype(self.dtypes) if self.dtypes else res 111 | 112 | 113 | @registry.writers.register("pandas") 114 | def to_pandas( 115 | data: Union[Any, LazyCollection], 116 | converter: Optional[Union[str, Callable]], 117 | dtypes: Optional[dict] = None, 118 | **kwargs, 119 | ) -> pd.DataFrame: 120 | """ 121 | `edspdf.data.to_pandas` writes a list of documents as a pandas table. 122 | 123 | Example 124 | ------- 125 | ```{ .python .no-check } 126 | 127 | import edspdf 128 | 129 | nlp = edspdf.blank("eds") 130 | nlp.add_pipe(...) 131 | 132 | doc = nlp("My document with entities") 133 | 134 | edspdf.data.to_pandas([doc], converter="omop") 135 | ``` 136 | 137 | Parameters 138 | ---------- 139 | data: Union[Any, LazyCollection], 140 | The data to write (either a list of documents or a LazyCollection). 141 | converter: Optional[Union[str, Callable]] 142 | Converter to use to convert the documents to dictionary objects before storing 143 | them in the dataframe. 144 | dtypes: Optional[dict] 145 | Dictionary of column names to dtypes. This is passed to `pd.DataFrame.astype`. 146 | kwargs: 147 | Additional keyword arguments passed to the converter. These are documented 148 | on the [Data schemas](/data/schemas) page. 149 | """ 150 | data = LazyCollection.ensure_lazy(data) 151 | if converter: 152 | converter, kwargs = get_doc2dict_converter(converter, kwargs) 153 | data = data.map(converter, kwargs=kwargs) 154 | 155 | return data.write(PandasWriter(dtypes)) 156 | -------------------------------------------------------------------------------- /edspdf/layers/__init__.py: -------------------------------------------------------------------------------- 1 | from . import box_transformer, relative_attention, sinusoidal_embedding, vocabulary 2 | -------------------------------------------------------------------------------- /edspdf/layers/sinusoidal_embedding.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | 6 | 7 | class SinusoidalEmbedding(torch.nn.Module): 8 | """ 9 | A position embedding lookup table that stores embeddings for a fixed number 10 | of positions. 11 | The value of each of the `embedding_dim` channels of the generated embedding 12 | is generated according to a trigonometric function (sin for even channels, 13 | cos for odd channels). 14 | The frequency of the signal in each pair of channels varies according to the 15 | temperature parameter. 16 | 17 | Any input position above the maximum value `num_embeddings` will be capped to 18 | `num_embeddings - 1` 19 | """ 20 | 21 | def __init__( 22 | self, 23 | num_embeddings: int, 24 | embedding_dim: int, 25 | temperature: float = 10000.0, 26 | ): 27 | """ 28 | Parameters 29 | ---------- 30 | num_embeddings: int 31 | The maximum number of position embeddings store in this table 32 | embedding_dim: int 33 | The embedding size 34 | temperature: float 35 | The temperature controls the range of frequencies used by each 36 | channel of the embedding 37 | """ 38 | super().__init__() 39 | 40 | self.embedding_dim = embedding_dim 41 | self.num_embeddings = num_embeddings 42 | self.temperature = temperature 43 | 44 | weight = torch.zeros(self.num_embeddings, self.embedding_dim) 45 | position = torch.arange(0, self.num_embeddings, dtype=torch.float).unsqueeze(1) 46 | div_term = torch.exp( 47 | torch.arange(0, self.embedding_dim, 2).float() 48 | * (-math.log(self.temperature) / self.embedding_dim) 49 | ) 50 | weight[:, 0::2] = torch.sin(position * div_term) 51 | weight[:, 1::2] = torch.cos(position * div_term) 52 | self.register_buffer("weight", weight) 53 | 54 | def extra_repr(self) -> str: 55 | return f"{self.num_embeddings}, {self.embedding_dim}" 56 | 57 | def forward(self, indices: torch.LongTensor): 58 | """ 59 | Forward pass of the SinusoidalEmbedding module 60 | 61 | Parameters 62 | ---------- 63 | indices: torch.LongTensor 64 | Shape: any 65 | 66 | Returns 67 | ------- 68 | torch.FloatTensor 69 | Shape: `(*input_shape, embedding_dim)` 70 | """ 71 | res = F.embedding(indices.clamp(0, len(self.weight) - 1), self.weight) 72 | return res 73 | -------------------------------------------------------------------------------- /edspdf/layers/vocabulary.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | from typing import Generic, Sequence, TypeVar 3 | 4 | import torch 5 | 6 | T = TypeVar("T") 7 | 8 | 9 | class Vocabulary(torch.nn.Module, Generic[T]): 10 | """ 11 | Vocabulary layer. 12 | This is not meant to be used as a `torch.nn.Module` but subclassing 13 | `torch.nn.Module` makes the instances appear when printing a model, which is nice. 14 | """ 15 | 16 | def __init__(self, items: Sequence[T] = None, default: int = -100): 17 | """ 18 | Parameters 19 | ---------- 20 | items: Sequence[InputT] 21 | Initial vocabulary elements if any. 22 | Specific elements such as padding and unk can be set here to enforce their 23 | index in the vocabulary. 24 | default: int 25 | Default index to use for out of vocabulary elements 26 | Defaults to -100 27 | """ 28 | super().__init__() 29 | self.indices = {} if items is None else {v: i for i, v in enumerate(items)} 30 | self.initialized = True 31 | self.default = default 32 | 33 | def __len__(self): 34 | return len(self.indices) 35 | 36 | @contextlib.contextmanager 37 | def initialization(self): 38 | """ 39 | Enters the initialization mode. 40 | Out of vocabulary elements will be assigned an index. 41 | """ 42 | self.initialized = False 43 | yield 44 | self.initialized = True 45 | 46 | def encode(self, item): 47 | """ 48 | Converts an element into its vocabulary index 49 | If the layer is in its initialization mode (`with vocab.initialization(): ...`), 50 | and the element is out of vocabulary, a new index will be created and returned. 51 | Otherwise, any oov element will be encoded with the `default` index. 52 | 53 | Parameters 54 | ---------- 55 | item: InputT 56 | 57 | Returns 58 | ------- 59 | int 60 | """ 61 | if self.initialized: 62 | return self.indices.get( 63 | item, self.default 64 | ) # .setdefault(item, len(self.indices)) 65 | else: 66 | return self.indices.setdefault( 67 | item, len(self.indices) 68 | ) # .setdefault(item, len(self.indices)) 69 | 70 | def decode(self, idx): 71 | """ 72 | Converts an index into its original value 73 | 74 | Parameters 75 | ---------- 76 | idx: int 77 | 78 | Returns 79 | ------- 80 | InputT 81 | """ 82 | return list(self.indices.keys())[idx] if idx >= 0 else None 83 | 84 | def extra_repr(self): 85 | return "n={}".format(len(self.indices)) 86 | -------------------------------------------------------------------------------- /edspdf/pipes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/__init__.py -------------------------------------------------------------------------------- /edspdf/pipes/aggregators/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/aggregators/__init__.py -------------------------------------------------------------------------------- /edspdf/pipes/classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/classifiers/__init__.py -------------------------------------------------------------------------------- /edspdf/pipes/classifiers/dummy.py: -------------------------------------------------------------------------------- 1 | from edspdf.pipeline import Pipeline 2 | from edspdf.registry import registry 3 | from edspdf.structures import PDFDoc 4 | 5 | 6 | @registry.factory.register("dummy-classifier") 7 | class DummyClassifier: 8 | """ 9 | Dummy classifier. Classifies each line to the same element. 10 | 11 | Parameters 12 | ---------- 13 | pipeline: Pipeline 14 | The pipeline object. 15 | name: str 16 | The name of the component. 17 | label: str 18 | The label to assign to each line. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | label: str, 24 | pipeline: Pipeline = None, 25 | name: str = "dummy-classifier", 26 | ) -> None: 27 | self.name = name 28 | self.label = label 29 | 30 | def __call__(self, doc: PDFDoc) -> PDFDoc: 31 | for b in doc.content_boxes: 32 | b.label = self.label 33 | 34 | return doc 35 | -------------------------------------------------------------------------------- /edspdf/pipes/classifiers/random.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Union 2 | 3 | import numpy as np 4 | 5 | from edspdf import PDFDoc, Pipeline, registry 6 | 7 | 8 | @registry.factory.register("random-classifier") 9 | class RandomClassifier: 10 | """ 11 | Random classifier, for chaos purposes. Classifies each box to a random element. 12 | 13 | Parameters 14 | ---------- 15 | pipeline: Pipeline 16 | The pipeline object. 17 | name: str 18 | The name of the component. 19 | labels: Union[List[str], Dict[str, float]] 20 | The labels to assign to each line. If a list is passed, each label is assigned 21 | with equal probability. If a dict is passed, the keys are the labels and the 22 | values are the probabilities. 23 | """ 24 | 25 | def __init__( 26 | self, 27 | pipeline: Pipeline, 28 | labels: Union[List[str], Dict[str, float]], 29 | seed: Optional[int] = 0, 30 | name: str = "random-classifier", 31 | ) -> None: 32 | super().__init__() 33 | 34 | if isinstance(labels, list): 35 | labels = {c: 1 for c in labels} 36 | 37 | self.labels = {c: w / sum(labels.values()) for c, w in labels.items()} 38 | 39 | self.rgn = np.random.default_rng(seed=seed) 40 | 41 | def __call__(self, doc: PDFDoc) -> PDFDoc: 42 | lines = doc.content_boxes 43 | prediction = self.rgn.choice( 44 | list(self.labels.keys()), 45 | p=list(self.labels.values()), 46 | size=len(lines), 47 | ) 48 | for b, label in zip(lines, prediction): 49 | b.label = label 50 | 51 | return doc 52 | -------------------------------------------------------------------------------- /edspdf/pipes/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | from foldedtensor import FoldedTensor 2 | from typing_extensions import TypedDict 3 | 4 | from edspdf import TrainablePipe 5 | 6 | EmbeddingOutput = TypedDict( 7 | "EmbeddingOutput", 8 | { 9 | "embeddings": FoldedTensor, 10 | }, 11 | ) 12 | -------------------------------------------------------------------------------- /edspdf/pipes/embeddings/box_layout_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing_extensions import Literal 3 | 4 | from edspdf.layers.sinusoidal_embedding import SinusoidalEmbedding 5 | from edspdf.pipeline import Pipeline 6 | from edspdf.pipes.embeddings import EmbeddingOutput 7 | from edspdf.pipes.embeddings.box_layout_preprocessor import ( 8 | BoxLayoutBatch, 9 | BoxLayoutPreprocessor, 10 | ) 11 | from edspdf.registry import registry 12 | from edspdf.trainable_pipe import TrainablePipe 13 | 14 | 15 | @registry.factory.register("box-layout-embedding") 16 | class BoxLayoutEmbedding(TrainablePipe[EmbeddingOutput]): 17 | """ 18 | This component encodes the geometrical features of a box, as extracted by the 19 | BoxLayoutPreprocessor module, into an embedding. For position modes, use: 20 | 21 | - `"sin"` to embed positions with a fixed 22 | [SinusoidalEmbedding][edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding] 23 | - `"learned"` to embed positions using a learned standard pytorch embedding layer 24 | 25 | Each produces embedding is the concatenation of the box width, height and the top, 26 | left, bottom and right coordinates, each embedded depending on the `*_mode` param. 27 | 28 | Parameters 29 | ---------- 30 | size: int 31 | Size of the output box embedding 32 | n_positions: int 33 | Number of position embeddings stored in the PositionEmbedding module 34 | x_mode: Literal["sin", "learned"] 35 | Position embedding mode of the x coordinates 36 | y_mode: Literal["sin", "learned"] 37 | Position embedding mode of the x coordinates 38 | w_mode: Literal["sin", "learned"] 39 | Position embedding mode of the width features 40 | h_mode: Literal["sin", "learned"] 41 | Position embedding mode of the height features 42 | """ 43 | 44 | def __init__( 45 | self, 46 | n_positions: int, 47 | size: int, 48 | x_mode: Literal["sin", "learned"] = "sin", 49 | y_mode: Literal["sin", "learned"] = "sin", 50 | w_mode: Literal["sin", "learned"] = "sin", 51 | h_mode: Literal["sin", "learned"] = "sin", 52 | pipeline: Pipeline = None, 53 | name: str = "box-layout-embedding", 54 | ): 55 | super().__init__(pipeline, name) 56 | 57 | assert size % 12 == 0, "Size must be a multiple of 12" 58 | 59 | self.n_positions = n_positions 60 | self.output_size = size 61 | 62 | self.x_embedding = self._make_embed(n_positions, size // 6, x_mode) 63 | self.y_embedding = self._make_embed(n_positions, size // 6, y_mode) 64 | self.w_embedding = self._make_embed(n_positions, size // 6, w_mode) 65 | self.h_embedding = self._make_embed(n_positions, size // 6, h_mode) 66 | self.first_page_embedding = torch.nn.Parameter(torch.randn(size)) 67 | self.last_page_embedding = torch.nn.Parameter(torch.randn(size)) 68 | 69 | self.box_preprocessor = BoxLayoutPreprocessor(pipeline, "box_preprocessor") 70 | 71 | def preprocess(self, doc): 72 | return self.box_preprocessor.preprocess(doc) 73 | 74 | def collate(self, batch) -> BoxLayoutBatch: 75 | return self.box_preprocessor.collate(batch) 76 | 77 | @classmethod 78 | def _make_embed(cls, n_positions, size, mode): 79 | if mode == "sin": 80 | return SinusoidalEmbedding(n_positions, size) 81 | else: 82 | return torch.nn.Embedding(n_positions, size) 83 | 84 | def forward(self, batch: BoxLayoutBatch) -> EmbeddingOutput: 85 | # fmt: off 86 | embedding = ( 87 | torch.cat( 88 | [ 89 | self.x_embedding((batch["xmin"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501 90 | self.y_embedding((batch["ymin"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501 91 | self.x_embedding((batch["xmax"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501 92 | self.y_embedding((batch["ymax"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501 93 | self.w_embedding((batch["width"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501 94 | self.h_embedding((batch["height"] * 5 * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501 95 | ], 96 | dim=-1, 97 | ) 98 | + self.first_page_embedding * batch["first_page"][..., None] 99 | + self.last_page_embedding * batch["last_page"][..., None] 100 | ) 101 | # fmt: on 102 | return {"embeddings": embedding} 103 | -------------------------------------------------------------------------------- /edspdf/pipes/embeddings/box_layout_preprocessor.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | import torch 4 | from foldedtensor import FoldedTensor, as_folded_tensor 5 | from typing_extensions import TypedDict 6 | 7 | from edspdf import Pipeline, TrainablePipe, registry 8 | from edspdf.structures import PDFDoc 9 | 10 | BoxLayoutBatch = TypedDict( 11 | "BoxLayoutBatch", 12 | { 13 | "xmin": FoldedTensor, 14 | "ymin": FoldedTensor, 15 | "xmax": FoldedTensor, 16 | "ymax": FoldedTensor, 17 | "width": FoldedTensor, 18 | "height": FoldedTensor, 19 | "first_page": FoldedTensor, 20 | "last_page": FoldedTensor, 21 | }, 22 | ) 23 | 24 | 25 | @registry.factory.register("box-layout-preprocessor") 26 | class BoxLayoutPreprocessor(TrainablePipe[BoxLayoutBatch]): 27 | """ 28 | The box preprocessor is singleton since its is not configurable. 29 | The following features of each box of an input PDFDoc document are encoded 30 | as 1D tensors: 31 | 32 | - `boxes_page`: page index of the box 33 | - `boxes_first_page`: is the box on the first page 34 | - `boxes_last_page`: is the box on the last page 35 | - `boxes_xmin`: left position of the box 36 | - `boxes_ymin`: bottom position of the box 37 | - `boxes_xmax`: right position of the box 38 | - `boxes_ymax`: top position of the box 39 | - `boxes_w`: width position of the box 40 | - `boxes_h`: height position of the box 41 | 42 | The preprocessor also returns an additional tensors: 43 | 44 | - `page_boxes_id`: box indices per page to index the 45 | above 1D tensors (LongTensor: n_pages * n_boxes) 46 | """ 47 | 48 | INSTANCE = None 49 | 50 | def __new__(cls, *args, **kwargs): 51 | if BoxLayoutPreprocessor.INSTANCE is None: 52 | BoxLayoutPreprocessor.INSTANCE = super().__new__(cls) 53 | return BoxLayoutPreprocessor.INSTANCE 54 | 55 | def __init__( 56 | self, 57 | pipeline: Pipeline = None, 58 | name: str = "box-layout-preprocessor", 59 | ): 60 | super().__init__(pipeline, name) 61 | 62 | def preprocess(self, doc: PDFDoc, supervision: bool = False): 63 | pages = doc.pages 64 | [[b.page_num for b in page.text_boxes] for page in pages] 65 | last_p = doc.num_pages - 1 66 | return { 67 | "xmin": [[b.x0 for b in p.text_boxes] for p in pages], 68 | "ymin": [[b.y0 for b in p.text_boxes] for p in pages], 69 | "xmax": [[b.x1 for b in p.text_boxes] for p in pages], 70 | "ymax": [[b.y1 for b in p.text_boxes] for p in pages], 71 | "width": [[(b.x1 - b.x0) for b in p.text_boxes] for p in pages], 72 | "height": [[(b.y1 - b.y0) for b in p.text_boxes] for p in pages], 73 | "first_page": [[b.page_num == 0 for b in p.text_boxes] for p in pages], 74 | "last_page": [[b.page_num == last_p for b in p.text_boxes] for p in pages], 75 | } 76 | 77 | def collate(self, batch) -> BoxLayoutBatch: 78 | kw = { 79 | "full_names": ["sample", "page", "line"], 80 | "data_dims": ["line"], 81 | } 82 | 83 | return { 84 | "xmin": as_folded_tensor(batch["xmin"], dtype=torch.float, **kw), 85 | "ymin": as_folded_tensor(batch["ymin"], dtype=torch.float, **kw), 86 | "xmax": as_folded_tensor(batch["xmax"], dtype=torch.float, **kw), 87 | "ymax": as_folded_tensor(batch["ymax"], dtype=torch.float, **kw), 88 | "width": as_folded_tensor(batch["width"], dtype=torch.float, **kw), 89 | "height": as_folded_tensor(batch["height"], dtype=torch.float, **kw), 90 | "first_page": as_folded_tensor(batch["first_page"], dtype=torch.bool, **kw), 91 | "last_page": as_folded_tensor(batch["last_page"], dtype=torch.bool, **kw), 92 | } 93 | 94 | def forward(self, *args, **kwargs) -> Dict[str, Any]: 95 | raise NotImplementedError() 96 | -------------------------------------------------------------------------------- /edspdf/pipes/embeddings/box_transformer.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Sequence 2 | 3 | from typing_extensions import Literal, TypedDict 4 | 5 | from edspdf import TrainablePipe 6 | from edspdf.layers.box_transformer import BoxTransformerModule 7 | from edspdf.pipeline import Pipeline 8 | from edspdf.pipes.embeddings import EmbeddingOutput 9 | from edspdf.pipes.embeddings.box_layout_preprocessor import ( 10 | BoxLayoutBatch, 11 | BoxLayoutPreprocessor, 12 | ) 13 | from edspdf.registry import registry 14 | from edspdf.utils.torch import ActivationFunction 15 | 16 | BoxTransformerEmbeddingInputBatch = TypedDict( 17 | "BoxTransformerEmbeddingInputBatch", 18 | { 19 | "embedding": EmbeddingOutput, 20 | "box_prep": BoxLayoutBatch, 21 | }, 22 | ) 23 | 24 | 25 | @registry.factory.register("box-transformer") 26 | class BoxTransformer(TrainablePipe[EmbeddingOutput]): 27 | """ 28 | BoxTransformer using 29 | [BoxTransformerModule][edspdf.layers.box_transformer.BoxTransformerModule] 30 | under the hood. 31 | 32 | !!! note 33 | 34 | This module is a [TrainablePipe][edspdf.trainable_pipe.TrainablePipe] 35 | and can be used in a [Pipeline][edspdf.pipeline.Pipeline], while 36 | [BoxTransformerModule][edspdf.layers.box_transformer.BoxTransformerModule] 37 | is a standard PyTorch module, which does not take care of the 38 | preprocessing, collating, etc. of the input documents. 39 | 40 | Parameters 41 | ---------- 42 | pipeline: Pipeline 43 | Pipeline instance 44 | name: str 45 | Name of the component 46 | num_heads: int 47 | Number of attention heads in the attention layers 48 | n_relative_positions: int 49 | Maximum range of embeddable relative positions between boxes (further 50 | distances are capped to ±n_relative_positions // 2) 51 | dropout_p: float 52 | Dropout probability both for the attention layers and embedding projections 53 | head_size: int 54 | Head sizes of the attention layers 55 | activation: ActivationFunction 56 | Activation function used in the linear->activation->linear transformations 57 | init_resweight: float 58 | Initial weight of the residual gates. 59 | At 0, the layer acts (initially) as an identity function, and at 1 as 60 | a standard Transformer layer. 61 | Initializing with a value close to 0 can help the training converge. 62 | attention_mode: Sequence[RelativeAttentionMode] 63 | Mode of relative position infused attention layer. 64 | See the [relative attention][edspdf.layers.relative_attention.RelativeAttention] 65 | documentation for more information. 66 | n_layers: int 67 | Number of layers in the Transformer 68 | """ 69 | 70 | def __init__( 71 | self, 72 | embedding: TrainablePipe[EmbeddingOutput], 73 | num_heads: int = 2, 74 | dropout_p: float = 0.0, 75 | head_size: Optional[int] = None, 76 | activation: ActivationFunction = "gelu", 77 | init_resweight: float = 0.0, 78 | n_relative_positions: Optional[int] = None, 79 | attention_mode: Sequence[Literal["c2c", "c2p", "p2c"]] = ("c2c", "c2p", "p2c"), 80 | n_layers: int = 2, 81 | pipeline: Pipeline = None, 82 | name: str = "box-transformer", 83 | ): 84 | super().__init__(pipeline, name) 85 | self.embedding = embedding 86 | self.transformer = BoxTransformerModule( 87 | input_size=embedding.output_size, 88 | num_heads=num_heads, 89 | dropout_p=dropout_p, 90 | head_size=head_size, 91 | activation=activation, 92 | init_resweight=init_resweight, 93 | n_relative_positions=n_relative_positions, 94 | attention_mode=attention_mode, 95 | n_layers=n_layers, 96 | ) 97 | self.output_size = embedding.output_size 98 | self.box_prep = BoxLayoutPreprocessor(pipeline, f"{name}.box_prep") 99 | 100 | def forward( 101 | self, 102 | batch: BoxTransformerEmbeddingInputBatch, 103 | ) -> EmbeddingOutput: 104 | res = self.embedding.module_forward(batch["embedding"]) 105 | assert ( 106 | "lengths" not in res 107 | ), "You must pool a SubBoxEmbedding output before using BoxTransformer" 108 | return { 109 | "embeddings": self.transformer(res["embeddings"], batch["box_prep"]), 110 | } 111 | -------------------------------------------------------------------------------- /edspdf/pipes/embeddings/embedding_combiner.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from typing_extensions import Literal 3 | 4 | from edspdf import Pipeline, registry 5 | from edspdf.pipes.embeddings import EmbeddingOutput, TrainablePipe 6 | 7 | 8 | @registry.factory.register("embedding-combiner") 9 | class EmbeddingCombiner(TrainablePipe[EmbeddingOutput]): 10 | def __init__( 11 | self, 12 | dropout_p: float = 0.0, 13 | mode: Literal["sum", "cat"] = "sum", 14 | pipeline: Pipeline = None, 15 | name: str = "embedding-combiner", 16 | **encoders: TrainablePipe[EmbeddingOutput], 17 | ): 18 | """ 19 | Encodes boxes using a combination of multiple encoders 20 | 21 | Parameters 22 | ---------- 23 | pipeline: Pipeline 24 | The pipeline object 25 | name: str 26 | The name of the pipe 27 | mode: Literal["sum", "cat"] 28 | The mode to use to combine the encoders: 29 | 30 | - `sum`: Sum the outputs of the encoders 31 | - `cat`: Concatenate the outputs of the encoders 32 | dropout_p: float 33 | Dropout probability used on the output of the box and textual encoders 34 | encoders: Dict[str, TrainablePipe[EmbeddingOutput]] 35 | The encoders to use. The keys are the names of the encoders and the values 36 | are the encoders themselves. 37 | """ 38 | super().__init__(pipeline, name) 39 | 40 | for name, encoder in encoders.items(): 41 | setattr(self, name, encoder) 42 | 43 | self.mode = mode 44 | 45 | assert ( 46 | mode != "sum" 47 | or len(set(encoder.output_size for encoder in encoders.values())) == 1 48 | ), ( 49 | "All encoders must have the same output size when using 'sum' " 50 | "combination:\n{}".format( 51 | "\n".join( 52 | "- {}: {}".format(name, encoder.output_size) 53 | for name, encoder in encoders.items() 54 | ) 55 | ) 56 | ) 57 | 58 | self.dropout = torch.nn.Dropout(dropout_p) 59 | self.output_size = ( 60 | sum(encoder.output_size for encoder in encoders.values()) 61 | if mode == "cat" 62 | else next(iter(encoders.values())).output_size 63 | ) 64 | 65 | def forward(self, batch) -> EmbeddingOutput: 66 | results = [ 67 | encoder.module_forward(batch[name]) 68 | for name, encoder in self.named_component_children() 69 | ] 70 | all_embeds = [ 71 | self.dropout(res["embeddings"].refold(results[0]["embeddings"].data_dims)) 72 | for res in results 73 | ] 74 | embeddings = ( 75 | sum(all_embeds) if self.mode == "sum" else torch.cat(all_embeds, dim=-1) 76 | ) 77 | return {"embeddings": embeddings} # type: ignore 78 | -------------------------------------------------------------------------------- /edspdf/pipes/embeddings/sub_box_cnn_pooler.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Sequence 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from foldedtensor import as_folded_tensor 6 | 7 | from edspdf.pipeline import Pipeline 8 | from edspdf.pipes.embeddings import EmbeddingOutput, TrainablePipe 9 | from edspdf.registry import registry 10 | from edspdf.utils.torch import ActivationFunction, get_activation_function 11 | 12 | 13 | @registry.factory.register("sub-box-cnn-pooler") 14 | class SubBoxCNNPooler(TrainablePipe[EmbeddingOutput]): 15 | """ 16 | One dimension CNN encoding multi-kernel layer. 17 | Input embeddings are convoluted using linear kernels each parametrized with 18 | a (window) size of `kernel_size[kernel_i]` 19 | The output of the kernels are concatenated together, max-pooled and finally 20 | projected to a size of `output_size`. 21 | 22 | Parameters 23 | ---------- 24 | pipeline: Pipeline 25 | Pipeline instance 26 | name: str 27 | Name of the component 28 | output_size: Optional[int] 29 | Size of the output embeddings 30 | Defaults to the `input_size` 31 | out_channels: int 32 | Number of channels 33 | kernel_sizes: Sequence[int] 34 | Window size of each kernel 35 | activation: str 36 | Activation function to use 37 | """ 38 | 39 | def __init__( 40 | self, 41 | embedding: TrainablePipe[EmbeddingOutput], 42 | pipeline: Pipeline = None, 43 | name: str = "sub-box-cnn-pooler", 44 | output_size: Optional[int] = None, 45 | out_channels: Optional[int] = None, 46 | kernel_sizes: Sequence[int] = (3, 4, 5), 47 | activation: ActivationFunction = "relu", 48 | ): 49 | super().__init__(pipeline, name) 50 | 51 | self.activation_fn = get_activation_function(activation) 52 | 53 | self.embedding = embedding 54 | input_size = self.embedding.output_size 55 | out_channels = input_size if out_channels is None else out_channels 56 | output_size = input_size if output_size is None else input_size 57 | 58 | self.convolutions = torch.nn.ModuleList( 59 | torch.nn.Conv1d( 60 | in_channels=self.embedding.output_size, 61 | out_channels=out_channels, 62 | kernel_size=kernel_size, 63 | padding=0, 64 | ) 65 | for kernel_size in kernel_sizes 66 | ) 67 | self.linear = torch.nn.Linear( 68 | in_features=out_channels * len(kernel_sizes), 69 | out_features=output_size, 70 | ) 71 | self.output_size = output_size 72 | 73 | def forward(self, batch: Any) -> EmbeddingOutput: 74 | embeddings = self.embedding.module_forward(batch["embedding"])[ 75 | "embeddings" 76 | ].refold("line", "word") 77 | if 0 in embeddings.shape: 78 | return { 79 | "embeddings": as_folded_tensor( 80 | data=torch.zeros(0, self.output_size, device=embeddings.device), 81 | lengths=embeddings.lengths[:-1], # pooled on the last dim 82 | data_dims=["line"], # fully flattened 83 | full_names=["sample", "page", "line"], 84 | ) 85 | } 86 | 87 | # sample word dim -> sample dim word 88 | box_token_embeddings = embeddings.as_tensor().permute(0, 2, 1) 89 | box_token_embeddings = torch.cat( 90 | [ 91 | self.activation_fn( 92 | conv( 93 | # pad by the appropriate amount on both sides of each sentence 94 | F.pad( 95 | box_token_embeddings, 96 | pad=[ 97 | conv.kernel_size[0] // 2, 98 | (conv.kernel_size[0] - 1) // 2, 99 | ], 100 | ) 101 | ) 102 | .permute(0, 2, 1) 103 | .masked_fill(~embeddings.mask.unsqueeze(-1), 0) 104 | ) 105 | for conv in self.convolutions 106 | ], 107 | dim=2, 108 | ) 109 | pooled = box_token_embeddings.max(1).values 110 | pooled = self.linear(pooled) 111 | # print("TEXT EMBEDS", pooled.shape, pooled.sum()) 112 | 113 | return { 114 | "embeddings": as_folded_tensor( 115 | data=pooled, 116 | lengths=embeddings.lengths[:-1], # pooled on the last dim 117 | data_dims=["line"], # fully flattened 118 | full_names=["sample", "page", "line"], 119 | ) 120 | } 121 | -------------------------------------------------------------------------------- /edspdf/pipes/extractors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/extractors/__init__.py -------------------------------------------------------------------------------- /edspdf/processing/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import TYPE_CHECKING 2 | 3 | from edspdf.utils.lazy_module import lazify 4 | 5 | lazify() 6 | 7 | if TYPE_CHECKING: 8 | from .simple import execute_simple_backend 9 | from .multiprocessing import execute_multiprocessing_backend 10 | -------------------------------------------------------------------------------- /edspdf/processing/simple.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | import sys 4 | from contextlib import nullcontext 5 | from typing import TYPE_CHECKING 6 | 7 | from edspdf.utils.collections import batchify, flatten 8 | 9 | from .utils import apply_basic_pipes, batchify_fns 10 | 11 | if TYPE_CHECKING: 12 | from edspdf.lazy_collection import LazyCollection 13 | 14 | doc_size_fns = { 15 | "content_boxes": lambda doc: len(doc.content_boxes), 16 | } 17 | 18 | 19 | def execute_simple_backend( 20 | lc: LazyCollection, 21 | ): 22 | """ 23 | This is the default execution mode which batches the documents and processes each 24 | batch on the current process in a sequential manner. 25 | """ 26 | try: 27 | no_grad = sys.modules["torch"].no_grad 28 | except (KeyError, AttributeError): 29 | no_grad = nullcontext 30 | reader = lc.reader 31 | writer = lc.writer 32 | show_progress = lc.show_progress 33 | 34 | split_into_batches_after = lc.split_into_batches_after 35 | if split_into_batches_after is None and (lc.batch_by != "docs" or lc.sort_chunks): 36 | split_into_batches_after = next( 37 | (s[0] for s in lc.pipeline if s[0] is not None), None 38 | ) 39 | names = [None] + [step[0] for step in lc.pipeline] 40 | chunk_components = lc.pipeline[: names.index(split_into_batches_after)] 41 | batch_components = lc.pipeline[names.index(split_into_batches_after) :] 42 | 43 | def process(): 44 | bar = nullcontext() 45 | if show_progress: 46 | from tqdm import tqdm 47 | 48 | bar = tqdm(smoothing=0.1, mininterval=5.0) 49 | 50 | with bar, lc.eval(): 51 | for docs in batchify( 52 | ( 53 | subtask 54 | for task, count in reader.read_main() 55 | for subtask in reader.read_worker([task]) 56 | ), 57 | batch_size=lc.chunk_size, 58 | ): 59 | docs = apply_basic_pipes(docs, chunk_components) 60 | 61 | if lc.sort_chunks: 62 | docs.sort( 63 | key=doc_size_fns.get( 64 | lc.sort_chunks, doc_size_fns["content_boxes"] 65 | ) 66 | ) 67 | 68 | for batch in batchify_fns[lc.batch_by](docs, lc.batch_size): 69 | count = len(batch) 70 | with no_grad(), lc.cache(): 71 | batch = apply_basic_pipes(batch, batch_components) 72 | 73 | if writer is not None: 74 | result, count = writer.write_worker(batch) 75 | if show_progress: 76 | bar.update(count) 77 | yield result 78 | else: 79 | if show_progress: 80 | bar.update(count) 81 | yield batch 82 | if writer is not None: 83 | result, count = writer.finalize() 84 | if show_progress: 85 | bar.update(count) 86 | if count: 87 | yield result 88 | 89 | gen = process() 90 | return flatten(gen) if writer is None else writer.write_main(gen) 91 | -------------------------------------------------------------------------------- /edspdf/processing/utils.py: -------------------------------------------------------------------------------- 1 | import types 2 | from typing import Iterable, List, TypeVar 3 | 4 | from edspdf.utils.collections import batchify 5 | 6 | 7 | def apply_basic_pipes(docs, pipes): 8 | for name, pipe, kwargs in pipes: 9 | if hasattr(pipe, "batch_process"): 10 | docs = pipe.batch_process(docs) 11 | else: 12 | results = [] 13 | for doc in docs: 14 | res = pipe(doc, **kwargs) 15 | if isinstance(res, types.GeneratorType): 16 | results.extend(res) 17 | else: 18 | results.append(res) 19 | docs = results 20 | return docs 21 | 22 | 23 | T = TypeVar("T") 24 | 25 | 26 | def batchify_with_counts( 27 | iterable, 28 | batch_size, 29 | ): 30 | total = 0 31 | batch = [] 32 | for item, count in iterable: 33 | if len(batch) > 0 and total + count > batch_size: 34 | yield batch, total 35 | batch = [] 36 | total = 0 37 | batch.append(item) 38 | total += count 39 | if len(batch) > 0: 40 | yield batch, total 41 | 42 | 43 | def batchify_by_content_boxes( 44 | iterable: Iterable[T], 45 | batch_size: int, 46 | drop_last: bool = False, 47 | ) -> Iterable[List[T]]: 48 | batch = [] 49 | total = 0 50 | for item in iterable: 51 | count = len(item.content_boxes) 52 | if len(batch) > 0 and total + count > batch_size: 53 | yield batch 54 | batch = [] 55 | total = 0 56 | batch.append(item) 57 | total += count 58 | if len(batch) > 0 and not drop_last: 59 | yield batch 60 | 61 | 62 | def batchify_by_pages( 63 | iterable: Iterable[T], 64 | batch_size: int, 65 | drop_last: bool = False, 66 | ) -> Iterable[List[T]]: 67 | batch = [] 68 | total = 0 69 | for item in iterable: 70 | count = len(item.pages) 71 | if len(batch) > 0 and total + count > batch_size: 72 | yield batch 73 | batch = [] 74 | total = 0 75 | batch.append(item) 76 | total += count 77 | if len(batch) > 0 and not drop_last: 78 | yield batch 79 | 80 | 81 | batchify_fns = { 82 | "content_boxes": batchify_by_content_boxes, 83 | "pages": batchify_by_pages, 84 | "docs": batchify, 85 | } 86 | -------------------------------------------------------------------------------- /edspdf/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/utils/__init__.py -------------------------------------------------------------------------------- /edspdf/utils/alignment.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Sequence, TypeVar 2 | 3 | import numpy as np 4 | 5 | from ..structures import Box 6 | from .collections import list_factorize 7 | 8 | INF = 100000 9 | 10 | T = TypeVar("T", bound=Box) 11 | 12 | 13 | def _align_box_labels_on_page( 14 | src_boxes: Sequence[Box], 15 | dst_boxes: Sequence[Box], 16 | threshold: float = 0.0001, 17 | pollution_label: Any = None, 18 | ): 19 | if len(src_boxes) == 0 or len(dst_boxes) == 0: 20 | return [] 21 | 22 | src_labels, label_vocab = list_factorize( 23 | [b.label for b in src_boxes] + [pollution_label] 24 | ) 25 | src_labels = np.asarray(src_labels) 26 | 27 | src_x0, src_x1, src_y0, src_y1 = np.asarray( 28 | [(b.x0, b.x1, b.y0, b.y1) for b in src_boxes] + [(-INF, INF, -INF, INF)] 29 | ).T[:, :, None] 30 | dst_x0, dst_x1, dst_y0, dst_y1 = np.asarray( 31 | [(b.x0, b.x1, b.y0, b.y1) for b in dst_boxes] 32 | ).T[:, None, :] 33 | 34 | # src_x0 has shape (n_src_boxes, 1) 35 | # dst_x0 has shape (1, n_dst_boxes) 36 | 37 | dx = np.minimum(src_x1, dst_x1) - np.maximum(src_x0, dst_x0) # shape: n_src, n_dst 38 | dy = np.minimum(src_y1, dst_y1) - np.maximum(src_y0, dst_y0) # shape: n_src, n_dst 39 | 40 | overlap = np.clip(dx, 0, None) * np.clip(dy, 0, None) # shape: n_src, n_dst 41 | src_area = (src_x1 - src_x0) * (src_y1 - src_y0) # shape: n_src 42 | dst_area = (dst_x1 - dst_x0) * (dst_y1 - dst_y0) # shape: n_dst 43 | 44 | # To remove errors for 0 divisions 45 | src_area[src_area == 0] = 1 46 | dst_area[dst_area == 0] = 1 47 | 48 | covered_src_ratio = overlap / src_area # shape: n_src, n_dst 49 | covered_dst_ratio = overlap / dst_area # shape: n_src, n_dst 50 | 51 | score = covered_src_ratio 52 | score[covered_dst_ratio < threshold] = 0.0 53 | 54 | src_indices = score.argmax(0) 55 | dst_labels = src_labels[src_indices] 56 | 57 | new_dst_boxes = [ 58 | b.evolve(label=label_vocab[label_idx]) 59 | for b, label_idx in zip(dst_boxes, dst_labels) 60 | # if label_vocab[label_idx] != "__pollution__" 61 | ] 62 | return new_dst_boxes 63 | 64 | 65 | def align_box_labels( 66 | src_boxes: Sequence[Box], 67 | dst_boxes: Sequence[T], 68 | threshold: float = 0.0001, 69 | pollution_label: Any = None, 70 | ) -> Sequence[T]: 71 | """ 72 | Align lines with possibly overlapping (and non-exhaustive) labels. 73 | 74 | Possible matches are sorted by covered area. Lines with no overlap at all 75 | 76 | Parameters 77 | ---------- 78 | src_boxes: Sequence[Box] 79 | The labelled boxes that will be used to determine the label of the dst_boxes 80 | dst_boxes: Sequence[T] 81 | The non-labelled boxes that will be assigned a label 82 | threshold : float, default 1 83 | Threshold to use for discounting a label. Used if the `labels` DataFrame 84 | does not provide a `threshold` column, or to fill `NaN` values thereof. 85 | pollution_label : Any 86 | The label to use for boxes that are not covered by any of the source boxes 87 | 88 | Returns 89 | ------- 90 | List[Box] 91 | A copy of the boxes, with the labels mapped from the source boxes 92 | """ 93 | 94 | return [ 95 | b 96 | for page in sorted(set((b.page_num for b in dst_boxes))) 97 | for b in _align_box_labels_on_page( 98 | src_boxes=[ 99 | b 100 | for b in src_boxes 101 | if page is None or b.page_num is None or b.page_num == page 102 | ], 103 | dst_boxes=[ 104 | b 105 | for b in dst_boxes 106 | if page is None or b.page_num is None or b.page_num == page 107 | ], 108 | threshold=threshold, 109 | pollution_label=pollution_label, 110 | ) 111 | ] 112 | -------------------------------------------------------------------------------- /edspdf/utils/file_system.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from pathlib import Path 4 | from typing import Optional, Tuple, Union 5 | 6 | import fsspec.implementations.local 7 | import pyarrow.fs 8 | from fsspec import AbstractFileSystem 9 | from fsspec import __version__ as fsspec_version 10 | from fsspec.implementations.arrow import ArrowFSWrapper 11 | 12 | FileSystem = Union[AbstractFileSystem, pyarrow.fs.FileSystem] 13 | 14 | if fsspec_version < "2023.3.0": 15 | # Ugly hack to make fsspec's arrow implementation work in python 3.7 16 | # since arrow requires files to be seekable, and the default fsspec 17 | # open(..., seekable) parameter is False 18 | # See https://github.com/fsspec/filesystem_spec/pull/1186 19 | ArrowFSWrapper._open.__wrapped__.__defaults__ = ("rb", None, True) 20 | 21 | 22 | def walk_match( 23 | fs: FileSystem, 24 | root: str, 25 | file_pattern: str, 26 | ) -> list: 27 | return [ 28 | os.path.join(dirpath, f) 29 | for dirpath, dirnames, files in fs.walk(root) 30 | for f in files 31 | if re.match(file_pattern, f) 32 | ] 33 | 34 | 35 | def normalize_fs_path( 36 | filesystem: Optional[FileSystem], 37 | path: Union[str, Path], 38 | ) -> Tuple[AbstractFileSystem, str]: 39 | has_protocol = isinstance(path, str) and "://" in path 40 | filesystem = ( 41 | ArrowFSWrapper(filesystem) 42 | if isinstance(filesystem, pyarrow.fs.FileSystem) 43 | else filesystem 44 | ) 45 | 46 | # We need to detect the fs from the path 47 | if filesystem is None or has_protocol: 48 | uri: str = path if has_protocol else f"file://{os.path.abspath(path)}" 49 | inferred_fs, fs_path = fsspec.core.url_to_fs(uri) 50 | inferred_fs: fsspec.AbstractFileSystem 51 | filesystem = filesystem or inferred_fs 52 | assert inferred_fs.protocol == filesystem.protocol, ( 53 | f"Protocol {inferred_fs.protocol} in path does not match " 54 | f"filesystem {filesystem.protocol}" 55 | ) 56 | path = fs_path # path without protocol 57 | 58 | return ( 59 | ArrowFSWrapper(filesystem) 60 | if isinstance(filesystem, pyarrow.fs.FileSystem) 61 | else filesystem 62 | ), str(path) 63 | -------------------------------------------------------------------------------- /edspdf/utils/lazy_module.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa: F811 2 | import ast 3 | import importlib 4 | import inspect 5 | import os 6 | 7 | 8 | def lazify(): 9 | def _get_module_paths(file): 10 | """ 11 | Reads the content of the current file, parses it with ast and store the 12 | import path for future potential imports. This is useful to only import 13 | the module that is requested and avoid loading all the modules at once, since 14 | some of them are quite heavy, or contain dependencies that are not always 15 | available. 16 | 17 | For instance: 18 | > from .trainable.span_qualifier.factory import create_component as 19 | span_qualifier is stored in the cache as: 20 | > module_paths["span_qualifier"] = "trainable.span_qualifier.factory" 21 | 22 | Returns 23 | ------- 24 | Dict[str, Tuple[str, str]] 25 | The absolute path of the current file. 26 | """ 27 | module_path = os.path.abspath(file) 28 | with open(module_path, "r") as f: 29 | module_content = f.read() 30 | module_ast = ast.parse(module_content) 31 | module_paths = {} 32 | for node in module_ast.body: 33 | # Lookup TYPE_CHECKING 34 | if not ( 35 | isinstance(node, ast.If) 36 | and ( 37 | ( 38 | isinstance(node.test, ast.Name) 39 | and node.test.id == "TYPE_CHECKING" 40 | ) 41 | or ( 42 | isinstance(node.test, ast.Attribute) 43 | and node.test.attr == "TYPE_CHECKING" 44 | ) 45 | ) 46 | ): 47 | continue 48 | for import_node in node.body: 49 | if isinstance(import_node, ast.ImportFrom): 50 | for name in import_node.names: 51 | module_paths[name.asname or name.name] = ( 52 | import_node.module, 53 | name.name, 54 | ) 55 | 56 | return module_paths 57 | 58 | def __getattr__(name): 59 | """ 60 | Imports the actual module if it is in the module_paths dict. 61 | 62 | Parameters 63 | ---------- 64 | name 65 | 66 | Returns 67 | ------- 68 | 69 | """ 70 | if name in module_paths: 71 | module_path, module_name = module_paths[name] 72 | result = getattr( 73 | importlib.__import__( 74 | module_path, 75 | fromlist=[module_name], 76 | globals=module_globals, 77 | level=1, 78 | ), 79 | module_name, 80 | ) 81 | module_globals[name] = result 82 | return result 83 | raise AttributeError(f"module {__name__} has no attribute {name}") 84 | 85 | def __dir__(): 86 | """ 87 | Returns the list of available modules. 88 | 89 | Returns 90 | ------- 91 | List[str] 92 | """ 93 | return __all__ 94 | 95 | # Access upper frame 96 | module_globals = inspect.currentframe().f_back.f_globals 97 | 98 | module_paths = _get_module_paths(module_globals["__file__"]) 99 | 100 | __all__ = list(module_paths.keys()) 101 | 102 | module_globals.update( 103 | { 104 | "__getattr__": __getattr__, 105 | "__dir__": __dir__, 106 | "__all__": __all__, 107 | } 108 | ) 109 | -------------------------------------------------------------------------------- /edspdf/utils/optimization.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import torch 4 | 5 | from edspdf.utils.collections import get_deep_attr, set_deep_attr 6 | 7 | 8 | class ScheduledOptimizer(torch.optim.Optimizer): 9 | def __init__(self, optim): 10 | self.optim = optim 11 | schedule_to_groups = defaultdict(lambda: []) 12 | for group in self.optim.param_groups: 13 | if "schedules" in group: 14 | if not isinstance(group["schedules"], list): 15 | group["schedules"] = [group["schedules"]] 16 | group["schedules"] = list(group["schedules"]) 17 | for schedule in group["schedules"]: 18 | schedule_to_groups[schedule].append(group) 19 | schedule.step(group) 20 | 21 | def zero_grad(self): 22 | return self.optim.zero_grad() 23 | 24 | @property 25 | def param_groups(self): 26 | return self.optim.param_groups 27 | 28 | @param_groups.setter 29 | def param_groups(self, value): 30 | self.optim.param_groups = value 31 | 32 | @property 33 | def state(self): 34 | return self.optim.state 35 | 36 | @state.setter 37 | def state(self, value): 38 | self.optim.state = value 39 | 40 | def state_dict(self): 41 | state = { 42 | "optim": self.optim.state_dict(), 43 | "lr": [group.get("lr") for group in self.optim.param_groups], 44 | "schedules": [ 45 | [schedule.state_dict() for schedule in group.get("schedules", ())] 46 | for group in self.optim.param_groups 47 | ], 48 | } 49 | for group in state["optim"]["param_groups"]: 50 | if "schedules" in group: 51 | del group["schedules"] 52 | return state 53 | 54 | def load_state_dict(self, state): 55 | optim_schedules = [ 56 | group.get("schedules", ()) for group in self.optim.param_groups 57 | ] 58 | self.optim.load_state_dict(state["optim"]) 59 | for group, group_schedule, group_schedules_state, lr in zip( 60 | self.optim.param_groups, optim_schedules, state["schedules"], state["lr"] 61 | ): 62 | group["schedules"] = group_schedule 63 | for schedule, schedule_state in zip( 64 | group["schedules"], group_schedules_state 65 | ): 66 | schedule.load_state_dict(schedule_state) 67 | group["lr"] = lr 68 | 69 | def step(self, closure=None): 70 | self.optim.step(closure=closure) 71 | for group in self.optim.param_groups: 72 | if "schedules" in group: 73 | for schedule in group["schedules"]: 74 | schedule.step(group) 75 | 76 | 77 | class LinearSchedule: 78 | def __init__( 79 | self, 80 | total_steps, 81 | max_value=None, 82 | start_value=0.0, 83 | path="lr", 84 | warmup=True, 85 | warmup_rate=0.1, 86 | ): 87 | self.path = path 88 | self.start_value = start_value 89 | self.max_value = max_value 90 | self.warmup_rate = warmup_rate 91 | self.total_steps = total_steps 92 | self.idx = 0 93 | 94 | def state_dict(self): 95 | return { 96 | "idx": self.idx, 97 | } 98 | 99 | def load_state_dict(self, state): 100 | self.idx = state["idx"] 101 | 102 | def step(self, group, closure=None): 103 | if self.max_value is None: 104 | self.max_value = get_deep_attr(group, self.path) 105 | warmup_steps = self.total_steps * self.warmup_rate 106 | if self.idx < warmup_steps: 107 | progress = self.idx / warmup_steps 108 | value = self.start_value + (self.max_value - self.start_value) * progress 109 | else: 110 | progress = (self.idx - warmup_steps) / (self.total_steps - warmup_steps) 111 | value = self.max_value + (0 - self.max_value) * progress 112 | self.idx += 1 113 | set_deep_attr(group, self.path, value) 114 | -------------------------------------------------------------------------------- /edspdf/utils/random.py: -------------------------------------------------------------------------------- 1 | import random 2 | from collections import namedtuple 3 | 4 | import numpy as np 5 | import torch 6 | 7 | RandomGeneratorState = namedtuple( 8 | "RandomGeneratorState", ["random", "torch", "numpy", "torch_cuda"] 9 | ) 10 | 11 | 12 | def get_random_generator_state(cuda=torch.cuda.is_available()): 13 | """ 14 | Get the `torch`, `numpy` and `random` random generator state. 15 | Parameters 16 | ---------- 17 | cuda: bool 18 | Saves the cuda random states too 19 | 20 | Returns 21 | ------- 22 | RandomGeneratorState 23 | """ 24 | return RandomGeneratorState( 25 | random.getstate(), 26 | torch.random.get_rng_state(), 27 | np.random.get_state(), 28 | torch.cuda.get_rng_state_all() if cuda else None, 29 | ) 30 | 31 | 32 | def set_random_generator_state(state): 33 | """ 34 | Set the `torch`, `numpy` and `random` random generator state. 35 | Parameters 36 | ---------- 37 | state: RandomGeneratorState 38 | """ 39 | random.setstate(state.random) 40 | torch.random.set_rng_state(state.torch) 41 | np.random.set_state(state.numpy) 42 | if ( 43 | state.torch_cuda is not None 44 | and torch.cuda.is_available() 45 | and len(state.torch_cuda) == torch.cuda.device_count() 46 | ): # pragma: no cover 47 | torch.cuda.set_rng_state_all(state.torch_cuda) 48 | 49 | 50 | class set_seed: 51 | def __init__(self, seed, cuda=torch.cuda.is_available()): 52 | """ 53 | Set seed values for random generators. 54 | If used as a context, restore the random state 55 | used before entering the context. 56 | 57 | Parameters 58 | ---------- 59 | seed: int 60 | Value used as a seed. 61 | cuda: bool 62 | Saves the cuda random states too 63 | """ 64 | # if seed is True: 65 | # seed = random.randint(1, 2**16) 66 | seed = random.randint(1, 2**16) if seed is True else seed 67 | self.state = get_random_generator_state(cuda) 68 | if seed is not None: 69 | random.seed(seed) 70 | torch.manual_seed(seed) 71 | np.random.seed(seed) 72 | if cuda: # pragma: no cover 73 | torch.cuda.manual_seed(seed) 74 | torch.cuda.manual_seed_all(seed) 75 | 76 | def __enter__(self): 77 | return self 78 | 79 | def __exit__(self, exc_type, exc_val, exc_tb): 80 | set_random_generator_state(self.state) 81 | -------------------------------------------------------------------------------- /edspdf/utils/torch.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import TypeVar 3 | 4 | import torch 5 | 6 | Args = TypeVar("Args") 7 | 8 | 9 | def pad_2d(data, pad=0, device=None): 10 | max_len = max(map(len, data), default=0) 11 | padded = [row + [pad] * (max_len - len(row)) for row in data] 12 | return torch.as_tensor(padded, device=device) 13 | 14 | 15 | def compute_pdf_relative_positions(x0, y0, x1, y1, width, height, n_relative_positions): 16 | """ 17 | Compute relative positions between boxes. 18 | Input boxes must be split between pages with the shape n_pages * n_boxes 19 | 20 | Parameters 21 | ---------- 22 | x0: torch.FloatTensor 23 | y0: torch.FloatTensor 24 | x1: torch.FloatTensor 25 | y1: torch.FloatTensor 26 | width: torch.FloatTensor 27 | height: torch.FloatTensor 28 | n_relative_positions: int 29 | Maximum range of embeddable relative positions between boxes (further 30 | distances will be capped to ±n_relative_positions // 2) 31 | 32 | Returns 33 | ------- 34 | torch.LongTensor 35 | Shape: n_pages * n_boxes * n_boxes * 2 36 | """ 37 | dx = x0[:, None, :] - x0[:, :, None] # B begin -> A begin 38 | dx = (dx * n_relative_positions).long() 39 | 40 | dy = y0[:, None, :] - y0[:, :, None] 41 | # If query above (dy > 0) key, use query height 42 | ref_height = (dy >= 0).float() * height.float()[:, :, None] + ( 43 | dy < 0 44 | ).float() * height[:, None, :] 45 | dy0 = y1[:, None, :] - y0[:, :, None] # A begin -> B end 46 | dy1 = y0[:, None, :] - y1[:, :, None] # A end -> B begin 47 | offset = 0.5 48 | dy = torch.where( 49 | # where A fully above B (dy0 and dy1 > 0), dy is min distance 50 | ((dy0 + offset).sign() > 0) & ((dy1 + offset).sign() > 0), 51 | (torch.minimum(dy0, dy1) / ref_height + offset).ceil(), 52 | # where A fully below B (dy0 and dy1 < 0), dy is -(min -distances) 53 | torch.where( 54 | ((dy0 - offset).sign() < 0) & ((dy1 - offset).sign() < 0), 55 | (torch.maximum(dy0, dy1) / ref_height - offset).floor(), 56 | 0, 57 | ), 58 | ) 59 | dy = (dy.abs().ceil() * dy.sign()).long() 60 | 61 | relative_positions = torch.stack([dx, dy], dim=-1) 62 | 63 | return relative_positions 64 | 65 | 66 | class ActivationFunction(str, Enum): 67 | relu = "relu" 68 | gelu = "gelu" 69 | glu = "glu" 70 | 71 | 72 | def get_activation_function(activation: ActivationFunction): 73 | return getattr(torch.nn.functional, activation) 74 | -------------------------------------------------------------------------------- /edspdf/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | from .annotations import compare_results, show_annotations 2 | from .merge import merge_boxes 3 | -------------------------------------------------------------------------------- /edspdf/visualization/annotations.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Sequence, Union 2 | 3 | import numpy as np 4 | import pypdfium2 as pdfium 5 | from PIL import Image, ImageDraw 6 | from PIL.PpmImagePlugin import PpmImageFile 7 | 8 | from edspdf.structures import Box 9 | 10 | CATEGORY20 = [ 11 | "#1f77b4", 12 | # "#aec7e8", 13 | "#ff7f0e", 14 | # "#ffbb78", 15 | "#2ca02c", 16 | "#98df8a", 17 | "#d62728", 18 | "#ff9896", 19 | "#9467bd", 20 | "#c5b0d5", 21 | "#8c564b", 22 | "#c49c94", 23 | "#e377c2", 24 | "#f7b6d2", 25 | "#7f7f7f", 26 | "#c7c7c7", 27 | "#bcbd22", 28 | "#dbdb8d", 29 | "#17becf", 30 | "#9edae5", 31 | ] 32 | 33 | 34 | def show_annotations( 35 | pdf: bytes, 36 | annotations: Sequence[Box], 37 | colors: Optional[Union[Dict[str, str], List[str]]] = None, 38 | ) -> List[PpmImageFile]: 39 | """ 40 | Show Box annotations on a PDF document. 41 | 42 | Parameters 43 | ---------- 44 | pdf: bytes 45 | Bytes content of the PDF document 46 | annotations: Sequence[Box] 47 | List of Box annotations to show 48 | colors: Optional[Union[Dict[str, str], List[str]]] 49 | Colors to use for each label. If a list is provided, it will be used to color 50 | the first `len(colors)` unique labels. If a dictionary is provided, it will be 51 | used to color the labels in the dictionary. If None, a default color scheme will 52 | be used. 53 | 54 | Returns 55 | ------- 56 | List[PpmImageFile] 57 | List of PIL images with the annotations. You can display them in a notebook 58 | with `display(*pages)`. 59 | """ 60 | 61 | pdf_doc = pdfium.PdfDocument(pdf) 62 | pages = list([page.render(scale=2).to_pil() for page in pdf_doc]) 63 | unique_labels = list(dict.fromkeys([box.label for box in annotations])) 64 | 65 | if colors is None: 66 | colors = {key: color for key, color in zip(unique_labels, CATEGORY20)} 67 | elif isinstance(colors, list): 68 | colors = {label: color for label, color in zip(unique_labels, colors)} 69 | 70 | for page_num, img in enumerate(pages): 71 | 72 | w, h = img.size 73 | draw = ImageDraw.Draw(img) 74 | 75 | for bloc in annotations: 76 | if bloc.page_num == page_num: 77 | draw.rectangle( 78 | [(bloc.x0 * w, bloc.y0 * h), (bloc.x1 * w, bloc.y1 * h)], 79 | outline=colors[bloc.label], 80 | width=3, 81 | ) 82 | 83 | return pages 84 | 85 | 86 | def compare_results( 87 | pdf: bytes, 88 | pred: Sequence[Box], 89 | gold: Sequence[Box], 90 | colors: Optional[Union[Dict[str, str], List[str]]] = None, 91 | ) -> List[PpmImageFile]: 92 | """ 93 | Compare two sets of annotations on a PDF document. 94 | 95 | Parameters 96 | ---------- 97 | pdf: bytes 98 | Bytes content of the PDF document 99 | pred: Sequence[Box] 100 | List of Box annotations to show on the left side 101 | gold: Sequence[Box] 102 | List of Box annotations to show on the right side 103 | colors: Optional[Union[Dict[str, str], List[str]]] 104 | Colors to use for each label. If a list is provided, it will be used to color 105 | the first `len(colors)` unique labels. If a dictionary is provided, it will be 106 | used to color the labels in the dictionary. If None, a default color scheme will 107 | be used. 108 | 109 | Returns 110 | ------- 111 | List[PpmImageFile] 112 | List of PIL images with the annotations. You can display them in a notebook 113 | with `display(*pages)`. 114 | """ 115 | if colors is None: 116 | colors = { 117 | **dict.fromkeys([b.label for b in pred]), 118 | **dict.fromkeys([b.label for b in gold]), 119 | } 120 | 121 | pages_pred = show_annotations(pdf, pred, colors) 122 | pages_gold = show_annotations(pdf, gold, colors) 123 | 124 | pages = [] 125 | 126 | for page_pred, page_gold in zip(pages_pred, pages_gold): 127 | array = np.hstack((np.asarray(page_pred), np.asarray(page_gold))) 128 | pages.append(Image.fromarray(array)) 129 | 130 | return pages 131 | -------------------------------------------------------------------------------- /edspdf/visualization/merge.py: -------------------------------------------------------------------------------- 1 | from typing import List, Sequence 2 | 3 | import networkx as nx 4 | import numpy as np 5 | 6 | from edspdf.structures import Box 7 | 8 | INF = 1000000 9 | 10 | 11 | def merge_boxes( 12 | boxes: Sequence[Box], 13 | ) -> List[Box]: 14 | """ 15 | Recursively merge boxes that have the same label to form larger non-overlapping 16 | boxes. 17 | 18 | Parameters 19 | ---------- 20 | boxes: Sequence[Box] 21 | List of boxes to merge 22 | 23 | Returns 24 | ------- 25 | List[Box] 26 | List of merged boxes 27 | """ 28 | labels = np.asarray([b.label for b in boxes]) 29 | 30 | coords = np.asarray([(b.x0, b.x1, b.y0, b.y1) for b in boxes]) 31 | 32 | # Key that determines if two boxes can be merged, initialized from the box labels 33 | merge_keys = np.unique(labels, return_inverse=True)[1] 34 | 35 | # For each page 36 | while True: 37 | adj = np.zeros((len(boxes), len(boxes)), dtype=bool) 38 | 39 | # Split boxes between those that belong to a label (and could be merged), 40 | # and those that do not belong to that label and will prevent the mergers 41 | for key in np.unique(merge_keys): 42 | key_filter = merge_keys == key 43 | 44 | x0, x1, y0, y1 = coords[key_filter].T 45 | obs_x0, obs_x1, obs_y0, obs_y1 = coords[~key_filter].T 46 | 47 | A = (slice(None), None, None) 48 | B = (None, slice(None), None) 49 | 50 | # Find the bbox of the hypothetical merged boxes 51 | merged_x0 = np.minimum(x0[A], x0[B]) 52 | merged_x1 = np.maximum(x1[A], x1[B]) 53 | merged_y0 = np.minimum(y0[A], y0[B]) 54 | merged_y1 = np.maximum(y1[A], y1[B]) 55 | 56 | # And detect if it overlaps existing box of a different label 57 | dx = np.minimum(merged_x1, obs_x1) - np.maximum(merged_x0, obs_x0) 58 | dy = np.minimum(merged_y1, obs_y1) - np.maximum(merged_y0, obs_y0) 59 | merged_overlap_with_other = (dx > 0) & (dy > 0) 60 | no_box_inbetween = (~merged_overlap_with_other).all(-1) 61 | 62 | # Update the adjacency matrix to 1 if two boxes can be merged 63 | # (ie no box of a different label lie inbetween) 64 | adj_indices = np.flatnonzero(key_filter) 65 | adj[adj_indices[:, None], adj_indices[None, :]] = no_box_inbetween 66 | 67 | # Build the cliques of boxes that can be merged 68 | cliques = nx.find_cliques(nx.from_numpy_array(adj)) 69 | 70 | # These cliques of mergeable boxes can be overlapping: think of a cross 71 | # like this= 72 | # *** --- *** 73 | # --- --- --- 74 | # *** --- *** 75 | # for which the two (-) labelled cliques would be the two axis of the cross 76 | # For each box, we change its label to its first clique number, so the cross 77 | # looks like this (symbols between the 2 figures don't map to the same indices) 78 | # *** --- *** 79 | # ooo ooo ooo 80 | # *** --- *** 81 | # and rerun the above process until there is no conflict 82 | 83 | conflicting_cliques = False 84 | seen = set() 85 | for clique_idx, clique_box_indices in enumerate(cliques): 86 | for box_idx in clique_box_indices: 87 | if box_idx in seen: 88 | # print("Already seen", box_idx) 89 | conflicting_cliques = True 90 | else: 91 | seen.add(box_idx) 92 | merge_keys[box_idx] = clique_idx 93 | 94 | if not conflicting_cliques: 95 | break 96 | 97 | x0, x1, y0, y1 = coords.T.reshape((4, -1)) 98 | 99 | # Finally, compute the bbox of the sets of mergeable boxes (same `key`) 100 | merged_boxes = [] 101 | for group_key in dict.fromkeys(merge_keys): 102 | indices = [i for i, key in enumerate(merge_keys) if group_key == key] 103 | first_box = boxes[indices[0]] 104 | merged_boxes.append( 105 | first_box.evolve( 106 | x0=min(x0[i] for i in indices), 107 | y0=min(y0[i] for i in indices), 108 | x1=max(x1[i] for i in indices), 109 | y1=max(y1[i] for i in indices), 110 | ) 111 | ) 112 | 113 | return merged_boxes 114 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: EDS-PDF 2 | 3 | repo_url: https://github.com/aphp/edspdf 4 | # repo_name: algorithms/pseudonymisation 5 | 6 | theme: 7 | name: material 8 | palette: 9 | - scheme: default 10 | toggle: 11 | icon: material/brightness-4 12 | name: Switch to dark mode 13 | - scheme: slate 14 | toggle: 15 | icon: material/brightness-7 16 | name: Switch to light mode 17 | logo: assets/logo/edspdf-white.svg 18 | favicon: assets/logo/edspdf-blue.svg 19 | features: 20 | - navigation.tracking 21 | - navigation.instant 22 | - navigation.indexes 23 | - navigation.prune 24 | - navigation.top 25 | - content.code.annotate 26 | 27 | nav: 28 | - index.md 29 | - Demo 🚀: https://aphp.github.io/edspdf/demo" target="_blank 30 | - pipeline.md 31 | - configuration.md 32 | - data-structures.md 33 | - trainable-pipes.md 34 | - inference.md 35 | - Recipes: 36 | - recipes/index.md 37 | - recipes/rule-based.md 38 | - recipes/training.md 39 | - recipes/extension.md 40 | - recipes/annotation.md 41 | - Pipes: 42 | - pipes/index.md 43 | - Embeddings: 44 | - pipes/embeddings/index.md 45 | - pipes/embeddings/simple-text-embedding.md 46 | - pipes/embeddings/embedding-combiner.md 47 | - pipes/embeddings/sub-box-cnn-pooler.md 48 | - pipes/embeddings/box-layout-embedding.md 49 | - pipes/embeddings/box-transformer.md 50 | - pipes/embeddings/huggingface-embedding.md 51 | - Extractors: 52 | - pipes/extractors/index.md 53 | - pipes/extractors/pdfminer.md 54 | - MuPDF Extractor: https://aphp.github.io/edspdf-mupdf/latest 55 | - Poppler Extractor: https://aphp.github.io/edspdf-poppler/latest 56 | - Classifiers: 57 | - pipes/box-classifiers/index.md 58 | - pipes/box-classifiers/trainable.md 59 | - pipes/box-classifiers/mask.md 60 | - pipes/box-classifiers/dummy.md 61 | - pipes/box-classifiers/random.md 62 | - Aggregators: 63 | - pipes/aggregators/index.md 64 | - pipes/aggregators/simple-aggregator.md 65 | - Layers: 66 | - layers/index.md 67 | - layers/box-transformer.md 68 | - layers/box-transformer-layer.md 69 | - layers/relative-attention.md 70 | - layers/sinusoidal-embedding.md 71 | - layers/vocabulary.md 72 | - Utilities: 73 | - utilities/index.md 74 | - utilities/visualisation.md 75 | - utilities/alignment.md 76 | - Code Reference: reference/edspdf/ 77 | - alternatives.md 78 | - contributing.md 79 | - changelog.md 80 | - roadmap.md 81 | 82 | extra_css: 83 | - assets/stylesheets/extra.css 84 | - assets/termynal/termynal.css 85 | 86 | extra_javascript: 87 | - https://cdn.jsdelivr.net/npm/vega@5 88 | - https://cdn.jsdelivr.net/npm/vega-lite@5 89 | - https://cdn.jsdelivr.net/npm/vega-embed@6 90 | - assets/termynal/termynal.js 91 | 92 | watch: 93 | - contributing.md 94 | - roadmap.md 95 | - changelog.md 96 | - edspdf 97 | - docs/scripts 98 | 99 | 100 | extra: 101 | version: 102 | provider: mike 103 | 104 | hooks: 105 | - docs/scripts/plugin.py 106 | 107 | plugins: 108 | - search 109 | - autorefs: 110 | priority: 111 | - '*' 112 | - reference 113 | 114 | - mkdocstrings: 115 | enable_inventory: true 116 | custom_templates: docs/assets/templates 117 | handlers: 118 | python: 119 | import: 120 | - https://aphp.github.io/edspdf-poppler/latest/objects.inv 121 | - https://aphp.github.io/edspdf-mupdf/latest/objects.inv 122 | options: 123 | docstring_style: numpy 124 | docstring_section_style: spacy 125 | heading_level: 2 126 | members_order: source 127 | show_root_toc_entry: false 128 | show_signature: false 129 | merge_init_into_class: true 130 | - glightbox: 131 | touchNavigation: true 132 | loop: false 133 | effect: none 134 | width: 100% 135 | height: auto 136 | zoomable: true 137 | draggable: true 138 | - bibtex: 139 | bibtex_file: "docs/references.bib" 140 | 141 | - mike 142 | 143 | markdown_extensions: 144 | - admonition 145 | - pymdownx.superfences 146 | - pymdownx.highlight 147 | - pymdownx.inlinehilite 148 | - pymdownx.snippets 149 | - pymdownx.tabbed: 150 | alternate_style: true 151 | - footnotes 152 | - md_in_html 153 | - attr_list 154 | - pymdownx.details 155 | - pymdownx.tasklist: 156 | custom_checkbox: true 157 | - pymdownx.emoji: 158 | emoji_index: !!python/name:materialx.emoji.twemoji 159 | emoji_generator: !!python/name:materialx.emoji.to_svg 160 | 161 | validation: 162 | absolute_links: ignore 163 | -------------------------------------------------------------------------------- /roadmap.md: -------------------------------------------------------------------------------- 1 | # Roadmap 2 | 3 | - [x] Style extraction 4 | - [x] Custom hybrid torch-based pipeline & configuration system 5 | - [x] Drop pandas DataFrame in favour of a ~~Cython~~ [attr](https://www.attrs.org/en/stable/) wrapper around PDF documents? 6 | - [x] Add training capabilities with a CLI to automate the annotation/preparation/training loop. 7 | Again, draw inspiration from spaCy, and maybe add the notion of a `TrainableClassifier`... 8 | - [ ] Add complete serialisation capabilities, to save a full pipeline to disk. 9 | Draw inspiration from spaCy, which took great care to solve these issues: 10 | add `save` and `load` methods to every pipeline component 11 | - [ ] Multiple-column extraction 12 | - [ ] Table detector 13 | - [ ] Integrate third-party OCR module 14 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import pytest 5 | from datasets import Dataset 6 | from pytest import fixture 7 | from utils import nested_approx 8 | 9 | from edspdf import Pipeline 10 | from edspdf.utils.collections import ld_to_dl 11 | 12 | pytest.nested_approx = nested_approx 13 | 14 | TEST_DIR = Path(__file__).parent 15 | 16 | 17 | @pytest.fixture 18 | def change_test_dir(request): 19 | os.chdir(request.fspath.dirname) 20 | yield 21 | os.chdir(request.config.invocation_dir) 22 | 23 | 24 | @fixture(scope="session") 25 | def pdf(): 26 | path = TEST_DIR / "resources" / "test.pdf" 27 | return path.read_bytes() 28 | 29 | 30 | @fixture(scope="session") 31 | def blank_pdf(): 32 | path = TEST_DIR / "resources" / "blank.pdf" 33 | return path.read_bytes() 34 | 35 | 36 | @fixture(scope="session") 37 | def styles_pdf(): 38 | path = TEST_DIR / "resources" / "styles.pdf" 39 | return path.read_bytes() 40 | 41 | 42 | @fixture(scope="session") 43 | def letter_pdf(): 44 | path = TEST_DIR / "resources" / "letter.pdf" 45 | return path.read_bytes() 46 | 47 | 48 | @fixture(scope="session") 49 | def distant_superscript_pdf(): 50 | path = TEST_DIR / "resources" / "distant-superscript.pdf" 51 | return path.read_bytes() 52 | 53 | 54 | @fixture(scope="session") 55 | def error_pdf(): 56 | path = TEST_DIR / "resources" / "error.pdf" 57 | return path.read_bytes() 58 | 59 | 60 | @fixture(scope="session") 61 | def dummy_dataset(tmpdir_factory, pdf): 62 | tmp_path = tmpdir_factory.mktemp("datasets") 63 | dataset_path = str(tmp_path / "pdf-dataset.hf") 64 | 65 | ds = Dataset.from_dict( 66 | ld_to_dl( 67 | [ 68 | { 69 | "id": str(i), 70 | "content": pdf, 71 | "bboxes": [ 72 | { 73 | "page": 0, 74 | "x0": 0.1, 75 | "y0": 0.1, 76 | "x1": 0.9, 77 | "y1": 0.5, 78 | "label": "first", 79 | "page_width": 20, 80 | "page_height": 30, 81 | }, 82 | { 83 | "page": 0, 84 | "x0": 0.1, 85 | "y0": 0.6, 86 | "x1": 0.9, 87 | "y1": 0.9, 88 | "label": "second", 89 | "page_width": 20, 90 | "page_height": 30, 91 | }, 92 | ], # top half part of the page with margin 93 | } 94 | for i in range(8) 95 | ] 96 | ) 97 | ) 98 | ds.save_to_disk(dataset_path) 99 | return dataset_path 100 | 101 | 102 | @pytest.fixture(scope="session") 103 | def frozen_pipeline(): 104 | model = Pipeline() 105 | model.add_pipe("pdfminer-extractor", name="extractor") 106 | model.add_pipe( 107 | "trainable-classifier", 108 | name="classifier", 109 | config=dict( 110 | embedding={ 111 | "@factory": "box-layout-embedding", 112 | "n_positions": 32, 113 | "size": "48", 114 | }, 115 | labels=["first", "second"], 116 | ), 117 | ) 118 | model.add_pipe("simple-aggregator") 119 | model.post_init([]) 120 | return model 121 | -------------------------------------------------------------------------------- /tests/core/config.cfg: -------------------------------------------------------------------------------- 1 | [pipeline] 2 | pipeline = ["extractor", "classifier"] 3 | disabled = [] 4 | components = ${components} 5 | 6 | [components] 7 | 8 | [components.extractor] 9 | @factory = "pdfminer-extractor" 10 | 11 | [components.classifier] 12 | @factory = "trainable-classifier" 13 | labels = [] 14 | 15 | [components.classifier.embedding] 16 | @factory = "box-transformer" 17 | num_heads = 4 18 | dropout_p = 0.1 19 | head_size = 16 20 | activation = "gelu" 21 | init_resweight = 0.01 22 | n_relative_positions = 64 23 | attention_mode = ["c2c", "c2p", "p2c"] 24 | n_layers = 1 25 | 26 | [components.classifier.embedding.embedding] 27 | @factory = "embedding-combiner" 28 | dropout_p = 0.1 29 | 30 | [components.classifier.embedding.embedding.layout_encoder] 31 | @factory = "box-layout-embedding" 32 | n_positions = 64 33 | size = 72 34 | x_mode = "learned" 35 | y_mode = "learned" 36 | w_mode = "learned" 37 | h_mode = "learned" 38 | 39 | [components.classifier.embedding.embedding.text_encoder] 40 | @factory = "sub-box-cnn-pooler" 41 | out_channels = 64 42 | kernel_sizes = [3, 4, 5] 43 | 44 | [components.classifier.embedding.embedding.text_encoder.embedding] 45 | @factory = "simple-text-embedding" 46 | size = 72 47 | -------------------------------------------------------------------------------- /tests/core/test_registry.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from edspdf.pipeline import Pipeline 4 | from edspdf.registry import CurriedFactory, registry 5 | 6 | 7 | def test_misc_register_decorator(): 8 | @registry.misc.register("test-1") 9 | def test_function(param: int = 3): 10 | pass 11 | 12 | assert test_function is registry.misc.get("test-1") 13 | 14 | 15 | def test_misc_register_call(): 16 | def test_function(param: int = 3): 17 | pass 18 | 19 | test_function_2 = registry.misc.register("test", func=test_function) 20 | assert test_function_2 is registry.misc.get("test") 21 | 22 | 23 | def test_factory_default_config(): 24 | @registry.factory.register("custom-test-component-1", default_config={"value": 5}) 25 | class CustomComponent: 26 | def __init__(self, pipeline: "Pipeline", name: str, value: int = 3): 27 | self.name = name 28 | self.value = value 29 | 30 | def __call__(self, *args, **kwargs): 31 | return self.value 32 | 33 | registry_result = registry.factory.get("custom-test-component-1")() 34 | assert isinstance(registry_result, CurriedFactory) 35 | 36 | pipeline = Pipeline() 37 | pipeline.add_pipe("custom-test-component-1") 38 | 39 | assert pipeline.get_pipe("custom-test-component-1").value == 5 40 | 41 | 42 | def test_factory_required_arguments(): 43 | with pytest.raises(ValueError) as exc_info: 44 | 45 | @registry.factory.register("custom-test-component-2") 46 | class CustomComponent: 47 | def __init__(self, value: int = 3): 48 | self.value = value 49 | 50 | def __call__(self, *args, **kwargs): 51 | return self.value 52 | 53 | assert "Factory functions must accept pipeline and name as arguments." in str( 54 | exc_info.value 55 | ) 56 | 57 | 58 | def test_missing_component(): 59 | pipeline = Pipeline() 60 | 61 | with pytest.raises(ValueError) as exc_info: 62 | pipeline.add_pipe("missing_custom_test_component") 63 | 64 | assert ( 65 | "Can't find 'missing_custom_test_component' in registry edspdf -> factories." 66 | in str(exc_info.value) 67 | ) 68 | -------------------------------------------------------------------------------- /tests/core/test_structures.py: -------------------------------------------------------------------------------- 1 | def test_repr(styles_pdf): 2 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor 3 | 4 | doc = PdfMinerExtractor(extract_style=True)(styles_pdf) 5 | doc.id = "test" 6 | 7 | for b in doc.content_boxes: 8 | b.x0 = round(b.x0, 2) 9 | b.y0 = round(b.y0, 2) 10 | b.x1 = round(b.x1, 2) 11 | b.y1 = round(b.y1, 2) 12 | 13 | assert repr(doc) == ( 14 | "PDFDoc(content=39476 bytes, id='test', num_pages=0, pages=[Page(page_num=0, " 15 | "width=612, height=792, image=None)], error=False, " 16 | "content_boxes=[TextBox(x0=0.12, x1=0.65, y0=0.09, y1=0.11, label=None, " 17 | "page_num=0, text='This is a test to check EDS-PDF’s ability to detect " 18 | "changing styles.', props=[TextProperties(italic=False, bold=False, begin=0, " 19 | "end=9, fontname='AAAAAA+ArialMT'), TextProperties(italic=False, bold=True, " 20 | "begin=10, end=14, fontname='BAAAAA+Arial-BoldMT'), " 21 | "TextProperties(italic=False, bold=False, begin=15, end=33, " 22 | "fontname='AAAAAA+ArialMT'), TextProperties(italic=True, bold=False, " 23 | "begin=34, end=41, fontname='CAAAAA+Arial-ItalicMT'), " 24 | "TextProperties(italic=False, bold=False, begin=42, end=68, " 25 | "fontname='AAAAAA+ArialMT')]), TextBox(x0=0.12, x1=0.73, y0=0.11, y1=0.13, " 26 | "label=None, page_num=0, text='Let’s up the stakes, with intra-word change. " 27 | "Or better yet, this might be hard.', props=[TextProperties(italic=False, " 28 | "bold=False, begin=0, end=25, fontname='AAAAAA+ArialMT'), " 29 | "TextProperties(italic=True, bold=False, begin=26, end=31, " 30 | "fontname='CAAAAA+Arial-ItalicMT'), TextProperties(italic=False, bold=False, " 31 | "begin=31, end=59, fontname='AAAAAA+ArialMT'), TextProperties(italic=False, " 32 | "bold=True, begin=60, end=67, fontname='BAAAAA+Arial-BoldMT'), " 33 | "TextProperties(italic=False, bold=False, begin=67, end=79, " 34 | "fontname='AAAAAA+ArialMT')])], aggregated_texts={})" 35 | ) 36 | -------------------------------------------------------------------------------- /tests/pipes/aggregators/test_simple.py: -------------------------------------------------------------------------------- 1 | from itertools import cycle 2 | 3 | import edspdf 4 | from edspdf.pipes.aggregators.simple import SimpleAggregator 5 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor 6 | from edspdf.structures import Page, PDFDoc, TextBox 7 | 8 | 9 | def test_no_style(): 10 | doc = PDFDoc( 11 | content=b"", 12 | pages=[], 13 | ) 14 | doc.pages = [ 15 | Page(doc=doc, page_num=0, width=1, height=1), 16 | Page(doc=doc, page_num=1, width=1, height=1), 17 | ] 18 | doc.content_boxes = [ 19 | TextBox( 20 | doc=doc, 21 | page_num=0, 22 | x0=0.1, 23 | y0=0.1, 24 | x1=0.5, 25 | y1=0.2, 26 | label="body", 27 | text="Begin", 28 | ), 29 | TextBox( 30 | doc=doc, 31 | page_num=0, 32 | x0=0.6, 33 | y0=0.1, 34 | x1=0.7, 35 | y1=0.2, 36 | label="body", 37 | text="and", 38 | ), 39 | TextBox( 40 | doc=doc, 41 | page_num=0, 42 | x0=0.8, 43 | y0=0.1, 44 | x1=0.9, 45 | y1=0.2, 46 | label="body", 47 | text="end.", 48 | ), 49 | TextBox( 50 | doc=doc, 51 | page_num=1, 52 | x0=0.8, 53 | y0=0.1, 54 | x1=0.9, 55 | y1=0.2, 56 | label="body", 57 | text="New page", 58 | ), 59 | ] 60 | 61 | aggregator = SimpleAggregator() 62 | assert aggregator(doc).aggregated_texts["body"].text == "Begin and end.\n\nNew page" 63 | 64 | 65 | def test_styled_pdfminer_aggregation(styles_pdf): 66 | extractor = PdfMinerExtractor(extract_style=True) 67 | aggregator = SimpleAggregator( 68 | sort=True, 69 | label_map={ 70 | "header": ["header"], 71 | "body": "body", 72 | }, 73 | ) 74 | 75 | doc = extractor(styles_pdf) 76 | for b, label in zip(doc.text_boxes, cycle(["header", "body"])): 77 | b.label = label 78 | doc = aggregator(doc) 79 | texts = {k: v.text for k, v in doc.aggregated_texts.items()} 80 | props = {k: v.properties for k, v in doc.aggregated_texts.items()} 81 | 82 | assert set(texts.keys()) == {"body", "header"} 83 | assert isinstance(props["body"], list) 84 | 85 | for value in props.values(): 86 | assert value[0].begin == 0 87 | 88 | pairs = set() 89 | for label in texts.keys(): 90 | for prop in props[label]: 91 | pairs.add( 92 | ( 93 | texts[label][prop.begin : prop.end], 94 | " ".join( 95 | filter( 96 | bool, 97 | ( 98 | ("italic" if prop.italic else ""), 99 | ("bold" if prop.bold else ""), 100 | ), 101 | ) 102 | ), 103 | ) 104 | ) 105 | 106 | assert pairs == { 107 | ("This is a", ""), 108 | ("test", "bold"), 109 | ("to check EDS-PDF’s", ""), 110 | ("ability", "italic"), 111 | ("to detect changing styles.", ""), 112 | ("Let’s up the stakes, with", ""), 113 | ("intra", "italic"), 114 | ("-word change. Or better yet,", ""), 115 | ("this mi", "bold"), 116 | ("ght be hard.", ""), 117 | } 118 | 119 | 120 | def test_styled_pdfminer_aggregation_letter(letter_pdf): 121 | extractor = PdfMinerExtractor(extract_style=True) 122 | aggregator = SimpleAggregator() 123 | 124 | doc = extractor(letter_pdf) 125 | for b, label in zip(doc.content_boxes, cycle(["header", "body"])): 126 | b.label = label 127 | doc = aggregator(doc) 128 | texts = {k: v.text for k, v in doc.aggregated_texts.items()} 129 | props = {k: v.properties for k, v in doc.aggregated_texts.items()} 130 | 131 | assert set(texts.keys()) == {"body", "header"} 132 | assert isinstance(props["body"], list) 133 | 134 | for value in props.values(): 135 | assert value[0].begin == 0 136 | 137 | pairs = set() 138 | for label in texts.keys(): 139 | for prop in props[label]: 140 | pairs.add( 141 | ( 142 | texts[label][prop.begin : prop.end], 143 | " ".join( 144 | filter( 145 | bool, 146 | ( 147 | ("italic" if prop.italic else ""), 148 | ("bold" if prop.bold else ""), 149 | ), 150 | ) 151 | ), 152 | ) 153 | ) 154 | 155 | 156 | def test_distant_superscript(distant_superscript_pdf): 157 | pipeline = edspdf.Pipeline() 158 | pipeline.add_pipe("poppler-extractor") 159 | pipeline.add_pipe("dummy-classifier", config={"label": "body"}) 160 | pipeline.add_pipe("simple-aggregator") 161 | doc = pipeline(distant_superscript_pdf) 162 | assert doc.aggregated_texts["body"].text == "3 test line" 163 | -------------------------------------------------------------------------------- /tests/pipes/classifiers/conftest.py: -------------------------------------------------------------------------------- 1 | from pytest import fixture 2 | 3 | from edspdf.structures import Page, PDFDoc, TextBox 4 | 5 | 6 | @fixture 7 | def single_page_doc() -> PDFDoc: 8 | doc = PDFDoc(id="doc", content=b"", pages=[]) 9 | doc.pages = [Page(doc=doc, page_num=0, width=1.0, height=1.0)] 10 | doc.content_boxes = [ 11 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.1, x1=0.9, y1=0.2), 12 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.4, y1=0.7), 13 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.9, y1=0.7), 14 | ] 15 | return doc 16 | 17 | 18 | @fixture 19 | def multi_page_doc() -> PDFDoc: 20 | doc = PDFDoc(id="doc", content=b"") 21 | doc.pages = [ 22 | Page(doc=doc, page_num=0, width=1.0, height=1.0), 23 | Page(doc=doc, page_num=1, width=1.0, height=1.0), 24 | ] 25 | doc.content_boxes = [ 26 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.1, x1=0.9, y1=0.2), 27 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.4, y1=0.7), 28 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.9, y1=0.7), 29 | TextBox(doc=doc, page_num=1, text="foo", x0=0.1, y0=0.1, x1=0.9, y1=0.2), 30 | TextBox(doc=doc, page_num=1, text="foo", x0=0.1, y0=0.6, x1=0.4, y1=0.7), 31 | TextBox(doc=doc, page_num=1, text="foo", x0=0.1, y0=0.6, x1=0.9, y1=0.7), 32 | ] 33 | 34 | return doc 35 | -------------------------------------------------------------------------------- /tests/pipes/classifiers/test_align.py: -------------------------------------------------------------------------------- 1 | from edspdf.structures import Box 2 | from edspdf.utils.alignment import align_box_labels 3 | 4 | 5 | def test_align_multi_page(multi_page_doc): 6 | annotations = [ 7 | Box(x0=0.0, y0=0.0, x1=1.0, y1=1.0, page_num=0, label="big"), 8 | Box(x0=0.1, y0=0.1, x1=0.9, y1=0.9, page_num=1, label="small"), 9 | ] 10 | 11 | labelled = align_box_labels(annotations, multi_page_doc.text_boxes) 12 | assert [b.label for b in labelled] == [ 13 | "big", 14 | "big", 15 | "big", 16 | "small", 17 | "small", 18 | "small", 19 | ] 20 | 21 | 22 | def test_align_cross_page(multi_page_doc): 23 | annotations = [ 24 | Box(x0=0.0, y0=0.0, x1=1.0, y1=1.0, label="big"), 25 | Box(x0=0.1, y0=0.1, x1=0.9, y1=0.9, label="small"), 26 | ] 27 | 28 | labelled = align_box_labels(annotations, multi_page_doc.text_boxes) 29 | assert [b.label for b in labelled] == [ 30 | "small", 31 | "small", 32 | "small", 33 | "small", 34 | "small", 35 | "small", 36 | ] 37 | -------------------------------------------------------------------------------- /tests/pipes/classifiers/test_dummy.py: -------------------------------------------------------------------------------- 1 | from edspdf.pipes.classifiers.dummy import DummyClassifier 2 | 3 | 4 | def test_dummy(single_page_doc): 5 | classifier = DummyClassifier(label="body") 6 | 7 | single_page_doc = classifier(single_page_doc) 8 | 9 | p1, p2, p3 = [b.label for b in single_page_doc.text_boxes] 10 | 11 | assert p1 == "body" 12 | assert p2 == "body" 13 | assert p3 == "body" 14 | -------------------------------------------------------------------------------- /tests/pipes/classifiers/test_mask.py: -------------------------------------------------------------------------------- 1 | from confit import Config 2 | 3 | import edspdf 4 | 5 | configuration = """ 6 | [pipeline] 7 | pipeline = ["classifier"] 8 | components = ${components} 9 | 10 | [components.classifier] 11 | @factory = "mask-classifier" 12 | x0 = 0 13 | y0 = 0.5 14 | x1 = 0.5 15 | y1 = 1 16 | threshold = 0.4 17 | """ 18 | 19 | configuration_custom = """ 20 | [pipeline] 21 | pipeline = ["classifier"] 22 | components = ${components} 23 | 24 | [components.classifier] 25 | @factory = "multi-mask-classifier" 26 | threshold = 0.9 27 | 28 | [components.classifier.body] 29 | label = "body" 30 | x0 = 0 31 | y0 = 0.5 32 | x1 = 0.5 33 | y1 = 1 34 | """ 35 | 36 | 37 | def test_simple_mask(single_page_doc): 38 | model = edspdf.load(Config.from_str(configuration)) 39 | 40 | single_page_doc = model(single_page_doc) 41 | 42 | p1, p2, p3 = [b.label for b in single_page_doc.text_boxes] 43 | 44 | assert p1 == "pollution" 45 | assert p2 == "body" 46 | assert p3 == "body" 47 | 48 | 49 | def test_custom_mask(single_page_doc): 50 | model = edspdf.load(Config.from_str(configuration_custom)) 51 | 52 | single_page_doc = model(single_page_doc) 53 | 54 | p1, p2, p3 = [b.label for b in single_page_doc.text_boxes] 55 | 56 | assert p1 == "pollution" 57 | assert p2 == "body" 58 | assert p3 == "pollution" 59 | -------------------------------------------------------------------------------- /tests/pipes/classifiers/test_random.py: -------------------------------------------------------------------------------- 1 | from confit import Config 2 | 3 | import edspdf 4 | from edspdf.structures import PDFDoc 5 | 6 | configuration = """ 7 | [pipeline] 8 | pipeline = ["classifier"] 9 | components = ${components} 10 | 11 | [components.classifier] 12 | @factory = "random-classifier" 13 | labels = [ "body", "header" ] 14 | """ 15 | 16 | 17 | def test_random_classifier(single_page_doc: PDFDoc): 18 | model = edspdf.load(Config.from_str(configuration)) 19 | 20 | single_page_doc = model(single_page_doc) 21 | 22 | assert set(b.label for b in single_page_doc.text_boxes) == {"body", "header"} 23 | -------------------------------------------------------------------------------- /tests/pipes/embeddings/test_custom.py: -------------------------------------------------------------------------------- 1 | from edspdf.pipes.embeddings.box_layout_embedding import BoxLayoutEmbedding 2 | from edspdf.pipes.embeddings.box_transformer import BoxTransformer 3 | from edspdf.pipes.embeddings.embedding_combiner import EmbeddingCombiner 4 | from edspdf.pipes.embeddings.simple_text_embedding import SimpleTextEmbedding 5 | from edspdf.pipes.embeddings.sub_box_cnn_pooler import SubBoxCNNPooler 6 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor 7 | 8 | 9 | def test_custom_embedding(pdf, error_pdf, tmp_path): 10 | embedding = BoxTransformer( 11 | num_heads=4, 12 | dropout_p=0.1, 13 | activation="gelu", 14 | init_resweight=0.01, 15 | head_size=16, 16 | attention_mode=["c2c", "c2p", "p2c"], 17 | n_layers=1, 18 | n_relative_positions=64, 19 | embedding=EmbeddingCombiner( 20 | dropout_p=0.1, 21 | text_encoder=SubBoxCNNPooler( 22 | out_channels=64, 23 | kernel_sizes=(3, 4, 5), 24 | embedding=SimpleTextEmbedding( 25 | size=72, 26 | ), 27 | ), 28 | layout_encoder=BoxLayoutEmbedding( 29 | n_positions=64, 30 | x_mode="sin", 31 | y_mode="sin", 32 | w_mode="learned", 33 | h_mode="learned", 34 | size=72, 35 | ), 36 | ), 37 | ) 38 | str(embedding) 39 | 40 | extractor = PdfMinerExtractor(render_pages=True) 41 | pdfdoc = extractor(pdf) 42 | pdfdoc.text_boxes[0].text = "Very long word of 150 letters : " + "x" * 150 43 | embedding.post_init([pdfdoc], set()) 44 | embedding(pdfdoc) 45 | embedding.save_extra_data(tmp_path, set()) 46 | embedding.load_extra_data(tmp_path, set()) 47 | 48 | # Test empty document 49 | embedding(extractor(error_pdf)) 50 | -------------------------------------------------------------------------------- /tests/pipes/embeddings/test_huggingface.py: -------------------------------------------------------------------------------- 1 | from edspdf.pipes.embeddings.huggingface_embedding import HuggingfaceEmbedding 2 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor 3 | 4 | 5 | def test_huggingface_embedding(pdf, error_pdf): 6 | embedding = HuggingfaceEmbedding( 7 | pipeline=None, 8 | name="huggingface", 9 | model="hf-tiny-model-private/tiny-random-LayoutLMv3Model", 10 | window=32, 11 | stride=16, 12 | use_image=True, 13 | ) 14 | # Patch the faulty size in the tiny-random-LayoutLMv3Model 15 | embedding.image_processor.size = { 16 | "height": embedding.hf_model.config.input_size, 17 | "width": embedding.hf_model.config.input_size, 18 | } 19 | 20 | extractor = PdfMinerExtractor(render_pages=True) 21 | embedding(extractor(pdf)) 22 | embedding(extractor(error_pdf)) 23 | -------------------------------------------------------------------------------- /tests/pipes/extractors/test_pdfminer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from blocks_ground_truth import blank_blocks, pdf_blocks, styles_blocks 3 | from pdfminer.pdfparser import PDFSyntaxError 4 | 5 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor 6 | 7 | 8 | def test_pdfminer(pdf, styles_pdf, blank_pdf): 9 | extractor = PdfMinerExtractor(extract_style=False) 10 | 11 | pytest.nested_approx(extractor(pdf).text_boxes, pdf_blocks, abs=5e-2) 12 | pytest.nested_approx(extractor(styles_pdf).text_boxes, styles_blocks, abs=5e-2) 13 | pytest.nested_approx(extractor(blank_pdf).text_boxes, blank_blocks, abs=5e-2) 14 | 15 | 16 | def test_pdfminer_image(pdf, styles_pdf, blank_pdf): 17 | extractor = PdfMinerExtractor(extract_style=False, render_pages=True) 18 | 19 | assert extractor(pdf).pages[0].image.shape == (2339, 1654, 3) 20 | assert extractor(styles_pdf).pages[0].image.shape == (2200, 1700, 3) 21 | assert extractor(blank_pdf).pages[0].image.shape == (2339, 1654, 3) 22 | 23 | 24 | def test_pdfminer_error(error_pdf): 25 | extractor = PdfMinerExtractor(extract_style=False, raise_on_error=True) 26 | 27 | with pytest.raises(PDFSyntaxError): 28 | extractor(error_pdf) 29 | 30 | extractor.raise_on_error = False 31 | result = extractor(error_pdf) 32 | assert len(result.text_boxes) == 0 33 | assert result.error is True 34 | -------------------------------------------------------------------------------- /tests/recipes/config.cfg: -------------------------------------------------------------------------------- 1 | [train] 2 | model = ${pipeline} 3 | max_steps = 20 4 | lr = 8e-4 5 | seed = 43 6 | 7 | [train.train_data] 8 | @adapter = segmentation-adapter 9 | 10 | [train.val_data] 11 | @adapter = segmentation-adapter 12 | 13 | [pipeline] 14 | pipeline = ["extractor", "embedding", "classifier"] 15 | disabled = [] 16 | components = ${components} 17 | 18 | [components] 19 | 20 | [components.extractor] 21 | @factory = "pdfminer-extractor" 22 | 23 | [components.embedding] 24 | @factory = "box-transformer" 25 | num_heads = 4 26 | dropout_p = 0.1 27 | head_size = 16 28 | activation = "gelu" 29 | init_resweight = 0.01 30 | n_relative_positions = 64 31 | attention_mode = ["c2c", "c2p", "p2c"] 32 | n_layers = 1 33 | 34 | [components.classifier] 35 | @factory = "trainable-classifier" 36 | labels = [] 37 | embedding = ${components.embedding} 38 | 39 | [components.embedding.embedding] 40 | @factory = "embedding-combiner" 41 | dropout_p = 0.1 42 | 43 | [components.embedding.embedding.layout_encoder] 44 | @factory = "box-layout-embedding" 45 | n_positions = 64 46 | size = 72 47 | x_mode = "learned" 48 | y_mode = "learned" 49 | w_mode = "learned" 50 | h_mode = "learned" 51 | 52 | [components.embedding.embedding.text_encoder] 53 | @factory = "sub-box-cnn-pooler" 54 | out_channels = 64 55 | kernel_sizes = [3, 4, 5] 56 | 57 | [components.embedding.embedding.text_encoder.embedding] 58 | @factory = "simple-text-embedding" 59 | size = 72 60 | -------------------------------------------------------------------------------- /tests/recipes/test_markdown_aggregator.py: -------------------------------------------------------------------------------- 1 | from edspdf import registry 2 | from edspdf.pipes.aggregators.simple import SimpleAggregator 3 | from edspdf.structures import PDFDoc, Text 4 | 5 | 6 | @registry.factory.register("markdown-aggregator") # 7 | class MarkdownAggregator(SimpleAggregator): 8 | def __call__(self, doc: PDFDoc) -> PDFDoc: 9 | doc = super().__call__(doc) 10 | 11 | for label in doc.aggregated_texts.keys(): 12 | text = doc.aggregated_texts[label].text 13 | 14 | fragments = [] 15 | 16 | offset = 0 17 | for s in doc.aggregated_texts[label].properties: 18 | if s.begin >= s.end: 19 | continue 20 | if offset < s.begin: 21 | fragments.append(text[offset : s.begin]) 22 | 23 | offset = s.end 24 | snippet = text[s.begin : s.end] 25 | if s.bold: 26 | snippet = f"**{snippet}**" 27 | if s.italic: 28 | snippet = f"_{snippet}_" 29 | fragments.append(snippet) 30 | 31 | if offset < len(text): 32 | fragments.append(text[offset:]) 33 | 34 | doc.aggregated_texts[label] = Text(text="".join(fragments)) 35 | 36 | return doc 37 | 38 | 39 | def test_markdown_aggregator(styles_pdf): 40 | from edspdf import Pipeline 41 | 42 | model = Pipeline() 43 | # will extract text lines from a document 44 | model.add_pipe( 45 | "pdfminer-extractor", 46 | config=dict( 47 | extract_style=True, 48 | ), 49 | ) 50 | # classify everything inside the `body` bounding box as `body` 51 | model.add_pipe( 52 | "mask-classifier", 53 | config={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9}, 54 | ) 55 | # aggregates the lines together to re-create the original text 56 | model.add_pipe("markdown-aggregator") 57 | 58 | assert model(styles_pdf).aggregated_texts["body"].text == ( 59 | "Let’s up the stakes, with _intra_-word change. Or better yet, **this mi**ght " 60 | "be hard." 61 | ) 62 | -------------------------------------------------------------------------------- /tests/resources/blank.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/blank.pdf -------------------------------------------------------------------------------- /tests/resources/distant-superscript.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/distant-superscript.pdf -------------------------------------------------------------------------------- /tests/resources/error.pdf: -------------------------------------------------------------------------------- 1 | This is not a PDF : it will raise an error if anyone tries to parse it 2 | 3 | -------------------------------------------------------------------------------- /tests/resources/letter.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/letter.pdf -------------------------------------------------------------------------------- /tests/resources/styles.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/styles.pdf -------------------------------------------------------------------------------- /tests/resources/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/test.pdf -------------------------------------------------------------------------------- /tests/utils.py: -------------------------------------------------------------------------------- 1 | import attr 2 | from pytest import approx as pytest_approx 3 | 4 | from edspdf.structures import BaseModel 5 | 6 | 7 | def is_primitive(x): 8 | return x is None or type(x) in (int, float, str, bool) 9 | 10 | 11 | pytest_plugins = ["helpers_namespace"] 12 | 13 | 14 | def nested_approx(A, B, abs=1e-6, rel=1e-6, enforce_same_type=False): 15 | if enforce_same_type and type(A) != type(B) and not is_primitive(A): 16 | # I use `not is_primitive(A)` to enforce the same type only for data structures 17 | return False 18 | if isinstance(A, BaseModel): 19 | names = [field.name for field in attr.fields(type(A)) if field.eq] 20 | return type(A) == type(B) and nested_approx( 21 | A.dict(filter=lambda a, v: a.name in names), 22 | B.dict(filter=lambda a, v: a.name in names), 23 | abs=abs, 24 | rel=rel, 25 | ) 26 | 27 | elif isinstance(A, set) or isinstance(B, set): 28 | # if any of the data structures is a set, convert both of them to a sorted 29 | # list, but return False if the length has changed 30 | len_A, len_B = len(A), len(B) 31 | A, B = sorted(A), sorted(B) 32 | if len_A != len(A) or len_B != len(B): 33 | return False 34 | 35 | for i in range(len(A)): 36 | if not nested_approx(A[i], B[i], abs, rel): 37 | return False 38 | 39 | return True 40 | elif isinstance(A, dict) and isinstance(B, dict): 41 | for k in A.keys(): 42 | if not nested_approx(A[k], B[k], abs, rel): 43 | return False 44 | 45 | return True 46 | elif (isinstance(A, list) or isinstance(A, tuple)) and ( 47 | isinstance(B, list) or isinstance(B, tuple) 48 | ): 49 | for i in range(len(A)): 50 | if not nested_approx(A[i], B[i], abs, rel): 51 | return False 52 | 53 | return True 54 | else: 55 | try: 56 | assert A == pytest_approx(B, rel=rel, abs=abs) 57 | is_approx_equal = A == pytest_approx(B, rel=rel, abs=abs) 58 | except (AssertionError, TypeError): 59 | is_approx_equal = False 60 | 61 | return is_approx_equal 62 | -------------------------------------------------------------------------------- /tests/utils/test_py_utils.py: -------------------------------------------------------------------------------- 1 | from edspdf.utils.collections import ( 2 | flatten_dict, 3 | get_deep_attr, 4 | nest_dict, 5 | set_deep_attr, 6 | ) 7 | 8 | 9 | def test_nest_dict(): 10 | assert nest_dict({"a/b/c": 4, "a/b/d": "ok", "a/x": {"key": "value"}}) == { 11 | "a": { 12 | "b": {"c": 4, "d": "ok"}, 13 | "x": {"key": "value"}, 14 | } 15 | } 16 | assert nest_dict({}) == {} 17 | 18 | 19 | def test_flatten_dict(): 20 | assert flatten_dict( 21 | { 22 | "a": { 23 | "b": {"c": 4, "d": "ok"}, 24 | "x": {"key": "value"}, 25 | }, 26 | "empty": {}, 27 | } 28 | ) == ( 29 | { 30 | "a/b/c": 4, 31 | "a/b/d": "ok", 32 | "a/x/key": "value", 33 | } 34 | ) 35 | 36 | 37 | class Point: 38 | def __init__(self, x, y, data): 39 | self.x = x 40 | self.y = y 41 | self.data = data 42 | 43 | 44 | def test_deep_attr(): 45 | obj = [Point(2, 3, {"my": ({"attr": 4},)})] 46 | assert get_deep_attr(obj, "0.data.my.0.attr") == 4 47 | 48 | set_deep_attr(obj, "0.data.my.0.attr", 5) 49 | 50 | assert get_deep_attr(obj, "0.data.my.0.attr") == 5 51 | 52 | set_deep_attr(obj, "0.data.my.0", 5) 53 | 54 | assert get_deep_attr(obj, "0.data.my") == (5,) 55 | 56 | set_deep_attr(obj, "0.new_attr", "ok") 57 | 58 | assert get_deep_attr(obj, "0.new_attr") == "ok" 59 | -------------------------------------------------------------------------------- /tests/utils/test_torch_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from edspdf.utils.torch import pad_2d 4 | 5 | 6 | def test_pad_2d(): 7 | a = [ 8 | [0, 1, 2, 3, 4], 9 | [0, 1, 2, 3, 4, 5, 6], 10 | ] 11 | torch.testing.assert_close( 12 | pad_2d(a, pad=-1), 13 | torch.tensor( 14 | [ 15 | [0, 1, 2, 3, 4, -1, -1], 16 | [0, 1, 2, 3, 4, 5, 6], 17 | ] 18 | ), 19 | ) 20 | torch.testing.assert_close( 21 | pad_2d([], pad=-1, device=torch.device("cpu")), 22 | torch.tensor([]), 23 | ) 24 | -------------------------------------------------------------------------------- /tests/visualization/test_annotations.py: -------------------------------------------------------------------------------- 1 | from edspdf.pipes.classifiers.mask import simple_mask_classifier_factory 2 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor 3 | from edspdf.visualization import compare_results, merge_boxes, show_annotations 4 | 5 | 6 | def test_pipeline(pdf): 7 | extractor = PdfMinerExtractor() 8 | classifier = simple_mask_classifier_factory( 9 | x0=0.1, y0=0.4, x1=0.5, y1=0.9, threshold=0.1 10 | ) 11 | 12 | doc = extractor(pdf) 13 | doc = classifier(doc) 14 | 15 | merged = merge_boxes(doc.lines) 16 | 17 | assert len(show_annotations(pdf, merged)) == 1 18 | assert len(compare_results(pdf, doc.lines, merged)) == 1 19 | -------------------------------------------------------------------------------- /tests/visualization/test_merge.py: -------------------------------------------------------------------------------- 1 | from edspdf.pipeline import Pipeline 2 | from edspdf.structures import Box 3 | from edspdf.visualization.merge import merge_boxes 4 | 5 | 6 | def test_merge(): 7 | lines = [ 8 | Box(page_num=0, x0=0, x1=1, y0=0, y1=0.1, label="body"), 9 | Box(page_num=0, x0=0, x1=1, y0=0.1, y1=0.2, label="body"), 10 | Box(page_num=0, x0=0, x1=0.4, y0=0.2, y1=0.3, label="body"), 11 | Box(page_num=0, x0=0.6, x1=1, y0=0.2, y1=0.3, label="other"), 12 | Box(page_num=1, x0=0.6, x1=1, y0=0.2, y1=0.3, label="body"), 13 | ] 14 | 15 | merged = [ 16 | Box(page_num=0, x0=0.0, x1=1.0, y0=0.0, y1=0.2, label="body"), 17 | Box(page_num=0, x0=0.0, x1=0.4, y0=0.2, y1=0.3, label="body"), 18 | Box(page_num=0, x0=0.6, x1=1.0, y0=0.2, y1=0.3, label="other"), 19 | Box(page_num=1, x0=0.6, x1=1.0, y0=0.2, y1=0.3, label="body"), 20 | ] 21 | 22 | out = merge_boxes(lines) 23 | 24 | assert len(out) == 4 25 | 26 | assert out == merged 27 | 28 | 29 | def test_pipeline(pdf, blank_pdf): 30 | model = Pipeline() 31 | model.add_pipe("pdfminer-extractor") 32 | model.add_pipe( 33 | "mask-classifier", config=dict(x0=0.1, y0=0.4, x1=0.5, y1=0.9, threshold=0.1) 34 | ) 35 | 36 | pdf_pages = model(pdf).pages 37 | blank_pdf_pages = model(blank_pdf).pages 38 | assert len([b for p in pdf_pages for b in merge_boxes(p.text_boxes)]) == 7 39 | assert len([b for p in blank_pdf_pages for b in merge_boxes(p.text_boxes)]) == 0 40 | --------------------------------------------------------------------------------