├── .flake8
├── .github
├── ISSUE_TEMPLATE
│ ├── bug.md
│ └── feature.md
├── pull_request_template.md
└── workflows
│ ├── documentation.yml
│ ├── release.yml
│ └── tests.yml
├── .gitignore
├── .gitlab-ci.yml
├── .pre-commit-config.yaml
├── CITATION.cff
├── LICENSE
├── README.md
├── changelog.md
├── contributing.md
├── demo
├── app.py
└── requirements.txt
├── docs
├── alternatives.md
├── assets
│ ├── images
│ │ ├── model-parallelism.png
│ │ ├── multiprocessing.png
│ │ └── transformer-windowing.svg
│ ├── logo
│ │ ├── aphp-blue.svg
│ │ ├── aphp-white.svg
│ │ ├── edspdf-blue.svg
│ │ ├── edspdf-red.svg
│ │ └── edspdf-white.svg
│ ├── stylesheets
│ │ └── extra.css
│ ├── templates
│ │ └── python
│ │ │ └── material
│ │ │ ├── class.html
│ │ │ ├── docstring.html
│ │ │ ├── docstring
│ │ │ ├── examples.html
│ │ │ └── parameters.html
│ │ │ └── function.html
│ └── termynal
│ │ ├── termynal.css
│ │ └── termynal.js
├── changelog.md
├── configuration.md
├── contributing.md
├── data-structures.md
├── index.md
├── inference.md
├── layers
│ ├── box-transformer-layer.md
│ ├── box-transformer.md
│ ├── index.md
│ ├── relative-attention.md
│ ├── sinusoidal-embedding.md
│ └── vocabulary.md
├── pipeline.md
├── pipes
│ ├── aggregators
│ │ ├── index.md
│ │ └── simple-aggregator.md
│ ├── box-classifiers
│ │ ├── dummy.md
│ │ ├── index.md
│ │ ├── mask.md
│ │ ├── random.md
│ │ └── trainable.md
│ ├── embeddings
│ │ ├── box-layout-embedding.md
│ │ ├── box-transformer.md
│ │ ├── embedding-combiner.md
│ │ ├── huggingface-embedding.md
│ │ ├── index.md
│ │ ├── simple-text-embedding.md
│ │ └── sub-box-cnn-pooler.md
│ ├── extractors
│ │ ├── index.md
│ │ └── pdfminer.md
│ └── index.md
├── recipes
│ ├── annotation.md
│ ├── extension.md
│ ├── index.md
│ ├── resources
│ │ ├── deep-learning-architecture.svg
│ │ ├── lines.jpeg
│ │ └── merged.jpeg
│ ├── rule-based.md
│ └── training.md
├── references.bib
├── roadmap.md
├── scripts
│ ├── bibtex.py
│ └── plugin.py
├── trainable-pipes.md
└── utilities
│ ├── alignment.md
│ ├── index.md
│ ├── resources
│ ├── aligned-merged.jpeg
│ ├── aligned.jpeg
│ ├── blocs.jpeg
│ ├── blocs.png
│ ├── lines.jpeg
│ └── merged.jpeg
│ └── visualisation.md
├── edspdf
├── __init__.py
├── accelerators
│ ├── __init__.py
│ ├── base.py
│ └── multiprocessing.py
├── data
│ ├── __init__.py
│ ├── base.py
│ ├── converters.py
│ ├── files.py
│ ├── pandas.py
│ └── parquet.py
├── layers
│ ├── __init__.py
│ ├── box_transformer.py
│ ├── relative_attention.py
│ ├── sinusoidal_embedding.py
│ └── vocabulary.py
├── lazy_collection.py
├── pipeline.py
├── pipes
│ ├── __init__.py
│ ├── aggregators
│ │ ├── __init__.py
│ │ └── simple.py
│ ├── classifiers
│ │ ├── __init__.py
│ │ ├── dummy.py
│ │ ├── mask.py
│ │ ├── random.py
│ │ └── trainable.py
│ ├── embeddings
│ │ ├── __init__.py
│ │ ├── box_layout_embedding.py
│ │ ├── box_layout_preprocessor.py
│ │ ├── box_transformer.py
│ │ ├── embedding_combiner.py
│ │ ├── huggingface_embedding.py
│ │ ├── simple_text_embedding.py
│ │ └── sub_box_cnn_pooler.py
│ └── extractors
│ │ ├── __init__.py
│ │ └── pdfminer.py
├── processing
│ ├── __init__.py
│ ├── multiprocessing.py
│ ├── simple.py
│ └── utils.py
├── registry.py
├── structures.py
├── trainable_pipe.py
├── utils
│ ├── __init__.py
│ ├── alignment.py
│ ├── collections.py
│ ├── file_system.py
│ ├── lazy_module.py
│ ├── optimization.py
│ ├── package.py
│ ├── random.py
│ └── torch.py
└── visualization
│ ├── __init__.py
│ ├── annotations.py
│ └── merge.py
├── mkdocs.yml
├── pyproject.toml
├── roadmap.md
└── tests
├── conftest.py
├── core
├── config.cfg
├── test_data.py
├── test_pipeline.py
├── test_registry.py
└── test_structures.py
├── pipes
├── aggregators
│ └── test_simple.py
├── classifiers
│ ├── conftest.py
│ ├── test_align.py
│ ├── test_dummy.py
│ ├── test_mask.py
│ └── test_random.py
├── embeddings
│ ├── test_custom.py
│ └── test_huggingface.py
└── extractors
│ ├── blocks_ground_truth.py
│ └── test_pdfminer.py
├── recipes
├── config.cfg
├── test_markdown_aggregator.py
└── test_train.py
├── resources
├── blank.pdf
├── distant-superscript.pdf
├── error.pdf
├── letter.pdf
├── styles.pdf
└── test.pdf
├── utils.py
├── utils
├── test_package.py
├── test_py_utils.py
└── test_torch_utils.py
└── visualization
├── test_annotations.py
└── test_merge.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | exclude = .git,__pycache__,__init__.py,.mypy_cache,.pytest_cache,.venv,build
4 | per-file-ignores = __init__.py:F401,tests/*.py:F401,factory.py:F401
5 | ignore = W503, E203
6 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "Bug Report"
3 | about: Use this template if you came across a bug or unexpected behaviour differing from the docs.
4 | ---
5 |
6 |
7 |
8 | ## Description
9 |
10 |
11 |
12 | ## How to reproduce the bug
13 |
14 |
15 |
16 | ## Your Environment
17 |
18 |
19 |
20 | - Operating System:
21 | - Python Version Used:
22 | - EDS-PDF Version Used:
23 | - Environment Information:
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: "Feature request"
3 | about: Use this template if you'd like EDS-PDF to add a new feature.
4 | title: "Feature request: [feature]"
5 | ---
6 |
7 | ## Feature type
8 |
9 |
10 |
11 | ## Description
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Description
4 |
5 |
6 |
7 | ## Checklist
8 |
9 |
10 |
11 | - [ ] If this PR is a bug fix, the bug is documented in the test suite.
12 | - [ ] Changes were documented in the changelog (pending section).
13 | - [ ] If necessary, changes were made to the documentation.
14 |
--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
1 | name: Documentation
2 |
3 | on:
4 | workflow_dispatch:
5 | push:
6 | branches: [main]
7 |
8 | jobs:
9 | Documentation:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v2
13 | - uses: actions/setup-python@v2
14 | with:
15 | python-version: "3.10"
16 |
17 | - name: Set PY variable
18 | run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
19 |
20 | - name: Install hatch
21 | run: pip install hatch
22 |
23 | - name: Set up Git
24 | run: |
25 | git config user.name ${{ github.actor }}
26 | git config user.email ${{ github.actor }}@users.noreply.github.com
27 | - name: Build documentation
28 | run: |
29 | git fetch origin gh-pages
30 | hatch -e docs run mike delete main
31 | hatch -e docs run mike deploy --push main
32 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Release
10 |
11 | on:
12 | workflow_dispatch:
13 | release:
14 | types: [published]
15 |
16 | jobs:
17 | build:
18 | name: Build package
19 | runs-on: ubuntu-22.04
20 | steps:
21 | - uses: actions/checkout@v2
22 |
23 | - name: Build sdist
24 | run: pipx run build --sdist --wheel
25 |
26 | - uses: actions/upload-artifact@v4
27 | with:
28 | name: artifact
29 | path: |
30 | dist/*.tar.gz
31 | dist/*.whl
32 |
33 | pypi:
34 | name: Upload to PyPI
35 | needs: [ build ]
36 | runs-on: ubuntu-22.04
37 | permissions:
38 | id-token: write
39 |
40 | steps:
41 | - uses: actions/download-artifact@v4
42 | with:
43 | name: artifact
44 | path: dist
45 | merge-multiple: true
46 | - name: Publish package
47 | uses: pypa/gh-action-pypi-publish@release/v1
48 |
49 | documentation:
50 | name: Build documentation
51 |
52 | runs-on: ubuntu-22.04
53 | steps:
54 | - uses: actions/checkout@v2
55 | - uses: actions/setup-python@v2
56 | with:
57 | python-version: "3.10"
58 |
59 | - name: Set PY variable
60 | run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
61 |
62 | - name: Install hatch
63 | run: pip install hatch
64 |
65 | - name: Set up Git
66 | run: |
67 | git config user.name ${{ github.actor }}
68 | git config user.email ${{ github.actor }}@users.noreply.github.com
69 |
70 | - name: Build documentation
71 | run: |
72 | git fetch origin gh-pages
73 | hatch -e docs run mike deploy --push --no-redirect --update-aliases $GITHUB_REF_NAME latest
74 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests and Linting
2 |
3 | on:
4 | workflow_dispatch:
5 | pull_request:
6 | push:
7 | branches: [main]
8 |
9 | jobs:
10 | Linting:
11 | runs-on: ubuntu-22.04
12 | steps:
13 | - uses: actions/checkout@v3
14 | with:
15 | # requites to grab the history of the PR
16 | fetch-depth: 0
17 | - uses: actions/setup-python@v3
18 | with:
19 | python-version: "3.10"
20 | - uses: pre-commit/action@v3.0.0
21 |
22 | Pytest:
23 | runs-on: ubuntu-22.04
24 | strategy:
25 | fail-fast: true
26 | matrix:
27 | python-version: ["3.7", "3.8", "3.9", "3.10"]
28 | steps:
29 | - uses: actions/checkout@v2
30 | - name: Set up Python
31 | uses: actions/setup-python@v2
32 | with:
33 | python-version: ${{ matrix.python-version }}
34 | architecture: x64
35 |
36 | - name: Cache HuggingFace Models
37 | uses: actions/cache@v2
38 | id: cache-huggingface
39 | with:
40 | path: ~/.cache/huggingface/
41 | key: ${{ matrix.python-version }}-huggingface
42 |
43 | - name: Install hatch
44 | run: pip install hatch
45 |
46 | - name: Test with Pytest on Python ${{ matrix.python-version }}
47 | run: hatch run tests
48 |
49 | - name: Upload coverage data
50 | uses: actions/upload-artifact@v4
51 | with:
52 | name: coverage-data-${{ matrix.python-version }}
53 | path: .coverage.*
54 | if-no-files-found: ignore
55 | include-hidden-files: true
56 |
57 | Coverage:
58 | name: Coverage
59 | needs: Pytest
60 | uses: aphp/foldedtensor/.github/workflows/coverage.yml@main
61 | with:
62 | base-branch: main
63 | coverage-data-pattern: coverage-data-*
64 | coverage-report: coverage.txt
65 | coverage-badge: coverage.svg
66 | coverage-branch: coverage
67 |
68 | Documentation:
69 | runs-on: ubuntu-latest
70 | steps:
71 | - uses: actions/checkout@v2
72 | - uses: actions/setup-python@v2
73 | with:
74 | python-version: "3.10"
75 |
76 | - name: Install hatch
77 | run: pip install hatch
78 |
79 | - name: Build documentation
80 | run: hatch run docs:build
81 |
82 | Installation:
83 | runs-on: ubuntu-22.04
84 | strategy:
85 | fail-fast: false
86 | matrix:
87 | python-version: ["3.7", "3.8", "3.9", "3.10"]
88 | steps:
89 | - uses: actions/checkout@v2
90 | - uses: actions/setup-python@v2
91 | with:
92 | python-version: ${{ matrix.python-version }}
93 | - name: Install library from source
94 | run: |
95 | pip install .
96 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | __pycache__/
3 |
4 | # DS Store
5 | .DS_Store
6 |
7 | .idea
8 | .vscode
9 |
10 | .venv
11 |
12 | # C extensions
13 | *.so
14 | *.dylib
15 | *.cpp
16 |
17 | # Distribution / packaging
18 | setup.py
19 | poetry.lock
20 | init
21 | .Python
22 | env/
23 | build/
24 | develop-eggs/
25 | dist/
26 | downloads/
27 | eggs/
28 | .eggs/
29 | lib/
30 | lib64/
31 | parts/
32 | sdist/
33 | var/
34 | *.egg-info/
35 | .installed.cfg
36 | *.egg
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 | .pytest_cache/
49 | report.xml
50 |
51 | # IPython Notebook
52 | .ipynb_checkpoints
53 | *.ipynb
54 |
55 | # Data
56 | *.csv
57 | *.xls
58 | *.xlsx
59 | *.pkl
60 | *.jpg
61 | *.png
62 | *.html
63 | *.pickle
64 | *.joblib
65 | *.pdf
66 | /data/
67 |
68 | # MkDocs output
69 | docs/reference
70 | site/
71 | public/
72 |
--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
1 | image: harbor.eds.aphp.fr/public/python:3.8-slim
2 |
3 | variables:
4 | GIT_SUBMODULE_STRATEGY: recursive
5 |
6 | stages:
7 | - test
8 | - pages
9 | - package
10 |
11 | Linting:
12 | stage: test
13 | cache:
14 | - key:
15 | files:
16 | - .pre-commit-config.yaml
17 | paths:
18 | - ~/.pre-commit
19 | before_script:
20 | - apt-get update
21 | - apt-get install -y --no-install-recommends git
22 | - pip install pre-commit
23 | script:
24 | - pre-commit run --all-files
25 | only:
26 | refs:
27 | - main
28 | - merge_request
29 |
30 | Running Pytest:
31 | stage: test
32 | before_script:
33 | - pip install cython setuptools # because `poetry install` does not correctly build the package
34 | - pip install -e '.[dev]'
35 | script:
36 | - pytest tests --cov edspdf --junitxml=report.xml
37 | after_script:
38 | - coverage xml -o coverage.xml
39 | coverage: "/TOTAL.+ ([0-9]{1,3}%)/"
40 | artifacts:
41 | when: always
42 | paths:
43 | - coverage.xml
44 | - report.xml
45 | - ./
46 | reports:
47 | junit: report.xml
48 | coverage_report:
49 | coverage_format: cobertura
50 | path: coverage.xml
51 |
52 | only:
53 | refs:
54 | - main
55 | - merge_request
56 |
57 | Installation:
58 | stage: test
59 | script:
60 | - pip install .
61 | only:
62 | refs:
63 | - main
64 | - merge_request
65 |
66 | Test documentation:
67 | stage: test
68 | before_script:
69 | - pip install -e '.[docs]'
70 | script:
71 | - mkdocs build --site-dir documentation
72 | artifacts:
73 | paths:
74 | - documentation
75 | only:
76 | refs:
77 | - merge_request
78 |
79 | pages:
80 | stage: pages
81 | before_script:
82 | - pip install -e '.[docs]'
83 | script:
84 | - mkdocs build --site-dir public
85 | artifacts:
86 | paths:
87 | - public
88 | only:
89 | - main
90 |
91 | Package:
92 | stage: package
93 | before_script:
94 | - pip install build twine
95 | - python -m build
96 | script:
97 | - TWINE_PASSWORD=${CI_JOB_TOKEN} TWINE_USERNAME=gitlab-ci-token python -m twine upload --repository-url ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/pypi dist/*
98 | only:
99 | - tags
100 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # See https://pre-commit.com for more information
2 | # See https://pre-commit.com/hooks.html for more hooks
3 | repos:
4 | - repo: https://github.com/pre-commit/pre-commit-hooks
5 | rev: v3.2.0
6 | hooks:
7 | - id: trailing-whitespace
8 | - id: no-commit-to-branch
9 | - id: end-of-file-fixer
10 | - id: check-yaml
11 | args: ["--unsafe"]
12 | - id: check-toml
13 | - id: check-json
14 | - id: check-symlinks
15 | - id: check-docstring-first
16 | - id: check-added-large-files
17 | - id: detect-private-key
18 | # ruff
19 | - repo: https://github.com/charliermarsh/ruff-pre-commit
20 | # Ruff version.
21 | rev: 'v0.0.287'
22 | hooks:
23 | - id: ruff
24 | args: ['--config', 'pyproject.toml']
25 | - repo: https://github.com/psf/black
26 | rev: 22.3.0
27 | hooks:
28 | - id: black
29 | - repo: https://github.com/asottile/blacken-docs
30 | rev: v1.10.0
31 | hooks:
32 | - id: blacken-docs
33 | additional_dependencies: [black==20.8b1]
34 | exclude: notebooks/
35 |
--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
1 | # This CITATION.cff file was generated with cffinit.
2 | # Visit https://bit.ly/cffinit to generate yours today!
3 |
4 | cff-version: 1.2.0
5 | title: >-
6 | EDS-PDF: Smart text extraction from PDF documents
7 | message: If you use EDS-PDF, please cite us as below.
8 | type: software
9 | authors:
10 | - given-names: Basile
11 | family-names: Dura
12 | orcid: "https://orcid.org/0000-0002-8315-4050"
13 | affiliation: Assistance Publique – Hôpitaux de Paris
14 | - given-names: Perceval
15 | family-names: Wajsburt
16 | affiliation: Assistance Publique – Hôpitaux de Paris
17 | - given-names: Alice
18 | family-names: Calliger
19 | affiliation: Assistance Publique – Hôpitaux de Paris
20 | - given-names: Christel
21 | family-names: Gérardin
22 | affiliation: Assistance Publique – Hôpitaux de Paris
23 | - given-names: Romain
24 | family-names: Bey
25 | affiliation: Assistance Publique – Hôpitaux de Paris
26 | repository-code: "https://github.com/aphp/edspdf"
27 | url: "https://github.com/aphp/edspdf"
28 | abstract: >-
29 | EDS-PDF provides a modular and extendable framework to extract text from PDF documents.
30 | keywords:
31 | - PDF
32 | - extraction
33 | - python
34 | - NLP
35 | license: BSD-3-Clause
36 | year: 2022
37 | doi: 10.5281/zenodo.6902977
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2021 Assistance Publique - Hôpitaux de Paris
2 |
3 | Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
4 |
5 | 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6 |
7 | 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 |
9 | 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
10 |
11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | [](https://aphp.github.io/edspdf/latest/)
3 | [](https://pypi.org/project/edspdf/)
4 | [](https://raw.githubusercontent.com/aphp/edspdf/coverage/coverage.txt)
5 | [](https://zenodo.org/badge/latestdoi/517726737)
6 |
7 | # EDS-PDF
8 |
9 | EDS-PDF provides a modular framework to extract text information from PDF documents.
10 |
11 | You can use it out-of-the-box, or extend it to fit your specific use case. We provide a pipeline system and various utilities for visualizing and processing PDFs, as well as multiple components to build complex models:complex models:
12 | - 📄 [Extractors](https://aphp.github.io/edspdf/latest/pipes/extractors) to parse PDFs (based on [pdfminer](https://github.com/euske/pdfminer), [mupdf](https://github.com/aphp/edspdf-mupdf) or [poppler](https://github.com/aphp/edspdf-poppler))
13 | - 🎯 [Classifiers](https://aphp.github.io/edspdf/latest/pipes/box-classifiers) to perform text box classification, in order to segment PDFs
14 | - 🧩 [Aggregators](https://aphp.github.io/edspdf/latest/pipes/aggregators) to produce an aggregated output from the detected text boxes
15 | - 🧠 Trainable layers to incorporate machine learning in your pipeline (e.g., [embedding](https://aphp.github.io/edspdf/latest/pipes/embeddings) building blocks or a [trainable classifier](https://aphp.github.io/edspdf/latest/pipes/box-classifiers/trainable/))
16 |
17 | Visit the [:book: documentation](https://aphp.github.io/edspdf/) for more information!
18 |
19 | ## Getting started
20 |
21 | ### Installation
22 |
23 | Install the library with pip:
24 |
25 | ```bash
26 | pip install edspdf
27 | ```
28 |
29 | ### Extracting text
30 |
31 | Let's build a simple PDF extractor that uses a rule-based classifier. There are two
32 | ways to do this, either by using the [configuration system](#configuration) or by using
33 | the pipeline API.
34 |
35 | Create a configuration file:
36 |
37 |
config.cfg
38 |
39 | ```ini
40 | [pipeline]
41 | pipeline = ["extractor", "classifier", "aggregator"]
42 |
43 | [components.extractor]
44 | @factory = "pdfminer-extractor"
45 |
46 | [components.classifier]
47 | @factory = "mask-classifier"
48 | x0 = 0.2
49 | x1 = 0.9
50 | y0 = 0.3
51 | y1 = 0.6
52 | threshold = 0.1
53 |
54 | [components.aggregator]
55 | @factory = "simple-aggregator"
56 | ```
57 |
58 | and load it from Python:
59 |
60 | ```python
61 | import edspdf
62 | from pathlib import Path
63 |
64 | model = edspdf.load("config.cfg") # (1)
65 | ```
66 |
67 | Or create a pipeline directly from Python:
68 |
69 | ```python
70 | from edspdf import Pipeline
71 |
72 | model = Pipeline()
73 | model.add_pipe("pdfminer-extractor")
74 | model.add_pipe(
75 | "mask-classifier",
76 | config=dict(
77 | x0=0.2,
78 | x1=0.9,
79 | y0=0.3,
80 | y1=0.6,
81 | threshold=0.1,
82 | ),
83 | )
84 | model.add_pipe("simple-aggregator")
85 | ```
86 |
87 | This pipeline can then be applied (for instance with this [PDF](https://github.com/aphp/edspdf/raw/main/tests/resources/letter.pdf)):
88 |
89 | ```python
90 | # Get a PDF
91 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes()
92 | pdf = model(pdf)
93 |
94 | body = pdf.aggregated_texts["body"]
95 |
96 | text, style = body.text, body.properties
97 | ```
98 |
99 | See the [rule-based recipe](https://aphp.github.io/edspdf/latest/recipes/rule-based) for a step-by-step explanation of what is happening.
100 |
101 | ## Citation
102 |
103 | If you use EDS-PDF, please cite us as below.
104 |
105 | ```bibtex
106 | @software{edspdf,
107 | author = {Dura, Basile and Wajsburt, Perceval and Calliger, Alice and Gérardin, Christel and Bey, Romain},
108 | doi = {10.5281/zenodo.6902977},
109 | license = {BSD-3-Clause},
110 | title = {{EDS-PDF: Smart text extraction from PDF documents}},
111 | url = {https://github.com/aphp/edspdf}
112 | }
113 | ```
114 |
115 | ## Acknowledgement
116 |
117 | We would like to thank [Assistance Publique – Hôpitaux de Paris](https://www.aphp.fr/) and
118 | [AP-HP Foundation](https://fondationrechercheaphp.fr/) for funding this project.
119 |
--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing to EDS-PDF
2 |
3 | We welcome contributions ! There are many ways to help. For example, you can:
4 |
5 | 1. Help us track bugs by filing issues
6 | 2. Suggest and help prioritise new functionalities
7 | 3. Help us make the library as straightforward as possible, by simply asking questions on whatever does not seem clear to you.
8 |
9 | ## Development installation
10 |
11 | To be able to run the test suite and develop your own pipeline, you should clone the repo and install it locally. We use the [`hatch`](https://hatch.pypa.io/) package manager to manage the project.
12 |
13 |
14 |
15 |
16 | ```console
17 | color:gray # Clone the repository and change directory
18 | $ git clone ssh://git@github.com/aphp/edspdf.git
19 | ---> 100%
20 |
21 | color:gray # Ensure hatch is installed, preferably via pipx
22 | $ pipx install hatch
23 |
24 | $ cd edspdf
25 |
26 | color:gray # Enter a shell to develop / test the project. This will install everything required in a virtual environment. You can also `source` the path shown by hatch.
27 | $ hatch shell
28 | $ ...
29 | $ exit # when you're done
30 | ```
31 |
32 |
33 |
34 | To make sure the pipeline will not fail because of formatting errors, we added pre-commit hooks using the `pre-commit` Python library. To use it, simply install it:
35 |
36 |
37 |
38 | ```console
39 | $ pre-commit install
40 | ```
41 |
42 |
43 |
44 | The pre-commit hooks defined in the [configuration](https://github.com/aphp/edspdf/blob/main/.pre-commit-config.yaml) will automatically run when you commit your changes, letting you know if something went wrong.
45 |
46 | The hooks only run on staged changes. To force-run it on all files, run:
47 |
48 |
49 |
50 | ```console
51 | $ pre-commit run --all-files
52 | ---> 100%
53 | color:green All good !
54 | ```
55 |
56 |
57 |
58 | ## Proposing a merge request
59 |
60 | At the very least, your changes should :
61 |
62 | - Be well-documented ;
63 | - Pass every tests, and preferably implement its own ;
64 | - Follow the style guide.
65 |
66 | ### Testing your code
67 |
68 | We use the Pytest test suite.
69 |
70 | The following command will run the test suite. Writing your own tests is encouraged !
71 |
72 | ```shell
73 | pytest
74 | ```
75 |
76 | Should your contribution propose a bug fix, we require the bug be thoroughly tested.
77 |
78 | ### Style Guide
79 |
80 | We use [Black](https://github.com/psf/black) to reformat the code. While other formatter only enforce PEP8 compliance, Black also makes the code uniform. In short :
81 |
82 | > Black reformats entire files in place. It is not configurable.
83 |
84 | Moreover, the CI/CD pipeline enforces a number of checks on the "quality" of the code. To wit, non black-formatted code will make the test pipeline fail. We use `pre-commit` to keep our codebase clean.
85 |
86 | Refer to the [development install tutorial](#development-installation) for tips on how to format your files automatically.
87 | Most modern editors propose extensions that will format files on save.
88 |
89 | ### Documentation
90 |
91 | Make sure to document your improvements, both within the code with comprehensive docstrings,
92 | as well as in the documentation itself if need be.
93 |
94 | We use `MkDocs` for EDS-PDF's documentation. You can view your changes with
95 |
96 |
97 |
98 | ```console
99 | color:gray # Run the documentation
100 | $ hatch run docs:serve
101 | ```
102 |
103 |
104 |
105 | Go to [`localhost:8000`](http://localhost:8000) to see your changes. MkDocs watches for changes in the documentation folder
106 | and automatically reloads the page.
107 |
--------------------------------------------------------------------------------
/demo/app.py:
--------------------------------------------------------------------------------
1 | import base64
2 |
3 | import pandas as pd
4 | import streamlit as st
5 | from confit import Config
6 |
7 | import edspdf
8 | from edspdf.visualization import merge_boxes, show_annotations
9 |
10 | CONFIG = """\
11 | [pipeline]
12 | pipeline = ["extractor", "classifier", "aggregator"]
13 |
14 | [components]
15 |
16 | [components.extractor]
17 | @factory = "pdfminer-extractor"
18 | extract_style = true
19 |
20 | [components.classifier]
21 | @factory = "mask-classifier"
22 | x0 = 0.25
23 | x1 = 0.95
24 | y0 = 0.3
25 | y1 = 0.9
26 | threshold = 0.1
27 |
28 | [components.aggregator]
29 | @factory = "simple-aggregator"
30 | """
31 |
32 |
33 | st.set_page_config(
34 | page_title="EDS-PDF Demo",
35 | page_icon="📄",
36 | )
37 |
38 | st.title("EDS-PDF")
39 |
40 | st.warning(
41 | "You should **not** put sensitive data in the example, as this application "
42 | "**is not secure**."
43 | )
44 |
45 | st.sidebar.header("About")
46 | st.sidebar.markdown(
47 | "EDS-PDF is a contributive effort maintained by AP-HP's Data Science team. "
48 | "Have a look at the "
49 | "[documentation](https://aphp.github.io/edspdf/) for more information."
50 | )
51 |
52 |
53 | st.header("Extract a PDF")
54 |
55 | st.subheader("Configuration")
56 | config = st.text_area(label="Change the config", value=CONFIG, height=200)
57 |
58 |
59 | model_load_state = st.info("Loading model...")
60 |
61 | reader = edspdf.load(Config.from_str(config))
62 |
63 | model_load_state.empty()
64 |
65 | st.subheader("Input")
66 | upload = st.file_uploader("PDF to analyse", accept_multiple_files=False)
67 |
68 | if upload:
69 |
70 | pdf = upload.getvalue()
71 |
72 | base64_pdf = base64.b64encode(pdf).decode("utf-8")
73 |
74 | doc = reader(pdf)
75 |
76 | body = doc.aggregated_texts["body"].text
77 | styles = doc.aggregated_texts["body"].properties
78 |
79 | pdf_display = f"""\
80 | """
86 |
87 | st.subheader("Output")
88 |
89 | with st.expander("Visualisation"):
90 |
91 | merged = merge_boxes(sorted(doc.text_boxes))
92 |
93 | imgs = show_annotations(pdf=pdf, annotations=merged)
94 |
95 | page = st.selectbox("Pages", options=[i + 1 for i in range(len(imgs))]) - 1
96 |
97 | st.image(imgs[page])
98 |
99 | # with st.expander("PDF"):
100 | # st.markdown(pdf_display, unsafe_allow_html=True)
101 |
102 | with st.expander("Text"):
103 | if body is None:
104 | st.warning(
105 | "No text detected... Are you sure this is a text-based PDF?\n\n"
106 | "There is no support for OCR within EDS-PDF (for now?)."
107 | )
108 | else:
109 | st.markdown("```\n" + body + "\n```")
110 |
111 | with st.expander("Styles"):
112 | if styles is None:
113 | st.warning(
114 | "No text detected... Are you sure this is a text-based PDF?\n\n"
115 | "There is no support for OCR within EDS-PDF (for now?)."
116 | )
117 | else:
118 | st.dataframe(pd.DataFrame(styles))
119 |
--------------------------------------------------------------------------------
/demo/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/aphp/edspdf.git
2 | streamlit
3 |
--------------------------------------------------------------------------------
/docs/alternatives.md:
--------------------------------------------------------------------------------
1 | # Alternatives & Comparison
2 |
3 | EDS-PDF was developed to propose a more modular and extendable approach to PDF extraction than [PDFBox](https://pdfbox.apache.org/), the legacy implementation at APHP's clinical data warehouse.
4 |
5 | EDS-PDF takes inspiration from Explosion's [spaCy](https://spacy.io) pipelining system and closely follows its API. Therefore, the core object within EDS-PDF is the Pipeline, which organises the processing of PDF documents into multiple components. However, unlike spaCy, the library is built around a single deep learning framework, pytorch, which makes model development easier.
6 |
--------------------------------------------------------------------------------
/docs/assets/images/model-parallelism.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/assets/images/model-parallelism.png
--------------------------------------------------------------------------------
/docs/assets/images/multiprocessing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/assets/images/multiprocessing.png
--------------------------------------------------------------------------------
/docs/assets/logo/aphp-blue.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | image/svg+xml
79 |
--------------------------------------------------------------------------------
/docs/assets/logo/aphp-white.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | image/svg+xml
79 |
--------------------------------------------------------------------------------
/docs/assets/logo/edspdf-red.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/assets/stylesheets/extra.css:
--------------------------------------------------------------------------------
1 | [data-md-color-scheme="default"] {
2 | --md-primary-fg-color: #006bb6;
3 | --md-primary-fg-color--light: #006bb6;
4 | --md-accent-fg-color: #006bb6;
5 | --md-accent-fg-color--light: #006bb6;
6 | }
7 |
8 | [data-md-color-scheme="slate"] {
9 | --md-primary-fg-color: #006bb6;
10 | --md-primary-fg-color--dark: #006bb6;
11 | --md-accent-fg-color: #006bb6;
12 | --md-accent-fg-color--light: #006bb6;
13 | }
14 |
15 | :root {
16 | --md-admonition-icon--aphp: url('data:image/svg+xml;charset=utf-8, ');
17 | }
18 |
19 |
20 | .md-typeset .admonition.aphp,
21 | .md-typeset details.aphp {
22 | border-color: rgb(0, 107, 182);
23 | }
24 |
25 | .md-typeset .aphp > .admonition-title,
26 | .md-typeset .aphp > summary {
27 | background-color: rgba(0, 107, 182, 0.1);
28 | border-color: rgb(0, 107, 182);
29 | }
30 |
31 | .md-typeset .aphp > .admonition-title::before,
32 | .md-typeset .aphp > summary::before {
33 | background-color: rgb(0, 107, 182);
34 | -webkit-mask-image: var(--md-admonition-icon--aphp);
35 | mask-image: var(--md-admonition-icon--aphp);
36 | }
37 |
38 |
39 | :root {
40 | --md-code-font: Consolas, Roboto Mono, Roboto;
41 | --md-code-bg-color: rgba(175, 184, 193, .2);
42 |
43 | --md-typeset-color: #24292e;
44 | }
45 |
46 |
47 | :root, [data-md-color-scheme=default] {
48 | --md-main-bg: #eef4f8;
49 | }
50 |
51 | :root, [data-md-color-scheme=slate] {
52 | --md-main-bg: hsl(232deg 15% 25%);;
53 | }
54 |
55 | html {
56 | }
57 |
58 | body, input {
59 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen, Ubuntu, Cantarell, "Fira Sans", "Droid Sans", "Helvetica Neue", Arial, sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol";;
60 | font-weight: 400;
61 | font-feature-settings: normal;
62 | }
63 |
64 | .md-typeset h1, .md-typeset h2, .md-typeset h3, .md-typeset h4, .md-typeset h5, .md-typeset h6 {
65 | word-wrap: normal;
66 | color: var(--md-typeset-color);
67 | font-weight: 600;
68 | scroll-margin-top: 1.25rem;
69 | letter-spacing: 0;
70 | }
71 |
72 | .md-typeset h1 {
73 | border-bottom: 1px solid #d8dee4;
74 | }
75 |
76 | .md-nav {
77 | font-size: 0.8rem;
78 | }
79 |
80 | .md-typeset code {
81 | font-size: 0.95em;
82 | }
83 |
84 | .md-typeset pre > code, .termy > [data-termynal], .highlighttable .linenos {
85 | font-size: .75rem;
86 | }
87 |
88 | .termy > [data-termynal] {
89 | font-size: 0.8rem;
90 | font-family: var(--md-code-font);
91 | padding: 45px 45px 25px;
92 | }
93 |
94 | .termy > [data-termynal] {
95 |
96 | }
97 |
98 | .md-typeset :is(.admonition,details) {
99 | font-size: inherit !important;
100 | }
101 |
102 | .highlight span.filename, .quote > summary {
103 | font-size: 0.85em;
104 | padding-top: 0.3em;
105 | padding-bottom: 0.3em;
106 | }
107 |
108 | .md-typeset pre > code, .highlight span.filename {
109 | border-top-left-radius: 5px;
110 | border-top-right-radius: 5px;
111 | }
112 |
113 | .md-typeset pre > code {
114 | border-bottom-left-radius: 5px;
115 | border-bottom-right-radius: 5px;
116 | }
117 |
118 | .md-main__inner {
119 | margin-top: 0;
120 | }
121 |
122 | .md-typeset__table td > a {
123 | white-space: nowrap;
124 | }
125 |
126 | @media screen and (min-width: 76.1875em) {
127 | .md-sidebar {
128 | margin-top: 1.5rem;
129 | }
130 | }
131 |
132 | @media screen and (min-width: 60em) {
133 | .md-nav--secondary .md-nav__title {
134 | background: var(--md-main-bg) !important;
135 | box-shadow: 0 0 0.4rem 0.4rem var(--md-main-bg) !important;
136 | }
137 | }
138 |
139 | @media screen and (min-width: 76.25em) {
140 | .md-nav--primary .md-nav__title, .md-nav--secondary .md-nav__title, .md-nav--lifted > .md-nav__list > .md-nav__item--active > .md-nav__link {
141 | background: var(--md-main-bg) !important;
142 | box-shadow: 0 0 0.4rem 0.4rem var(--md-main-bg) !important;
143 | }
144 | }
145 |
146 | .md-content {
147 | background: var(--md-default-bg-color);
148 | }
149 |
150 | .md-main {
151 | background: var(--md-main-bg);
152 | }
153 |
154 | .md-content__inner {
155 | margin-top: 1.5rem;
156 | }
157 |
158 | .doc td > code {
159 | word-break: normal;
160 | }
161 |
--------------------------------------------------------------------------------
/docs/assets/templates/python/material/docstring.html:
--------------------------------------------------------------------------------
1 | {% if docstring_sections %}
2 | {{ log.debug("Rendering docstring") }}
3 | {% for section in docstring_sections %}
4 | {% if not config.only_parameters %}
5 | {% if section.kind.value == "text" %}
6 | {{ section.value|convert_markdown(heading_level, html_id) }}
7 | {% elif section.kind.value == "attributes" %}
8 | {% include "docstring/attributes.html" with context %}
9 | {% elif section.kind.value == "parameters" %}
10 | {% include "docstring/parameters.html" with context %}
11 | {% elif section.kind.value == "other parameters" %}
12 | {% include "docstring/other_parameters.html" with context %}
13 | {% elif section.kind.value == "raises" %}
14 | {% include "docstring/raises.html" with context %}
15 | {% elif section.kind.value == "warns" %}
16 | {% include "docstring/warns.html" with context %}
17 | {% elif section.kind.value == "yields" %}
18 | {% include "docstring/yields.html" with context %}
19 | {% elif section.kind.value == "receives" %}
20 | {% include "docstring/receives.html" with context %}
21 | {% elif section.kind.value == "returns" %}
22 | {% include "docstring/returns.html" with context %}
23 | {% elif section.kind.value == "examples" %}
24 | {% include "docstring/examples.html" with context %}
25 | {% elif section.kind.value == "admonition" %}
26 | {% include "docstring/admonition.html" with context %}
27 | {% endif %}
28 | {% elif section.kind.value == "parameters" %}
29 | {% include "docstring/parameters.html" with context %}
30 | {% elif section.kind.value == "attributes" %}
31 | {% include "docstring/attributes.html" with context %}
32 | {% endif %}
33 | {% endfor %}
34 | {% endif %}
35 |
--------------------------------------------------------------------------------
/docs/assets/templates/python/material/docstring/examples.html:
--------------------------------------------------------------------------------
1 | {{ "# Examples\n"|convert_markdown(heading_level, html_id) }}
2 | {% for section_type, sub_section in section.value %}
3 | {% if section_type.value == "text" %}
4 | {{ sub_section|convert_markdown(heading_level, html_id) }}
5 | {% elif section_type.value == "examples" %}
6 | {{ sub_section|highlight(language="pycon", linenums=False) }}
7 | {% endif %}
8 | {% endfor %}
9 |
--------------------------------------------------------------------------------
/docs/assets/templates/python/material/docstring/parameters.html:
--------------------------------------------------------------------------------
1 | {{ log.debug("Rendering parameters section") }}
2 | {% if is_merged_init %}
3 | {{ "# Parameters\n"|convert_markdown(heading_level, html_id) }}
4 | {% endif %}
5 | {% if config.docstring_section_style == "table" %}
6 | {% block table_style %}
7 | {{ section.title or "Parameters:" }}
8 |
9 |
10 |
11 | Name
12 | Type
13 | Description
14 | Default
15 |
16 |
17 |
18 | {% for parameter in section.value %}
19 | {% if not config.only_parameters or parameter.name not in ("nlp", "name", "vocab", "scorer") %}
20 |
21 | {{ parameter.name }}
22 |
23 | {% if parameter.annotation %}
24 | {% with expression = parameter.annotation %}
25 | {% include "expression.html" with context %}
26 | {% endwith %}
27 | {% endif %}
28 |
29 | {{ parameter.description|convert_markdown(heading_level, html_id) }}
30 |
31 | {% if parameter.default %}
32 | {% with expression = parameter.default %}
33 | {% include "expression.html" with context %}
34 | {% endwith %}
35 | {% else %}
36 | required
37 | {% endif %}
38 |
39 |
40 | {% endif %}
41 | {% endfor %}
42 |
43 |
44 | {% endblock table_style %}
45 | {% elif config.docstring_section_style == "list" %}
46 | {% block list_style %}
47 | {{ section.title or "Parameters:" }}
48 |
49 | {% for parameter in section.value %}
50 | {% if not config.only_parameters or parameter.name not in ("nlp", "name", "vocab", "scorer") %}
51 |
52 | {{ parameter.name }}
53 | {% if parameter.annotation %}
54 | {% with expression = parameter.annotation %}
55 | ({% include "expression.html" with context %}
)
56 | {% endwith %}
57 | {% endif %}
58 | – {{ parameter.description|convert_markdown(heading_level, html_id) }}
59 |
60 | {% endif %}
61 | {% endfor %}
62 |
63 | {% endblock list_style %}
64 | {% elif config.docstring_section_style == "spacy" %}
65 | {% block spacy_style %}
66 |
67 |
68 |
69 | {{ (section.title or "PARAMETER").rstrip(":").upper() }}
70 | DESCRIPTION
71 |
72 |
73 |
74 | {% for parameter in section.value %}
75 | {% if not config.only_parameters or parameter.name not in ("nlp", "name", "vocab", "scorer") %}
76 |
77 | {{ parameter.name }}
78 |
79 | {{ parameter.description|convert_markdown(heading_level, html_id) }}
80 |
81 | {% if parameter.annotation %}
82 |
83 | TYPE:
84 | {% with expression = parameter.annotation %}
85 | {% include "expression.html" with context %}
86 | {% endwith %}
87 |
88 | {% endif %}
89 | {% if parameter.default %}
90 |
91 | DEFAULT:
92 | {% with expression = parameter.default %}
93 | {% include "expression.html" with context %}
94 | {% endwith %}
95 |
96 | {% endif %}
97 |
98 |
99 |
100 | {% endif %}
101 | {% endfor %}
102 |
103 |
104 | {% endblock spacy_style %}
105 | {% endif %}
106 |
--------------------------------------------------------------------------------
/docs/assets/templates/python/material/function.html:
--------------------------------------------------------------------------------
1 | {{ log.debug("Rendering " + function.path) }}
2 |
3 |
4 | {% with html_id = function.path %}
5 |
6 | {% if root %}
7 | {% set show_full_path = config.show_root_full_path %}
8 | {% set root_members = True %}
9 | {% elif root_members %}
10 | {% set show_full_path = config.show_root_members_full_path or config.show_object_full_path %}
11 | {% set root_members = False %}
12 | {% else %}
13 | {% set show_full_path = config.show_object_full_path %}
14 | {% endif %}
15 |
16 | {% if not root or config.show_root_heading %}
17 |
18 | {% filter heading(heading_level,
19 | role="function",
20 | id=html_id,
21 | class="doc doc-heading",
22 | toc_label=function.name ~ "()") %}
23 |
24 | {% if config.separate_signature %}
25 |
{% if show_full_path %}{{ function.path }}{% else %}{{ function.name }}{% endif %}
26 | {% else %}
27 | {% filter highlight(language="python", inline=True) %}
28 | {% if show_full_path %}{{ function.path }}{% else %}{{ function.name }}{% endif %}
29 | {% include "signature.html" with context %}
30 | {% endfilter %}
31 | {% endif %}
32 |
33 | {% with labels = function.labels %}
34 | {% include "labels.html" with context %}
35 | {% endwith %}
36 |
37 | {% endfilter %}
38 |
39 | {% if config.separate_signature %}
40 | {% filter highlight(language="python", inline=False) %}
41 | {% filter format_signature(config.line_length) %}
42 | {% if show_full_path %}{{ function.path }}{% else %}{{ function.name }}{% endif %}
43 | {% include "signature.html" with context %}
44 | {% endfilter %}
45 | {% endfilter %}
46 | {% endif %}
47 |
48 | {% else %}
49 | {% if config.show_root_toc_entry %}
50 | {% filter heading(heading_level,
51 | role="function",
52 | id=html_id,
53 | toc_label=function.path if config.show_root_full_path else function.name,
54 | hidden=True) %}
55 | {% endfilter %}
56 | {% endif %}
57 | {% set heading_level = heading_level - 1 %}
58 | {% endif %}
59 |
60 |
61 | {% with docstring_sections = function.docstring.parsed %}
62 | {% include "docstring.html" with context %}
63 | {% endwith %}
64 |
65 | {% if not config.only_parameters and config.show_source and function.source %}
66 |
67 | Source code in {{ function.relative_filepath }}
68 | {{ function.source|highlight(language="python", linestart=function.lineno, linenums=True) }}
69 |
70 | {% endif %}
71 |
72 |
73 | {% endwith %}
74 |
75 |
--------------------------------------------------------------------------------
/docs/assets/termynal/termynal.css:
--------------------------------------------------------------------------------
1 | /**
2 | * termynal.js
3 | *
4 | * @author Ines Montani
5 | * @version 0.0.1
6 | * @license MIT
7 | *
8 | * Modified version from https://github.com/tiangolo/typer
9 | */
10 |
11 | :root {
12 | --color-bg: #252a33;
13 | --color-text: #eee;
14 | --color-text-subtle: #a2a2a2;
15 | }
16 |
17 | [data-termynal] {
18 | width: auto;
19 | max-width: 100%;
20 | background: var(--color-bg);
21 | color: var(--color-text);
22 | font-size: 18px;
23 | /* font-family: 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace; */
24 | font-family: 'Roboto Mono', 'Fira Mono', Consolas, Menlo, Monaco, 'Courier New', Courier, monospace;
25 | border-radius: 4px;
26 | padding: 75px 45px 35px;
27 | position: relative;
28 | -webkit-box-sizing: border-box;
29 | box-sizing: border-box;
30 | }
31 |
32 | [data-termynal]:before {
33 | content: '';
34 | position: absolute;
35 | top: 15px;
36 | left: 15px;
37 | display: inline-block;
38 | width: 15px;
39 | height: 15px;
40 | border-radius: 50%;
41 | /* A little hack to display the window buttons in one pseudo element. */
42 | background: #d9515d;
43 | -webkit-box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;
44 | box-shadow: 25px 0 0 #f4c025, 50px 0 0 #3ec930;
45 | }
46 |
47 | [data-termynal]:after {
48 | content: 'bash';
49 | position: absolute;
50 | color: var(--color-text-subtle);
51 | top: 5px;
52 | left: 0;
53 | width: 100%;
54 | text-align: center;
55 | }
56 |
57 | a[data-terminal-control] {
58 | text-align: right;
59 | display: block;
60 | color: #aebbff;
61 | }
62 |
63 | [data-terminal-copy] {
64 | text-align: right;
65 | position: absolute;
66 | top: 5px;
67 | right: 5px;
68 | }
69 |
70 | [data-terminal-copy].md-icon {
71 | color: #aebbff;
72 | }
73 |
74 | [data-ty] {
75 | display: block;
76 | line-height: 2;
77 | }
78 |
79 | [data-ty]:before {
80 | /* Set up defaults and ensure empty lines are displayed. */
81 | content: '';
82 | display: inline-block;
83 | vertical-align: middle;
84 | }
85 |
86 | [data-ty="input"]:before,
87 | [data-ty-prompt]:before {
88 | margin-right: 0.72em;
89 | color: var(--color-text-subtle);
90 | }
91 |
92 | [data-ty="input"]:before {
93 | content: '$';
94 | }
95 |
96 | [data-ty][data-ty-prompt]:before {
97 | content: attr(data-ty-prompt);
98 | }
99 |
100 | [data-ty-cursor]:after {
101 | content: attr(data-ty-cursor);
102 | font-family: monospace;
103 | margin-left: 0.5em;
104 | -webkit-animation: blink 1s infinite;
105 | animation: blink 1s infinite;
106 | }
107 |
108 |
109 | /* Cursor animation */
110 |
111 | @-webkit-keyframes blink {
112 | 50% {
113 | opacity: 0;
114 | }
115 | }
116 |
117 | @keyframes blink {
118 | 50% {
119 | opacity: 0;
120 | }
121 | }
122 |
123 | /* tooltip */
124 |
125 | [data-md-state="open"] {
126 | transform: translateY(0);
127 | opacity: 1;
128 | transition:
129 | transform 400ms cubic-bezier(0.075, 0.85, 0.175, 1),
130 | opacity 400ms;
131 | pointer-events: initial;
132 | }
133 |
--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
1 | ---8<--- "changelog.md"
2 |
--------------------------------------------------------------------------------
/docs/configuration.md:
--------------------------------------------------------------------------------
1 | # Configuration
2 |
3 | EDS-PDF is built on top of the [`confit`](https://github.com/aphp/confit) configuration system.
4 |
5 | The following [catalogue](https://github.com/explosion/catalogue) registries are included within EDS-PDF:
6 |
7 | | Section | Description |
8 | |---------------|-------------------------------------------|
9 | | `factory` | Components factories (most often classes) |
10 | | `adapter` | Raw data preprocessing functions |
11 |
12 | EDS-PDF pipelines are meant to be reproducible and serializable, such that you can always define a pipeline through the configuration system.
13 |
14 | To wit, compare the API-based approach to the configuration-based approach (the two are strictly equivalent):
15 |
16 | === "API-based"
17 |
18 | ```python hl_lines="4-13"
19 | import edspdf
20 | from pathlib import Path
21 |
22 | model = edspdf.Pipeline()
23 | model.add_pipe("pdfminer-extractor", name="extractor")
24 | model.add_pipe("mask-classifier", name="classifier", config=dict(
25 | x0=0.2,
26 | x1=0.9,
27 | y0=0.3,
28 | y1=0.6,
29 | threshold=0.1,
30 | )
31 | model.add_pipe("simple-aggregator", name="aggregator")
32 |
33 | # Get a PDF
34 | pdf = Path("letter.pdf").read_bytes()
35 |
36 | pdf = model(pdf)
37 |
38 | str(pdf.aggregated_texts["body"])
39 | # Out: Cher Pr ABC, Cher DEF,\n...
40 | ```
41 |
42 | === "Configuration-based"
43 |
44 | ```toml title="config.cfg"
45 | [pipeline]
46 | pipeline = ["extractor", "classifier", "aggregator"]
47 |
48 | [components.extractor]
49 | @factory = "pdfminer-extractor"
50 |
51 | [components.classifier]
52 | @factory = "mask-classifier"
53 | x0 = 0.2
54 | x1 = 0.9
55 | y0 = 0.3
56 | y1 = 0.6
57 | threshold = 0.1
58 |
59 | [components.aggregator]
60 | @factory = "simple-aggregator"
61 | ```
62 |
63 | ```python hl_lines="4"
64 | import edspdf
65 | from pathlib import Path
66 |
67 | pipeline = edspdf.load("config.cfg")
68 |
69 | # Get a PDF
70 | pdf = Path("letter.pdf").read_bytes()
71 |
72 | pdf = pipeline(pdf)
73 |
74 | str(pdf.aggregated_texts["body"])
75 | # Out: Cher Pr ABC, Cher DEF,\n...
76 | ```
77 |
78 | The configuration-based approach strictly separates the definition of the pipeline
79 | to its application and avoids tucking away important configuration details.
80 | Changes to the pipeline are transparent as there is a single source of truth: the configuration file.
81 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | ---8<--- "contributing.md"
2 |
--------------------------------------------------------------------------------
/docs/data-structures.md:
--------------------------------------------------------------------------------
1 | # Data Structures
2 |
3 |
4 | EDS-PDF stores PDFs and their annotation in a custom data structures that are
5 | designed to be easy to use and manipulate. We must distinguish between:
6 |
7 | - the data models used to store the PDFs and exchange them between the
8 | different components of EDS-PDF
9 | - the tensors structures used to process the PDFs with deep learning models
10 |
11 | ## Itinerary of a PDF
12 |
13 | A PDF is first converted to a [PDFDoc][edspdf.structures.PDFDoc] object, which contains the raw PDF content. This task is usually performed a [PDF extractor component](/components/extractors). Once the PDF is converted, the same object will be used and updated by the different components, and returned at the end of the pipeline.
14 |
15 | When running a trainable component, the [PDFDoc][edspdf.structures.PDFDoc] is preprocessed and converted to tensors containing relevant features for the task. This task is performed in the `preprocess` method of the component. The resulting tensors are then collated together to form a batch, in the `collate` method of the component. After running the `forward` method of the component, the tensor predictions are finally assigned as annotations to original [PDFDoc][edspdf.structures.PDFDoc] objects in the `postprocess` method.
16 |
17 |
18 | ## Data models
19 |
20 | The main data structure is the [PDFDoc][edspdf.structures.PDFDoc], which represents full a PDF document. It contains the raw PDF content, annotations for the full document, regardless of pages. A PDF is split into [Page][edspdf.structures.Page] objects that stores their number, dimension and optionally an image of the rendered page.
21 |
22 | The PDF annotations are stored in [Box][edspdf.structures.Box] objects, which represent a rectangular region of the PDF. At the moment, box can only be specialized into [TextBox][edspdf.structures.TextBox] to represent text regions, such as lines extracted by a PDF extractor. Aggregated texts are stored in [Text][edspdf.structures.Text] objects, that are not associated with a specific box.
23 |
24 | A [TextBox][edspdf.structures.TextBox] contains a list of [TextProperties][edspdf.structures.TextProperties] objects to store the style properties of a styled spans of the text.
25 |
26 | ??? note "Reference"
27 |
28 | ::: edspdf.structures
29 | options:
30 | heading_level: 3
31 |
32 | ## Tensor structure
33 |
34 | The tensors used to process PDFs with deep learning models usually contain 4 main dimensions, in addition to the standard embedding dimensions:
35 |
36 | - `samples`: one entry per PDF in the batch
37 | - `pages`: one entry per page in a PDF
38 | - `boxes`: one entry per box in a page
39 | - `token`: one entry per token in a box (only for text boxes)
40 |
41 | These tensors use a special [FoldedTensor](http://pypi.org/project/foldedtensor) format to store the data in a compact way and reshape the data depending on the requirements of a layer.
42 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | EDS-PDF provides modular framework to extract text information from PDF documents.
4 |
5 | You can use it out-of-the-box, or extend it to fit your use-case.
6 |
7 | ## Getting started
8 |
9 | ### Installation
10 |
11 | Install the library with pip:
12 |
13 |
14 |
15 | ```console
16 | $ pip install edspdf
17 | ---> 100%
18 | color:green Installation successful
19 | ```
20 |
21 |
22 |
23 | ### Extracting text
24 |
25 | Let's build a simple PDF extractor that uses a rule-based classifier. There are two
26 | ways to do this, either by using the [configuration system](#configuration) or by using
27 | the pipeline API.
28 |
29 | === "Configuration based pipeline"
30 |
31 | Create a configuration file:
32 |
33 | ```toml title="config.cfg"
34 | [pipeline]
35 | pipeline = ["extractor", "classifier", "aggregator"]
36 |
37 | [components.extractor]
38 | @factory = "pdfminer-extractor"
39 |
40 | [components.classifier]
41 | @factory = "mask-classifier"
42 | x0 = 0.2
43 | x1 = 0.9
44 | y0 = 0.3
45 | y1 = 0.6
46 | threshold = 0.1
47 |
48 | [components.aggregator]
49 | @factory = "simple-aggregator"
50 | ```
51 |
52 | and load it from Python:
53 |
54 | ```python
55 | import edspdf
56 | from pathlib import Path
57 |
58 | model = edspdf.load("config.cfg") # (1)
59 | ```
60 |
61 | === "API based pipeline"
62 |
63 | Or create a pipeline directly from Python:
64 |
65 | ```python
66 | from edspdf import Pipeline
67 |
68 | model = Pipeline()
69 | model.add_pipe("pdfminer-extractor")
70 | model.add_pipe(
71 | "mask-classifier",
72 | config=dict(
73 | x0=0.2,
74 | x1=0.9,
75 | y0=0.3,
76 | y1=0.6,
77 | threshold=0.1,
78 | ),
79 | )
80 | model.add_pipe("simple-aggregator")
81 | ```
82 |
83 | This pipeline can then be applied (for instance with this [PDF](https://github.com/aphp/edspdf/raw/main/tests/resources/letter.pdf)):
84 |
85 | ```python
86 | # Get a PDF
87 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes()
88 | pdf = model(pdf)
89 |
90 | body = pdf.aggregated_texts["body"]
91 |
92 | text, style = body.text, body.properties
93 | ```
94 |
95 | See the [rule-based recipe](recipes/rule-based.md) for a step-by-step explanation of what is happening.
96 |
97 | ## Citation
98 |
99 | If you use EDS-PDF, please cite us as below.
100 |
101 | ```bibtex
102 | @article{gerardin_wajsburt_pdf,
103 | title={Bridging Clinical PDFs and Downstream Natural Language Processing: An Efficient Neural Approach to Layout Segmentation},
104 | author={G{\'e}rardin, Christel Ducroz and Wajsburt, Perceval and Dura, Basile and Calliger, Alice and Mouchet, Alexandre and Tannier, Xavier and Bey, Romain},
105 | journal={Available at SSRN 4587624}
106 | }
107 | ```
108 |
109 | ## Acknowledgement
110 |
111 | We would like to thank [Assistance Publique – Hôpitaux de Paris](https://www.aphp.fr/) and
112 | [AP-HP Foundation](https://fondationrechercheaphp.fr/) for funding this project.
113 |
--------------------------------------------------------------------------------
/docs/layers/box-transformer-layer.md:
--------------------------------------------------------------------------------
1 | # BoxTransformerLayer {: #edspdf.layers.box_transformer.BoxTransformerLayer }
2 |
3 | ::: edspdf.layers.box_transformer.BoxTransformerLayer
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/layers/box-transformer.md:
--------------------------------------------------------------------------------
1 | # BoxTransformerModule {: #edspdf.layers.box_transformer.BoxTransformerModule }
2 |
3 | ::: edspdf.layers.box_transformer.BoxTransformerModule
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/layers/index.md:
--------------------------------------------------------------------------------
1 | # Deep learning layers
2 |
3 | EDS-PDF provides a set of specialized deep learning layers that can be used to build trainable
4 | components. These layers are built on top of the PyTorch framework and can be used in
5 | any PyTorch model.
6 |
7 | | Layer | Description |
8 | |---------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------|
9 | | [`BoxTransformerModule`][edspdf.layers.box_transformer.BoxTransformerModule] | Contextualize box embeddings with a 2d Transformer with relative position representations |
10 | | [`BoxTransformerLayer`][edspdf.layers.box_transformer.BoxTransformerLayer] | A single layer of the above `BoxTransformerModule` layer |
11 | | [`RelativeAttention`][edspdf.layers.relative_attention.RelativeAttention] | A 2d attention layer that optionally uses relative position to compute its attention scores |
12 | | [`SinusoidalEmbedding`][edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding] | A position embedding that uses trigonometric functions to encode positions |
13 | | [`Vocabulary`][edspdf.layers.vocabulary.Vocabulary] | A non deep learning layer to encodes / decode vocabularies |
14 |
--------------------------------------------------------------------------------
/docs/layers/relative-attention.md:
--------------------------------------------------------------------------------
1 | # RelativeAttention {: #edspdf.layers.relative_attention.RelativeAttention }
2 |
3 |
4 | ::: edspdf.layers.relative_attention.RelativeAttention
5 | options:
6 | heading_level: 2
7 | show_bases: false
8 | show_source: false
9 |
--------------------------------------------------------------------------------
/docs/layers/sinusoidal-embedding.md:
--------------------------------------------------------------------------------
1 | # SinusoidalEmbedding {: #edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding }
2 |
3 | ::: edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/layers/vocabulary.md:
--------------------------------------------------------------------------------
1 | # Vocabulary {: #edspdf.layers.vocabulary.Vocabulary }
2 |
3 | ::: edspdf.layers.vocabulary.Vocabulary
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 | show_category_heading: true
9 |
--------------------------------------------------------------------------------
/docs/pipeline.md:
--------------------------------------------------------------------------------
1 | # Pipeline {: #edspdf.pipeline.Pipeline }
2 |
3 | The goal of EDS-PDF is to provide a **framework** for processing PDF documents, along with some utilities and a few components, stitched together by a robust pipeline and configuration system.
4 |
5 | Processing PDFs usually involves many steps such as extracting lines, running OCR models, detecting and classifying boxes, filtering and aggregating parts of the extracted texts, etc. Organising these steps together, combining static and deep learning components, while remaining modular and efficient is a challenge. This is why EDS-PDF is built on top of a new pipelining system.
6 |
7 |
8 | !!! note "Deep learning frameworks"
9 |
10 | The EDS-PDF trainable components are built around the PyTorch framework. While you
11 | can use any technology in static components, we do not provide tools to train
12 | components built with other deep learning frameworks.
13 |
14 | ## Creating a pipeline
15 |
16 | A pipe is a processing block (like a function) that applies a transformation on its input and returns a modified object.
17 |
18 | At the moment, four types of pipes are implemented in the library:
19 |
20 | 1. **extraction** components extract lines from a raw PDF and return a [`PDFDoc`][edspdf.structures.PDFDoc] object filled with these text boxes.
21 | 2. **classification** components classify each box with labels, such as `body`, `header`, `footer`...
22 | 3. **aggregation** components compiles the lines together according to their classes to re-create the original text.
23 | 4. **embedding** components don't directly update the annotations on the document but have specific deep-learning methods (see the [TrainablePipe][edspdf.trainable_pipe.TrainablePipe] page) that can be composed to form a machine learning model.
24 |
25 | To create your first pipeline, execute the following code:
26 |
27 | ```python
28 | from edspdf import Pipeline
29 |
30 | model = Pipeline()
31 | # will extract text lines from a document
32 | model.add_pipe(
33 | "pdfminer-extractor",
34 | config=dict(
35 | extract_style=False,
36 | ),
37 | )
38 | # classify everything inside the `body` bounding box as `body`
39 | model.add_pipe(
40 | "mask-classifier", config=dict(body={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9})
41 | )
42 | # aggregates the lines together to re-create the original text
43 | model.add_pipe("simple-aggregator")
44 | ```
45 |
46 | This pipeline can then be run on one or more PDF documents.
47 | As the pipeline process documents, components will be called in the order
48 | they were added to the pipeline.
49 |
50 | ```python
51 | from pathlib import Path
52 |
53 | pdf_bytes = Path("path/to/your/pdf").read_bytes()
54 |
55 | # Processing one document
56 | model(pdf_bytes)
57 |
58 | # Processing multiple documents
59 | model.pipe([pdf_bytes, ...])
60 | ```
61 |
62 | For more information on how to use the pipeline, refer to the [Inference](/inference) page.
63 |
64 | ### Hybrid models
65 |
66 | EDS-PDF was designed to facilitate the training and inference of hybrid models that
67 | arbitrarily chain static components or trained deep learning components. Static components are callable objects that take a PDFDoc object as input, perform arbitrary transformations over the input, and return the modified object. [Trainable pipes][edspdf.trainable_pipe.TrainablePipe], on the other hand, allow for deep learning operations to be performed on the [PDFDoc][edspdf.structures.PDFDoc] object and must be trained to be used.
68 |
69 | ## Saving and loading a pipeline
70 |
71 | Pipelines can be saved and loaded using the `save` and `load` methods. The saved pipeline is not a pickled objet but a folder containing the config file, the weights and extra resources for each pipeline. This allows for easy inspection and modification of the pipeline, and avoids the execution of arbitrary code when loading a pipeline.
72 |
73 | ```python
74 | model.save("path/to/your/model")
75 | model = edspdf.load("path/to/your/model")
76 | ```
77 |
78 | To share the pipeline and turn it into a pip installable package, you can use the `package` method, which will use or create a pyproject.toml file, fill it accordingly, and create a wheel file. At the moment, we only support the poetry package manager.
79 |
80 | ```python
81 | model.package(
82 | name="your-package-name", # leave None to reuse name in pyproject.toml
83 | version="0.0.1",
84 | root_dir="path/to/project/root", # optional, to retrieve an existing pyproject.toml file
85 | # if you don't have a pyproject.toml, you can provide the metadata here instead
86 | metadata=dict(
87 | authors="Firstname Lastname ",
88 | description="A short description of your package",
89 | ),
90 | )
91 | ```
92 |
93 | This will create a wheel file in the root_dir/dist folder, which you can share and install with pip
94 |
--------------------------------------------------------------------------------
/docs/pipes/aggregators/index.md:
--------------------------------------------------------------------------------
1 | # Aggregation
2 |
3 | The aggregation step compiles extracted text blocs together according to their detected class.
4 |
5 |
6 |
7 | | Factory name | Description |
8 | |-------------------------------------------------------------------------|-------------------------------------------------------------------|
9 | | [`simple-aggregator`][edspdf.pipes.aggregators.simple.SimpleAggregator] | Returns a dictionary with one key for each detected class |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/docs/pipes/aggregators/simple-aggregator.md:
--------------------------------------------------------------------------------
1 | ::: edspdf.pipes.aggregators.simple
2 | options:
3 | heading_level: 1
4 |
--------------------------------------------------------------------------------
/docs/pipes/box-classifiers/dummy.md:
--------------------------------------------------------------------------------
1 | # Dummy classifier {: #edspdf.pipes.classifiers.dummy.DummyClassifier }
2 |
3 | ::: edspdf.pipes.classifiers.dummy.DummyClassifier
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/box-classifiers/index.md:
--------------------------------------------------------------------------------
1 | # Box classifiers
2 |
3 | We developed EDS-PDF with modularity in mind. To that end, you can choose between multiple classification methods.
4 |
5 |
6 |
7 | | Factory name | Description |
8 | |--------------------------------------------------------------------------------------------------|-----------------------------------------|
9 | | [`mask-classifier`][edspdf.pipes.classifiers.mask.simple_mask_classifier_factory] | Simple rule-based classification |
10 | | [`multi-mask-classifier`][edspdf.pipes.classifiers.mask.mask_classifier_factory] | Simple rule-based classification |
11 | | [`dummy-classifier`][edspdf.pipes.classifiers.dummy.DummyClassifier] | Dummy classifier, for testing purposes. |
12 | | [`random-classifier`][edspdf.pipes.classifiers.random.RandomClassifier] | To sow chaos |
13 | | [`trainable-classifier`][edspdf.pipes.classifiers.trainable.TrainableClassifier] | Trainable box classification model |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/docs/pipes/box-classifiers/mask.md:
--------------------------------------------------------------------------------
1 | # Mask Classification
2 |
3 | We developed a simple classifier that roughly uses the same strategy as PDFBox, namely:
4 |
5 | - define a "mask" on the PDF documents ;
6 | - keep every text bloc within that mask, tag everything else as pollution.
7 |
8 | ## Factories
9 |
10 | Two factories are available in the `classifiers` registry: `mask-classifier` and `multi-mask-classifier`.
11 |
12 | ### `mask-classifier` {: #edspdf.pipes.classifiers.mask.simple_mask_classifier_factory }
13 |
14 | ::: edspdf.pipes.classifiers.mask.simple_mask_classifier_factory
15 | options:
16 | heading_level: 4
17 | show_bases: false
18 | show_source: false
19 |
20 | ---
21 |
22 | ### `multi-mask-classifier` {: #edspdf.pipes.classifiers.mask.mask_classifier_factory }
23 |
24 | ::: edspdf.pipes.classifiers.mask.mask_classifier_factory
25 | options:
26 | heading_level: 4
27 | show_bases: false
28 | show_source: false
29 |
--------------------------------------------------------------------------------
/docs/pipes/box-classifiers/random.md:
--------------------------------------------------------------------------------
1 | # Random classifier {: #edspdf.pipes.classifiers.random.RandomClassifier }
2 |
3 | ::: edspdf.pipes.classifiers.random.RandomClassifier
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/box-classifiers/trainable.md:
--------------------------------------------------------------------------------
1 | # Trainable classifier {: #edspdf.pipes.classifiers.trainable.TrainableClassifier }
2 |
3 | ::: edspdf.pipes.classifiers.trainable.TrainableClassifier
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/embeddings/box-layout-embedding.md:
--------------------------------------------------------------------------------
1 | # BoxLayoutEmbedding {: #edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding }
2 |
3 | ::: edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/embeddings/box-transformer.md:
--------------------------------------------------------------------------------
1 | # BoxTransformer {: #edspdf.pipes.embeddings.box_transformer.BoxTransformer }
2 |
3 | ::: edspdf.pipes.embeddings.box_transformer.BoxTransformer
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/embeddings/embedding-combiner.md:
--------------------------------------------------------------------------------
1 | # EmbeddingCombiner {: #edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner }
2 |
3 | ::: edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/embeddings/huggingface-embedding.md:
--------------------------------------------------------------------------------
1 | # HuggingfaceEmbedding {: #edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding }
2 |
3 | ::: edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/embeddings/index.md:
--------------------------------------------------------------------------------
1 | # Embeddings
2 |
3 | We offer multiple embedding methods to encode the text and layout information of the PDFs. The following components can be added to a pipeline or composed together, and contain preprocessing and postprocessing logic to convert and batch documents.
4 |
5 |
6 |
7 |
12 |
13 | | Factory name | Description |
14 | |-----------------------------------------------------------------------------------------------|-------------------------------------------------------------------|
15 | | [`simple-text-embedding`][edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding] | A module that embeds the textual features of the blocks. |
16 | | [`embedding-combiner`][edspdf.pipes.embeddings.embedding_combiner.EmbeddingCombiner] | Encodes boxes using a combination of multiple encoders |
17 | | [`sub-box-cnn-pooler`][edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler] | Pools the output of a CNN over the elements of a box (like words) |
18 | | [`box-layout-embedding`][edspdf.pipes.embeddings.box_layout_embedding.BoxLayoutEmbedding] | Encodes the layout of the boxes |
19 | | [`box-transformer`][edspdf.pipes.embeddings.box_transformer.BoxTransformer] | Contextualizes box representations using a transformer |
20 | | [`huggingface-embedding`][edspdf.pipes.embeddings.huggingface_embedding.HuggingfaceEmbedding] | Box representations using a Huggingface multi-modal model. |
21 |
22 |
23 |
24 | !!! warning "Layers"
25 | These components are not to be confused with [`layers`](/layers), which are standard
26 | PyTorch modules that can be used to build trainable components, such as the ones
27 | described here.
28 |
--------------------------------------------------------------------------------
/docs/pipes/embeddings/simple-text-embedding.md:
--------------------------------------------------------------------------------
1 | # SimpleTextEmbedding {: #edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding }
2 |
3 | ::: edspdf.pipes.embeddings.simple_text_embedding.SimpleTextEmbedding
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/embeddings/sub-box-cnn-pooler.md:
--------------------------------------------------------------------------------
1 | # SubBoxCNNPooler {: #edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler }
2 |
3 | ::: edspdf.pipes.embeddings.sub_box_cnn_pooler.SubBoxCNNPooler
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/extractors/index.md:
--------------------------------------------------------------------------------
1 | # Extraction
2 |
3 | The extraction phase consists of reading the PDF document and gather text blocs, along with their dimensions and position within the document. Said blocs will go on to the classification phase to separate the body from the rest.
4 |
5 | ## Text-based PDF
6 |
7 | We provide a multiple extractor architectures for text-based PDFs :
8 |
9 |
10 |
11 | | Factory name | Description |
12 | |----------------------------------------------------------------------------|-------------------------------------------------|
13 | | [`pdfminer-extractor`][edspdf.pipes.extractors.pdfminer.PdfMinerExtractor] | Extracts text lines with the `pdfminer` library |
14 | | [`mupdf-extractor`][edspdf_mupdf.MuPdfExtractor] | Extracts text lines with the `pymupdf` library |
15 | | [`poppler-extractor`][edspdf_poppler.PopplerExtractor] | Extracts text lines with the `poppler` library |
16 |
17 |
18 |
19 | ## Image-based PDF
20 |
21 | Image-based PDF documents require an OCR[^1] step, which is not natively supported by EDS-PDF.
22 | However, you can easily extend EDS-PDF by adding such a method to the registry.
23 |
24 | We plan on adding such an OCR extractor component in the future.
25 |
26 | [^1]: Optical Character Recognition, or OCR, is the process of extracting characters and words from an image.
27 |
--------------------------------------------------------------------------------
/docs/pipes/extractors/pdfminer.md:
--------------------------------------------------------------------------------
1 | # PdfMiner Extractor {: #edspdf.pipes.extractors.pdfminer.PdfMinerExtractor }
2 |
3 | ::: edspdf.pipes.extractors.pdfminer.PdfMinerExtractor
4 | options:
5 | heading_level: 2
6 | show_bases: false
7 | show_source: false
8 |
--------------------------------------------------------------------------------
/docs/pipes/index.md:
--------------------------------------------------------------------------------
1 | # Components overview
2 |
3 | EDS-PDF provides easy-to-use components for defining PDF processing pipelines.
4 |
5 |
6 |
7 | === "Box extractors"
8 |
9 | --8<-- "docs/pipes/extractors/index.md:components"
10 |
11 | === "Box classifiers"
12 |
13 | --8<-- "docs/pipes/box-classifiers/index.md:components"
14 |
15 |
16 | === "Aggregators"
17 |
18 | --8<-- "docs/pipes/aggregators/index.md:components"
19 |
20 |
21 | === "Embeddings"
22 |
23 | --8<-- "docs/pipes/embeddings/index.md:components"
24 |
25 | You can add them to your EDS-PDF pipeline by simply calling `add_pipe`, for instance:
26 |
27 |
28 |
29 | ```python
30 | # ↑ Omitted code that defines the pipeline object ↑
31 | pipeline.add_pipe("pdfminer-extractor", name="component-name", config=...)
32 | ```
33 |
--------------------------------------------------------------------------------
/docs/recipes/annotation.md:
--------------------------------------------------------------------------------
1 | # PDF Annotation
2 |
3 | In this section, we will cover one methodology to annotate PDF documents.
4 |
5 | !!! aphp "Data annotation at AP-HP's CDW"
6 |
7 | At AP-HP's CDW[^1], we recently moved away from a rule- and Java-based PDF extraction pipeline
8 | (using PDFBox) to one using EDS-PDF. Hence, EDS-PDF is used in production, helping
9 | extract text from around 100k PDF documents every day.
10 |
11 | To train our pipeline presently in production, we annotated **around 270 documents**, and reached
12 | a **f1-score of 0.98** on the body classification.
13 |
14 | ## Preparing the data for annotation
15 |
16 | We will frame the annotation phase as an image segmentation task,
17 | where annotators are asked to draw bounding boxes around the different sections.
18 | Hence, the very first step is to convert PDF documents to images. We suggest using the
19 | library `pdf2image` for that step.
20 |
21 | The following script will convert the PDF documents located in a `data/pdfs` directory
22 | to PNG images inside the `data/images` folder.
23 |
24 | ```python
25 | import pdf2image
26 | from pathlib import Path
27 |
28 | DATA_DIR = Path("data")
29 | PDF_DIR = DATA_DIR / "pdfs"
30 | IMAGE_DIR = DATA_DIR / "images"
31 |
32 | for pdf in PDF_DIR.glob("*.pdf"):
33 | imgs = pdf2image.convert_from_bytes(pdf)
34 |
35 | for page, img in enumerate(imgs):
36 | path = IMAGE_DIR / f"{pdf.stem}_{page}.png"
37 | img.save(path)
38 | ```
39 |
40 | You can use any annotation tool to annotate the images. If you're looking for a simple
41 | way to annotate from within a Jupyter Notebook,
42 | [ipyannotations](https://ipyannotations.readthedocs.io/en/latest/examples/image-landmarks.html#annotating-bounding-boxes)
43 | might be a good fit.
44 |
45 | You will need to post-process the output
46 | to convert the annotations to the following format:
47 |
48 | | Key | Description |
49 | |---------|--------------------------------------------------------------------|
50 | | `page` | Page within the PDF (0-indexed) |
51 | | `x0` | Horizontal position of the top-left corner of the bounding box |
52 | | `x1` | Horizontal position of the bottom-right corner of the bounding box |
53 | | `y0` | Vertical position of the top-left corner of the bounding box |
54 | | `y1` | Vertical position of the bottom-right corner of the bounding box |
55 | | `label` | Class of the bounding box (eg `body`, `header`...) |
56 |
57 | All dimensions should be normalised by the height and width of the page.
58 |
59 | ## Saving the dataset
60 |
61 | Once the annotation phase is complete, make sure the train/test split is performed
62 | once and for all when you create the dataset.
63 |
64 | We suggest the following structure:
65 |
66 | ```title="Directory structure"
67 | dataset/
68 | ├── train/
69 | │ ├── .pdf
70 | │ ├── .json
71 | │ ├── .pdf
72 | │ ├── .json
73 | │ └── ...
74 | └── test/
75 | ├── .pdf
76 | ├── .json
77 | └── ...
78 | ```
79 |
80 | Where the normalised annotation resides in a JSON file living next to the related PDF,
81 | and uses the following schema:
82 |
83 | | Key | Description |
84 | | -------------- | ----------------------------------------------- |
85 | | `note_id` | Reference to the document |
86 | | `` | Optional property of the document itself |
87 | | `annotations` | List of annotations, following the schema above |
88 |
89 | This structure presents the advantage of being machine- and human-friendly.
90 | The JSON file contains annotated regions as well as any document property that
91 | could be useful to adapt the pipeline (typically for the classification step).
92 |
93 | ## Extracting annotations
94 |
95 | The following snippet extracts the annotations into a workable format:
96 |
97 | ```python
98 | from pathlib import Path
99 | import pandas as pd
100 |
101 |
102 | def get_annotations(
103 | directory: Path,
104 | ) -> pd.DataFrame:
105 | """
106 | Read annotations from the dataset directory.
107 |
108 | Parameters
109 | ----------
110 | directory : Path
111 | Dataset directory
112 |
113 | Returns
114 | -------
115 | pd.DataFrame
116 | Pandas DataFrame containing the annotations.
117 | """
118 | dfs = []
119 |
120 | iterator = tqdm(list(directory.glob("*.json")))
121 |
122 | for path in iterator:
123 | meta = json.loads(path.read_text())
124 | df = pd.DataFrame.from_records(meta.pop("annotations"))
125 |
126 | for k, v in meta.items(): # (1)
127 | df[k] = v
128 |
129 | dfs.append(df)
130 |
131 | return pd.concat(dfs)
132 |
133 |
134 | train_path = Path("dataset/train")
135 |
136 | annotations = get_annotations(train_path)
137 | ```
138 |
139 | 1. Add a column for each additional property saved in the dataset.
140 |
141 | The annotations compiled this way can be used to train a pipeline.
142 | See the [trained pipeline recipe](./training.md) for more detail.
143 |
144 | [^1]: Greater Paris University Hospital's Clinical Data Warehouse
145 |
--------------------------------------------------------------------------------
/docs/recipes/extension.md:
--------------------------------------------------------------------------------
1 | # Extending EDS-PDF
2 |
3 | EDS-PDF is organised around a function registry powered by catalogue and a custom configuration system. The result is a powerful framework that is easy to extend - and we'll see how in this section.
4 |
5 | For this recipe, let's imagine we're not entirely satisfied with the aggregation
6 | proposed by EDS-PDF. For instance, we might want an aggregator that outputs the
7 | text in Markdown format.
8 |
9 | !!! note
10 |
11 | Properly converting to markdown is no easy task. For this example,
12 | we will limit ourselves to detecting bold and italics sections.
13 |
14 | ## Developing the new aggregator
15 |
16 | Our aggregator will inherit from the [`SimpleAggregator`][edspdf.pipes.aggregators.simple.SimpleAggregator],
17 | and use the style to detect italics and bold sections.
18 |
19 | ```python title="markdown_aggregator.py"
20 | from edspdf import registry
21 | from edspdf.pipes.aggregators.simple import SimpleAggregator
22 | from edspdf.structures import PDFDoc, Text
23 |
24 |
25 | @registry.factory.register("markdown-aggregator") # (1)
26 | class MarkdownAggregator(SimpleAggregator):
27 | def __call__(self, doc: PDFDoc) -> PDFDoc:
28 | doc = super().__call__(doc)
29 |
30 | for label in doc.aggregated_texts.keys():
31 | text = doc.aggregated_texts[label].text
32 |
33 | fragments = []
34 |
35 | offset = 0
36 | for s in doc.aggregated_texts[label].properties:
37 | if s.begin >= s.end:
38 | continue
39 | if offset < s.begin:
40 | fragments.append(text[offset : s.begin])
41 |
42 | offset = s.end
43 | snippet = text[s.begin : s.end]
44 | if s.bold:
45 | snippet = f"**{snippet}**"
46 | if s.italic:
47 | snippet = f"_{snippet}_"
48 | fragments.append(snippet)
49 |
50 | if offset < len(text):
51 | fragments.append(text[offset:])
52 |
53 | doc.aggregated_texts[label] = Text(text="".join(fragments))
54 |
55 | return doc
56 | ```
57 |
58 | 1. The new aggregator is registered via this line
59 | 2. The new aggregator redefines the `__call__` method.
60 | It will output a single string, corresponding to the markdown-formatted output.
61 |
62 | That's it! You can use this new aggregator with the API:
63 |
64 | ```python
65 | from edspdf import Pipeline
66 | from markdown_aggregator import MarkdownAggregator # (1)
67 |
68 | model = Pipeline()
69 | # will extract text lines from a document
70 | model.add_pipe(
71 | "pdfminer-extractor",
72 | config=dict(
73 | extract_style=False,
74 | ),
75 | )
76 | # classify everything inside the `body` bounding box as `body`
77 | model.add_pipe("mask-classifier", config={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9})
78 | # aggregates the lines together to generate the markdown formatted text
79 | model.add_pipe("markdown-aggregator")
80 | ```
81 |
82 | 1. We're importing the aggregator that we just defined.
83 |
84 | It all works relatively smoothly!
85 |
86 | ## Making the aggregator discoverable
87 |
88 | Now, how can we instantiate the pipeline using the configuration system?
89 | The registry needs to be aware of the new function, but we shouldn't have to
90 | import `mardown_aggregator.py` just so that the module is registered as a side-effect...
91 |
92 | Catalogue solves this problem by using Python _entry points_.
93 |
94 | === "pyproject.toml"
95 |
96 | ```toml
97 | [project.entry-points."edspdf_factories"]
98 | "markdown-aggregator" = "markdown_aggregator:MarkdownAggregator"
99 | ```
100 |
101 | === "setup.py"
102 |
103 | ```python
104 | from setuptools import setup
105 |
106 | setup(
107 | name="edspdf-markdown-aggregator",
108 | entry_points={
109 | "edspdf_factories": [
110 | "markdown-aggregator = markdown_aggregator:MarkdownAggregator"
111 | ]
112 | },
113 | )
114 | ```
115 |
116 | By declaring the new aggregator as an entrypoint, it will become discoverable by EDS-PDF
117 | as long as it is installed in your environment!
118 |
--------------------------------------------------------------------------------
/docs/recipes/index.md:
--------------------------------------------------------------------------------
1 | # EDS-PDF Recipes
2 |
3 | This section goes over a few use-cases for PDF extraction.
4 | It is meant as a more hands-on tutorial to get a grip on the library.
5 |
--------------------------------------------------------------------------------
/docs/recipes/resources/lines.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/recipes/resources/lines.jpeg
--------------------------------------------------------------------------------
/docs/recipes/resources/merged.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/recipes/resources/merged.jpeg
--------------------------------------------------------------------------------
/docs/recipes/rule-based.md:
--------------------------------------------------------------------------------
1 | # Rule-based extraction
2 |
3 | Let's create a rule-based extractor for PDF documents.
4 |
5 | !!! note
6 |
7 | This pipeline will likely perform poorly as soon as your PDF documents
8 | come in varied forms. In that case, even a very simple trained pipeline
9 | may give you a substantial performance boost (see [next section](training.md)).
10 |
11 | First, download this example [PDF](https://github.com/aphp/edspdf/raw/main/tests/resources/letter.pdf).
12 |
13 | We will use the following configuration:
14 |
15 | ```toml title="config.cfg"
16 | [pipeline]
17 | components = ["extractor", "classifier", "aggregator"]
18 | components_config = ${components}
19 |
20 | [components.extractor]
21 | @factory = "pdfminer-extractor" # (2)
22 | extract_style = true
23 |
24 | [components.classifier]
25 | @factory = "mask-classifier" # (3)
26 | x0 = 0.2
27 | x1 = 0.9
28 | y0 = 0.3
29 | y1 = 0.6
30 | threshold = 0.1
31 |
32 | [components.aggregator]
33 | @factory = "simple-aggregator" # (4)
34 | ```
35 |
36 | 1. This is the top-level object, which organises the entire extraction process.
37 | 2. Here we use the provided text-based extractor, based on the PDFMiner library
38 | 3. This is where we define the rule-based classifier. Here, we use a "mask",
39 | meaning that every text bloc that falls within the boundaries will be assigned
40 | the `body` label, everything else will be tagged as pollution.
41 | 4. This aggregator returns a tuple of dictionaries. The first contains compiled text for each
42 | label, the second exports their style.
43 |
44 | Save the configuration as `config.cfg` and run the following snippet:
45 |
46 | ```python
47 | import edspdf
48 | import pandas as pd
49 | from pathlib import Path
50 |
51 | model = edspdf.load("config.cfg") # (1)
52 |
53 | # Get a PDF
54 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes()
55 | pdf = model(pdf)
56 |
57 | body = pdf.aggregated_texts["body"]
58 |
59 | text, style = body.text, body.properties
60 | print(text)
61 | print(pd.DataFrame(style))
62 | ```
63 |
64 | This code will output the following results:
65 |
66 | === "Visualisation"
67 |
68 | 
69 |
70 | === "Extracted Text"
71 |
72 | ```
73 | Cher Pr ABC, Cher DEF,
74 |
75 | Nous souhaitons remercier le CSE pour son avis favorable quant à l’accès aux données de
76 | l’Entrepôt de Données de Santé du projet n° XXXX.
77 |
78 | Nous avons bien pris connaissance des conditions requises pour cet avis favorable, c’est
79 | pourquoi nous nous engageons par la présente à :
80 |
81 | • Informer individuellement les patients concernés par la recherche, admis à l'AP-HP
82 | avant juillet 2017, sortis vivants, et non réadmis depuis.
83 |
84 | • Effectuer une demande d'autorisation à la CNIL en cas d'appariement avec d’autres
85 | cohortes.
86 |
87 | Bien cordialement,
88 | ```
89 |
90 | === "Extracted Style"
91 |
92 | The `start` and `end` columns refer to the character indices within the extracted text.
93 |
94 | | italic | bold | fontname | start | end |
95 | |--------|--------|----------------|-------|-----|
96 | | False | False | BCDFEE+Calibri | 0 | 22 |
97 | | False | False | BCDFEE+Calibri | 24 | 90 |
98 | | False | False | BCDHEE+Calibri | 90 | 91 |
99 | | False | False | BCDFEE+Calibri | 91 | 111 |
100 | | False | False | BCDFEE+Calibri | 112 | 113 |
101 | | False | False | BCDHEE+Calibri | 113 | 114 |
102 | | False | False | BCDFEE+Calibri | 114 | 161 |
103 | | False | False | BCDFEE+Calibri | 163 | 247 |
104 | | False | False | BCDHEE+Calibri | 247 | 248 |
105 | | False | False | BCDFEE+Calibri | 248 | 251 |
106 | | False | False | BCDFEE+Calibri | 252 | 300 |
107 | | False | False | SymbolMT | 302 | 303 |
108 | | False | False | BCDFEE+Calibri | 304 | 386 |
109 | | False | False | BCDFEE+Calibri | 387 | 445 |
110 | | False | False | SymbolMT | 447 | 448 |
111 | | False | False | BCDFEE+Calibri | 449 | 523 |
112 | | False | False | BCDHEE+Calibri | 523 | 524 |
113 | | False | False | BCDFEE+Calibri | 524 | 530 |
114 | | False | False | BCDFEE+Calibri | 531 | 540 |
115 | | False | False | BCDFEE+Calibri | 542 | 560 |
116 |
--------------------------------------------------------------------------------
/docs/references.bib:
--------------------------------------------------------------------------------
1 | @article{vaswani2017attention,
2 | title={Attention is all you need},
3 | author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
4 | journal={Advances in neural information processing systems},
5 | volume={30},
6 | year={2017}
7 | }
8 |
--------------------------------------------------------------------------------
/docs/roadmap.md:
--------------------------------------------------------------------------------
1 | ---8<--- "roadmap.md"
2 |
--------------------------------------------------------------------------------
/docs/utilities/alignment.md:
--------------------------------------------------------------------------------
1 | # Alignment
2 |
3 | To simplify the annotation process, EDS-PDF provides a [utility that aligns
4 | bounding boxes][edspdf.utils.alignment.align_box_labels] with text blocs extracted from a PDF document.
5 | This is particularly useful for annotating documents.
6 |
7 | === "Blocs"
8 |
9 | 
10 |
11 | === "Blocs + Annotation"
12 |
13 | 
14 |
15 | === "Aligned"
16 |
17 | 
18 |
19 | === "Merged Blocs"
20 |
21 | 
22 |
--------------------------------------------------------------------------------
/docs/utilities/index.md:
--------------------------------------------------------------------------------
1 | # Overview
2 |
3 | EDS-PDF provides a few utilities help annotate PDF documents, and debug the output of an extraction pipeline.
4 |
--------------------------------------------------------------------------------
/docs/utilities/resources/aligned-merged.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/aligned-merged.jpeg
--------------------------------------------------------------------------------
/docs/utilities/resources/aligned.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/aligned.jpeg
--------------------------------------------------------------------------------
/docs/utilities/resources/blocs.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/blocs.jpeg
--------------------------------------------------------------------------------
/docs/utilities/resources/blocs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/blocs.png
--------------------------------------------------------------------------------
/docs/utilities/resources/lines.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/lines.jpeg
--------------------------------------------------------------------------------
/docs/utilities/resources/merged.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/docs/utilities/resources/merged.jpeg
--------------------------------------------------------------------------------
/docs/utilities/visualisation.md:
--------------------------------------------------------------------------------
1 | # Visualisation
2 |
3 | EDS-PDF provides utilities to help you visualise the output of the pipeline.
4 |
5 | ## Visualising a pipeline's output
6 |
7 | You can use EDS-PDF to overlay labelled bounding boxes on top of a PDF document.
8 |
9 | ```python
10 | import edspdf
11 | from confit import Config
12 | from pathlib import Path
13 | from edspdf.visualization import show_annotations
14 |
15 | config = """
16 | [pipeline]
17 | pipeline = ["extractor", "classifier"]
18 |
19 | [components]
20 |
21 | [components.extractor]
22 | @factory = "pdfminer-extractor"
23 | extract_style = true
24 |
25 | [components.classifier]
26 | @factory = "mask-classifier"
27 | x0 = 0.25
28 | x1 = 0.95
29 | y0 = 0.3
30 | y1 = 0.9
31 | threshold = 0.1
32 | """
33 |
34 | model = edspdf.load(Config.from_str(config))
35 |
36 | # Get a PDF
37 | pdf = Path("/Users/perceval/Development/edspdf/tests/resources/letter.pdf").read_bytes()
38 |
39 | # Construct the DataFrame of blocs
40 | doc = model(pdf)
41 |
42 | # Compute an image representation of each page of the PDF
43 | # overlaid with the predicted bounding boxes
44 | imgs = show_annotations(pdf=pdf, annotations=doc.text_boxes)
45 |
46 | imgs[0]
47 | ```
48 |
49 | If you run this code in a Jupyter notebook, you'll see the following:
50 |
51 | 
52 |
53 | ## Merging blocs together
54 |
55 | To help debug a pipeline (or a labelled dataset), you might want to
56 | merge blocs together according to their labels. EDS-PDF provides a `merge_lines` method
57 | that does just that.
58 |
59 | ```python
60 | # ↑ Omitted code above ↑
61 | from edspdf.visualization import merge_boxes, show_annotations
62 |
63 | merged = merge_boxes(doc.text_boxes)
64 |
65 | imgs = show_annotations(pdf=pdf, annotations=merged)
66 | imgs[0]
67 | ```
68 |
69 | See the difference:
70 |
71 | === "Original"
72 |
73 | 
74 |
75 | === "Merged"
76 |
77 | 
78 |
79 | The `merge_boxes` method uses the notion of maximal cliques to compute merges.
80 | It forbids the combined blocs from overlapping with any bloc from another label.
81 |
--------------------------------------------------------------------------------
/edspdf/__init__.py:
--------------------------------------------------------------------------------
1 | # ruff: noqa: F401
2 | from .trainable_pipe import TrainablePipe
3 | from .pipeline import Pipeline, load
4 | from .registry import registry
5 | from .structures import Box, Page, PDFDoc, Text, TextBox, TextProperties
6 | from . import data
7 |
8 | from . import utils # isort:skip
9 |
10 | __version__ = "0.10.0"
11 |
--------------------------------------------------------------------------------
/edspdf/accelerators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/accelerators/__init__.py
--------------------------------------------------------------------------------
/edspdf/accelerators/base.py:
--------------------------------------------------------------------------------
1 | class Accelerator:
2 | pass
3 |
--------------------------------------------------------------------------------
/edspdf/accelerators/multiprocessing.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional, Union
2 |
3 | import torch
4 |
5 | from ..registry import registry
6 | from .base import Accelerator
7 |
8 |
9 | @registry.accelerator.register("multiprocessing")
10 | class MultiprocessingAccelerator(Accelerator):
11 | """
12 | Deprecated: Use `docs.map_pipeline(model).set_processing(...)` instead
13 | """
14 |
15 | def __init__(
16 | self,
17 | batch_size: int,
18 | num_cpu_workers: Optional[int] = None,
19 | num_gpu_workers: Optional[int] = None,
20 | gpu_pipe_names: Optional[List[str]] = None,
21 | gpu_worker_devices: Optional[List[Union[torch.device, str]]] = None,
22 | cpu_worker_devices: Optional[List[Union[torch.device, str]]] = None,
23 | ):
24 | self.batch_size = batch_size
25 | self.num_gpu_workers: Optional[int] = num_gpu_workers
26 | self.num_cpu_workers = num_cpu_workers
27 | self.gpu_pipe_names = gpu_pipe_names
28 | self.gpu_worker_devices = gpu_worker_devices
29 | self.cpu_worker_devices = cpu_worker_devices
30 |
--------------------------------------------------------------------------------
/edspdf/data/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 | from edspdf.utils.lazy_module import lazify
3 |
4 | lazify()
5 |
6 | if TYPE_CHECKING:
7 | from .base import from_iterable, to_iterable
8 | from .files import read_files, write_files
9 | from .parquet import read_parquet, write_parquet
10 | from .pandas import from_pandas, to_pandas
11 | from .converters import get_dict2doc_converter, get_doc2dict_converter
12 |
--------------------------------------------------------------------------------
/edspdf/data/converters.py:
--------------------------------------------------------------------------------
1 | """
2 | Converters are used to convert documents between python dictionaries and Doc objects.
3 | There are two types of converters: readers and writers. Readers convert dictionaries to
4 | Doc objects, and writers convert Doc objects to dictionaries.
5 | """
6 | import inspect
7 | from copy import copy
8 | from types import FunctionType
9 | from typing import (
10 | Any,
11 | Callable,
12 | Dict,
13 | Optional,
14 | Tuple,
15 | )
16 |
17 | from confit.registry import ValidatedFunction
18 |
19 | FILENAME = "__FILENAME__"
20 | CONTENT = "__CONTENT__"
21 |
22 | SCHEMA = {}
23 |
24 |
25 | def validate_kwargs(converter, kwargs):
26 | converter: FunctionType = copy(converter)
27 | spec = inspect.getfullargspec(converter)
28 | first = spec.args[0]
29 | converter.__annotations__[first] = Optional[Any]
30 | converter.__defaults__ = (None, *(spec.defaults or ())[-len(spec.args) + 1 :])
31 | vd = ValidatedFunction(converter, {"arbitrary_types_allowed": True})
32 | model = vd.init_model_instance(**kwargs)
33 | d = {
34 | k: v
35 | for k, v in model._iter()
36 | if (k in model.__fields__ or model.__fields__[k].default_factory)
37 | }
38 | d.pop("v__duplicate_kwargs", None) # see pydantic ValidatedFunction code
39 | d.pop(vd.v_args_name, None)
40 | d.pop(first, None)
41 | return {**(d.pop(vd.v_kwargs_name, None) or {}), **d}
42 |
43 |
44 | def get_dict2doc_converter(converter: Callable, kwargs) -> Tuple[Callable, Dict]:
45 | # kwargs_to_init = False
46 | # if not callable(converter):
47 | # available = edspdf.registry.factory.get_available()
48 | # try:
49 | # filtered = [
50 | # name
51 | # for name in available
52 | # if converter == name or (converter in name and "dict2doc" in name)
53 | # ]
54 | # converter = edspdf.registry.factory.get(filtered[0])
55 | # converter = converter(**kwargs).instantiate(nlp=None)
56 | # kwargs = {}
57 | # return converter, kwargs
58 | # except (KeyError, IndexError):
59 | # available = [v for v in available if "dict2doc" in v]
60 | # raise ValueError(
61 | # f"Cannot find converter for format {converter}. "
62 | # f"Available converters are {', '.join(available)}"
63 | # )
64 | # if isinstance(converter, type) or kwargs_to_init:
65 | # return converter(**kwargs), {}
66 | return converter, validate_kwargs(converter, kwargs)
67 |
68 |
69 | def get_doc2dict_converter(converter: Callable, kwargs) -> Tuple[Callable, Dict]:
70 | # if not callable(converter):
71 | # available = edspdf.registry.factory.get_available()
72 | # try:
73 | # filtered = [
74 | # name
75 | # for name in available
76 | # if converter == name or (converter in name and "doc2dict" in name)
77 | # ]
78 | # converter = edspdf.registry.factory.get(filtered[0])
79 | # converter = converter(**kwargs).instantiate(nlp=None)
80 | # kwargs = {}
81 | # return converter, kwargs
82 | # except (KeyError, IndexError):
83 | # available = [v for v in available if "doc2dict" in v]
84 | # raise ValueError(
85 | # f"Cannot find converter for format {converter}. "
86 | # f"Available converters are {', '.join(available)}"
87 | # )
88 | return converter, validate_kwargs(converter, kwargs)
89 |
--------------------------------------------------------------------------------
/edspdf/data/pandas.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | from typing import Any, Callable, Iterable, Optional, Tuple, Union
4 |
5 | import pandas as pd
6 |
7 | from edspdf import registry
8 | from edspdf.data.base import BaseReader, BaseWriter
9 | from edspdf.data.converters import (
10 | FILENAME,
11 | get_dict2doc_converter,
12 | get_doc2dict_converter,
13 | )
14 | from edspdf.lazy_collection import LazyCollection
15 | from edspdf.utils.collections import dl_to_ld, flatten, ld_to_dl
16 |
17 |
18 | class PandasReader(BaseReader):
19 | DATA_FIELDS = ("data",)
20 |
21 | def __init__(
22 | self,
23 | data: pd.DataFrame,
24 | **kwargs,
25 | ):
26 | assert isinstance(data, pd.DataFrame)
27 | self.data = data
28 |
29 | super().__init__(**kwargs)
30 |
31 | def read_main(self) -> Iterable[Tuple[Any, int]]:
32 | return ((item, 1) for item in dl_to_ld(dict(self.data)))
33 |
34 | def read_worker(self, fragments):
35 | return [task for task in fragments]
36 |
37 |
38 | @registry.readers.register("pandas")
39 | def from_pandas(
40 | data,
41 | converter: Union[str, Callable],
42 | **kwargs,
43 | ) -> LazyCollection:
44 | """
45 | The PandasReader (or `edspdf.data.from_pandas`) handles reading from a table and
46 | yields documents. At the moment, only entities and attributes are loaded. Relations
47 | and events are not supported.
48 |
49 | Example
50 | -------
51 | ```{ .python .no-check }
52 |
53 | import edspdf
54 |
55 | nlp = edspdf.blank("eds")
56 | nlp.add_pipe(...)
57 | doc_iterator = edspdf.data.from_pandas(df, nlp=nlp, converter="omop")
58 | annotated_docs = nlp.pipe(doc_iterator)
59 | ```
60 |
61 | !!! note "Generator vs list"
62 |
63 | `edspdf.data.from_pandas` returns a
64 | [LazyCollection][edspdf.core.lazy_collection.LazyCollection].
65 | To iterate over the documents multiple times efficiently or to access them by
66 | index, you must convert it to a list
67 |
68 | ```{ .python .no-check }
69 | docs = list(edspdf.data.from_pandas(df, converter="omop"))
70 | ```
71 |
72 | Parameters
73 | ----------
74 | data: pd.DataFrame
75 | Pandas object
76 | converter: Optional[Union[str, Callable]]
77 | Converter to use to convert the rows of the DataFrame to Doc objects
78 | kwargs:
79 | Additional keyword arguments passed to the converter. These are documented
80 | on the [Data schemas](/data/schemas) page.
81 |
82 | Returns
83 | -------
84 | LazyCollection
85 | """
86 |
87 | data = LazyCollection(reader=PandasReader(data))
88 | if converter:
89 | converter, kwargs = get_dict2doc_converter(converter, kwargs)
90 | data = data.map(converter, kwargs=kwargs)
91 | return data
92 |
93 |
94 | class PandasWriter(BaseWriter):
95 | def __init__(self, dtypes: Optional[dict] = None):
96 | self.dtypes = dtypes
97 |
98 | def write_worker(self, records):
99 | # If write as jsonl, we will perform the actual writing in the `write` method
100 | for rec in records:
101 | if isinstance(rec, dict):
102 | rec.pop(FILENAME, None)
103 | return records, len(records)
104 |
105 | def write_main(self, fragments):
106 | import pandas as pd
107 |
108 | columns = ld_to_dl(flatten(fragments))
109 | res = pd.DataFrame(columns)
110 | return res.astype(self.dtypes) if self.dtypes else res
111 |
112 |
113 | @registry.writers.register("pandas")
114 | def to_pandas(
115 | data: Union[Any, LazyCollection],
116 | converter: Optional[Union[str, Callable]],
117 | dtypes: Optional[dict] = None,
118 | **kwargs,
119 | ) -> pd.DataFrame:
120 | """
121 | `edspdf.data.to_pandas` writes a list of documents as a pandas table.
122 |
123 | Example
124 | -------
125 | ```{ .python .no-check }
126 |
127 | import edspdf
128 |
129 | nlp = edspdf.blank("eds")
130 | nlp.add_pipe(...)
131 |
132 | doc = nlp("My document with entities")
133 |
134 | edspdf.data.to_pandas([doc], converter="omop")
135 | ```
136 |
137 | Parameters
138 | ----------
139 | data: Union[Any, LazyCollection],
140 | The data to write (either a list of documents or a LazyCollection).
141 | converter: Optional[Union[str, Callable]]
142 | Converter to use to convert the documents to dictionary objects before storing
143 | them in the dataframe.
144 | dtypes: Optional[dict]
145 | Dictionary of column names to dtypes. This is passed to `pd.DataFrame.astype`.
146 | kwargs:
147 | Additional keyword arguments passed to the converter. These are documented
148 | on the [Data schemas](/data/schemas) page.
149 | """
150 | data = LazyCollection.ensure_lazy(data)
151 | if converter:
152 | converter, kwargs = get_doc2dict_converter(converter, kwargs)
153 | data = data.map(converter, kwargs=kwargs)
154 |
155 | return data.write(PandasWriter(dtypes))
156 |
--------------------------------------------------------------------------------
/edspdf/layers/__init__.py:
--------------------------------------------------------------------------------
1 | from . import box_transformer, relative_attention, sinusoidal_embedding, vocabulary
2 |
--------------------------------------------------------------------------------
/edspdf/layers/sinusoidal_embedding.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch
4 | import torch.nn.functional as F
5 |
6 |
7 | class SinusoidalEmbedding(torch.nn.Module):
8 | """
9 | A position embedding lookup table that stores embeddings for a fixed number
10 | of positions.
11 | The value of each of the `embedding_dim` channels of the generated embedding
12 | is generated according to a trigonometric function (sin for even channels,
13 | cos for odd channels).
14 | The frequency of the signal in each pair of channels varies according to the
15 | temperature parameter.
16 |
17 | Any input position above the maximum value `num_embeddings` will be capped to
18 | `num_embeddings - 1`
19 | """
20 |
21 | def __init__(
22 | self,
23 | num_embeddings: int,
24 | embedding_dim: int,
25 | temperature: float = 10000.0,
26 | ):
27 | """
28 | Parameters
29 | ----------
30 | num_embeddings: int
31 | The maximum number of position embeddings store in this table
32 | embedding_dim: int
33 | The embedding size
34 | temperature: float
35 | The temperature controls the range of frequencies used by each
36 | channel of the embedding
37 | """
38 | super().__init__()
39 |
40 | self.embedding_dim = embedding_dim
41 | self.num_embeddings = num_embeddings
42 | self.temperature = temperature
43 |
44 | weight = torch.zeros(self.num_embeddings, self.embedding_dim)
45 | position = torch.arange(0, self.num_embeddings, dtype=torch.float).unsqueeze(1)
46 | div_term = torch.exp(
47 | torch.arange(0, self.embedding_dim, 2).float()
48 | * (-math.log(self.temperature) / self.embedding_dim)
49 | )
50 | weight[:, 0::2] = torch.sin(position * div_term)
51 | weight[:, 1::2] = torch.cos(position * div_term)
52 | self.register_buffer("weight", weight)
53 |
54 | def extra_repr(self) -> str:
55 | return f"{self.num_embeddings}, {self.embedding_dim}"
56 |
57 | def forward(self, indices: torch.LongTensor):
58 | """
59 | Forward pass of the SinusoidalEmbedding module
60 |
61 | Parameters
62 | ----------
63 | indices: torch.LongTensor
64 | Shape: any
65 |
66 | Returns
67 | -------
68 | torch.FloatTensor
69 | Shape: `(*input_shape, embedding_dim)`
70 | """
71 | res = F.embedding(indices.clamp(0, len(self.weight) - 1), self.weight)
72 | return res
73 |
--------------------------------------------------------------------------------
/edspdf/layers/vocabulary.py:
--------------------------------------------------------------------------------
1 | import contextlib
2 | from typing import Generic, Sequence, TypeVar
3 |
4 | import torch
5 |
6 | T = TypeVar("T")
7 |
8 |
9 | class Vocabulary(torch.nn.Module, Generic[T]):
10 | """
11 | Vocabulary layer.
12 | This is not meant to be used as a `torch.nn.Module` but subclassing
13 | `torch.nn.Module` makes the instances appear when printing a model, which is nice.
14 | """
15 |
16 | def __init__(self, items: Sequence[T] = None, default: int = -100):
17 | """
18 | Parameters
19 | ----------
20 | items: Sequence[InputT]
21 | Initial vocabulary elements if any.
22 | Specific elements such as padding and unk can be set here to enforce their
23 | index in the vocabulary.
24 | default: int
25 | Default index to use for out of vocabulary elements
26 | Defaults to -100
27 | """
28 | super().__init__()
29 | self.indices = {} if items is None else {v: i for i, v in enumerate(items)}
30 | self.initialized = True
31 | self.default = default
32 |
33 | def __len__(self):
34 | return len(self.indices)
35 |
36 | @contextlib.contextmanager
37 | def initialization(self):
38 | """
39 | Enters the initialization mode.
40 | Out of vocabulary elements will be assigned an index.
41 | """
42 | self.initialized = False
43 | yield
44 | self.initialized = True
45 |
46 | def encode(self, item):
47 | """
48 | Converts an element into its vocabulary index
49 | If the layer is in its initialization mode (`with vocab.initialization(): ...`),
50 | and the element is out of vocabulary, a new index will be created and returned.
51 | Otherwise, any oov element will be encoded with the `default` index.
52 |
53 | Parameters
54 | ----------
55 | item: InputT
56 |
57 | Returns
58 | -------
59 | int
60 | """
61 | if self.initialized:
62 | return self.indices.get(
63 | item, self.default
64 | ) # .setdefault(item, len(self.indices))
65 | else:
66 | return self.indices.setdefault(
67 | item, len(self.indices)
68 | ) # .setdefault(item, len(self.indices))
69 |
70 | def decode(self, idx):
71 | """
72 | Converts an index into its original value
73 |
74 | Parameters
75 | ----------
76 | idx: int
77 |
78 | Returns
79 | -------
80 | InputT
81 | """
82 | return list(self.indices.keys())[idx] if idx >= 0 else None
83 |
84 | def extra_repr(self):
85 | return "n={}".format(len(self.indices))
86 |
--------------------------------------------------------------------------------
/edspdf/pipes/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/__init__.py
--------------------------------------------------------------------------------
/edspdf/pipes/aggregators/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/aggregators/__init__.py
--------------------------------------------------------------------------------
/edspdf/pipes/classifiers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/classifiers/__init__.py
--------------------------------------------------------------------------------
/edspdf/pipes/classifiers/dummy.py:
--------------------------------------------------------------------------------
1 | from edspdf.pipeline import Pipeline
2 | from edspdf.registry import registry
3 | from edspdf.structures import PDFDoc
4 |
5 |
6 | @registry.factory.register("dummy-classifier")
7 | class DummyClassifier:
8 | """
9 | Dummy classifier. Classifies each line to the same element.
10 |
11 | Parameters
12 | ----------
13 | pipeline: Pipeline
14 | The pipeline object.
15 | name: str
16 | The name of the component.
17 | label: str
18 | The label to assign to each line.
19 | """
20 |
21 | def __init__(
22 | self,
23 | label: str,
24 | pipeline: Pipeline = None,
25 | name: str = "dummy-classifier",
26 | ) -> None:
27 | self.name = name
28 | self.label = label
29 |
30 | def __call__(self, doc: PDFDoc) -> PDFDoc:
31 | for b in doc.content_boxes:
32 | b.label = self.label
33 |
34 | return doc
35 |
--------------------------------------------------------------------------------
/edspdf/pipes/classifiers/random.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Optional, Union
2 |
3 | import numpy as np
4 |
5 | from edspdf import PDFDoc, Pipeline, registry
6 |
7 |
8 | @registry.factory.register("random-classifier")
9 | class RandomClassifier:
10 | """
11 | Random classifier, for chaos purposes. Classifies each box to a random element.
12 |
13 | Parameters
14 | ----------
15 | pipeline: Pipeline
16 | The pipeline object.
17 | name: str
18 | The name of the component.
19 | labels: Union[List[str], Dict[str, float]]
20 | The labels to assign to each line. If a list is passed, each label is assigned
21 | with equal probability. If a dict is passed, the keys are the labels and the
22 | values are the probabilities.
23 | """
24 |
25 | def __init__(
26 | self,
27 | pipeline: Pipeline,
28 | labels: Union[List[str], Dict[str, float]],
29 | seed: Optional[int] = 0,
30 | name: str = "random-classifier",
31 | ) -> None:
32 | super().__init__()
33 |
34 | if isinstance(labels, list):
35 | labels = {c: 1 for c in labels}
36 |
37 | self.labels = {c: w / sum(labels.values()) for c, w in labels.items()}
38 |
39 | self.rgn = np.random.default_rng(seed=seed)
40 |
41 | def __call__(self, doc: PDFDoc) -> PDFDoc:
42 | lines = doc.content_boxes
43 | prediction = self.rgn.choice(
44 | list(self.labels.keys()),
45 | p=list(self.labels.values()),
46 | size=len(lines),
47 | )
48 | for b, label in zip(lines, prediction):
49 | b.label = label
50 |
51 | return doc
52 |
--------------------------------------------------------------------------------
/edspdf/pipes/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | from foldedtensor import FoldedTensor
2 | from typing_extensions import TypedDict
3 |
4 | from edspdf import TrainablePipe
5 |
6 | EmbeddingOutput = TypedDict(
7 | "EmbeddingOutput",
8 | {
9 | "embeddings": FoldedTensor,
10 | },
11 | )
12 |
--------------------------------------------------------------------------------
/edspdf/pipes/embeddings/box_layout_embedding.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing_extensions import Literal
3 |
4 | from edspdf.layers.sinusoidal_embedding import SinusoidalEmbedding
5 | from edspdf.pipeline import Pipeline
6 | from edspdf.pipes.embeddings import EmbeddingOutput
7 | from edspdf.pipes.embeddings.box_layout_preprocessor import (
8 | BoxLayoutBatch,
9 | BoxLayoutPreprocessor,
10 | )
11 | from edspdf.registry import registry
12 | from edspdf.trainable_pipe import TrainablePipe
13 |
14 |
15 | @registry.factory.register("box-layout-embedding")
16 | class BoxLayoutEmbedding(TrainablePipe[EmbeddingOutput]):
17 | """
18 | This component encodes the geometrical features of a box, as extracted by the
19 | BoxLayoutPreprocessor module, into an embedding. For position modes, use:
20 |
21 | - `"sin"` to embed positions with a fixed
22 | [SinusoidalEmbedding][edspdf.layers.sinusoidal_embedding.SinusoidalEmbedding]
23 | - `"learned"` to embed positions using a learned standard pytorch embedding layer
24 |
25 | Each produces embedding is the concatenation of the box width, height and the top,
26 | left, bottom and right coordinates, each embedded depending on the `*_mode` param.
27 |
28 | Parameters
29 | ----------
30 | size: int
31 | Size of the output box embedding
32 | n_positions: int
33 | Number of position embeddings stored in the PositionEmbedding module
34 | x_mode: Literal["sin", "learned"]
35 | Position embedding mode of the x coordinates
36 | y_mode: Literal["sin", "learned"]
37 | Position embedding mode of the x coordinates
38 | w_mode: Literal["sin", "learned"]
39 | Position embedding mode of the width features
40 | h_mode: Literal["sin", "learned"]
41 | Position embedding mode of the height features
42 | """
43 |
44 | def __init__(
45 | self,
46 | n_positions: int,
47 | size: int,
48 | x_mode: Literal["sin", "learned"] = "sin",
49 | y_mode: Literal["sin", "learned"] = "sin",
50 | w_mode: Literal["sin", "learned"] = "sin",
51 | h_mode: Literal["sin", "learned"] = "sin",
52 | pipeline: Pipeline = None,
53 | name: str = "box-layout-embedding",
54 | ):
55 | super().__init__(pipeline, name)
56 |
57 | assert size % 12 == 0, "Size must be a multiple of 12"
58 |
59 | self.n_positions = n_positions
60 | self.output_size = size
61 |
62 | self.x_embedding = self._make_embed(n_positions, size // 6, x_mode)
63 | self.y_embedding = self._make_embed(n_positions, size // 6, y_mode)
64 | self.w_embedding = self._make_embed(n_positions, size // 6, w_mode)
65 | self.h_embedding = self._make_embed(n_positions, size // 6, h_mode)
66 | self.first_page_embedding = torch.nn.Parameter(torch.randn(size))
67 | self.last_page_embedding = torch.nn.Parameter(torch.randn(size))
68 |
69 | self.box_preprocessor = BoxLayoutPreprocessor(pipeline, "box_preprocessor")
70 |
71 | def preprocess(self, doc):
72 | return self.box_preprocessor.preprocess(doc)
73 |
74 | def collate(self, batch) -> BoxLayoutBatch:
75 | return self.box_preprocessor.collate(batch)
76 |
77 | @classmethod
78 | def _make_embed(cls, n_positions, size, mode):
79 | if mode == "sin":
80 | return SinusoidalEmbedding(n_positions, size)
81 | else:
82 | return torch.nn.Embedding(n_positions, size)
83 |
84 | def forward(self, batch: BoxLayoutBatch) -> EmbeddingOutput:
85 | # fmt: off
86 | embedding = (
87 | torch.cat(
88 | [
89 | self.x_embedding((batch["xmin"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501
90 | self.y_embedding((batch["ymin"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501
91 | self.x_embedding((batch["xmax"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501
92 | self.y_embedding((batch["ymax"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501
93 | self.w_embedding((batch["width"] * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501
94 | self.h_embedding((batch["height"] * 5 * self.n_positions).clamp(max=self.n_positions - 1).long()), # noqa: E501
95 | ],
96 | dim=-1,
97 | )
98 | + self.first_page_embedding * batch["first_page"][..., None]
99 | + self.last_page_embedding * batch["last_page"][..., None]
100 | )
101 | # fmt: on
102 | return {"embeddings": embedding}
103 |
--------------------------------------------------------------------------------
/edspdf/pipes/embeddings/box_layout_preprocessor.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Dict
2 |
3 | import torch
4 | from foldedtensor import FoldedTensor, as_folded_tensor
5 | from typing_extensions import TypedDict
6 |
7 | from edspdf import Pipeline, TrainablePipe, registry
8 | from edspdf.structures import PDFDoc
9 |
10 | BoxLayoutBatch = TypedDict(
11 | "BoxLayoutBatch",
12 | {
13 | "xmin": FoldedTensor,
14 | "ymin": FoldedTensor,
15 | "xmax": FoldedTensor,
16 | "ymax": FoldedTensor,
17 | "width": FoldedTensor,
18 | "height": FoldedTensor,
19 | "first_page": FoldedTensor,
20 | "last_page": FoldedTensor,
21 | },
22 | )
23 |
24 |
25 | @registry.factory.register("box-layout-preprocessor")
26 | class BoxLayoutPreprocessor(TrainablePipe[BoxLayoutBatch]):
27 | """
28 | The box preprocessor is singleton since its is not configurable.
29 | The following features of each box of an input PDFDoc document are encoded
30 | as 1D tensors:
31 |
32 | - `boxes_page`: page index of the box
33 | - `boxes_first_page`: is the box on the first page
34 | - `boxes_last_page`: is the box on the last page
35 | - `boxes_xmin`: left position of the box
36 | - `boxes_ymin`: bottom position of the box
37 | - `boxes_xmax`: right position of the box
38 | - `boxes_ymax`: top position of the box
39 | - `boxes_w`: width position of the box
40 | - `boxes_h`: height position of the box
41 |
42 | The preprocessor also returns an additional tensors:
43 |
44 | - `page_boxes_id`: box indices per page to index the
45 | above 1D tensors (LongTensor: n_pages * n_boxes)
46 | """
47 |
48 | INSTANCE = None
49 |
50 | def __new__(cls, *args, **kwargs):
51 | if BoxLayoutPreprocessor.INSTANCE is None:
52 | BoxLayoutPreprocessor.INSTANCE = super().__new__(cls)
53 | return BoxLayoutPreprocessor.INSTANCE
54 |
55 | def __init__(
56 | self,
57 | pipeline: Pipeline = None,
58 | name: str = "box-layout-preprocessor",
59 | ):
60 | super().__init__(pipeline, name)
61 |
62 | def preprocess(self, doc: PDFDoc, supervision: bool = False):
63 | pages = doc.pages
64 | [[b.page_num for b in page.text_boxes] for page in pages]
65 | last_p = doc.num_pages - 1
66 | return {
67 | "xmin": [[b.x0 for b in p.text_boxes] for p in pages],
68 | "ymin": [[b.y0 for b in p.text_boxes] for p in pages],
69 | "xmax": [[b.x1 for b in p.text_boxes] for p in pages],
70 | "ymax": [[b.y1 for b in p.text_boxes] for p in pages],
71 | "width": [[(b.x1 - b.x0) for b in p.text_boxes] for p in pages],
72 | "height": [[(b.y1 - b.y0) for b in p.text_boxes] for p in pages],
73 | "first_page": [[b.page_num == 0 for b in p.text_boxes] for p in pages],
74 | "last_page": [[b.page_num == last_p for b in p.text_boxes] for p in pages],
75 | }
76 |
77 | def collate(self, batch) -> BoxLayoutBatch:
78 | kw = {
79 | "full_names": ["sample", "page", "line"],
80 | "data_dims": ["line"],
81 | }
82 |
83 | return {
84 | "xmin": as_folded_tensor(batch["xmin"], dtype=torch.float, **kw),
85 | "ymin": as_folded_tensor(batch["ymin"], dtype=torch.float, **kw),
86 | "xmax": as_folded_tensor(batch["xmax"], dtype=torch.float, **kw),
87 | "ymax": as_folded_tensor(batch["ymax"], dtype=torch.float, **kw),
88 | "width": as_folded_tensor(batch["width"], dtype=torch.float, **kw),
89 | "height": as_folded_tensor(batch["height"], dtype=torch.float, **kw),
90 | "first_page": as_folded_tensor(batch["first_page"], dtype=torch.bool, **kw),
91 | "last_page": as_folded_tensor(batch["last_page"], dtype=torch.bool, **kw),
92 | }
93 |
94 | def forward(self, *args, **kwargs) -> Dict[str, Any]:
95 | raise NotImplementedError()
96 |
--------------------------------------------------------------------------------
/edspdf/pipes/embeddings/box_transformer.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, Sequence
2 |
3 | from typing_extensions import Literal, TypedDict
4 |
5 | from edspdf import TrainablePipe
6 | from edspdf.layers.box_transformer import BoxTransformerModule
7 | from edspdf.pipeline import Pipeline
8 | from edspdf.pipes.embeddings import EmbeddingOutput
9 | from edspdf.pipes.embeddings.box_layout_preprocessor import (
10 | BoxLayoutBatch,
11 | BoxLayoutPreprocessor,
12 | )
13 | from edspdf.registry import registry
14 | from edspdf.utils.torch import ActivationFunction
15 |
16 | BoxTransformerEmbeddingInputBatch = TypedDict(
17 | "BoxTransformerEmbeddingInputBatch",
18 | {
19 | "embedding": EmbeddingOutput,
20 | "box_prep": BoxLayoutBatch,
21 | },
22 | )
23 |
24 |
25 | @registry.factory.register("box-transformer")
26 | class BoxTransformer(TrainablePipe[EmbeddingOutput]):
27 | """
28 | BoxTransformer using
29 | [BoxTransformerModule][edspdf.layers.box_transformer.BoxTransformerModule]
30 | under the hood.
31 |
32 | !!! note
33 |
34 | This module is a [TrainablePipe][edspdf.trainable_pipe.TrainablePipe]
35 | and can be used in a [Pipeline][edspdf.pipeline.Pipeline], while
36 | [BoxTransformerModule][edspdf.layers.box_transformer.BoxTransformerModule]
37 | is a standard PyTorch module, which does not take care of the
38 | preprocessing, collating, etc. of the input documents.
39 |
40 | Parameters
41 | ----------
42 | pipeline: Pipeline
43 | Pipeline instance
44 | name: str
45 | Name of the component
46 | num_heads: int
47 | Number of attention heads in the attention layers
48 | n_relative_positions: int
49 | Maximum range of embeddable relative positions between boxes (further
50 | distances are capped to ±n_relative_positions // 2)
51 | dropout_p: float
52 | Dropout probability both for the attention layers and embedding projections
53 | head_size: int
54 | Head sizes of the attention layers
55 | activation: ActivationFunction
56 | Activation function used in the linear->activation->linear transformations
57 | init_resweight: float
58 | Initial weight of the residual gates.
59 | At 0, the layer acts (initially) as an identity function, and at 1 as
60 | a standard Transformer layer.
61 | Initializing with a value close to 0 can help the training converge.
62 | attention_mode: Sequence[RelativeAttentionMode]
63 | Mode of relative position infused attention layer.
64 | See the [relative attention][edspdf.layers.relative_attention.RelativeAttention]
65 | documentation for more information.
66 | n_layers: int
67 | Number of layers in the Transformer
68 | """
69 |
70 | def __init__(
71 | self,
72 | embedding: TrainablePipe[EmbeddingOutput],
73 | num_heads: int = 2,
74 | dropout_p: float = 0.0,
75 | head_size: Optional[int] = None,
76 | activation: ActivationFunction = "gelu",
77 | init_resweight: float = 0.0,
78 | n_relative_positions: Optional[int] = None,
79 | attention_mode: Sequence[Literal["c2c", "c2p", "p2c"]] = ("c2c", "c2p", "p2c"),
80 | n_layers: int = 2,
81 | pipeline: Pipeline = None,
82 | name: str = "box-transformer",
83 | ):
84 | super().__init__(pipeline, name)
85 | self.embedding = embedding
86 | self.transformer = BoxTransformerModule(
87 | input_size=embedding.output_size,
88 | num_heads=num_heads,
89 | dropout_p=dropout_p,
90 | head_size=head_size,
91 | activation=activation,
92 | init_resweight=init_resweight,
93 | n_relative_positions=n_relative_positions,
94 | attention_mode=attention_mode,
95 | n_layers=n_layers,
96 | )
97 | self.output_size = embedding.output_size
98 | self.box_prep = BoxLayoutPreprocessor(pipeline, f"{name}.box_prep")
99 |
100 | def forward(
101 | self,
102 | batch: BoxTransformerEmbeddingInputBatch,
103 | ) -> EmbeddingOutput:
104 | res = self.embedding.module_forward(batch["embedding"])
105 | assert (
106 | "lengths" not in res
107 | ), "You must pool a SubBoxEmbedding output before using BoxTransformer"
108 | return {
109 | "embeddings": self.transformer(res["embeddings"], batch["box_prep"]),
110 | }
111 |
--------------------------------------------------------------------------------
/edspdf/pipes/embeddings/embedding_combiner.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from typing_extensions import Literal
3 |
4 | from edspdf import Pipeline, registry
5 | from edspdf.pipes.embeddings import EmbeddingOutput, TrainablePipe
6 |
7 |
8 | @registry.factory.register("embedding-combiner")
9 | class EmbeddingCombiner(TrainablePipe[EmbeddingOutput]):
10 | def __init__(
11 | self,
12 | dropout_p: float = 0.0,
13 | mode: Literal["sum", "cat"] = "sum",
14 | pipeline: Pipeline = None,
15 | name: str = "embedding-combiner",
16 | **encoders: TrainablePipe[EmbeddingOutput],
17 | ):
18 | """
19 | Encodes boxes using a combination of multiple encoders
20 |
21 | Parameters
22 | ----------
23 | pipeline: Pipeline
24 | The pipeline object
25 | name: str
26 | The name of the pipe
27 | mode: Literal["sum", "cat"]
28 | The mode to use to combine the encoders:
29 |
30 | - `sum`: Sum the outputs of the encoders
31 | - `cat`: Concatenate the outputs of the encoders
32 | dropout_p: float
33 | Dropout probability used on the output of the box and textual encoders
34 | encoders: Dict[str, TrainablePipe[EmbeddingOutput]]
35 | The encoders to use. The keys are the names of the encoders and the values
36 | are the encoders themselves.
37 | """
38 | super().__init__(pipeline, name)
39 |
40 | for name, encoder in encoders.items():
41 | setattr(self, name, encoder)
42 |
43 | self.mode = mode
44 |
45 | assert (
46 | mode != "sum"
47 | or len(set(encoder.output_size for encoder in encoders.values())) == 1
48 | ), (
49 | "All encoders must have the same output size when using 'sum' "
50 | "combination:\n{}".format(
51 | "\n".join(
52 | "- {}: {}".format(name, encoder.output_size)
53 | for name, encoder in encoders.items()
54 | )
55 | )
56 | )
57 |
58 | self.dropout = torch.nn.Dropout(dropout_p)
59 | self.output_size = (
60 | sum(encoder.output_size for encoder in encoders.values())
61 | if mode == "cat"
62 | else next(iter(encoders.values())).output_size
63 | )
64 |
65 | def forward(self, batch) -> EmbeddingOutput:
66 | results = [
67 | encoder.module_forward(batch[name])
68 | for name, encoder in self.named_component_children()
69 | ]
70 | all_embeds = [
71 | self.dropout(res["embeddings"].refold(results[0]["embeddings"].data_dims))
72 | for res in results
73 | ]
74 | embeddings = (
75 | sum(all_embeds) if self.mode == "sum" else torch.cat(all_embeds, dim=-1)
76 | )
77 | return {"embeddings": embeddings} # type: ignore
78 |
--------------------------------------------------------------------------------
/edspdf/pipes/embeddings/sub_box_cnn_pooler.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional, Sequence
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from foldedtensor import as_folded_tensor
6 |
7 | from edspdf.pipeline import Pipeline
8 | from edspdf.pipes.embeddings import EmbeddingOutput, TrainablePipe
9 | from edspdf.registry import registry
10 | from edspdf.utils.torch import ActivationFunction, get_activation_function
11 |
12 |
13 | @registry.factory.register("sub-box-cnn-pooler")
14 | class SubBoxCNNPooler(TrainablePipe[EmbeddingOutput]):
15 | """
16 | One dimension CNN encoding multi-kernel layer.
17 | Input embeddings are convoluted using linear kernels each parametrized with
18 | a (window) size of `kernel_size[kernel_i]`
19 | The output of the kernels are concatenated together, max-pooled and finally
20 | projected to a size of `output_size`.
21 |
22 | Parameters
23 | ----------
24 | pipeline: Pipeline
25 | Pipeline instance
26 | name: str
27 | Name of the component
28 | output_size: Optional[int]
29 | Size of the output embeddings
30 | Defaults to the `input_size`
31 | out_channels: int
32 | Number of channels
33 | kernel_sizes: Sequence[int]
34 | Window size of each kernel
35 | activation: str
36 | Activation function to use
37 | """
38 |
39 | def __init__(
40 | self,
41 | embedding: TrainablePipe[EmbeddingOutput],
42 | pipeline: Pipeline = None,
43 | name: str = "sub-box-cnn-pooler",
44 | output_size: Optional[int] = None,
45 | out_channels: Optional[int] = None,
46 | kernel_sizes: Sequence[int] = (3, 4, 5),
47 | activation: ActivationFunction = "relu",
48 | ):
49 | super().__init__(pipeline, name)
50 |
51 | self.activation_fn = get_activation_function(activation)
52 |
53 | self.embedding = embedding
54 | input_size = self.embedding.output_size
55 | out_channels = input_size if out_channels is None else out_channels
56 | output_size = input_size if output_size is None else input_size
57 |
58 | self.convolutions = torch.nn.ModuleList(
59 | torch.nn.Conv1d(
60 | in_channels=self.embedding.output_size,
61 | out_channels=out_channels,
62 | kernel_size=kernel_size,
63 | padding=0,
64 | )
65 | for kernel_size in kernel_sizes
66 | )
67 | self.linear = torch.nn.Linear(
68 | in_features=out_channels * len(kernel_sizes),
69 | out_features=output_size,
70 | )
71 | self.output_size = output_size
72 |
73 | def forward(self, batch: Any) -> EmbeddingOutput:
74 | embeddings = self.embedding.module_forward(batch["embedding"])[
75 | "embeddings"
76 | ].refold("line", "word")
77 | if 0 in embeddings.shape:
78 | return {
79 | "embeddings": as_folded_tensor(
80 | data=torch.zeros(0, self.output_size, device=embeddings.device),
81 | lengths=embeddings.lengths[:-1], # pooled on the last dim
82 | data_dims=["line"], # fully flattened
83 | full_names=["sample", "page", "line"],
84 | )
85 | }
86 |
87 | # sample word dim -> sample dim word
88 | box_token_embeddings = embeddings.as_tensor().permute(0, 2, 1)
89 | box_token_embeddings = torch.cat(
90 | [
91 | self.activation_fn(
92 | conv(
93 | # pad by the appropriate amount on both sides of each sentence
94 | F.pad(
95 | box_token_embeddings,
96 | pad=[
97 | conv.kernel_size[0] // 2,
98 | (conv.kernel_size[0] - 1) // 2,
99 | ],
100 | )
101 | )
102 | .permute(0, 2, 1)
103 | .masked_fill(~embeddings.mask.unsqueeze(-1), 0)
104 | )
105 | for conv in self.convolutions
106 | ],
107 | dim=2,
108 | )
109 | pooled = box_token_embeddings.max(1).values
110 | pooled = self.linear(pooled)
111 | # print("TEXT EMBEDS", pooled.shape, pooled.sum())
112 |
113 | return {
114 | "embeddings": as_folded_tensor(
115 | data=pooled,
116 | lengths=embeddings.lengths[:-1], # pooled on the last dim
117 | data_dims=["line"], # fully flattened
118 | full_names=["sample", "page", "line"],
119 | )
120 | }
121 |
--------------------------------------------------------------------------------
/edspdf/pipes/extractors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/pipes/extractors/__init__.py
--------------------------------------------------------------------------------
/edspdf/processing/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import TYPE_CHECKING
2 |
3 | from edspdf.utils.lazy_module import lazify
4 |
5 | lazify()
6 |
7 | if TYPE_CHECKING:
8 | from .simple import execute_simple_backend
9 | from .multiprocessing import execute_multiprocessing_backend
10 |
--------------------------------------------------------------------------------
/edspdf/processing/simple.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 |
3 | import sys
4 | from contextlib import nullcontext
5 | from typing import TYPE_CHECKING
6 |
7 | from edspdf.utils.collections import batchify, flatten
8 |
9 | from .utils import apply_basic_pipes, batchify_fns
10 |
11 | if TYPE_CHECKING:
12 | from edspdf.lazy_collection import LazyCollection
13 |
14 | doc_size_fns = {
15 | "content_boxes": lambda doc: len(doc.content_boxes),
16 | }
17 |
18 |
19 | def execute_simple_backend(
20 | lc: LazyCollection,
21 | ):
22 | """
23 | This is the default execution mode which batches the documents and processes each
24 | batch on the current process in a sequential manner.
25 | """
26 | try:
27 | no_grad = sys.modules["torch"].no_grad
28 | except (KeyError, AttributeError):
29 | no_grad = nullcontext
30 | reader = lc.reader
31 | writer = lc.writer
32 | show_progress = lc.show_progress
33 |
34 | split_into_batches_after = lc.split_into_batches_after
35 | if split_into_batches_after is None and (lc.batch_by != "docs" or lc.sort_chunks):
36 | split_into_batches_after = next(
37 | (s[0] for s in lc.pipeline if s[0] is not None), None
38 | )
39 | names = [None] + [step[0] for step in lc.pipeline]
40 | chunk_components = lc.pipeline[: names.index(split_into_batches_after)]
41 | batch_components = lc.pipeline[names.index(split_into_batches_after) :]
42 |
43 | def process():
44 | bar = nullcontext()
45 | if show_progress:
46 | from tqdm import tqdm
47 |
48 | bar = tqdm(smoothing=0.1, mininterval=5.0)
49 |
50 | with bar, lc.eval():
51 | for docs in batchify(
52 | (
53 | subtask
54 | for task, count in reader.read_main()
55 | for subtask in reader.read_worker([task])
56 | ),
57 | batch_size=lc.chunk_size,
58 | ):
59 | docs = apply_basic_pipes(docs, chunk_components)
60 |
61 | if lc.sort_chunks:
62 | docs.sort(
63 | key=doc_size_fns.get(
64 | lc.sort_chunks, doc_size_fns["content_boxes"]
65 | )
66 | )
67 |
68 | for batch in batchify_fns[lc.batch_by](docs, lc.batch_size):
69 | count = len(batch)
70 | with no_grad(), lc.cache():
71 | batch = apply_basic_pipes(batch, batch_components)
72 |
73 | if writer is not None:
74 | result, count = writer.write_worker(batch)
75 | if show_progress:
76 | bar.update(count)
77 | yield result
78 | else:
79 | if show_progress:
80 | bar.update(count)
81 | yield batch
82 | if writer is not None:
83 | result, count = writer.finalize()
84 | if show_progress:
85 | bar.update(count)
86 | if count:
87 | yield result
88 |
89 | gen = process()
90 | return flatten(gen) if writer is None else writer.write_main(gen)
91 |
--------------------------------------------------------------------------------
/edspdf/processing/utils.py:
--------------------------------------------------------------------------------
1 | import types
2 | from typing import Iterable, List, TypeVar
3 |
4 | from edspdf.utils.collections import batchify
5 |
6 |
7 | def apply_basic_pipes(docs, pipes):
8 | for name, pipe, kwargs in pipes:
9 | if hasattr(pipe, "batch_process"):
10 | docs = pipe.batch_process(docs)
11 | else:
12 | results = []
13 | for doc in docs:
14 | res = pipe(doc, **kwargs)
15 | if isinstance(res, types.GeneratorType):
16 | results.extend(res)
17 | else:
18 | results.append(res)
19 | docs = results
20 | return docs
21 |
22 |
23 | T = TypeVar("T")
24 |
25 |
26 | def batchify_with_counts(
27 | iterable,
28 | batch_size,
29 | ):
30 | total = 0
31 | batch = []
32 | for item, count in iterable:
33 | if len(batch) > 0 and total + count > batch_size:
34 | yield batch, total
35 | batch = []
36 | total = 0
37 | batch.append(item)
38 | total += count
39 | if len(batch) > 0:
40 | yield batch, total
41 |
42 |
43 | def batchify_by_content_boxes(
44 | iterable: Iterable[T],
45 | batch_size: int,
46 | drop_last: bool = False,
47 | ) -> Iterable[List[T]]:
48 | batch = []
49 | total = 0
50 | for item in iterable:
51 | count = len(item.content_boxes)
52 | if len(batch) > 0 and total + count > batch_size:
53 | yield batch
54 | batch = []
55 | total = 0
56 | batch.append(item)
57 | total += count
58 | if len(batch) > 0 and not drop_last:
59 | yield batch
60 |
61 |
62 | def batchify_by_pages(
63 | iterable: Iterable[T],
64 | batch_size: int,
65 | drop_last: bool = False,
66 | ) -> Iterable[List[T]]:
67 | batch = []
68 | total = 0
69 | for item in iterable:
70 | count = len(item.pages)
71 | if len(batch) > 0 and total + count > batch_size:
72 | yield batch
73 | batch = []
74 | total = 0
75 | batch.append(item)
76 | total += count
77 | if len(batch) > 0 and not drop_last:
78 | yield batch
79 |
80 |
81 | batchify_fns = {
82 | "content_boxes": batchify_by_content_boxes,
83 | "pages": batchify_by_pages,
84 | "docs": batchify,
85 | }
86 |
--------------------------------------------------------------------------------
/edspdf/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/edspdf/utils/__init__.py
--------------------------------------------------------------------------------
/edspdf/utils/alignment.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Sequence, TypeVar
2 |
3 | import numpy as np
4 |
5 | from ..structures import Box
6 | from .collections import list_factorize
7 |
8 | INF = 100000
9 |
10 | T = TypeVar("T", bound=Box)
11 |
12 |
13 | def _align_box_labels_on_page(
14 | src_boxes: Sequence[Box],
15 | dst_boxes: Sequence[Box],
16 | threshold: float = 0.0001,
17 | pollution_label: Any = None,
18 | ):
19 | if len(src_boxes) == 0 or len(dst_boxes) == 0:
20 | return []
21 |
22 | src_labels, label_vocab = list_factorize(
23 | [b.label for b in src_boxes] + [pollution_label]
24 | )
25 | src_labels = np.asarray(src_labels)
26 |
27 | src_x0, src_x1, src_y0, src_y1 = np.asarray(
28 | [(b.x0, b.x1, b.y0, b.y1) for b in src_boxes] + [(-INF, INF, -INF, INF)]
29 | ).T[:, :, None]
30 | dst_x0, dst_x1, dst_y0, dst_y1 = np.asarray(
31 | [(b.x0, b.x1, b.y0, b.y1) for b in dst_boxes]
32 | ).T[:, None, :]
33 |
34 | # src_x0 has shape (n_src_boxes, 1)
35 | # dst_x0 has shape (1, n_dst_boxes)
36 |
37 | dx = np.minimum(src_x1, dst_x1) - np.maximum(src_x0, dst_x0) # shape: n_src, n_dst
38 | dy = np.minimum(src_y1, dst_y1) - np.maximum(src_y0, dst_y0) # shape: n_src, n_dst
39 |
40 | overlap = np.clip(dx, 0, None) * np.clip(dy, 0, None) # shape: n_src, n_dst
41 | src_area = (src_x1 - src_x0) * (src_y1 - src_y0) # shape: n_src
42 | dst_area = (dst_x1 - dst_x0) * (dst_y1 - dst_y0) # shape: n_dst
43 |
44 | # To remove errors for 0 divisions
45 | src_area[src_area == 0] = 1
46 | dst_area[dst_area == 0] = 1
47 |
48 | covered_src_ratio = overlap / src_area # shape: n_src, n_dst
49 | covered_dst_ratio = overlap / dst_area # shape: n_src, n_dst
50 |
51 | score = covered_src_ratio
52 | score[covered_dst_ratio < threshold] = 0.0
53 |
54 | src_indices = score.argmax(0)
55 | dst_labels = src_labels[src_indices]
56 |
57 | new_dst_boxes = [
58 | b.evolve(label=label_vocab[label_idx])
59 | for b, label_idx in zip(dst_boxes, dst_labels)
60 | # if label_vocab[label_idx] != "__pollution__"
61 | ]
62 | return new_dst_boxes
63 |
64 |
65 | def align_box_labels(
66 | src_boxes: Sequence[Box],
67 | dst_boxes: Sequence[T],
68 | threshold: float = 0.0001,
69 | pollution_label: Any = None,
70 | ) -> Sequence[T]:
71 | """
72 | Align lines with possibly overlapping (and non-exhaustive) labels.
73 |
74 | Possible matches are sorted by covered area. Lines with no overlap at all
75 |
76 | Parameters
77 | ----------
78 | src_boxes: Sequence[Box]
79 | The labelled boxes that will be used to determine the label of the dst_boxes
80 | dst_boxes: Sequence[T]
81 | The non-labelled boxes that will be assigned a label
82 | threshold : float, default 1
83 | Threshold to use for discounting a label. Used if the `labels` DataFrame
84 | does not provide a `threshold` column, or to fill `NaN` values thereof.
85 | pollution_label : Any
86 | The label to use for boxes that are not covered by any of the source boxes
87 |
88 | Returns
89 | -------
90 | List[Box]
91 | A copy of the boxes, with the labels mapped from the source boxes
92 | """
93 |
94 | return [
95 | b
96 | for page in sorted(set((b.page_num for b in dst_boxes)))
97 | for b in _align_box_labels_on_page(
98 | src_boxes=[
99 | b
100 | for b in src_boxes
101 | if page is None or b.page_num is None or b.page_num == page
102 | ],
103 | dst_boxes=[
104 | b
105 | for b in dst_boxes
106 | if page is None or b.page_num is None or b.page_num == page
107 | ],
108 | threshold=threshold,
109 | pollution_label=pollution_label,
110 | )
111 | ]
112 |
--------------------------------------------------------------------------------
/edspdf/utils/file_system.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from pathlib import Path
4 | from typing import Optional, Tuple, Union
5 |
6 | import fsspec.implementations.local
7 | import pyarrow.fs
8 | from fsspec import AbstractFileSystem
9 | from fsspec import __version__ as fsspec_version
10 | from fsspec.implementations.arrow import ArrowFSWrapper
11 |
12 | FileSystem = Union[AbstractFileSystem, pyarrow.fs.FileSystem]
13 |
14 | if fsspec_version < "2023.3.0":
15 | # Ugly hack to make fsspec's arrow implementation work in python 3.7
16 | # since arrow requires files to be seekable, and the default fsspec
17 | # open(..., seekable) parameter is False
18 | # See https://github.com/fsspec/filesystem_spec/pull/1186
19 | ArrowFSWrapper._open.__wrapped__.__defaults__ = ("rb", None, True)
20 |
21 |
22 | def walk_match(
23 | fs: FileSystem,
24 | root: str,
25 | file_pattern: str,
26 | ) -> list:
27 | return [
28 | os.path.join(dirpath, f)
29 | for dirpath, dirnames, files in fs.walk(root)
30 | for f in files
31 | if re.match(file_pattern, f)
32 | ]
33 |
34 |
35 | def normalize_fs_path(
36 | filesystem: Optional[FileSystem],
37 | path: Union[str, Path],
38 | ) -> Tuple[AbstractFileSystem, str]:
39 | has_protocol = isinstance(path, str) and "://" in path
40 | filesystem = (
41 | ArrowFSWrapper(filesystem)
42 | if isinstance(filesystem, pyarrow.fs.FileSystem)
43 | else filesystem
44 | )
45 |
46 | # We need to detect the fs from the path
47 | if filesystem is None or has_protocol:
48 | uri: str = path if has_protocol else f"file://{os.path.abspath(path)}"
49 | inferred_fs, fs_path = fsspec.core.url_to_fs(uri)
50 | inferred_fs: fsspec.AbstractFileSystem
51 | filesystem = filesystem or inferred_fs
52 | assert inferred_fs.protocol == filesystem.protocol, (
53 | f"Protocol {inferred_fs.protocol} in path does not match "
54 | f"filesystem {filesystem.protocol}"
55 | )
56 | path = fs_path # path without protocol
57 |
58 | return (
59 | ArrowFSWrapper(filesystem)
60 | if isinstance(filesystem, pyarrow.fs.FileSystem)
61 | else filesystem
62 | ), str(path)
63 |
--------------------------------------------------------------------------------
/edspdf/utils/lazy_module.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa: F811
2 | import ast
3 | import importlib
4 | import inspect
5 | import os
6 |
7 |
8 | def lazify():
9 | def _get_module_paths(file):
10 | """
11 | Reads the content of the current file, parses it with ast and store the
12 | import path for future potential imports. This is useful to only import
13 | the module that is requested and avoid loading all the modules at once, since
14 | some of them are quite heavy, or contain dependencies that are not always
15 | available.
16 |
17 | For instance:
18 | > from .trainable.span_qualifier.factory import create_component as
19 | span_qualifier is stored in the cache as:
20 | > module_paths["span_qualifier"] = "trainable.span_qualifier.factory"
21 |
22 | Returns
23 | -------
24 | Dict[str, Tuple[str, str]]
25 | The absolute path of the current file.
26 | """
27 | module_path = os.path.abspath(file)
28 | with open(module_path, "r") as f:
29 | module_content = f.read()
30 | module_ast = ast.parse(module_content)
31 | module_paths = {}
32 | for node in module_ast.body:
33 | # Lookup TYPE_CHECKING
34 | if not (
35 | isinstance(node, ast.If)
36 | and (
37 | (
38 | isinstance(node.test, ast.Name)
39 | and node.test.id == "TYPE_CHECKING"
40 | )
41 | or (
42 | isinstance(node.test, ast.Attribute)
43 | and node.test.attr == "TYPE_CHECKING"
44 | )
45 | )
46 | ):
47 | continue
48 | for import_node in node.body:
49 | if isinstance(import_node, ast.ImportFrom):
50 | for name in import_node.names:
51 | module_paths[name.asname or name.name] = (
52 | import_node.module,
53 | name.name,
54 | )
55 |
56 | return module_paths
57 |
58 | def __getattr__(name):
59 | """
60 | Imports the actual module if it is in the module_paths dict.
61 |
62 | Parameters
63 | ----------
64 | name
65 |
66 | Returns
67 | -------
68 |
69 | """
70 | if name in module_paths:
71 | module_path, module_name = module_paths[name]
72 | result = getattr(
73 | importlib.__import__(
74 | module_path,
75 | fromlist=[module_name],
76 | globals=module_globals,
77 | level=1,
78 | ),
79 | module_name,
80 | )
81 | module_globals[name] = result
82 | return result
83 | raise AttributeError(f"module {__name__} has no attribute {name}")
84 |
85 | def __dir__():
86 | """
87 | Returns the list of available modules.
88 |
89 | Returns
90 | -------
91 | List[str]
92 | """
93 | return __all__
94 |
95 | # Access upper frame
96 | module_globals = inspect.currentframe().f_back.f_globals
97 |
98 | module_paths = _get_module_paths(module_globals["__file__"])
99 |
100 | __all__ = list(module_paths.keys())
101 |
102 | module_globals.update(
103 | {
104 | "__getattr__": __getattr__,
105 | "__dir__": __dir__,
106 | "__all__": __all__,
107 | }
108 | )
109 |
--------------------------------------------------------------------------------
/edspdf/utils/optimization.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 |
3 | import torch
4 |
5 | from edspdf.utils.collections import get_deep_attr, set_deep_attr
6 |
7 |
8 | class ScheduledOptimizer(torch.optim.Optimizer):
9 | def __init__(self, optim):
10 | self.optim = optim
11 | schedule_to_groups = defaultdict(lambda: [])
12 | for group in self.optim.param_groups:
13 | if "schedules" in group:
14 | if not isinstance(group["schedules"], list):
15 | group["schedules"] = [group["schedules"]]
16 | group["schedules"] = list(group["schedules"])
17 | for schedule in group["schedules"]:
18 | schedule_to_groups[schedule].append(group)
19 | schedule.step(group)
20 |
21 | def zero_grad(self):
22 | return self.optim.zero_grad()
23 |
24 | @property
25 | def param_groups(self):
26 | return self.optim.param_groups
27 |
28 | @param_groups.setter
29 | def param_groups(self, value):
30 | self.optim.param_groups = value
31 |
32 | @property
33 | def state(self):
34 | return self.optim.state
35 |
36 | @state.setter
37 | def state(self, value):
38 | self.optim.state = value
39 |
40 | def state_dict(self):
41 | state = {
42 | "optim": self.optim.state_dict(),
43 | "lr": [group.get("lr") for group in self.optim.param_groups],
44 | "schedules": [
45 | [schedule.state_dict() for schedule in group.get("schedules", ())]
46 | for group in self.optim.param_groups
47 | ],
48 | }
49 | for group in state["optim"]["param_groups"]:
50 | if "schedules" in group:
51 | del group["schedules"]
52 | return state
53 |
54 | def load_state_dict(self, state):
55 | optim_schedules = [
56 | group.get("schedules", ()) for group in self.optim.param_groups
57 | ]
58 | self.optim.load_state_dict(state["optim"])
59 | for group, group_schedule, group_schedules_state, lr in zip(
60 | self.optim.param_groups, optim_schedules, state["schedules"], state["lr"]
61 | ):
62 | group["schedules"] = group_schedule
63 | for schedule, schedule_state in zip(
64 | group["schedules"], group_schedules_state
65 | ):
66 | schedule.load_state_dict(schedule_state)
67 | group["lr"] = lr
68 |
69 | def step(self, closure=None):
70 | self.optim.step(closure=closure)
71 | for group in self.optim.param_groups:
72 | if "schedules" in group:
73 | for schedule in group["schedules"]:
74 | schedule.step(group)
75 |
76 |
77 | class LinearSchedule:
78 | def __init__(
79 | self,
80 | total_steps,
81 | max_value=None,
82 | start_value=0.0,
83 | path="lr",
84 | warmup=True,
85 | warmup_rate=0.1,
86 | ):
87 | self.path = path
88 | self.start_value = start_value
89 | self.max_value = max_value
90 | self.warmup_rate = warmup_rate
91 | self.total_steps = total_steps
92 | self.idx = 0
93 |
94 | def state_dict(self):
95 | return {
96 | "idx": self.idx,
97 | }
98 |
99 | def load_state_dict(self, state):
100 | self.idx = state["idx"]
101 |
102 | def step(self, group, closure=None):
103 | if self.max_value is None:
104 | self.max_value = get_deep_attr(group, self.path)
105 | warmup_steps = self.total_steps * self.warmup_rate
106 | if self.idx < warmup_steps:
107 | progress = self.idx / warmup_steps
108 | value = self.start_value + (self.max_value - self.start_value) * progress
109 | else:
110 | progress = (self.idx - warmup_steps) / (self.total_steps - warmup_steps)
111 | value = self.max_value + (0 - self.max_value) * progress
112 | self.idx += 1
113 | set_deep_attr(group, self.path, value)
114 |
--------------------------------------------------------------------------------
/edspdf/utils/random.py:
--------------------------------------------------------------------------------
1 | import random
2 | from collections import namedtuple
3 |
4 | import numpy as np
5 | import torch
6 |
7 | RandomGeneratorState = namedtuple(
8 | "RandomGeneratorState", ["random", "torch", "numpy", "torch_cuda"]
9 | )
10 |
11 |
12 | def get_random_generator_state(cuda=torch.cuda.is_available()):
13 | """
14 | Get the `torch`, `numpy` and `random` random generator state.
15 | Parameters
16 | ----------
17 | cuda: bool
18 | Saves the cuda random states too
19 |
20 | Returns
21 | -------
22 | RandomGeneratorState
23 | """
24 | return RandomGeneratorState(
25 | random.getstate(),
26 | torch.random.get_rng_state(),
27 | np.random.get_state(),
28 | torch.cuda.get_rng_state_all() if cuda else None,
29 | )
30 |
31 |
32 | def set_random_generator_state(state):
33 | """
34 | Set the `torch`, `numpy` and `random` random generator state.
35 | Parameters
36 | ----------
37 | state: RandomGeneratorState
38 | """
39 | random.setstate(state.random)
40 | torch.random.set_rng_state(state.torch)
41 | np.random.set_state(state.numpy)
42 | if (
43 | state.torch_cuda is not None
44 | and torch.cuda.is_available()
45 | and len(state.torch_cuda) == torch.cuda.device_count()
46 | ): # pragma: no cover
47 | torch.cuda.set_rng_state_all(state.torch_cuda)
48 |
49 |
50 | class set_seed:
51 | def __init__(self, seed, cuda=torch.cuda.is_available()):
52 | """
53 | Set seed values for random generators.
54 | If used as a context, restore the random state
55 | used before entering the context.
56 |
57 | Parameters
58 | ----------
59 | seed: int
60 | Value used as a seed.
61 | cuda: bool
62 | Saves the cuda random states too
63 | """
64 | # if seed is True:
65 | # seed = random.randint(1, 2**16)
66 | seed = random.randint(1, 2**16) if seed is True else seed
67 | self.state = get_random_generator_state(cuda)
68 | if seed is not None:
69 | random.seed(seed)
70 | torch.manual_seed(seed)
71 | np.random.seed(seed)
72 | if cuda: # pragma: no cover
73 | torch.cuda.manual_seed(seed)
74 | torch.cuda.manual_seed_all(seed)
75 |
76 | def __enter__(self):
77 | return self
78 |
79 | def __exit__(self, exc_type, exc_val, exc_tb):
80 | set_random_generator_state(self.state)
81 |
--------------------------------------------------------------------------------
/edspdf/utils/torch.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | from typing import TypeVar
3 |
4 | import torch
5 |
6 | Args = TypeVar("Args")
7 |
8 |
9 | def pad_2d(data, pad=0, device=None):
10 | max_len = max(map(len, data), default=0)
11 | padded = [row + [pad] * (max_len - len(row)) for row in data]
12 | return torch.as_tensor(padded, device=device)
13 |
14 |
15 | def compute_pdf_relative_positions(x0, y0, x1, y1, width, height, n_relative_positions):
16 | """
17 | Compute relative positions between boxes.
18 | Input boxes must be split between pages with the shape n_pages * n_boxes
19 |
20 | Parameters
21 | ----------
22 | x0: torch.FloatTensor
23 | y0: torch.FloatTensor
24 | x1: torch.FloatTensor
25 | y1: torch.FloatTensor
26 | width: torch.FloatTensor
27 | height: torch.FloatTensor
28 | n_relative_positions: int
29 | Maximum range of embeddable relative positions between boxes (further
30 | distances will be capped to ±n_relative_positions // 2)
31 |
32 | Returns
33 | -------
34 | torch.LongTensor
35 | Shape: n_pages * n_boxes * n_boxes * 2
36 | """
37 | dx = x0[:, None, :] - x0[:, :, None] # B begin -> A begin
38 | dx = (dx * n_relative_positions).long()
39 |
40 | dy = y0[:, None, :] - y0[:, :, None]
41 | # If query above (dy > 0) key, use query height
42 | ref_height = (dy >= 0).float() * height.float()[:, :, None] + (
43 | dy < 0
44 | ).float() * height[:, None, :]
45 | dy0 = y1[:, None, :] - y0[:, :, None] # A begin -> B end
46 | dy1 = y0[:, None, :] - y1[:, :, None] # A end -> B begin
47 | offset = 0.5
48 | dy = torch.where(
49 | # where A fully above B (dy0 and dy1 > 0), dy is min distance
50 | ((dy0 + offset).sign() > 0) & ((dy1 + offset).sign() > 0),
51 | (torch.minimum(dy0, dy1) / ref_height + offset).ceil(),
52 | # where A fully below B (dy0 and dy1 < 0), dy is -(min -distances)
53 | torch.where(
54 | ((dy0 - offset).sign() < 0) & ((dy1 - offset).sign() < 0),
55 | (torch.maximum(dy0, dy1) / ref_height - offset).floor(),
56 | 0,
57 | ),
58 | )
59 | dy = (dy.abs().ceil() * dy.sign()).long()
60 |
61 | relative_positions = torch.stack([dx, dy], dim=-1)
62 |
63 | return relative_positions
64 |
65 |
66 | class ActivationFunction(str, Enum):
67 | relu = "relu"
68 | gelu = "gelu"
69 | glu = "glu"
70 |
71 |
72 | def get_activation_function(activation: ActivationFunction):
73 | return getattr(torch.nn.functional, activation)
74 |
--------------------------------------------------------------------------------
/edspdf/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | from .annotations import compare_results, show_annotations
2 | from .merge import merge_boxes
3 |
--------------------------------------------------------------------------------
/edspdf/visualization/annotations.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List, Optional, Sequence, Union
2 |
3 | import numpy as np
4 | import pypdfium2 as pdfium
5 | from PIL import Image, ImageDraw
6 | from PIL.PpmImagePlugin import PpmImageFile
7 |
8 | from edspdf.structures import Box
9 |
10 | CATEGORY20 = [
11 | "#1f77b4",
12 | # "#aec7e8",
13 | "#ff7f0e",
14 | # "#ffbb78",
15 | "#2ca02c",
16 | "#98df8a",
17 | "#d62728",
18 | "#ff9896",
19 | "#9467bd",
20 | "#c5b0d5",
21 | "#8c564b",
22 | "#c49c94",
23 | "#e377c2",
24 | "#f7b6d2",
25 | "#7f7f7f",
26 | "#c7c7c7",
27 | "#bcbd22",
28 | "#dbdb8d",
29 | "#17becf",
30 | "#9edae5",
31 | ]
32 |
33 |
34 | def show_annotations(
35 | pdf: bytes,
36 | annotations: Sequence[Box],
37 | colors: Optional[Union[Dict[str, str], List[str]]] = None,
38 | ) -> List[PpmImageFile]:
39 | """
40 | Show Box annotations on a PDF document.
41 |
42 | Parameters
43 | ----------
44 | pdf: bytes
45 | Bytes content of the PDF document
46 | annotations: Sequence[Box]
47 | List of Box annotations to show
48 | colors: Optional[Union[Dict[str, str], List[str]]]
49 | Colors to use for each label. If a list is provided, it will be used to color
50 | the first `len(colors)` unique labels. If a dictionary is provided, it will be
51 | used to color the labels in the dictionary. If None, a default color scheme will
52 | be used.
53 |
54 | Returns
55 | -------
56 | List[PpmImageFile]
57 | List of PIL images with the annotations. You can display them in a notebook
58 | with `display(*pages)`.
59 | """
60 |
61 | pdf_doc = pdfium.PdfDocument(pdf)
62 | pages = list([page.render(scale=2).to_pil() for page in pdf_doc])
63 | unique_labels = list(dict.fromkeys([box.label for box in annotations]))
64 |
65 | if colors is None:
66 | colors = {key: color for key, color in zip(unique_labels, CATEGORY20)}
67 | elif isinstance(colors, list):
68 | colors = {label: color for label, color in zip(unique_labels, colors)}
69 |
70 | for page_num, img in enumerate(pages):
71 |
72 | w, h = img.size
73 | draw = ImageDraw.Draw(img)
74 |
75 | for bloc in annotations:
76 | if bloc.page_num == page_num:
77 | draw.rectangle(
78 | [(bloc.x0 * w, bloc.y0 * h), (bloc.x1 * w, bloc.y1 * h)],
79 | outline=colors[bloc.label],
80 | width=3,
81 | )
82 |
83 | return pages
84 |
85 |
86 | def compare_results(
87 | pdf: bytes,
88 | pred: Sequence[Box],
89 | gold: Sequence[Box],
90 | colors: Optional[Union[Dict[str, str], List[str]]] = None,
91 | ) -> List[PpmImageFile]:
92 | """
93 | Compare two sets of annotations on a PDF document.
94 |
95 | Parameters
96 | ----------
97 | pdf: bytes
98 | Bytes content of the PDF document
99 | pred: Sequence[Box]
100 | List of Box annotations to show on the left side
101 | gold: Sequence[Box]
102 | List of Box annotations to show on the right side
103 | colors: Optional[Union[Dict[str, str], List[str]]]
104 | Colors to use for each label. If a list is provided, it will be used to color
105 | the first `len(colors)` unique labels. If a dictionary is provided, it will be
106 | used to color the labels in the dictionary. If None, a default color scheme will
107 | be used.
108 |
109 | Returns
110 | -------
111 | List[PpmImageFile]
112 | List of PIL images with the annotations. You can display them in a notebook
113 | with `display(*pages)`.
114 | """
115 | if colors is None:
116 | colors = {
117 | **dict.fromkeys([b.label for b in pred]),
118 | **dict.fromkeys([b.label for b in gold]),
119 | }
120 |
121 | pages_pred = show_annotations(pdf, pred, colors)
122 | pages_gold = show_annotations(pdf, gold, colors)
123 |
124 | pages = []
125 |
126 | for page_pred, page_gold in zip(pages_pred, pages_gold):
127 | array = np.hstack((np.asarray(page_pred), np.asarray(page_gold)))
128 | pages.append(Image.fromarray(array))
129 |
130 | return pages
131 |
--------------------------------------------------------------------------------
/edspdf/visualization/merge.py:
--------------------------------------------------------------------------------
1 | from typing import List, Sequence
2 |
3 | import networkx as nx
4 | import numpy as np
5 |
6 | from edspdf.structures import Box
7 |
8 | INF = 1000000
9 |
10 |
11 | def merge_boxes(
12 | boxes: Sequence[Box],
13 | ) -> List[Box]:
14 | """
15 | Recursively merge boxes that have the same label to form larger non-overlapping
16 | boxes.
17 |
18 | Parameters
19 | ----------
20 | boxes: Sequence[Box]
21 | List of boxes to merge
22 |
23 | Returns
24 | -------
25 | List[Box]
26 | List of merged boxes
27 | """
28 | labels = np.asarray([b.label for b in boxes])
29 |
30 | coords = np.asarray([(b.x0, b.x1, b.y0, b.y1) for b in boxes])
31 |
32 | # Key that determines if two boxes can be merged, initialized from the box labels
33 | merge_keys = np.unique(labels, return_inverse=True)[1]
34 |
35 | # For each page
36 | while True:
37 | adj = np.zeros((len(boxes), len(boxes)), dtype=bool)
38 |
39 | # Split boxes between those that belong to a label (and could be merged),
40 | # and those that do not belong to that label and will prevent the mergers
41 | for key in np.unique(merge_keys):
42 | key_filter = merge_keys == key
43 |
44 | x0, x1, y0, y1 = coords[key_filter].T
45 | obs_x0, obs_x1, obs_y0, obs_y1 = coords[~key_filter].T
46 |
47 | A = (slice(None), None, None)
48 | B = (None, slice(None), None)
49 |
50 | # Find the bbox of the hypothetical merged boxes
51 | merged_x0 = np.minimum(x0[A], x0[B])
52 | merged_x1 = np.maximum(x1[A], x1[B])
53 | merged_y0 = np.minimum(y0[A], y0[B])
54 | merged_y1 = np.maximum(y1[A], y1[B])
55 |
56 | # And detect if it overlaps existing box of a different label
57 | dx = np.minimum(merged_x1, obs_x1) - np.maximum(merged_x0, obs_x0)
58 | dy = np.minimum(merged_y1, obs_y1) - np.maximum(merged_y0, obs_y0)
59 | merged_overlap_with_other = (dx > 0) & (dy > 0)
60 | no_box_inbetween = (~merged_overlap_with_other).all(-1)
61 |
62 | # Update the adjacency matrix to 1 if two boxes can be merged
63 | # (ie no box of a different label lie inbetween)
64 | adj_indices = np.flatnonzero(key_filter)
65 | adj[adj_indices[:, None], adj_indices[None, :]] = no_box_inbetween
66 |
67 | # Build the cliques of boxes that can be merged
68 | cliques = nx.find_cliques(nx.from_numpy_array(adj))
69 |
70 | # These cliques of mergeable boxes can be overlapping: think of a cross
71 | # like this=
72 | # *** --- ***
73 | # --- --- ---
74 | # *** --- ***
75 | # for which the two (-) labelled cliques would be the two axis of the cross
76 | # For each box, we change its label to its first clique number, so the cross
77 | # looks like this (symbols between the 2 figures don't map to the same indices)
78 | # *** --- ***
79 | # ooo ooo ooo
80 | # *** --- ***
81 | # and rerun the above process until there is no conflict
82 |
83 | conflicting_cliques = False
84 | seen = set()
85 | for clique_idx, clique_box_indices in enumerate(cliques):
86 | for box_idx in clique_box_indices:
87 | if box_idx in seen:
88 | # print("Already seen", box_idx)
89 | conflicting_cliques = True
90 | else:
91 | seen.add(box_idx)
92 | merge_keys[box_idx] = clique_idx
93 |
94 | if not conflicting_cliques:
95 | break
96 |
97 | x0, x1, y0, y1 = coords.T.reshape((4, -1))
98 |
99 | # Finally, compute the bbox of the sets of mergeable boxes (same `key`)
100 | merged_boxes = []
101 | for group_key in dict.fromkeys(merge_keys):
102 | indices = [i for i, key in enumerate(merge_keys) if group_key == key]
103 | first_box = boxes[indices[0]]
104 | merged_boxes.append(
105 | first_box.evolve(
106 | x0=min(x0[i] for i in indices),
107 | y0=min(y0[i] for i in indices),
108 | x1=max(x1[i] for i in indices),
109 | y1=max(y1[i] for i in indices),
110 | )
111 | )
112 |
113 | return merged_boxes
114 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: EDS-PDF
2 |
3 | repo_url: https://github.com/aphp/edspdf
4 | # repo_name: algorithms/pseudonymisation
5 |
6 | theme:
7 | name: material
8 | palette:
9 | - scheme: default
10 | toggle:
11 | icon: material/brightness-4
12 | name: Switch to dark mode
13 | - scheme: slate
14 | toggle:
15 | icon: material/brightness-7
16 | name: Switch to light mode
17 | logo: assets/logo/edspdf-white.svg
18 | favicon: assets/logo/edspdf-blue.svg
19 | features:
20 | - navigation.tracking
21 | - navigation.instant
22 | - navigation.indexes
23 | - navigation.prune
24 | - navigation.top
25 | - content.code.annotate
26 |
27 | nav:
28 | - index.md
29 | - Demo 🚀: https://aphp.github.io/edspdf/demo" target="_blank
30 | - pipeline.md
31 | - configuration.md
32 | - data-structures.md
33 | - trainable-pipes.md
34 | - inference.md
35 | - Recipes:
36 | - recipes/index.md
37 | - recipes/rule-based.md
38 | - recipes/training.md
39 | - recipes/extension.md
40 | - recipes/annotation.md
41 | - Pipes:
42 | - pipes/index.md
43 | - Embeddings:
44 | - pipes/embeddings/index.md
45 | - pipes/embeddings/simple-text-embedding.md
46 | - pipes/embeddings/embedding-combiner.md
47 | - pipes/embeddings/sub-box-cnn-pooler.md
48 | - pipes/embeddings/box-layout-embedding.md
49 | - pipes/embeddings/box-transformer.md
50 | - pipes/embeddings/huggingface-embedding.md
51 | - Extractors:
52 | - pipes/extractors/index.md
53 | - pipes/extractors/pdfminer.md
54 | - MuPDF Extractor: https://aphp.github.io/edspdf-mupdf/latest
55 | - Poppler Extractor: https://aphp.github.io/edspdf-poppler/latest
56 | - Classifiers:
57 | - pipes/box-classifiers/index.md
58 | - pipes/box-classifiers/trainable.md
59 | - pipes/box-classifiers/mask.md
60 | - pipes/box-classifiers/dummy.md
61 | - pipes/box-classifiers/random.md
62 | - Aggregators:
63 | - pipes/aggregators/index.md
64 | - pipes/aggregators/simple-aggregator.md
65 | - Layers:
66 | - layers/index.md
67 | - layers/box-transformer.md
68 | - layers/box-transformer-layer.md
69 | - layers/relative-attention.md
70 | - layers/sinusoidal-embedding.md
71 | - layers/vocabulary.md
72 | - Utilities:
73 | - utilities/index.md
74 | - utilities/visualisation.md
75 | - utilities/alignment.md
76 | - Code Reference: reference/edspdf/
77 | - alternatives.md
78 | - contributing.md
79 | - changelog.md
80 | - roadmap.md
81 |
82 | extra_css:
83 | - assets/stylesheets/extra.css
84 | - assets/termynal/termynal.css
85 |
86 | extra_javascript:
87 | - https://cdn.jsdelivr.net/npm/vega@5
88 | - https://cdn.jsdelivr.net/npm/vega-lite@5
89 | - https://cdn.jsdelivr.net/npm/vega-embed@6
90 | - assets/termynal/termynal.js
91 |
92 | watch:
93 | - contributing.md
94 | - roadmap.md
95 | - changelog.md
96 | - edspdf
97 | - docs/scripts
98 |
99 |
100 | extra:
101 | version:
102 | provider: mike
103 |
104 | hooks:
105 | - docs/scripts/plugin.py
106 |
107 | plugins:
108 | - search
109 | - autorefs:
110 | priority:
111 | - '*'
112 | - reference
113 |
114 | - mkdocstrings:
115 | enable_inventory: true
116 | custom_templates: docs/assets/templates
117 | handlers:
118 | python:
119 | import:
120 | - https://aphp.github.io/edspdf-poppler/latest/objects.inv
121 | - https://aphp.github.io/edspdf-mupdf/latest/objects.inv
122 | options:
123 | docstring_style: numpy
124 | docstring_section_style: spacy
125 | heading_level: 2
126 | members_order: source
127 | show_root_toc_entry: false
128 | show_signature: false
129 | merge_init_into_class: true
130 | - glightbox:
131 | touchNavigation: true
132 | loop: false
133 | effect: none
134 | width: 100%
135 | height: auto
136 | zoomable: true
137 | draggable: true
138 | - bibtex:
139 | bibtex_file: "docs/references.bib"
140 |
141 | - mike
142 |
143 | markdown_extensions:
144 | - admonition
145 | - pymdownx.superfences
146 | - pymdownx.highlight
147 | - pymdownx.inlinehilite
148 | - pymdownx.snippets
149 | - pymdownx.tabbed:
150 | alternate_style: true
151 | - footnotes
152 | - md_in_html
153 | - attr_list
154 | - pymdownx.details
155 | - pymdownx.tasklist:
156 | custom_checkbox: true
157 | - pymdownx.emoji:
158 | emoji_index: !!python/name:materialx.emoji.twemoji
159 | emoji_generator: !!python/name:materialx.emoji.to_svg
160 |
161 | validation:
162 | absolute_links: ignore
163 |
--------------------------------------------------------------------------------
/roadmap.md:
--------------------------------------------------------------------------------
1 | # Roadmap
2 |
3 | - [x] Style extraction
4 | - [x] Custom hybrid torch-based pipeline & configuration system
5 | - [x] Drop pandas DataFrame in favour of a ~~Cython~~ [attr](https://www.attrs.org/en/stable/) wrapper around PDF documents?
6 | - [x] Add training capabilities with a CLI to automate the annotation/preparation/training loop.
7 | Again, draw inspiration from spaCy, and maybe add the notion of a `TrainableClassifier`...
8 | - [ ] Add complete serialisation capabilities, to save a full pipeline to disk.
9 | Draw inspiration from spaCy, which took great care to solve these issues:
10 | add `save` and `load` methods to every pipeline component
11 | - [ ] Multiple-column extraction
12 | - [ ] Table detector
13 | - [ ] Integrate third-party OCR module
14 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 | import pytest
5 | from datasets import Dataset
6 | from pytest import fixture
7 | from utils import nested_approx
8 |
9 | from edspdf import Pipeline
10 | from edspdf.utils.collections import ld_to_dl
11 |
12 | pytest.nested_approx = nested_approx
13 |
14 | TEST_DIR = Path(__file__).parent
15 |
16 |
17 | @pytest.fixture
18 | def change_test_dir(request):
19 | os.chdir(request.fspath.dirname)
20 | yield
21 | os.chdir(request.config.invocation_dir)
22 |
23 |
24 | @fixture(scope="session")
25 | def pdf():
26 | path = TEST_DIR / "resources" / "test.pdf"
27 | return path.read_bytes()
28 |
29 |
30 | @fixture(scope="session")
31 | def blank_pdf():
32 | path = TEST_DIR / "resources" / "blank.pdf"
33 | return path.read_bytes()
34 |
35 |
36 | @fixture(scope="session")
37 | def styles_pdf():
38 | path = TEST_DIR / "resources" / "styles.pdf"
39 | return path.read_bytes()
40 |
41 |
42 | @fixture(scope="session")
43 | def letter_pdf():
44 | path = TEST_DIR / "resources" / "letter.pdf"
45 | return path.read_bytes()
46 |
47 |
48 | @fixture(scope="session")
49 | def distant_superscript_pdf():
50 | path = TEST_DIR / "resources" / "distant-superscript.pdf"
51 | return path.read_bytes()
52 |
53 |
54 | @fixture(scope="session")
55 | def error_pdf():
56 | path = TEST_DIR / "resources" / "error.pdf"
57 | return path.read_bytes()
58 |
59 |
60 | @fixture(scope="session")
61 | def dummy_dataset(tmpdir_factory, pdf):
62 | tmp_path = tmpdir_factory.mktemp("datasets")
63 | dataset_path = str(tmp_path / "pdf-dataset.hf")
64 |
65 | ds = Dataset.from_dict(
66 | ld_to_dl(
67 | [
68 | {
69 | "id": str(i),
70 | "content": pdf,
71 | "bboxes": [
72 | {
73 | "page": 0,
74 | "x0": 0.1,
75 | "y0": 0.1,
76 | "x1": 0.9,
77 | "y1": 0.5,
78 | "label": "first",
79 | "page_width": 20,
80 | "page_height": 30,
81 | },
82 | {
83 | "page": 0,
84 | "x0": 0.1,
85 | "y0": 0.6,
86 | "x1": 0.9,
87 | "y1": 0.9,
88 | "label": "second",
89 | "page_width": 20,
90 | "page_height": 30,
91 | },
92 | ], # top half part of the page with margin
93 | }
94 | for i in range(8)
95 | ]
96 | )
97 | )
98 | ds.save_to_disk(dataset_path)
99 | return dataset_path
100 |
101 |
102 | @pytest.fixture(scope="session")
103 | def frozen_pipeline():
104 | model = Pipeline()
105 | model.add_pipe("pdfminer-extractor", name="extractor")
106 | model.add_pipe(
107 | "trainable-classifier",
108 | name="classifier",
109 | config=dict(
110 | embedding={
111 | "@factory": "box-layout-embedding",
112 | "n_positions": 32,
113 | "size": "48",
114 | },
115 | labels=["first", "second"],
116 | ),
117 | )
118 | model.add_pipe("simple-aggregator")
119 | model.post_init([])
120 | return model
121 |
--------------------------------------------------------------------------------
/tests/core/config.cfg:
--------------------------------------------------------------------------------
1 | [pipeline]
2 | pipeline = ["extractor", "classifier"]
3 | disabled = []
4 | components = ${components}
5 |
6 | [components]
7 |
8 | [components.extractor]
9 | @factory = "pdfminer-extractor"
10 |
11 | [components.classifier]
12 | @factory = "trainable-classifier"
13 | labels = []
14 |
15 | [components.classifier.embedding]
16 | @factory = "box-transformer"
17 | num_heads = 4
18 | dropout_p = 0.1
19 | head_size = 16
20 | activation = "gelu"
21 | init_resweight = 0.01
22 | n_relative_positions = 64
23 | attention_mode = ["c2c", "c2p", "p2c"]
24 | n_layers = 1
25 |
26 | [components.classifier.embedding.embedding]
27 | @factory = "embedding-combiner"
28 | dropout_p = 0.1
29 |
30 | [components.classifier.embedding.embedding.layout_encoder]
31 | @factory = "box-layout-embedding"
32 | n_positions = 64
33 | size = 72
34 | x_mode = "learned"
35 | y_mode = "learned"
36 | w_mode = "learned"
37 | h_mode = "learned"
38 |
39 | [components.classifier.embedding.embedding.text_encoder]
40 | @factory = "sub-box-cnn-pooler"
41 | out_channels = 64
42 | kernel_sizes = [3, 4, 5]
43 |
44 | [components.classifier.embedding.embedding.text_encoder.embedding]
45 | @factory = "simple-text-embedding"
46 | size = 72
47 |
--------------------------------------------------------------------------------
/tests/core/test_registry.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from edspdf.pipeline import Pipeline
4 | from edspdf.registry import CurriedFactory, registry
5 |
6 |
7 | def test_misc_register_decorator():
8 | @registry.misc.register("test-1")
9 | def test_function(param: int = 3):
10 | pass
11 |
12 | assert test_function is registry.misc.get("test-1")
13 |
14 |
15 | def test_misc_register_call():
16 | def test_function(param: int = 3):
17 | pass
18 |
19 | test_function_2 = registry.misc.register("test", func=test_function)
20 | assert test_function_2 is registry.misc.get("test")
21 |
22 |
23 | def test_factory_default_config():
24 | @registry.factory.register("custom-test-component-1", default_config={"value": 5})
25 | class CustomComponent:
26 | def __init__(self, pipeline: "Pipeline", name: str, value: int = 3):
27 | self.name = name
28 | self.value = value
29 |
30 | def __call__(self, *args, **kwargs):
31 | return self.value
32 |
33 | registry_result = registry.factory.get("custom-test-component-1")()
34 | assert isinstance(registry_result, CurriedFactory)
35 |
36 | pipeline = Pipeline()
37 | pipeline.add_pipe("custom-test-component-1")
38 |
39 | assert pipeline.get_pipe("custom-test-component-1").value == 5
40 |
41 |
42 | def test_factory_required_arguments():
43 | with pytest.raises(ValueError) as exc_info:
44 |
45 | @registry.factory.register("custom-test-component-2")
46 | class CustomComponent:
47 | def __init__(self, value: int = 3):
48 | self.value = value
49 |
50 | def __call__(self, *args, **kwargs):
51 | return self.value
52 |
53 | assert "Factory functions must accept pipeline and name as arguments." in str(
54 | exc_info.value
55 | )
56 |
57 |
58 | def test_missing_component():
59 | pipeline = Pipeline()
60 |
61 | with pytest.raises(ValueError) as exc_info:
62 | pipeline.add_pipe("missing_custom_test_component")
63 |
64 | assert (
65 | "Can't find 'missing_custom_test_component' in registry edspdf -> factories."
66 | in str(exc_info.value)
67 | )
68 |
--------------------------------------------------------------------------------
/tests/core/test_structures.py:
--------------------------------------------------------------------------------
1 | def test_repr(styles_pdf):
2 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
3 |
4 | doc = PdfMinerExtractor(extract_style=True)(styles_pdf)
5 | doc.id = "test"
6 |
7 | for b in doc.content_boxes:
8 | b.x0 = round(b.x0, 2)
9 | b.y0 = round(b.y0, 2)
10 | b.x1 = round(b.x1, 2)
11 | b.y1 = round(b.y1, 2)
12 |
13 | assert repr(doc) == (
14 | "PDFDoc(content=39476 bytes, id='test', num_pages=0, pages=[Page(page_num=0, "
15 | "width=612, height=792, image=None)], error=False, "
16 | "content_boxes=[TextBox(x0=0.12, x1=0.65, y0=0.09, y1=0.11, label=None, "
17 | "page_num=0, text='This is a test to check EDS-PDF’s ability to detect "
18 | "changing styles.', props=[TextProperties(italic=False, bold=False, begin=0, "
19 | "end=9, fontname='AAAAAA+ArialMT'), TextProperties(italic=False, bold=True, "
20 | "begin=10, end=14, fontname='BAAAAA+Arial-BoldMT'), "
21 | "TextProperties(italic=False, bold=False, begin=15, end=33, "
22 | "fontname='AAAAAA+ArialMT'), TextProperties(italic=True, bold=False, "
23 | "begin=34, end=41, fontname='CAAAAA+Arial-ItalicMT'), "
24 | "TextProperties(italic=False, bold=False, begin=42, end=68, "
25 | "fontname='AAAAAA+ArialMT')]), TextBox(x0=0.12, x1=0.73, y0=0.11, y1=0.13, "
26 | "label=None, page_num=0, text='Let’s up the stakes, with intra-word change. "
27 | "Or better yet, this might be hard.', props=[TextProperties(italic=False, "
28 | "bold=False, begin=0, end=25, fontname='AAAAAA+ArialMT'), "
29 | "TextProperties(italic=True, bold=False, begin=26, end=31, "
30 | "fontname='CAAAAA+Arial-ItalicMT'), TextProperties(italic=False, bold=False, "
31 | "begin=31, end=59, fontname='AAAAAA+ArialMT'), TextProperties(italic=False, "
32 | "bold=True, begin=60, end=67, fontname='BAAAAA+Arial-BoldMT'), "
33 | "TextProperties(italic=False, bold=False, begin=67, end=79, "
34 | "fontname='AAAAAA+ArialMT')])], aggregated_texts={})"
35 | )
36 |
--------------------------------------------------------------------------------
/tests/pipes/aggregators/test_simple.py:
--------------------------------------------------------------------------------
1 | from itertools import cycle
2 |
3 | import edspdf
4 | from edspdf.pipes.aggregators.simple import SimpleAggregator
5 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
6 | from edspdf.structures import Page, PDFDoc, TextBox
7 |
8 |
9 | def test_no_style():
10 | doc = PDFDoc(
11 | content=b"",
12 | pages=[],
13 | )
14 | doc.pages = [
15 | Page(doc=doc, page_num=0, width=1, height=1),
16 | Page(doc=doc, page_num=1, width=1, height=1),
17 | ]
18 | doc.content_boxes = [
19 | TextBox(
20 | doc=doc,
21 | page_num=0,
22 | x0=0.1,
23 | y0=0.1,
24 | x1=0.5,
25 | y1=0.2,
26 | label="body",
27 | text="Begin",
28 | ),
29 | TextBox(
30 | doc=doc,
31 | page_num=0,
32 | x0=0.6,
33 | y0=0.1,
34 | x1=0.7,
35 | y1=0.2,
36 | label="body",
37 | text="and",
38 | ),
39 | TextBox(
40 | doc=doc,
41 | page_num=0,
42 | x0=0.8,
43 | y0=0.1,
44 | x1=0.9,
45 | y1=0.2,
46 | label="body",
47 | text="end.",
48 | ),
49 | TextBox(
50 | doc=doc,
51 | page_num=1,
52 | x0=0.8,
53 | y0=0.1,
54 | x1=0.9,
55 | y1=0.2,
56 | label="body",
57 | text="New page",
58 | ),
59 | ]
60 |
61 | aggregator = SimpleAggregator()
62 | assert aggregator(doc).aggregated_texts["body"].text == "Begin and end.\n\nNew page"
63 |
64 |
65 | def test_styled_pdfminer_aggregation(styles_pdf):
66 | extractor = PdfMinerExtractor(extract_style=True)
67 | aggregator = SimpleAggregator(
68 | sort=True,
69 | label_map={
70 | "header": ["header"],
71 | "body": "body",
72 | },
73 | )
74 |
75 | doc = extractor(styles_pdf)
76 | for b, label in zip(doc.text_boxes, cycle(["header", "body"])):
77 | b.label = label
78 | doc = aggregator(doc)
79 | texts = {k: v.text for k, v in doc.aggregated_texts.items()}
80 | props = {k: v.properties for k, v in doc.aggregated_texts.items()}
81 |
82 | assert set(texts.keys()) == {"body", "header"}
83 | assert isinstance(props["body"], list)
84 |
85 | for value in props.values():
86 | assert value[0].begin == 0
87 |
88 | pairs = set()
89 | for label in texts.keys():
90 | for prop in props[label]:
91 | pairs.add(
92 | (
93 | texts[label][prop.begin : prop.end],
94 | " ".join(
95 | filter(
96 | bool,
97 | (
98 | ("italic" if prop.italic else ""),
99 | ("bold" if prop.bold else ""),
100 | ),
101 | )
102 | ),
103 | )
104 | )
105 |
106 | assert pairs == {
107 | ("This is a", ""),
108 | ("test", "bold"),
109 | ("to check EDS-PDF’s", ""),
110 | ("ability", "italic"),
111 | ("to detect changing styles.", ""),
112 | ("Let’s up the stakes, with", ""),
113 | ("intra", "italic"),
114 | ("-word change. Or better yet,", ""),
115 | ("this mi", "bold"),
116 | ("ght be hard.", ""),
117 | }
118 |
119 |
120 | def test_styled_pdfminer_aggregation_letter(letter_pdf):
121 | extractor = PdfMinerExtractor(extract_style=True)
122 | aggregator = SimpleAggregator()
123 |
124 | doc = extractor(letter_pdf)
125 | for b, label in zip(doc.content_boxes, cycle(["header", "body"])):
126 | b.label = label
127 | doc = aggregator(doc)
128 | texts = {k: v.text for k, v in doc.aggregated_texts.items()}
129 | props = {k: v.properties for k, v in doc.aggregated_texts.items()}
130 |
131 | assert set(texts.keys()) == {"body", "header"}
132 | assert isinstance(props["body"], list)
133 |
134 | for value in props.values():
135 | assert value[0].begin == 0
136 |
137 | pairs = set()
138 | for label in texts.keys():
139 | for prop in props[label]:
140 | pairs.add(
141 | (
142 | texts[label][prop.begin : prop.end],
143 | " ".join(
144 | filter(
145 | bool,
146 | (
147 | ("italic" if prop.italic else ""),
148 | ("bold" if prop.bold else ""),
149 | ),
150 | )
151 | ),
152 | )
153 | )
154 |
155 |
156 | def test_distant_superscript(distant_superscript_pdf):
157 | pipeline = edspdf.Pipeline()
158 | pipeline.add_pipe("poppler-extractor")
159 | pipeline.add_pipe("dummy-classifier", config={"label": "body"})
160 | pipeline.add_pipe("simple-aggregator")
161 | doc = pipeline(distant_superscript_pdf)
162 | assert doc.aggregated_texts["body"].text == "3 test line"
163 |
--------------------------------------------------------------------------------
/tests/pipes/classifiers/conftest.py:
--------------------------------------------------------------------------------
1 | from pytest import fixture
2 |
3 | from edspdf.structures import Page, PDFDoc, TextBox
4 |
5 |
6 | @fixture
7 | def single_page_doc() -> PDFDoc:
8 | doc = PDFDoc(id="doc", content=b"", pages=[])
9 | doc.pages = [Page(doc=doc, page_num=0, width=1.0, height=1.0)]
10 | doc.content_boxes = [
11 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.1, x1=0.9, y1=0.2),
12 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.4, y1=0.7),
13 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.9, y1=0.7),
14 | ]
15 | return doc
16 |
17 |
18 | @fixture
19 | def multi_page_doc() -> PDFDoc:
20 | doc = PDFDoc(id="doc", content=b"")
21 | doc.pages = [
22 | Page(doc=doc, page_num=0, width=1.0, height=1.0),
23 | Page(doc=doc, page_num=1, width=1.0, height=1.0),
24 | ]
25 | doc.content_boxes = [
26 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.1, x1=0.9, y1=0.2),
27 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.4, y1=0.7),
28 | TextBox(doc=doc, page_num=0, text="foo", x0=0.1, y0=0.6, x1=0.9, y1=0.7),
29 | TextBox(doc=doc, page_num=1, text="foo", x0=0.1, y0=0.1, x1=0.9, y1=0.2),
30 | TextBox(doc=doc, page_num=1, text="foo", x0=0.1, y0=0.6, x1=0.4, y1=0.7),
31 | TextBox(doc=doc, page_num=1, text="foo", x0=0.1, y0=0.6, x1=0.9, y1=0.7),
32 | ]
33 |
34 | return doc
35 |
--------------------------------------------------------------------------------
/tests/pipes/classifiers/test_align.py:
--------------------------------------------------------------------------------
1 | from edspdf.structures import Box
2 | from edspdf.utils.alignment import align_box_labels
3 |
4 |
5 | def test_align_multi_page(multi_page_doc):
6 | annotations = [
7 | Box(x0=0.0, y0=0.0, x1=1.0, y1=1.0, page_num=0, label="big"),
8 | Box(x0=0.1, y0=0.1, x1=0.9, y1=0.9, page_num=1, label="small"),
9 | ]
10 |
11 | labelled = align_box_labels(annotations, multi_page_doc.text_boxes)
12 | assert [b.label for b in labelled] == [
13 | "big",
14 | "big",
15 | "big",
16 | "small",
17 | "small",
18 | "small",
19 | ]
20 |
21 |
22 | def test_align_cross_page(multi_page_doc):
23 | annotations = [
24 | Box(x0=0.0, y0=0.0, x1=1.0, y1=1.0, label="big"),
25 | Box(x0=0.1, y0=0.1, x1=0.9, y1=0.9, label="small"),
26 | ]
27 |
28 | labelled = align_box_labels(annotations, multi_page_doc.text_boxes)
29 | assert [b.label for b in labelled] == [
30 | "small",
31 | "small",
32 | "small",
33 | "small",
34 | "small",
35 | "small",
36 | ]
37 |
--------------------------------------------------------------------------------
/tests/pipes/classifiers/test_dummy.py:
--------------------------------------------------------------------------------
1 | from edspdf.pipes.classifiers.dummy import DummyClassifier
2 |
3 |
4 | def test_dummy(single_page_doc):
5 | classifier = DummyClassifier(label="body")
6 |
7 | single_page_doc = classifier(single_page_doc)
8 |
9 | p1, p2, p3 = [b.label for b in single_page_doc.text_boxes]
10 |
11 | assert p1 == "body"
12 | assert p2 == "body"
13 | assert p3 == "body"
14 |
--------------------------------------------------------------------------------
/tests/pipes/classifiers/test_mask.py:
--------------------------------------------------------------------------------
1 | from confit import Config
2 |
3 | import edspdf
4 |
5 | configuration = """
6 | [pipeline]
7 | pipeline = ["classifier"]
8 | components = ${components}
9 |
10 | [components.classifier]
11 | @factory = "mask-classifier"
12 | x0 = 0
13 | y0 = 0.5
14 | x1 = 0.5
15 | y1 = 1
16 | threshold = 0.4
17 | """
18 |
19 | configuration_custom = """
20 | [pipeline]
21 | pipeline = ["classifier"]
22 | components = ${components}
23 |
24 | [components.classifier]
25 | @factory = "multi-mask-classifier"
26 | threshold = 0.9
27 |
28 | [components.classifier.body]
29 | label = "body"
30 | x0 = 0
31 | y0 = 0.5
32 | x1 = 0.5
33 | y1 = 1
34 | """
35 |
36 |
37 | def test_simple_mask(single_page_doc):
38 | model = edspdf.load(Config.from_str(configuration))
39 |
40 | single_page_doc = model(single_page_doc)
41 |
42 | p1, p2, p3 = [b.label for b in single_page_doc.text_boxes]
43 |
44 | assert p1 == "pollution"
45 | assert p2 == "body"
46 | assert p3 == "body"
47 |
48 |
49 | def test_custom_mask(single_page_doc):
50 | model = edspdf.load(Config.from_str(configuration_custom))
51 |
52 | single_page_doc = model(single_page_doc)
53 |
54 | p1, p2, p3 = [b.label for b in single_page_doc.text_boxes]
55 |
56 | assert p1 == "pollution"
57 | assert p2 == "body"
58 | assert p3 == "pollution"
59 |
--------------------------------------------------------------------------------
/tests/pipes/classifiers/test_random.py:
--------------------------------------------------------------------------------
1 | from confit import Config
2 |
3 | import edspdf
4 | from edspdf.structures import PDFDoc
5 |
6 | configuration = """
7 | [pipeline]
8 | pipeline = ["classifier"]
9 | components = ${components}
10 |
11 | [components.classifier]
12 | @factory = "random-classifier"
13 | labels = [ "body", "header" ]
14 | """
15 |
16 |
17 | def test_random_classifier(single_page_doc: PDFDoc):
18 | model = edspdf.load(Config.from_str(configuration))
19 |
20 | single_page_doc = model(single_page_doc)
21 |
22 | assert set(b.label for b in single_page_doc.text_boxes) == {"body", "header"}
23 |
--------------------------------------------------------------------------------
/tests/pipes/embeddings/test_custom.py:
--------------------------------------------------------------------------------
1 | from edspdf.pipes.embeddings.box_layout_embedding import BoxLayoutEmbedding
2 | from edspdf.pipes.embeddings.box_transformer import BoxTransformer
3 | from edspdf.pipes.embeddings.embedding_combiner import EmbeddingCombiner
4 | from edspdf.pipes.embeddings.simple_text_embedding import SimpleTextEmbedding
5 | from edspdf.pipes.embeddings.sub_box_cnn_pooler import SubBoxCNNPooler
6 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
7 |
8 |
9 | def test_custom_embedding(pdf, error_pdf, tmp_path):
10 | embedding = BoxTransformer(
11 | num_heads=4,
12 | dropout_p=0.1,
13 | activation="gelu",
14 | init_resweight=0.01,
15 | head_size=16,
16 | attention_mode=["c2c", "c2p", "p2c"],
17 | n_layers=1,
18 | n_relative_positions=64,
19 | embedding=EmbeddingCombiner(
20 | dropout_p=0.1,
21 | text_encoder=SubBoxCNNPooler(
22 | out_channels=64,
23 | kernel_sizes=(3, 4, 5),
24 | embedding=SimpleTextEmbedding(
25 | size=72,
26 | ),
27 | ),
28 | layout_encoder=BoxLayoutEmbedding(
29 | n_positions=64,
30 | x_mode="sin",
31 | y_mode="sin",
32 | w_mode="learned",
33 | h_mode="learned",
34 | size=72,
35 | ),
36 | ),
37 | )
38 | str(embedding)
39 |
40 | extractor = PdfMinerExtractor(render_pages=True)
41 | pdfdoc = extractor(pdf)
42 | pdfdoc.text_boxes[0].text = "Very long word of 150 letters : " + "x" * 150
43 | embedding.post_init([pdfdoc], set())
44 | embedding(pdfdoc)
45 | embedding.save_extra_data(tmp_path, set())
46 | embedding.load_extra_data(tmp_path, set())
47 |
48 | # Test empty document
49 | embedding(extractor(error_pdf))
50 |
--------------------------------------------------------------------------------
/tests/pipes/embeddings/test_huggingface.py:
--------------------------------------------------------------------------------
1 | from edspdf.pipes.embeddings.huggingface_embedding import HuggingfaceEmbedding
2 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
3 |
4 |
5 | def test_huggingface_embedding(pdf, error_pdf):
6 | embedding = HuggingfaceEmbedding(
7 | pipeline=None,
8 | name="huggingface",
9 | model="hf-tiny-model-private/tiny-random-LayoutLMv3Model",
10 | window=32,
11 | stride=16,
12 | use_image=True,
13 | )
14 | # Patch the faulty size in the tiny-random-LayoutLMv3Model
15 | embedding.image_processor.size = {
16 | "height": embedding.hf_model.config.input_size,
17 | "width": embedding.hf_model.config.input_size,
18 | }
19 |
20 | extractor = PdfMinerExtractor(render_pages=True)
21 | embedding(extractor(pdf))
22 | embedding(extractor(error_pdf))
23 |
--------------------------------------------------------------------------------
/tests/pipes/extractors/test_pdfminer.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from blocks_ground_truth import blank_blocks, pdf_blocks, styles_blocks
3 | from pdfminer.pdfparser import PDFSyntaxError
4 |
5 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
6 |
7 |
8 | def test_pdfminer(pdf, styles_pdf, blank_pdf):
9 | extractor = PdfMinerExtractor(extract_style=False)
10 |
11 | pytest.nested_approx(extractor(pdf).text_boxes, pdf_blocks, abs=5e-2)
12 | pytest.nested_approx(extractor(styles_pdf).text_boxes, styles_blocks, abs=5e-2)
13 | pytest.nested_approx(extractor(blank_pdf).text_boxes, blank_blocks, abs=5e-2)
14 |
15 |
16 | def test_pdfminer_image(pdf, styles_pdf, blank_pdf):
17 | extractor = PdfMinerExtractor(extract_style=False, render_pages=True)
18 |
19 | assert extractor(pdf).pages[0].image.shape == (2339, 1654, 3)
20 | assert extractor(styles_pdf).pages[0].image.shape == (2200, 1700, 3)
21 | assert extractor(blank_pdf).pages[0].image.shape == (2339, 1654, 3)
22 |
23 |
24 | def test_pdfminer_error(error_pdf):
25 | extractor = PdfMinerExtractor(extract_style=False, raise_on_error=True)
26 |
27 | with pytest.raises(PDFSyntaxError):
28 | extractor(error_pdf)
29 |
30 | extractor.raise_on_error = False
31 | result = extractor(error_pdf)
32 | assert len(result.text_boxes) == 0
33 | assert result.error is True
34 |
--------------------------------------------------------------------------------
/tests/recipes/config.cfg:
--------------------------------------------------------------------------------
1 | [train]
2 | model = ${pipeline}
3 | max_steps = 20
4 | lr = 8e-4
5 | seed = 43
6 |
7 | [train.train_data]
8 | @adapter = segmentation-adapter
9 |
10 | [train.val_data]
11 | @adapter = segmentation-adapter
12 |
13 | [pipeline]
14 | pipeline = ["extractor", "embedding", "classifier"]
15 | disabled = []
16 | components = ${components}
17 |
18 | [components]
19 |
20 | [components.extractor]
21 | @factory = "pdfminer-extractor"
22 |
23 | [components.embedding]
24 | @factory = "box-transformer"
25 | num_heads = 4
26 | dropout_p = 0.1
27 | head_size = 16
28 | activation = "gelu"
29 | init_resweight = 0.01
30 | n_relative_positions = 64
31 | attention_mode = ["c2c", "c2p", "p2c"]
32 | n_layers = 1
33 |
34 | [components.classifier]
35 | @factory = "trainable-classifier"
36 | labels = []
37 | embedding = ${components.embedding}
38 |
39 | [components.embedding.embedding]
40 | @factory = "embedding-combiner"
41 | dropout_p = 0.1
42 |
43 | [components.embedding.embedding.layout_encoder]
44 | @factory = "box-layout-embedding"
45 | n_positions = 64
46 | size = 72
47 | x_mode = "learned"
48 | y_mode = "learned"
49 | w_mode = "learned"
50 | h_mode = "learned"
51 |
52 | [components.embedding.embedding.text_encoder]
53 | @factory = "sub-box-cnn-pooler"
54 | out_channels = 64
55 | kernel_sizes = [3, 4, 5]
56 |
57 | [components.embedding.embedding.text_encoder.embedding]
58 | @factory = "simple-text-embedding"
59 | size = 72
60 |
--------------------------------------------------------------------------------
/tests/recipes/test_markdown_aggregator.py:
--------------------------------------------------------------------------------
1 | from edspdf import registry
2 | from edspdf.pipes.aggregators.simple import SimpleAggregator
3 | from edspdf.structures import PDFDoc, Text
4 |
5 |
6 | @registry.factory.register("markdown-aggregator") #
7 | class MarkdownAggregator(SimpleAggregator):
8 | def __call__(self, doc: PDFDoc) -> PDFDoc:
9 | doc = super().__call__(doc)
10 |
11 | for label in doc.aggregated_texts.keys():
12 | text = doc.aggregated_texts[label].text
13 |
14 | fragments = []
15 |
16 | offset = 0
17 | for s in doc.aggregated_texts[label].properties:
18 | if s.begin >= s.end:
19 | continue
20 | if offset < s.begin:
21 | fragments.append(text[offset : s.begin])
22 |
23 | offset = s.end
24 | snippet = text[s.begin : s.end]
25 | if s.bold:
26 | snippet = f"**{snippet}**"
27 | if s.italic:
28 | snippet = f"_{snippet}_"
29 | fragments.append(snippet)
30 |
31 | if offset < len(text):
32 | fragments.append(text[offset:])
33 |
34 | doc.aggregated_texts[label] = Text(text="".join(fragments))
35 |
36 | return doc
37 |
38 |
39 | def test_markdown_aggregator(styles_pdf):
40 | from edspdf import Pipeline
41 |
42 | model = Pipeline()
43 | # will extract text lines from a document
44 | model.add_pipe(
45 | "pdfminer-extractor",
46 | config=dict(
47 | extract_style=True,
48 | ),
49 | )
50 | # classify everything inside the `body` bounding box as `body`
51 | model.add_pipe(
52 | "mask-classifier",
53 | config={"x0": 0.1, "y0": 0.1, "x1": 0.9, "y1": 0.9},
54 | )
55 | # aggregates the lines together to re-create the original text
56 | model.add_pipe("markdown-aggregator")
57 |
58 | assert model(styles_pdf).aggregated_texts["body"].text == (
59 | "Let’s up the stakes, with _intra_-word change. Or better yet, **this mi**ght "
60 | "be hard."
61 | )
62 |
--------------------------------------------------------------------------------
/tests/resources/blank.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/blank.pdf
--------------------------------------------------------------------------------
/tests/resources/distant-superscript.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/distant-superscript.pdf
--------------------------------------------------------------------------------
/tests/resources/error.pdf:
--------------------------------------------------------------------------------
1 | This is not a PDF : it will raise an error if anyone tries to parse it
2 |
3 |
--------------------------------------------------------------------------------
/tests/resources/letter.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/letter.pdf
--------------------------------------------------------------------------------
/tests/resources/styles.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/styles.pdf
--------------------------------------------------------------------------------
/tests/resources/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aphp/edspdf/7177a29534181686d035c85fa667784e7d930482/tests/resources/test.pdf
--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
1 | import attr
2 | from pytest import approx as pytest_approx
3 |
4 | from edspdf.structures import BaseModel
5 |
6 |
7 | def is_primitive(x):
8 | return x is None or type(x) in (int, float, str, bool)
9 |
10 |
11 | pytest_plugins = ["helpers_namespace"]
12 |
13 |
14 | def nested_approx(A, B, abs=1e-6, rel=1e-6, enforce_same_type=False):
15 | if enforce_same_type and type(A) != type(B) and not is_primitive(A):
16 | # I use `not is_primitive(A)` to enforce the same type only for data structures
17 | return False
18 | if isinstance(A, BaseModel):
19 | names = [field.name for field in attr.fields(type(A)) if field.eq]
20 | return type(A) == type(B) and nested_approx(
21 | A.dict(filter=lambda a, v: a.name in names),
22 | B.dict(filter=lambda a, v: a.name in names),
23 | abs=abs,
24 | rel=rel,
25 | )
26 |
27 | elif isinstance(A, set) or isinstance(B, set):
28 | # if any of the data structures is a set, convert both of them to a sorted
29 | # list, but return False if the length has changed
30 | len_A, len_B = len(A), len(B)
31 | A, B = sorted(A), sorted(B)
32 | if len_A != len(A) or len_B != len(B):
33 | return False
34 |
35 | for i in range(len(A)):
36 | if not nested_approx(A[i], B[i], abs, rel):
37 | return False
38 |
39 | return True
40 | elif isinstance(A, dict) and isinstance(B, dict):
41 | for k in A.keys():
42 | if not nested_approx(A[k], B[k], abs, rel):
43 | return False
44 |
45 | return True
46 | elif (isinstance(A, list) or isinstance(A, tuple)) and (
47 | isinstance(B, list) or isinstance(B, tuple)
48 | ):
49 | for i in range(len(A)):
50 | if not nested_approx(A[i], B[i], abs, rel):
51 | return False
52 |
53 | return True
54 | else:
55 | try:
56 | assert A == pytest_approx(B, rel=rel, abs=abs)
57 | is_approx_equal = A == pytest_approx(B, rel=rel, abs=abs)
58 | except (AssertionError, TypeError):
59 | is_approx_equal = False
60 |
61 | return is_approx_equal
62 |
--------------------------------------------------------------------------------
/tests/utils/test_py_utils.py:
--------------------------------------------------------------------------------
1 | from edspdf.utils.collections import (
2 | flatten_dict,
3 | get_deep_attr,
4 | nest_dict,
5 | set_deep_attr,
6 | )
7 |
8 |
9 | def test_nest_dict():
10 | assert nest_dict({"a/b/c": 4, "a/b/d": "ok", "a/x": {"key": "value"}}) == {
11 | "a": {
12 | "b": {"c": 4, "d": "ok"},
13 | "x": {"key": "value"},
14 | }
15 | }
16 | assert nest_dict({}) == {}
17 |
18 |
19 | def test_flatten_dict():
20 | assert flatten_dict(
21 | {
22 | "a": {
23 | "b": {"c": 4, "d": "ok"},
24 | "x": {"key": "value"},
25 | },
26 | "empty": {},
27 | }
28 | ) == (
29 | {
30 | "a/b/c": 4,
31 | "a/b/d": "ok",
32 | "a/x/key": "value",
33 | }
34 | )
35 |
36 |
37 | class Point:
38 | def __init__(self, x, y, data):
39 | self.x = x
40 | self.y = y
41 | self.data = data
42 |
43 |
44 | def test_deep_attr():
45 | obj = [Point(2, 3, {"my": ({"attr": 4},)})]
46 | assert get_deep_attr(obj, "0.data.my.0.attr") == 4
47 |
48 | set_deep_attr(obj, "0.data.my.0.attr", 5)
49 |
50 | assert get_deep_attr(obj, "0.data.my.0.attr") == 5
51 |
52 | set_deep_attr(obj, "0.data.my.0", 5)
53 |
54 | assert get_deep_attr(obj, "0.data.my") == (5,)
55 |
56 | set_deep_attr(obj, "0.new_attr", "ok")
57 |
58 | assert get_deep_attr(obj, "0.new_attr") == "ok"
59 |
--------------------------------------------------------------------------------
/tests/utils/test_torch_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from edspdf.utils.torch import pad_2d
4 |
5 |
6 | def test_pad_2d():
7 | a = [
8 | [0, 1, 2, 3, 4],
9 | [0, 1, 2, 3, 4, 5, 6],
10 | ]
11 | torch.testing.assert_close(
12 | pad_2d(a, pad=-1),
13 | torch.tensor(
14 | [
15 | [0, 1, 2, 3, 4, -1, -1],
16 | [0, 1, 2, 3, 4, 5, 6],
17 | ]
18 | ),
19 | )
20 | torch.testing.assert_close(
21 | pad_2d([], pad=-1, device=torch.device("cpu")),
22 | torch.tensor([]),
23 | )
24 |
--------------------------------------------------------------------------------
/tests/visualization/test_annotations.py:
--------------------------------------------------------------------------------
1 | from edspdf.pipes.classifiers.mask import simple_mask_classifier_factory
2 | from edspdf.pipes.extractors.pdfminer import PdfMinerExtractor
3 | from edspdf.visualization import compare_results, merge_boxes, show_annotations
4 |
5 |
6 | def test_pipeline(pdf):
7 | extractor = PdfMinerExtractor()
8 | classifier = simple_mask_classifier_factory(
9 | x0=0.1, y0=0.4, x1=0.5, y1=0.9, threshold=0.1
10 | )
11 |
12 | doc = extractor(pdf)
13 | doc = classifier(doc)
14 |
15 | merged = merge_boxes(doc.lines)
16 |
17 | assert len(show_annotations(pdf, merged)) == 1
18 | assert len(compare_results(pdf, doc.lines, merged)) == 1
19 |
--------------------------------------------------------------------------------
/tests/visualization/test_merge.py:
--------------------------------------------------------------------------------
1 | from edspdf.pipeline import Pipeline
2 | from edspdf.structures import Box
3 | from edspdf.visualization.merge import merge_boxes
4 |
5 |
6 | def test_merge():
7 | lines = [
8 | Box(page_num=0, x0=0, x1=1, y0=0, y1=0.1, label="body"),
9 | Box(page_num=0, x0=0, x1=1, y0=0.1, y1=0.2, label="body"),
10 | Box(page_num=0, x0=0, x1=0.4, y0=0.2, y1=0.3, label="body"),
11 | Box(page_num=0, x0=0.6, x1=1, y0=0.2, y1=0.3, label="other"),
12 | Box(page_num=1, x0=0.6, x1=1, y0=0.2, y1=0.3, label="body"),
13 | ]
14 |
15 | merged = [
16 | Box(page_num=0, x0=0.0, x1=1.0, y0=0.0, y1=0.2, label="body"),
17 | Box(page_num=0, x0=0.0, x1=0.4, y0=0.2, y1=0.3, label="body"),
18 | Box(page_num=0, x0=0.6, x1=1.0, y0=0.2, y1=0.3, label="other"),
19 | Box(page_num=1, x0=0.6, x1=1.0, y0=0.2, y1=0.3, label="body"),
20 | ]
21 |
22 | out = merge_boxes(lines)
23 |
24 | assert len(out) == 4
25 |
26 | assert out == merged
27 |
28 |
29 | def test_pipeline(pdf, blank_pdf):
30 | model = Pipeline()
31 | model.add_pipe("pdfminer-extractor")
32 | model.add_pipe(
33 | "mask-classifier", config=dict(x0=0.1, y0=0.4, x1=0.5, y1=0.9, threshold=0.1)
34 | )
35 |
36 | pdf_pages = model(pdf).pages
37 | blank_pdf_pages = model(blank_pdf).pages
38 | assert len([b for p in pdf_pages for b in merge_boxes(p.text_boxes)]) == 7
39 | assert len([b for p in blank_pdf_pages for b in merge_boxes(p.text_boxes)]) == 0
40 |
--------------------------------------------------------------------------------