├── .github └── workflows │ ├── publish.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── requirements.txt ├── setup.cfg ├── setup.py ├── spacy_layout ├── __init__.py ├── layout.py ├── types.py └── util.py └── tests ├── data ├── simple.docx ├── simple.pdf ├── starcraft.pdf ├── table.pdf └── table_document_index.pdf └── test_general.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | deploy: 13 | runs-on: ubuntu-latest 14 | environment: 15 | name: pypi 16 | url: https://pypi.org/p/spacy-layout 17 | permissions: 18 | id-token: write 19 | contents: read 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | 24 | - name: Set up Python 25 | uses: actions/setup-python@v3 26 | with: 27 | python-version: '3.10' 28 | 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | 34 | - name: Build package 35 | run: python -m build 36 | 37 | - name: Publish package 38 | uses: pypa/gh-action-pypi-publish@release/v1 39 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths-ignore: 8 | - "*.md" 9 | pull_request: 10 | paths-ignore: 11 | - "*.md" 12 | workflow_dispatch: 13 | 14 | jobs: 15 | run: 16 | runs-on: ubuntu-latest 17 | 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - name: Set up Python 22 | uses: actions/setup-python@v5 23 | with: 24 | python-version: "3.10" 25 | cache: "pip" # caching pip dependencies 26 | 27 | - name: Install 28 | run: | 29 | python -m pip install -U pip 30 | pip install -e . 31 | pip install pytest 32 | 33 | - name: Run tests 34 | shell: bash 35 | run: | 36 | python -m pytest tests 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | .pytest_cache/ 4 | .mypy_cache/ 5 | *.py[cod] 6 | *$py.class 7 | py36-64/ 8 | py35-64/ 9 | env3/ 10 | env3.9/ 11 | 12 | # C extensions 13 | *.so 14 | .cython_src/ 15 | cython_src/ 16 | 17 | # Distribution / packaging 18 | .Python 19 | env/ 20 | build/ 21 | develop-eggs/ 22 | dist/ 23 | downloads/ 24 | eggs/ 25 | .eggs/ 26 | lib/ 27 | lib64/ 28 | parts/ 29 | sdist/ 30 | var/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *,cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # IPython Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # dotenv 87 | .env* 88 | .~env 89 | .env-3.5.0 90 | .env-3.6.2 91 | 92 | # virtualenv 93 | venv/ 94 | ENV/ 95 | .linenv 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # PyCharm project settings 104 | .idea 105 | 106 | # Node 107 | **/node_modules 108 | 109 | # Redis 110 | *.rdb 111 | 112 | /tmp 113 | .vscode 114 | 115 | # Vim 116 | *.swp 117 | 118 | # prodigy 119 | prodigy.json 120 | 121 | # cypress and e2e 122 | e2e_cy/testing_assets/test_*_output.jsonl 123 | e2e_cy/prodigy_home/prodigy_cypress.db 124 | app/cypress/videos 125 | app/cypress/screenshots 126 | app/coverage 127 | 128 | exports 129 | trash 130 | 131 | .DS_Store 132 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 ExplosionAI GmbH 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # spaCy Layout: Process PDFs, Word documents and more with spaCy 4 | 5 | This plugin integrates with [Docling](https://ds4sd.github.io/docling/) to bring structured processing of **PDFs**, **Word documents** and other input formats to your [spaCy](https://spacy.io) pipeline. It outputs clean, **structured data** in a text-based format and creates spaCy's familiar [`Doc`](https://spacy.io/api/doc) objects that let you access labelled text spans like sections or headings, and tables with their data converted to a `pandas.DataFrame`. 6 | 7 | This workflow makes it easy to apply powerful **NLP techniques** to your documents, including linguistic analysis, named entity recognition, text classification and more. It's also great for implementing **chunking for RAG** pipelines. 8 | 9 | > 📖 **Blog post:** ["From PDFs to AI-ready structured data: a deep dive" 10 | ](https://explosion.ai/blog/pdfs-nlp-structured-data) – A new modular workflow for converting PDFs and similar documents to structured data, featuring `spacy-layout` and Docling. 11 | 12 | [![Test](https://github.com/explosion/spacy-layout/actions/workflows/test.yml/badge.svg)](https://github.com/explosion/spacy-layout/actions/workflows/test.yml) 13 | [![Current Release Version](https://img.shields.io/github/release/explosion/spacy-layout.svg?style=flat-square&logo=github&include_prereleases)](https://github.com/explosion/spacy-layout/releases) 14 | [![pypi Version](https://img.shields.io/pypi/v/spacy-layout.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/spacy-layout/) 15 | [![Built with spaCy](https://img.shields.io/badge/built%20with-spaCy-09a3d5.svg?style=flat-square)](https://spacy.io) 16 | 17 | ## 📝 Usage 18 | 19 | > ⚠️ This package requires **Python 3.10** or above. 20 | 21 | ```bash 22 | pip install spacy-layout 23 | ``` 24 | 25 | After initializing the `spaCyLayout` preprocessor with an `nlp` object for tokenization, you can call it on a document path to convert it to structured data. The resulting `Doc` object includes layout spans that map into the original raw text and expose various attributes, including the content type and layout features. 26 | 27 | ```python 28 | import spacy 29 | from spacy_layout import spaCyLayout 30 | 31 | nlp = spacy.blank("en") 32 | layout = spaCyLayout(nlp) 33 | 34 | # Process a document and create a spaCy Doc object 35 | doc = layout("./starcraft.pdf") 36 | 37 | # The text-based contents of the document 38 | print(doc.text) 39 | # Document layout including pages and page sizes 40 | print(doc._.layout) 41 | # Tables in the document and their extracted data 42 | print(doc._.tables) 43 | # Markdown representation of the document 44 | print(doc._.markdown) 45 | 46 | # Layout spans for different sections 47 | for span in doc.spans["layout"]: 48 | # Document section and token and character offsets into the text 49 | print(span.text, span.start, span.end, span.start_char, span.end_char) 50 | # Section type, e.g. "text", "title", "section_header" etc. 51 | print(span.label_) 52 | # Layout features of the section, including bounding box 53 | print(span._.layout) 54 | # Closest heading to the span (accuracy depends on document structure) 55 | print(span._.heading) 56 | ``` 57 | 58 | If you need to process larger volumes of documents at scale, you can use the `spaCyLayout.pipe` method, which takes an iterable of paths or bytes instead and yields `Doc` objects: 59 | 60 | ```python 61 | paths = ["one.pdf", "two.pdf", "three.pdf", ...] 62 | for doc in layout.pipe(paths): 63 | print(doc._.layout) 64 | ``` 65 | 66 | spaCy also allows you to call the `nlp` object on an already created `Doc`, so you can easily apply a pipeline of components for [linguistic analysis](https://spacy.io/usage/linguistic-features) or [named entity recognition](https://spacy.io/usage/linguistic-features#named-entities), use [rule-based matching](https://spacy.io/usage/rule-based-matching) or anything else you can do with spaCy. 67 | 68 | ```python 69 | # Load the transformer-based English pipeline 70 | # Installation: python -m spacy download en_core_web_trf 71 | nlp = spacy.load("en_core_web_trf") 72 | layout = spaCyLayout(nlp) 73 | 74 | doc = layout("./starcraft.pdf") 75 | # Apply the pipeline to access POS tags, dependencies, entities etc. 76 | doc = nlp(doc) 77 | ``` 78 | 79 | ### Tables and tabular data 80 | 81 | Tables are included in the layout spans with the label `"table"` and under the shortcut `Doc._.tables`. They expose a `layout` extension attribute, as well as an attribute `data`, which includes the tabular data converted to a `pandas.DataFrame`. 82 | 83 | ```python 84 | for table in doc._.tables: 85 | # Token position and bounding box 86 | print(table.start, table.end, table._.layout) 87 | # pandas.DataFrame of contents 88 | print(table._.data) 89 | ``` 90 | 91 | By default, the span text is a placeholder `TABLE`, but you can customize how a table is rendered by providing a `display_table` callback to `spaCyLayout`, which receives the `pandas.DataFrame` of the data. This allows you to include the table figures in the document text and use them later on, e.g. during information extraction with a trained named entity recognizer or text classifier. 92 | 93 | ```python 94 | def display_table(df: pd.DataFrame) -> str: 95 | return f"Table with columns: {', '.join(df.columns.tolist())}" 96 | 97 | layout = spaCyLayout(nlp, display_table=display_table) 98 | ``` 99 | 100 | ### Serialization 101 | 102 | After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion. 103 | 104 | ```python 105 | from spacy.tokens import DocBin 106 | 107 | docs = layout.pipe(["one.pdf", "two.pdf", "three.pdf"]) 108 | doc_bin = DocBin(docs=docs, store_user_data=True) 109 | doc_bin.to_disk("./file.spacy") 110 | ``` 111 | 112 | > ⚠️ **Note on deserializing with extension attributes:** The custom extension attributes like `Doc._.layout` are currently registered when `spaCyLayout` is initialized. So if you're loading back `Doc` objects with layout information from a binary file, you'll need to initialize it so the custom attributes can be repopulated. We're planning on making this more elegant in an upcoming version. 113 | > 114 | > ```diff 115 | > + layout = spacyLayout(nlp) 116 | > doc_bin = DocBin(store_user_data=True).from_disk("./file.spacy") 117 | > docs = list(doc_bin.get_docs(nlp.vocab)) 118 | > ``` 119 | 120 | 121 | ## 🎛️ API 122 | 123 | ### Data and extension attributes 124 | 125 | ```python 126 | layout = spaCyLayout(nlp) 127 | doc = layout("./starcraft.pdf") 128 | print(doc._.layout) 129 | for span in doc.spans["layout"]: 130 | print(span.label_, span._.layout) 131 | ``` 132 | 133 | | Attribute | Type | Description | 134 | | --- | --- | --- | 135 | | `Doc._.layout` | `DocLayout` | Layout features of the document. | 136 | | `Doc._.pages` | `list[tuple[PageLayout, list[Span]]]` | Pages in the document and the spans they contain. | 137 | | `Doc._.tables` | `list[Span]` | All tables in the document. | 138 | | `Doc._.markdown` | `str` | Markdown representation of the document. | 139 | | `Doc.spans["layout"]` | `spacy.tokens.SpanGroup` | The layout spans in the document. | 140 | | `Span.label_` | `str` | The type of the extracted layout span, e.g. `"text"` or `"section_header"`. [See here](https://github.com/DS4SD/docling-core/blob/14cad33ae7f8dc011a79dd364361d2647c635466/docling_core/types/doc/labels.py) for options. | 141 | | `Span.label` | `int` | The integer ID of the span label. | 142 | | `Span.id` | `int` | Running index of layout span. | 143 | | `Span._.layout` | `SpanLayout \| None` | Layout features of a layout span. | 144 | | `Span._.heading` | `Span \| None` | Closest heading to a span, if available. | 145 | | `Span._.data` | `pandas.DataFrame \| None` | The extracted data for table spans. 146 | 147 | ### dataclass PageLayout 148 | 149 | | Attribute | Type | Description | 150 | | --- | --- | --- | 151 | | `page_no` | `int` | The page number (1-indexed). | 152 | | `width` | `float` | Page width in pixels. | 153 | | `height` | `float` | Page height in pixels. | 154 | 155 | ### dataclass DocLayout 156 | 157 | | Attribute | Type | Description | 158 | | --- | --- | --- | 159 | | `pages` | `list[PageLayout]` | The pages in the document. | 160 | 161 | ### dataclass SpanLayout 162 | 163 | | Attribute | Type | Description | 164 | | --- | --- | --- | 165 | | `x` | `float` | Horizontal offset of the bounding box in pixels. | 166 | | `y` | `float` | Vertical offset of the bounding box in pixels. | 167 | | `width` | `float` | Width of the bounding box in pixels. | 168 | | `height` | `float` | Height of the bounding box in pixels. | 169 | | `page_no` | `int` | Number of page the span is on. | 170 | 171 | ### class `spaCyLayout` 172 | 173 | #### method `spaCyLayout.__init__` 174 | 175 | Initialize the document processor. 176 | 177 | ```python 178 | nlp = spacy.blank("en") 179 | layout = spaCyLayout(nlp) 180 | ``` 181 | 182 | | Argument | Type | Description | 183 | | --- | --- | --- | 184 | | `nlp` | `spacy.language.Language` | The initialized `nlp` object to use for tokenization. | 185 | | `separator` | `str` | Token used to separate sections in the created `Doc` object. The separator won't be part of the layout span. If `None`, no separator will be added. Defaults to `"\n\n"`. | 186 | | `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"doc_markdown"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. | 187 | | `headings` | `list[str]` | Labels of headings to consider for `Span._.heading` detection. Defaults to `["section_header", "page_header", "title"]`. | 188 | | `display_table` | `Callable[[pandas.DataFrame], str] \| str` | Function to generate the text-based representation of the table in the `Doc.text` or placeholder text. Defaults to `"TABLE"`. | 189 | | `docling_options` | `dict[InputFormat, FormatOption]` | [Format options](https://ds4sd.github.io/docling/usage/#advanced-options) passed to Docling's `DocumentConverter`. | 190 | | **RETURNS** | `spaCyLayout` | The initialized object. | 191 | 192 | #### method `spaCyLayout.__call__` 193 | 194 | Process a document and create a spaCy [`Doc`](https://spacy.io/api/doc) object containing the text content and layout spans, available via `Doc.spans["layout"]` by default. 195 | 196 | ```python 197 | layout = spaCyLayout(nlp) 198 | doc = layout("./starcraft.pdf") 199 | ``` 200 | 201 | | Argument | Type | Description | 202 | | --- | --- | --- | 203 | | `source` | `str \| Path \| bytes \| DoclingDocument` | Path of document to process, bytes or already created `DoclingDocument`. | 204 | | **RETURNS** | `Doc` | The processed spaCy `Doc` object. | 205 | 206 | #### method `spaCyLayout.pipe` 207 | 208 | Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe). 209 | 210 | ```python 211 | layout = spaCyLayout(nlp) 212 | paths = ["one.pdf", "two.pdf", "three.pdf", ...] 213 | docs = layout.pipe(paths) 214 | ``` 215 | 216 | ```python 217 | sources = [("one.pdf", {"id": 1}), ("two.pdf", {"id": 2})] 218 | for doc, context in layout.pipe(sources, as_tuples=True): 219 | ... 220 | ``` 221 | 222 | | Argument | Type | Description | 223 | | --- | --- | --- | 224 | | `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. | 225 | | `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | 226 | | **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. | 227 | 228 | ## 💡 Examples and code snippets 229 | 230 | This section includes further examples of what you can do with `spacy-layout`. If you have an example that could be a good fit, feel free to submit a [pull request](https://github.com/explosion/spacy-layout/pulls)! 231 | 232 | ### Visualize a page and bounding boxes with matplotlib 233 | 234 | ```python 235 | import pypdfium2 as pdfium 236 | import matplotlib.pyplot as plt 237 | from matplotlib.patches import Rectangle 238 | import spacy 239 | from spacy_layout import spaCyLayout 240 | 241 | DOCUMENT_PATH = "./document.pdf" 242 | 243 | # Load and convert the PDF page to an image 244 | pdf = pdfium.PdfDocument(DOCUMENT_PATH) 245 | page_image = pdf[2].render(scale=1) # get page 3 (index 2) 246 | numpy_array = page_image.to_numpy() 247 | # Process document with spaCy 248 | nlp = spacy.blank("en") 249 | layout = spaCyLayout(nlp) 250 | doc = layout(DOCUMENT_PATH) 251 | 252 | # Get page 3 layout and sections 253 | page = doc._.pages[2] 254 | page_layout = doc._.layout.pages[2] 255 | # Create figure and axis with page dimensions 256 | fig, ax = plt.subplots(figsize=(12, 16)) 257 | # Display the PDF image 258 | ax.imshow(numpy_array) 259 | # Add rectangles for each section's bounding box 260 | for section in page[1]: 261 | # Create rectangle patch 262 | rect = Rectangle( 263 | (section._.layout.x, section._.layout.y), 264 | section._.layout.width, 265 | section._.layout.height, 266 | fill=False, 267 | color="blue", 268 | linewidth=1, 269 | alpha=0.5 270 | ) 271 | ax.add_patch(rect) 272 | # Add text label at top of box 273 | ax.text( 274 | section._.layout.x, 275 | section._.layout.y, 276 | section.label_, 277 | fontsize=8, 278 | color="red", 279 | verticalalignment="bottom" 280 | ) 281 | 282 | ax.axis("off") # hide axes 283 | plt.show() 284 | ``` 285 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | spacy>=3.7.5 2 | docling>=2.5.2 3 | pandas # version range set by Docling 4 | srsly # version range set by spaCy 5 | # Dev requirements 6 | pytest 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | version = 0.0.12 3 | description = Use spaCy with PDFs, Word docs and other documents 4 | url = https://github.com/explosion/spacy-layout 5 | author = Explosion 6 | author_email = contact@explosion.ai 7 | license = MIT 8 | long_description = file: README.md 9 | long_description_content_type = text/markdown 10 | classifiers = 11 | Development Status :: 4 - Beta 12 | Environment :: Console 13 | Intended Audience :: Developers 14 | Intended Audience :: Science/Research 15 | License :: OSI Approved :: MIT License 16 | Operating System :: POSIX :: Linux 17 | Operating System :: MacOS :: MacOS X 18 | Operating System :: Microsoft :: Windows 19 | Programming Language :: Python :: 3 20 | Programming Language :: Python :: 3.10 21 | Programming Language :: Python :: 3.11 22 | Programming Language :: Python :: 3.12 23 | Topic :: Scientific/Engineering 24 | project_urls = 25 | Release notes = https://github.com/explosion/spacy-layout/releases 26 | Source = https://github.com/explosion/spacy-layout 27 | 28 | [options] 29 | zip_safe = true 30 | python_requires = >=3.10 31 | install_requires = 32 | spacy>=3.7.5 33 | docling>=2.5.2 34 | pandas # version range set by Docling 35 | srsly # version range set by spaCy 36 | 37 | [bdist_wheel] 38 | universal = true 39 | 40 | [sdist] 41 | formats = gztar 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | if __name__ == "__main__": 4 | from setuptools import find_packages, setup 5 | 6 | setup(name="spacy_layout", packages=find_packages()) 7 | -------------------------------------------------------------------------------- /spacy_layout/__init__.py: -------------------------------------------------------------------------------- 1 | from .layout import spaCyLayout 2 | 3 | __all__ = ["spaCyLayout"] 4 | -------------------------------------------------------------------------------- /spacy_layout/layout.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from pathlib import Path 3 | from typing import ( 4 | TYPE_CHECKING, 5 | Callable, 6 | Iterable, 7 | Iterator, 8 | Literal, 9 | TypeVar, 10 | cast, 11 | overload, 12 | ) 13 | 14 | import srsly 15 | from docling.datamodel.base_models import DocumentStream 16 | from docling.document_converter import DocumentConverter 17 | from docling_core.types.doc.document import DoclingDocument 18 | from docling_core.types.doc.labels import DocItemLabel 19 | from spacy.tokens import Doc, Span, SpanGroup 20 | 21 | from .types import Attrs, DocLayout, DoclingItem, PageLayout, SpanLayout 22 | from .util import decode_df, decode_obj, encode_df, encode_obj, get_bounding_box 23 | 24 | if TYPE_CHECKING: 25 | from docling.datamodel.base_models import InputFormat 26 | from docling.document_converter import FormatOption 27 | from pandas import DataFrame 28 | from spacy.language import Language 29 | 30 | # Type variable for contexts piped with documents 31 | _AnyContext = TypeVar("_AnyContext") 32 | 33 | TABLE_PLACEHOLDER = "TABLE" 34 | TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX] 35 | 36 | # Register msgpack encoders and decoders for custom types 37 | srsly.msgpack_encoders.register("spacy-layout.dataclass", func=encode_obj) 38 | srsly.msgpack_decoders.register("spacy-layout.dataclass", func=decode_obj) 39 | srsly.msgpack_encoders.register("spacy-layout.dataframe", func=encode_df) 40 | srsly.msgpack_decoders.register("spacy-layout.dataframe", func=decode_df) 41 | 42 | 43 | class spaCyLayout: 44 | def __init__( 45 | self, 46 | nlp: "Language", 47 | separator: str | None = "\n\n", 48 | attrs: dict[str, str] = {}, 49 | headings: list[str] = [ 50 | DocItemLabel.SECTION_HEADER, 51 | DocItemLabel.PAGE_HEADER, 52 | DocItemLabel.TITLE, 53 | ], 54 | display_table: Callable[["DataFrame"], str] | str = TABLE_PLACEHOLDER, 55 | docling_options: dict["InputFormat", "FormatOption"] | None = None, 56 | ) -> None: 57 | """Initialize the layout parser and Docling converter.""" 58 | self.nlp = nlp 59 | self.sep = separator 60 | self.attrs = Attrs( 61 | doc_layout=attrs.get("doc_layout", "layout"), 62 | doc_pages=attrs.get("doc_pages", "pages"), 63 | doc_tables=attrs.get("doc_tables", "tables"), 64 | doc_markdown=attrs.get("doc_markdown", "markdown"), 65 | span_layout=attrs.get("span_layout", "layout"), 66 | span_heading=attrs.get("span_heading", "heading"), 67 | span_data=attrs.get("span_data", "data"), 68 | span_group=attrs.get("span_group", "layout"), 69 | ) 70 | self.headings = headings 71 | self.display_table = display_table 72 | self.converter = DocumentConverter(format_options=docling_options) 73 | # Set spaCy extension attributes for custom data 74 | Doc.set_extension(self.attrs.doc_layout, default=None, force=True) 75 | Doc.set_extension(self.attrs.doc_pages, getter=self.get_pages, force=True) 76 | Doc.set_extension(self.attrs.doc_tables, getter=self.get_tables, force=True) 77 | Doc.set_extension(self.attrs.doc_markdown, default=None, force=True) 78 | Span.set_extension(self.attrs.span_layout, default=None, force=True) 79 | Span.set_extension(self.attrs.span_data, default=None, force=True) 80 | Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True) 81 | 82 | def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc: 83 | """Call parser on a path to create a spaCy Doc object.""" 84 | if isinstance(source, DoclingDocument): 85 | result = source 86 | else: 87 | result = self.converter.convert(self._get_source(source)).document 88 | return self._result_to_doc(result) 89 | 90 | @overload 91 | def pipe( 92 | self, 93 | sources: Iterable[str | Path | bytes], 94 | as_tuples: Literal[False] = ..., 95 | ) -> Iterator[Doc]: ... 96 | 97 | @overload 98 | def pipe( 99 | self, 100 | sources: Iterable[tuple[str | Path | bytes, _AnyContext]], 101 | as_tuples: Literal[True] = ..., 102 | ) -> Iterator[tuple[Doc, _AnyContext]]: ... 103 | 104 | def pipe( 105 | self, 106 | sources: ( 107 | Iterable[str | Path | bytes] 108 | | Iterable[tuple[str | Path | bytes, _AnyContext]] 109 | ), 110 | as_tuples: bool = False, 111 | ) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]: 112 | """Process multiple documents and create spaCy Doc objects.""" 113 | if as_tuples: 114 | sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources) 115 | data = (self._get_source(source) for source, _ in sources) 116 | contexts = (context for _, context in sources) 117 | results = self.converter.convert_all(data) 118 | for result, context in zip(results, contexts): 119 | yield (self._result_to_doc(result.document), context) 120 | else: 121 | sources = cast(Iterable[str | Path | bytes], sources) 122 | data = (self._get_source(source) for source in sources) 123 | results = self.converter.convert_all(data) 124 | for result in results: 125 | yield self._result_to_doc(result.document) 126 | 127 | def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream: 128 | if isinstance(source, (str, Path)): 129 | return source 130 | return DocumentStream(name="source", stream=BytesIO(source)) 131 | 132 | def _result_to_doc(self, document: DoclingDocument) -> Doc: 133 | inputs = [] 134 | pages = { 135 | (page.page_no): PageLayout( 136 | page_no=page.page_no, 137 | width=page.size.width if page.size else 0, 138 | height=page.size.height if page.size else 0, 139 | ) 140 | for _, page in document.pages.items() 141 | } 142 | text_items = {item.self_ref: item for item in document.texts} 143 | table_items = {item.self_ref: item for item in document.tables} 144 | # We want to iterate over the tree to get different elements in order 145 | for node, _ in document.iterate_items(): 146 | if node.self_ref in text_items: 147 | item = text_items[node.self_ref] 148 | if item.text == "": 149 | continue 150 | inputs.append((item.text, item)) 151 | elif node.self_ref in table_items: 152 | item = table_items[node.self_ref] 153 | if isinstance(self.display_table, str): 154 | table_text = self.display_table 155 | else: 156 | table_text = self.display_table(item.export_to_dataframe()) 157 | inputs.append((table_text, item)) 158 | doc = self._texts_to_doc(inputs, pages) 159 | doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()])) 160 | doc._.set(self.attrs.doc_markdown, document.export_to_markdown()) 161 | return doc 162 | 163 | def _texts_to_doc( 164 | self, inputs: list[tuple[str, DoclingItem]], pages: dict[int, PageLayout] 165 | ) -> Doc: 166 | """Convert Docling structure to spaCy Doc.""" 167 | words = [] 168 | spaces = [] 169 | span_data = [] 170 | token_idx = 0 171 | # Tokenize the span because we can't rely on the document parsing to 172 | # give us items that are not split across token boundaries 173 | with self.nlp.select_pipes(disable=self.nlp.pipe_names): 174 | for span_doc, item in self.nlp.pipe(inputs, as_tuples=True): 175 | words += [token.text for token in span_doc] 176 | spaces += [bool(token.whitespace_) for token in span_doc] 177 | # Add separator token and don't include it in the layout span 178 | if self.sep: 179 | words.append(self.sep) 180 | spaces[-1] = False 181 | spaces.append(False) 182 | end = token_idx + len(span_doc) 183 | span_data.append((item, token_idx, end)) 184 | token_idx += len(span_doc) + (1 if self.sep else 0) 185 | doc = Doc(self.nlp.vocab, words=words, spaces=spaces) 186 | spans = [] 187 | for i, (item, start, end) in enumerate(span_data): 188 | span = Span(doc, start=start, end=end, label=item.label, span_id=i) 189 | layout = self._get_span_layout(item, pages) 190 | span._.set(self.attrs.span_layout, layout) 191 | if item.label in TABLE_ITEM_LABELS: 192 | span._.set(self.attrs.span_data, item.export_to_dataframe()) 193 | spans.append(span) 194 | doc.spans[self.attrs.span_group] = SpanGroup( 195 | doc, name=self.attrs.span_group, spans=spans 196 | ) 197 | return doc 198 | 199 | def _get_span_layout( 200 | self, item: DoclingItem, pages: dict[int, PageLayout] 201 | ) -> SpanLayout | None: 202 | if item.prov: 203 | prov = item.prov[0] 204 | page = pages[prov.page_no] 205 | if page.width and page.height: 206 | x, y, width, height = get_bounding_box(prov.bbox, page.height) 207 | return SpanLayout( 208 | x=x, y=y, width=width, height=height, page_no=prov.page_no 209 | ) 210 | 211 | def get_pages(self, doc: Doc) -> list[tuple[PageLayout, list[Span]]]: 212 | """Get all pages and their layout spans.""" 213 | layout = doc._.get(self.attrs.doc_layout) 214 | pages = {page.page_no: page for page in layout.pages} 215 | page_spans = {page.page_no: [] for page in layout.pages} 216 | for span in doc.spans[self.attrs.span_group]: 217 | span_layout = span._.get(self.attrs.span_layout) 218 | page_spans[span_layout.page_no].append(span) 219 | return [(pages[i], page_spans[i]) for i in page_spans] 220 | 221 | def get_heading(self, span: Span) -> Span | None: 222 | """Get the closest heading for a span.""" 223 | spans = list(span.doc.spans[self.attrs.span_group]) 224 | if span.label_ not in self.headings: 225 | # Go through previous layout spans in reverse and find first match 226 | for candidate in spans[: span.id][::-1]: 227 | if candidate.label_ in self.headings: 228 | return candidate 229 | 230 | def get_tables(self, doc: Doc) -> list[Span]: 231 | """Get all tables in the document.""" 232 | return [ 233 | span 234 | for span in doc.spans[self.attrs.span_group] 235 | if span.label_ in TABLE_ITEM_LABELS 236 | ] 237 | -------------------------------------------------------------------------------- /spacy_layout/types.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from docling_core.types.doc.document import ( 4 | ListItem, 5 | SectionHeaderItem, 6 | TableItem, 7 | TextItem, 8 | ) 9 | 10 | DoclingItem = ListItem | SectionHeaderItem | TextItem | TableItem 11 | 12 | 13 | @dataclass 14 | class Attrs: 15 | """Custom atributes used to extend spaCy""" 16 | 17 | doc_layout: str 18 | doc_pages: str 19 | doc_tables: str 20 | doc_markdown: str 21 | span_layout: str 22 | span_data: str 23 | span_heading: str 24 | span_group: str 25 | 26 | 27 | @dataclass 28 | class PageLayout: 29 | page_no: int 30 | width: float 31 | height: float 32 | 33 | @classmethod 34 | def from_dict(cls, data: dict) -> "PageLayout": 35 | return cls(**data) 36 | 37 | 38 | @dataclass 39 | class DocLayout: 40 | """Document layout features added to Doc object""" 41 | 42 | pages: list[PageLayout] 43 | 44 | @classmethod 45 | def from_dict(cls, data: dict) -> "DocLayout": 46 | pages = [PageLayout.from_dict(page) for page in data.get("pages", [])] 47 | return cls(pages=pages) 48 | 49 | 50 | @dataclass 51 | class SpanLayout: 52 | """Text span layout features added to Span object""" 53 | 54 | x: float 55 | y: float 56 | width: float 57 | height: float 58 | page_no: int 59 | 60 | @classmethod 61 | def from_dict(cls, data: dict) -> "SpanLayout": 62 | return cls(**data) 63 | -------------------------------------------------------------------------------- /spacy_layout/util.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from typing import TYPE_CHECKING, Any, Callable 3 | 4 | from docling_core.types.doc.base import CoordOrigin 5 | from pandas import DataFrame 6 | 7 | from .types import DocLayout, PageLayout, SpanLayout 8 | 9 | if TYPE_CHECKING: 10 | from docling_core.types.doc.base import BoundingBox 11 | 12 | TYPE_ATTR = "__type__" 13 | OBJ_TYPES = {"SpanLayout": SpanLayout, "DocLayout": DocLayout, "PageLayout": PageLayout} 14 | 15 | 16 | def encode_obj(obj: Any, chain: Callable | None = None) -> Any: 17 | """Convert custom dataclass to dict for serialization.""" 18 | if isinstance(obj, tuple(OBJ_TYPES.values())): 19 | result = dataclasses.asdict(obj) 20 | result[TYPE_ATTR] = type(obj).__name__ 21 | return result 22 | return obj if chain is None else chain(obj) 23 | 24 | 25 | def decode_obj(obj: Any, chain: Callable | None = None) -> Any: 26 | """Load custom dataclass from serialized dict.""" 27 | if isinstance(obj, dict) and obj.get(TYPE_ATTR) in OBJ_TYPES: 28 | obj_type = obj.pop(TYPE_ATTR) 29 | return OBJ_TYPES[obj_type].from_dict(obj) 30 | return obj if chain is None else chain(obj) 31 | 32 | 33 | def encode_df(obj: Any, chain: Callable | None = None) -> Any: 34 | """Convert pandas.DataFrame for serialization.""" 35 | if isinstance(obj, DataFrame): 36 | return {"data": obj.to_dict(), TYPE_ATTR: "DataFrame"} 37 | return obj if chain is None else chain(obj) 38 | 39 | 40 | def decode_df(obj: Any, chain: Callable | None = None) -> Any: 41 | """Load pandas.DataFrame from serialized data.""" 42 | if isinstance(obj, dict) and obj.get(TYPE_ATTR) == "DataFrame": 43 | return DataFrame(obj["data"]) 44 | return obj if chain is None else chain(obj) 45 | 46 | 47 | def get_bounding_box( 48 | bbox: "BoundingBox", page_height: float 49 | ) -> tuple[float, float, float, float]: 50 | is_bottom = bbox.coord_origin == CoordOrigin.BOTTOMLEFT 51 | y = page_height - bbox.t if is_bottom else bbox.t 52 | height = bbox.t - bbox.b if is_bottom else bbox.b - bbox.t 53 | width = bbox.r - bbox.l 54 | return (bbox.l, y, width, height) 55 | -------------------------------------------------------------------------------- /tests/data/simple.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/simple.docx -------------------------------------------------------------------------------- /tests/data/simple.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/simple.pdf -------------------------------------------------------------------------------- /tests/data/starcraft.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/starcraft.pdf -------------------------------------------------------------------------------- /tests/data/table.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/table.pdf -------------------------------------------------------------------------------- /tests/data/table_document_index.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/table_document_index.pdf -------------------------------------------------------------------------------- /tests/test_general.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | import spacy 5 | import srsly 6 | from docling_core.types.doc.base import BoundingBox, CoordOrigin 7 | from docling_core.types.doc.labels import DocItemLabel 8 | from pandas import DataFrame 9 | from pandas.testing import assert_frame_equal 10 | from spacy.tokens import DocBin 11 | import pandas as pd 12 | 13 | from spacy_layout import spaCyLayout 14 | from spacy_layout.layout import TABLE_PLACEHOLDER, get_bounding_box 15 | from spacy_layout.types import DocLayout, PageLayout, SpanLayout 16 | 17 | PDF_STARCRAFT = Path(__file__).parent / "data" / "starcraft.pdf" 18 | PDF_SIMPLE = Path(__file__).parent / "data" / "simple.pdf" 19 | DOCX_SIMPLE = Path(__file__).parent / "data" / "simple.docx" 20 | PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read() 21 | PDF_TABLE = Path(__file__).parent / "data" / "table.pdf" 22 | PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf" 23 | 24 | 25 | @pytest.fixture 26 | def nlp(): 27 | return spacy.blank("en") 28 | 29 | 30 | @pytest.fixture 31 | def span_labels(): 32 | return [label.value for label in DocItemLabel] 33 | 34 | 35 | @pytest.mark.parametrize("path", [PDF_STARCRAFT, PDF_SIMPLE, PDF_SIMPLE_BYTES]) 36 | def test_general(path, nlp, span_labels): 37 | layout = spaCyLayout(nlp) 38 | doc = layout(path) 39 | assert isinstance(doc._.get(layout.attrs.doc_layout), DocLayout) 40 | for span in doc.spans[layout.attrs.span_group]: 41 | assert span.text 42 | assert span.label_ in span_labels 43 | assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout) 44 | 45 | 46 | @pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)]) 47 | def test_pages(path, pg_no, nlp): 48 | layout = spaCyLayout(nlp) 49 | doc = layout(path) 50 | # This should not raise a KeyError when accessing `pages` dict 51 | # Key Error would mean a mismatched pagination on document layout and span layout 52 | result = layout.get_pages(doc) 53 | assert len(result) == pg_no 54 | assert result[0][0].page_no == 1 55 | if pg_no == 6: # there should be 16 or 18 spans on the pg_no 1 56 | assert len(result[0][1]) in (16, 18) 57 | elif pg_no == 1: # there should be 4 spans on pg_no 1 58 | assert len(result[0][1]) == 4 59 | 60 | 61 | @pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE]) 62 | @pytest.mark.parametrize("separator", ["\n\n", ""]) 63 | def test_simple(path, separator, nlp): 64 | layout = spaCyLayout(nlp, separator=separator) 65 | doc = layout(path) 66 | assert len(doc.spans[layout.attrs.span_group]) == 4 67 | assert doc.text.startswith(f"Lorem ipsum dolor sit amet{separator}") 68 | assert doc.spans[layout.attrs.span_group][0].text == "Lorem ipsum dolor sit amet" 69 | 70 | 71 | def test_simple_pipe(nlp): 72 | layout = spaCyLayout(nlp) 73 | for doc in layout.pipe([PDF_SIMPLE, DOCX_SIMPLE]): 74 | assert len(doc.spans[layout.attrs.span_group]) == 4 75 | 76 | 77 | def test_simple_pipe_as_tuples(nlp): 78 | layout = spaCyLayout(nlp) 79 | data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")] 80 | result = list(layout.pipe(data, as_tuples=True)) 81 | for doc, _ in result: 82 | assert len(doc.spans[layout.attrs.span_group]) == 4 83 | assert [context for _, context in result] == ["pdf", "docx"] 84 | 85 | 86 | def test_table(nlp): 87 | layout = spaCyLayout(nlp) 88 | doc = layout(PDF_TABLE) 89 | assert len(doc._.get(layout.attrs.doc_tables)) == 1 90 | table = doc._.get(layout.attrs.doc_tables)[0] 91 | assert table.text == TABLE_PLACEHOLDER 92 | df = table._.get(layout.attrs.span_data) 93 | assert df.columns.tolist() == ["Name", "Type", "Place of birth"] 94 | assert df.to_dict(orient="list") == { 95 | "Name": ["Ines", "Matt", "Baikal", "Stanislav Petrov"], 96 | "Type": ["human", "human", "cat", "cat"], 97 | "Place of birth": [ 98 | "Cologne, Germany", 99 | "Sydney, Australia", 100 | "Berlin, Germany", 101 | "Chernihiv, Ukraine", 102 | ], 103 | } 104 | markdown = ( 105 | "| Name | Type | Place of birth |\n" 106 | "|------------------|--------|--------------------|\n" 107 | "| Ines | human | Cologne, Germany |\n" 108 | "| Matt | human | Sydney, Australia |\n" 109 | "| Baikal | cat | Berlin, Germany |\n" 110 | "| Stanislav Petrov | cat | Chernihiv, Ukraine |\n" 111 | ) 112 | assert markdown in doc._.get(layout.attrs.doc_markdown) 113 | 114 | 115 | def test_table_index(nlp): 116 | layout = spaCyLayout(nlp) 117 | doc = layout(PDF_INDEX) 118 | assert len(doc._.get(layout.attrs.doc_tables)) == 3 119 | table = doc._.get(layout.attrs.doc_tables)[0] 120 | assert table.text == TABLE_PLACEHOLDER 121 | assert table.label_ == DocItemLabel.DOCUMENT_INDEX.value 122 | 123 | # Check that each document_index table has a dataframe 124 | document_index_tables = [span for span in doc._.get( 125 | layout.attrs.doc_tables) if span.label_ == DocItemLabel.DOCUMENT_INDEX.value] 126 | for table in document_index_tables: 127 | assert table._.data is not None, "Table data not available" 128 | assert isinstance( 129 | table._.data, pd.DataFrame), "Table data is not a DataFrame" 130 | 131 | 132 | def test_table_placeholder(nlp): 133 | def display_table(df): 134 | return f"Table with columns: {', '.join(df.columns.tolist())}" 135 | 136 | layout = spaCyLayout(nlp, display_table=display_table) 137 | doc = layout(PDF_TABLE) 138 | table = doc._.get(layout.attrs.doc_tables)[0] 139 | assert table.text == "Table with columns: Name, Type, Place of birth" 140 | 141 | 142 | @pytest.mark.parametrize( 143 | "box,page_height,expected", 144 | [ 145 | ( 146 | (200.0, 50.0, 100.0, 400.0, CoordOrigin.BOTTOMLEFT), 147 | 1000.0, 148 | (100.0, 800.0, 300.0, 150.0), 149 | ), 150 | ( 151 | (200.0, 250.0, 100.0, 400.0, CoordOrigin.TOPLEFT), 152 | 1000.0, 153 | (100.0, 200.0, 300.0, 50.0), 154 | ), 155 | ( 156 | ( 157 | 648.3192749023438, 158 | 633.4112548828125, 159 | 155.50897216796875, 160 | 239.66929626464844, 161 | CoordOrigin.BOTTOMLEFT, 162 | ), 163 | 792.0, 164 | ( 165 | 155.50897216796875, 166 | 143.68072509765625, 167 | 84.16032409667969, 168 | 14.90802001953125, 169 | ), 170 | ), 171 | ], 172 | ) 173 | def test_bounding_box(box, page_height, expected): 174 | top, bottom, left, right, origin = box 175 | bbox = BoundingBox(t=top, b=bottom, l=left, r=right, coord_origin=origin) 176 | assert get_bounding_box(bbox, page_height) == expected 177 | 178 | 179 | def test_serialize_objects(): 180 | span_layout = SpanLayout(x=10, y=20, width=30, height=40, page_no=1) 181 | doc_layout = DocLayout(pages=[PageLayout(page_no=1, width=500, height=600)]) 182 | bytes_data = srsly.msgpack_dumps({"span": span_layout, "doc": doc_layout}) 183 | data = srsly.msgpack_loads(bytes_data) 184 | assert isinstance(data, dict) 185 | assert data["span"] == span_layout 186 | assert data["doc"] == doc_layout 187 | df = DataFrame(data={"col1": [1, 2], "col2": [3, 4]}) 188 | bytes_data = srsly.msgpack_dumps({"df": df}) 189 | data = srsly.msgpack_loads(bytes_data) 190 | assert isinstance(data, dict) 191 | assert_frame_equal(df, data["df"]) 192 | 193 | 194 | @pytest.mark.parametrize("path", [PDF_SIMPLE, PDF_TABLE]) 195 | def test_serialize_roundtrip(path, nlp): 196 | layout = spaCyLayout(nlp) 197 | doc = layout(path) 198 | doc_bin = DocBin(store_user_data=True) 199 | doc_bin.add(doc) 200 | bytes_data = doc_bin.to_bytes() 201 | new_doc_bin = DocBin().from_bytes(bytes_data) 202 | new_doc = list(new_doc_bin.get_docs(nlp.vocab))[0] 203 | layout_spans = new_doc.spans[layout.attrs.span_group] 204 | assert len(layout_spans) == len(doc.spans[layout.attrs.span_group]) 205 | assert all( 206 | isinstance(span._.get(layout.attrs.span_layout), SpanLayout) 207 | for span in layout_spans 208 | ) 209 | assert isinstance(new_doc._.get(layout.attrs.doc_layout), DocLayout) 210 | tables = doc._.get(layout.attrs.doc_tables) 211 | new_tables = new_doc._.get(layout.attrs.doc_tables) 212 | for before, after in zip(tables, new_tables): 213 | table_before = before._.get(layout.attrs.span_data) 214 | table_after = after._.get(layout.attrs.span_data) 215 | assert_frame_equal(table_before, table_after) 216 | --------------------------------------------------------------------------------