├── .github
└── workflows
│ ├── publish.yml
│ └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── requirements.txt
├── setup.cfg
├── setup.py
├── spacy_layout
├── __init__.py
├── layout.py
├── types.py
└── util.py
└── tests
├── data
├── simple.docx
├── simple.pdf
├── starcraft.pdf
├── table.pdf
└── table_document_index.pdf
└── test_general.py
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Upload Python Package
2 |
3 | on:
4 | release:
5 | types: [published]
6 | workflow_dispatch:
7 |
8 | permissions:
9 | contents: read
10 |
11 | jobs:
12 | deploy:
13 | runs-on: ubuntu-latest
14 | environment:
15 | name: pypi
16 | url: https://pypi.org/p/spacy-layout
17 | permissions:
18 | id-token: write
19 | contents: read
20 |
21 | steps:
22 | - uses: actions/checkout@v4
23 |
24 | - name: Set up Python
25 | uses: actions/setup-python@v3
26 | with:
27 | python-version: '3.10'
28 |
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 |
34 | - name: Build package
35 | run: python -m build
36 |
37 | - name: Publish package
38 | uses: pypa/gh-action-pypi-publish@release/v1
39 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | paths-ignore:
8 | - "*.md"
9 | pull_request:
10 | paths-ignore:
11 | - "*.md"
12 | workflow_dispatch:
13 |
14 | jobs:
15 | run:
16 | runs-on: ubuntu-latest
17 |
18 | steps:
19 | - uses: actions/checkout@v4
20 |
21 | - name: Set up Python
22 | uses: actions/setup-python@v5
23 | with:
24 | python-version: "3.10"
25 | cache: "pip" # caching pip dependencies
26 |
27 | - name: Install
28 | run: |
29 | python -m pip install -U pip
30 | pip install -e .
31 | pip install pytest
32 |
33 | - name: Run tests
34 | shell: bash
35 | run: |
36 | python -m pytest tests
37 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | .pytest_cache/
4 | .mypy_cache/
5 | *.py[cod]
6 | *$py.class
7 | py36-64/
8 | py35-64/
9 | env3/
10 | env3.9/
11 |
12 | # C extensions
13 | *.so
14 | .cython_src/
15 | cython_src/
16 |
17 | # Distribution / packaging
18 | .Python
19 | env/
20 | build/
21 | develop-eggs/
22 | dist/
23 | downloads/
24 | eggs/
25 | .eggs/
26 | lib/
27 | lib64/
28 | parts/
29 | sdist/
30 | var/
31 | *.egg-info/
32 | .installed.cfg
33 | *.egg
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .coverage
49 | .coverage.*
50 | .cache
51 | nosetests.xml
52 | coverage.xml
53 | *,cover
54 | .hypothesis/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # IPython Notebook
78 | .ipynb_checkpoints
79 |
80 | # pyenv
81 | .python-version
82 |
83 | # celery beat schedule file
84 | celerybeat-schedule
85 |
86 | # dotenv
87 | .env*
88 | .~env
89 | .env-3.5.0
90 | .env-3.6.2
91 |
92 | # virtualenv
93 | venv/
94 | ENV/
95 | .linenv
96 |
97 | # Spyder project settings
98 | .spyderproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # PyCharm project settings
104 | .idea
105 |
106 | # Node
107 | **/node_modules
108 |
109 | # Redis
110 | *.rdb
111 |
112 | /tmp
113 | .vscode
114 |
115 | # Vim
116 | *.swp
117 |
118 | # prodigy
119 | prodigy.json
120 |
121 | # cypress and e2e
122 | e2e_cy/testing_assets/test_*_output.jsonl
123 | e2e_cy/prodigy_home/prodigy_cypress.db
124 | app/cypress/videos
125 | app/cypress/screenshots
126 | app/coverage
127 |
128 | exports
129 | trash
130 |
131 | .DS_Store
132 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 ExplosionAI GmbH
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # spaCy Layout: Process PDFs, Word documents and more with spaCy
4 |
5 | This plugin integrates with [Docling](https://ds4sd.github.io/docling/) to bring structured processing of **PDFs**, **Word documents** and other input formats to your [spaCy](https://spacy.io) pipeline. It outputs clean, **structured data** in a text-based format and creates spaCy's familiar [`Doc`](https://spacy.io/api/doc) objects that let you access labelled text spans like sections or headings, and tables with their data converted to a `pandas.DataFrame`.
6 |
7 | This workflow makes it easy to apply powerful **NLP techniques** to your documents, including linguistic analysis, named entity recognition, text classification and more. It's also great for implementing **chunking for RAG** pipelines.
8 |
9 | > 📖 **Blog post:** ["From PDFs to AI-ready structured data: a deep dive"
10 | ](https://explosion.ai/blog/pdfs-nlp-structured-data) – A new modular workflow for converting PDFs and similar documents to structured data, featuring `spacy-layout` and Docling.
11 |
12 | [](https://github.com/explosion/spacy-layout/actions/workflows/test.yml)
13 | [](https://github.com/explosion/spacy-layout/releases)
14 | [](https://pypi.org/project/spacy-layout/)
15 | [](https://spacy.io)
16 |
17 | ## 📝 Usage
18 |
19 | > ⚠️ This package requires **Python 3.10** or above.
20 |
21 | ```bash
22 | pip install spacy-layout
23 | ```
24 |
25 | After initializing the `spaCyLayout` preprocessor with an `nlp` object for tokenization, you can call it on a document path to convert it to structured data. The resulting `Doc` object includes layout spans that map into the original raw text and expose various attributes, including the content type and layout features.
26 |
27 | ```python
28 | import spacy
29 | from spacy_layout import spaCyLayout
30 |
31 | nlp = spacy.blank("en")
32 | layout = spaCyLayout(nlp)
33 |
34 | # Process a document and create a spaCy Doc object
35 | doc = layout("./starcraft.pdf")
36 |
37 | # The text-based contents of the document
38 | print(doc.text)
39 | # Document layout including pages and page sizes
40 | print(doc._.layout)
41 | # Tables in the document and their extracted data
42 | print(doc._.tables)
43 | # Markdown representation of the document
44 | print(doc._.markdown)
45 |
46 | # Layout spans for different sections
47 | for span in doc.spans["layout"]:
48 | # Document section and token and character offsets into the text
49 | print(span.text, span.start, span.end, span.start_char, span.end_char)
50 | # Section type, e.g. "text", "title", "section_header" etc.
51 | print(span.label_)
52 | # Layout features of the section, including bounding box
53 | print(span._.layout)
54 | # Closest heading to the span (accuracy depends on document structure)
55 | print(span._.heading)
56 | ```
57 |
58 | If you need to process larger volumes of documents at scale, you can use the `spaCyLayout.pipe` method, which takes an iterable of paths or bytes instead and yields `Doc` objects:
59 |
60 | ```python
61 | paths = ["one.pdf", "two.pdf", "three.pdf", ...]
62 | for doc in layout.pipe(paths):
63 | print(doc._.layout)
64 | ```
65 |
66 | spaCy also allows you to call the `nlp` object on an already created `Doc`, so you can easily apply a pipeline of components for [linguistic analysis](https://spacy.io/usage/linguistic-features) or [named entity recognition](https://spacy.io/usage/linguistic-features#named-entities), use [rule-based matching](https://spacy.io/usage/rule-based-matching) or anything else you can do with spaCy.
67 |
68 | ```python
69 | # Load the transformer-based English pipeline
70 | # Installation: python -m spacy download en_core_web_trf
71 | nlp = spacy.load("en_core_web_trf")
72 | layout = spaCyLayout(nlp)
73 |
74 | doc = layout("./starcraft.pdf")
75 | # Apply the pipeline to access POS tags, dependencies, entities etc.
76 | doc = nlp(doc)
77 | ```
78 |
79 | ### Tables and tabular data
80 |
81 | Tables are included in the layout spans with the label `"table"` and under the shortcut `Doc._.tables`. They expose a `layout` extension attribute, as well as an attribute `data`, which includes the tabular data converted to a `pandas.DataFrame`.
82 |
83 | ```python
84 | for table in doc._.tables:
85 | # Token position and bounding box
86 | print(table.start, table.end, table._.layout)
87 | # pandas.DataFrame of contents
88 | print(table._.data)
89 | ```
90 |
91 | By default, the span text is a placeholder `TABLE`, but you can customize how a table is rendered by providing a `display_table` callback to `spaCyLayout`, which receives the `pandas.DataFrame` of the data. This allows you to include the table figures in the document text and use them later on, e.g. during information extraction with a trained named entity recognizer or text classifier.
92 |
93 | ```python
94 | def display_table(df: pd.DataFrame) -> str:
95 | return f"Table with columns: {', '.join(df.columns.tolist())}"
96 |
97 | layout = spaCyLayout(nlp, display_table=display_table)
98 | ```
99 |
100 | ### Serialization
101 |
102 | After you've processed the documents, you can [serialize](https://spacy.io/usage/saving-loading#docs) the structured `Doc` objects in spaCy's efficient binary format, so you don't have to re-run the resource-intensive conversion.
103 |
104 | ```python
105 | from spacy.tokens import DocBin
106 |
107 | docs = layout.pipe(["one.pdf", "two.pdf", "three.pdf"])
108 | doc_bin = DocBin(docs=docs, store_user_data=True)
109 | doc_bin.to_disk("./file.spacy")
110 | ```
111 |
112 | > ⚠️ **Note on deserializing with extension attributes:** The custom extension attributes like `Doc._.layout` are currently registered when `spaCyLayout` is initialized. So if you're loading back `Doc` objects with layout information from a binary file, you'll need to initialize it so the custom attributes can be repopulated. We're planning on making this more elegant in an upcoming version.
113 | >
114 | > ```diff
115 | > + layout = spacyLayout(nlp)
116 | > doc_bin = DocBin(store_user_data=True).from_disk("./file.spacy")
117 | > docs = list(doc_bin.get_docs(nlp.vocab))
118 | > ```
119 |
120 |
121 | ## 🎛️ API
122 |
123 | ### Data and extension attributes
124 |
125 | ```python
126 | layout = spaCyLayout(nlp)
127 | doc = layout("./starcraft.pdf")
128 | print(doc._.layout)
129 | for span in doc.spans["layout"]:
130 | print(span.label_, span._.layout)
131 | ```
132 |
133 | | Attribute | Type | Description |
134 | | --- | --- | --- |
135 | | `Doc._.layout` | `DocLayout` | Layout features of the document. |
136 | | `Doc._.pages` | `list[tuple[PageLayout, list[Span]]]` | Pages in the document and the spans they contain. |
137 | | `Doc._.tables` | `list[Span]` | All tables in the document. |
138 | | `Doc._.markdown` | `str` | Markdown representation of the document. |
139 | | `Doc.spans["layout"]` | `spacy.tokens.SpanGroup` | The layout spans in the document. |
140 | | `Span.label_` | `str` | The type of the extracted layout span, e.g. `"text"` or `"section_header"`. [See here](https://github.com/DS4SD/docling-core/blob/14cad33ae7f8dc011a79dd364361d2647c635466/docling_core/types/doc/labels.py) for options. |
141 | | `Span.label` | `int` | The integer ID of the span label. |
142 | | `Span.id` | `int` | Running index of layout span. |
143 | | `Span._.layout` | `SpanLayout \| None` | Layout features of a layout span. |
144 | | `Span._.heading` | `Span \| None` | Closest heading to a span, if available. |
145 | | `Span._.data` | `pandas.DataFrame \| None` | The extracted data for table spans.
146 |
147 | ### dataclass PageLayout
148 |
149 | | Attribute | Type | Description |
150 | | --- | --- | --- |
151 | | `page_no` | `int` | The page number (1-indexed). |
152 | | `width` | `float` | Page width in pixels. |
153 | | `height` | `float` | Page height in pixels. |
154 |
155 | ### dataclass DocLayout
156 |
157 | | Attribute | Type | Description |
158 | | --- | --- | --- |
159 | | `pages` | `list[PageLayout]` | The pages in the document. |
160 |
161 | ### dataclass SpanLayout
162 |
163 | | Attribute | Type | Description |
164 | | --- | --- | --- |
165 | | `x` | `float` | Horizontal offset of the bounding box in pixels. |
166 | | `y` | `float` | Vertical offset of the bounding box in pixels. |
167 | | `width` | `float` | Width of the bounding box in pixels. |
168 | | `height` | `float` | Height of the bounding box in pixels. |
169 | | `page_no` | `int` | Number of page the span is on. |
170 |
171 | ### class `spaCyLayout`
172 |
173 | #### method `spaCyLayout.__init__`
174 |
175 | Initialize the document processor.
176 |
177 | ```python
178 | nlp = spacy.blank("en")
179 | layout = spaCyLayout(nlp)
180 | ```
181 |
182 | | Argument | Type | Description |
183 | | --- | --- | --- |
184 | | `nlp` | `spacy.language.Language` | The initialized `nlp` object to use for tokenization. |
185 | | `separator` | `str` | Token used to separate sections in the created `Doc` object. The separator won't be part of the layout span. If `None`, no separator will be added. Defaults to `"\n\n"`. |
186 | | `attrs` | `dict[str, str]` | Override the custom spaCy attributes. Can include `"doc_layout"`, `"doc_pages"`, `"doc_tables"`, `"doc_markdown"`, `"span_layout"`, `"span_data"`, `"span_heading"` and `"span_group"`. |
187 | | `headings` | `list[str]` | Labels of headings to consider for `Span._.heading` detection. Defaults to `["section_header", "page_header", "title"]`. |
188 | | `display_table` | `Callable[[pandas.DataFrame], str] \| str` | Function to generate the text-based representation of the table in the `Doc.text` or placeholder text. Defaults to `"TABLE"`. |
189 | | `docling_options` | `dict[InputFormat, FormatOption]` | [Format options](https://ds4sd.github.io/docling/usage/#advanced-options) passed to Docling's `DocumentConverter`. |
190 | | **RETURNS** | `spaCyLayout` | The initialized object. |
191 |
192 | #### method `spaCyLayout.__call__`
193 |
194 | Process a document and create a spaCy [`Doc`](https://spacy.io/api/doc) object containing the text content and layout spans, available via `Doc.spans["layout"]` by default.
195 |
196 | ```python
197 | layout = spaCyLayout(nlp)
198 | doc = layout("./starcraft.pdf")
199 | ```
200 |
201 | | Argument | Type | Description |
202 | | --- | --- | --- |
203 | | `source` | `str \| Path \| bytes \| DoclingDocument` | Path of document to process, bytes or already created `DoclingDocument`. |
204 | | **RETURNS** | `Doc` | The processed spaCy `Doc` object. |
205 |
206 | #### method `spaCyLayout.pipe`
207 |
208 | Process multiple documents and create spaCy [`Doc`](https://spacy.io/api/doc) objects. You should use this method if you're processing larger volumes of documents at scale. The behavior of `as_tuples` works like it does in spaCy's [`Language.pipe`](https://spacy.io/api/language#pipe).
209 |
210 | ```python
211 | layout = spaCyLayout(nlp)
212 | paths = ["one.pdf", "two.pdf", "three.pdf", ...]
213 | docs = layout.pipe(paths)
214 | ```
215 |
216 | ```python
217 | sources = [("one.pdf", {"id": 1}), ("two.pdf", {"id": 2})]
218 | for doc, context in layout.pipe(sources, as_tuples=True):
219 | ...
220 | ```
221 |
222 | | Argument | Type | Description |
223 | | --- | --- | --- |
224 | | `sources` | `Iterable[str \| Path \| bytes] \| Iterable[tuple[str \| Path \| bytes, Any]]` | Paths of documents to process or bytes, or `(source, context)` tuples if `as_tuples` is set to `True`. |
225 | | `as_tuples` | `bool` | If set to `True`, inputs should be an iterable of `(source, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
226 | | **YIELDS** | `Doc \| tuple[Doc, Any]` | The processed spaCy `Doc` objects or `(doc, context)` tuples if `as_tuples` is set to `True`. |
227 |
228 | ## 💡 Examples and code snippets
229 |
230 | This section includes further examples of what you can do with `spacy-layout`. If you have an example that could be a good fit, feel free to submit a [pull request](https://github.com/explosion/spacy-layout/pulls)!
231 |
232 | ### Visualize a page and bounding boxes with matplotlib
233 |
234 | ```python
235 | import pypdfium2 as pdfium
236 | import matplotlib.pyplot as plt
237 | from matplotlib.patches import Rectangle
238 | import spacy
239 | from spacy_layout import spaCyLayout
240 |
241 | DOCUMENT_PATH = "./document.pdf"
242 |
243 | # Load and convert the PDF page to an image
244 | pdf = pdfium.PdfDocument(DOCUMENT_PATH)
245 | page_image = pdf[2].render(scale=1) # get page 3 (index 2)
246 | numpy_array = page_image.to_numpy()
247 | # Process document with spaCy
248 | nlp = spacy.blank("en")
249 | layout = spaCyLayout(nlp)
250 | doc = layout(DOCUMENT_PATH)
251 |
252 | # Get page 3 layout and sections
253 | page = doc._.pages[2]
254 | page_layout = doc._.layout.pages[2]
255 | # Create figure and axis with page dimensions
256 | fig, ax = plt.subplots(figsize=(12, 16))
257 | # Display the PDF image
258 | ax.imshow(numpy_array)
259 | # Add rectangles for each section's bounding box
260 | for section in page[1]:
261 | # Create rectangle patch
262 | rect = Rectangle(
263 | (section._.layout.x, section._.layout.y),
264 | section._.layout.width,
265 | section._.layout.height,
266 | fill=False,
267 | color="blue",
268 | linewidth=1,
269 | alpha=0.5
270 | )
271 | ax.add_patch(rect)
272 | # Add text label at top of box
273 | ax.text(
274 | section._.layout.x,
275 | section._.layout.y,
276 | section.label_,
277 | fontsize=8,
278 | color="red",
279 | verticalalignment="bottom"
280 | )
281 |
282 | ax.axis("off") # hide axes
283 | plt.show()
284 | ```
285 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | spacy>=3.7.5
2 | docling>=2.5.2
3 | pandas # version range set by Docling
4 | srsly # version range set by spaCy
5 | # Dev requirements
6 | pytest
7 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | version = 0.0.12
3 | description = Use spaCy with PDFs, Word docs and other documents
4 | url = https://github.com/explosion/spacy-layout
5 | author = Explosion
6 | author_email = contact@explosion.ai
7 | license = MIT
8 | long_description = file: README.md
9 | long_description_content_type = text/markdown
10 | classifiers =
11 | Development Status :: 4 - Beta
12 | Environment :: Console
13 | Intended Audience :: Developers
14 | Intended Audience :: Science/Research
15 | License :: OSI Approved :: MIT License
16 | Operating System :: POSIX :: Linux
17 | Operating System :: MacOS :: MacOS X
18 | Operating System :: Microsoft :: Windows
19 | Programming Language :: Python :: 3
20 | Programming Language :: Python :: 3.10
21 | Programming Language :: Python :: 3.11
22 | Programming Language :: Python :: 3.12
23 | Topic :: Scientific/Engineering
24 | project_urls =
25 | Release notes = https://github.com/explosion/spacy-layout/releases
26 | Source = https://github.com/explosion/spacy-layout
27 |
28 | [options]
29 | zip_safe = true
30 | python_requires = >=3.10
31 | install_requires =
32 | spacy>=3.7.5
33 | docling>=2.5.2
34 | pandas # version range set by Docling
35 | srsly # version range set by spaCy
36 |
37 | [bdist_wheel]
38 | universal = true
39 |
40 | [sdist]
41 | formats = gztar
42 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | if __name__ == "__main__":
4 | from setuptools import find_packages, setup
5 |
6 | setup(name="spacy_layout", packages=find_packages())
7 |
--------------------------------------------------------------------------------
/spacy_layout/__init__.py:
--------------------------------------------------------------------------------
1 | from .layout import spaCyLayout
2 |
3 | __all__ = ["spaCyLayout"]
4 |
--------------------------------------------------------------------------------
/spacy_layout/layout.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | from pathlib import Path
3 | from typing import (
4 | TYPE_CHECKING,
5 | Callable,
6 | Iterable,
7 | Iterator,
8 | Literal,
9 | TypeVar,
10 | cast,
11 | overload,
12 | )
13 |
14 | import srsly
15 | from docling.datamodel.base_models import DocumentStream
16 | from docling.document_converter import DocumentConverter
17 | from docling_core.types.doc.document import DoclingDocument
18 | from docling_core.types.doc.labels import DocItemLabel
19 | from spacy.tokens import Doc, Span, SpanGroup
20 |
21 | from .types import Attrs, DocLayout, DoclingItem, PageLayout, SpanLayout
22 | from .util import decode_df, decode_obj, encode_df, encode_obj, get_bounding_box
23 |
24 | if TYPE_CHECKING:
25 | from docling.datamodel.base_models import InputFormat
26 | from docling.document_converter import FormatOption
27 | from pandas import DataFrame
28 | from spacy.language import Language
29 |
30 | # Type variable for contexts piped with documents
31 | _AnyContext = TypeVar("_AnyContext")
32 |
33 | TABLE_PLACEHOLDER = "TABLE"
34 | TABLE_ITEM_LABELS = [DocItemLabel.TABLE, DocItemLabel.DOCUMENT_INDEX]
35 |
36 | # Register msgpack encoders and decoders for custom types
37 | srsly.msgpack_encoders.register("spacy-layout.dataclass", func=encode_obj)
38 | srsly.msgpack_decoders.register("spacy-layout.dataclass", func=decode_obj)
39 | srsly.msgpack_encoders.register("spacy-layout.dataframe", func=encode_df)
40 | srsly.msgpack_decoders.register("spacy-layout.dataframe", func=decode_df)
41 |
42 |
43 | class spaCyLayout:
44 | def __init__(
45 | self,
46 | nlp: "Language",
47 | separator: str | None = "\n\n",
48 | attrs: dict[str, str] = {},
49 | headings: list[str] = [
50 | DocItemLabel.SECTION_HEADER,
51 | DocItemLabel.PAGE_HEADER,
52 | DocItemLabel.TITLE,
53 | ],
54 | display_table: Callable[["DataFrame"], str] | str = TABLE_PLACEHOLDER,
55 | docling_options: dict["InputFormat", "FormatOption"] | None = None,
56 | ) -> None:
57 | """Initialize the layout parser and Docling converter."""
58 | self.nlp = nlp
59 | self.sep = separator
60 | self.attrs = Attrs(
61 | doc_layout=attrs.get("doc_layout", "layout"),
62 | doc_pages=attrs.get("doc_pages", "pages"),
63 | doc_tables=attrs.get("doc_tables", "tables"),
64 | doc_markdown=attrs.get("doc_markdown", "markdown"),
65 | span_layout=attrs.get("span_layout", "layout"),
66 | span_heading=attrs.get("span_heading", "heading"),
67 | span_data=attrs.get("span_data", "data"),
68 | span_group=attrs.get("span_group", "layout"),
69 | )
70 | self.headings = headings
71 | self.display_table = display_table
72 | self.converter = DocumentConverter(format_options=docling_options)
73 | # Set spaCy extension attributes for custom data
74 | Doc.set_extension(self.attrs.doc_layout, default=None, force=True)
75 | Doc.set_extension(self.attrs.doc_pages, getter=self.get_pages, force=True)
76 | Doc.set_extension(self.attrs.doc_tables, getter=self.get_tables, force=True)
77 | Doc.set_extension(self.attrs.doc_markdown, default=None, force=True)
78 | Span.set_extension(self.attrs.span_layout, default=None, force=True)
79 | Span.set_extension(self.attrs.span_data, default=None, force=True)
80 | Span.set_extension(self.attrs.span_heading, getter=self.get_heading, force=True)
81 |
82 | def __call__(self, source: str | Path | bytes | DoclingDocument) -> Doc:
83 | """Call parser on a path to create a spaCy Doc object."""
84 | if isinstance(source, DoclingDocument):
85 | result = source
86 | else:
87 | result = self.converter.convert(self._get_source(source)).document
88 | return self._result_to_doc(result)
89 |
90 | @overload
91 | def pipe(
92 | self,
93 | sources: Iterable[str | Path | bytes],
94 | as_tuples: Literal[False] = ...,
95 | ) -> Iterator[Doc]: ...
96 |
97 | @overload
98 | def pipe(
99 | self,
100 | sources: Iterable[tuple[str | Path | bytes, _AnyContext]],
101 | as_tuples: Literal[True] = ...,
102 | ) -> Iterator[tuple[Doc, _AnyContext]]: ...
103 |
104 | def pipe(
105 | self,
106 | sources: (
107 | Iterable[str | Path | bytes]
108 | | Iterable[tuple[str | Path | bytes, _AnyContext]]
109 | ),
110 | as_tuples: bool = False,
111 | ) -> Iterator[Doc] | Iterator[tuple[Doc, _AnyContext]]:
112 | """Process multiple documents and create spaCy Doc objects."""
113 | if as_tuples:
114 | sources = cast(Iterable[tuple[str | Path | bytes, _AnyContext]], sources)
115 | data = (self._get_source(source) for source, _ in sources)
116 | contexts = (context for _, context in sources)
117 | results = self.converter.convert_all(data)
118 | for result, context in zip(results, contexts):
119 | yield (self._result_to_doc(result.document), context)
120 | else:
121 | sources = cast(Iterable[str | Path | bytes], sources)
122 | data = (self._get_source(source) for source in sources)
123 | results = self.converter.convert_all(data)
124 | for result in results:
125 | yield self._result_to_doc(result.document)
126 |
127 | def _get_source(self, source: str | Path | bytes) -> str | Path | DocumentStream:
128 | if isinstance(source, (str, Path)):
129 | return source
130 | return DocumentStream(name="source", stream=BytesIO(source))
131 |
132 | def _result_to_doc(self, document: DoclingDocument) -> Doc:
133 | inputs = []
134 | pages = {
135 | (page.page_no): PageLayout(
136 | page_no=page.page_no,
137 | width=page.size.width if page.size else 0,
138 | height=page.size.height if page.size else 0,
139 | )
140 | for _, page in document.pages.items()
141 | }
142 | text_items = {item.self_ref: item for item in document.texts}
143 | table_items = {item.self_ref: item for item in document.tables}
144 | # We want to iterate over the tree to get different elements in order
145 | for node, _ in document.iterate_items():
146 | if node.self_ref in text_items:
147 | item = text_items[node.self_ref]
148 | if item.text == "":
149 | continue
150 | inputs.append((item.text, item))
151 | elif node.self_ref in table_items:
152 | item = table_items[node.self_ref]
153 | if isinstance(self.display_table, str):
154 | table_text = self.display_table
155 | else:
156 | table_text = self.display_table(item.export_to_dataframe())
157 | inputs.append((table_text, item))
158 | doc = self._texts_to_doc(inputs, pages)
159 | doc._.set(self.attrs.doc_layout, DocLayout(pages=[p for p in pages.values()]))
160 | doc._.set(self.attrs.doc_markdown, document.export_to_markdown())
161 | return doc
162 |
163 | def _texts_to_doc(
164 | self, inputs: list[tuple[str, DoclingItem]], pages: dict[int, PageLayout]
165 | ) -> Doc:
166 | """Convert Docling structure to spaCy Doc."""
167 | words = []
168 | spaces = []
169 | span_data = []
170 | token_idx = 0
171 | # Tokenize the span because we can't rely on the document parsing to
172 | # give us items that are not split across token boundaries
173 | with self.nlp.select_pipes(disable=self.nlp.pipe_names):
174 | for span_doc, item in self.nlp.pipe(inputs, as_tuples=True):
175 | words += [token.text for token in span_doc]
176 | spaces += [bool(token.whitespace_) for token in span_doc]
177 | # Add separator token and don't include it in the layout span
178 | if self.sep:
179 | words.append(self.sep)
180 | spaces[-1] = False
181 | spaces.append(False)
182 | end = token_idx + len(span_doc)
183 | span_data.append((item, token_idx, end))
184 | token_idx += len(span_doc) + (1 if self.sep else 0)
185 | doc = Doc(self.nlp.vocab, words=words, spaces=spaces)
186 | spans = []
187 | for i, (item, start, end) in enumerate(span_data):
188 | span = Span(doc, start=start, end=end, label=item.label, span_id=i)
189 | layout = self._get_span_layout(item, pages)
190 | span._.set(self.attrs.span_layout, layout)
191 | if item.label in TABLE_ITEM_LABELS:
192 | span._.set(self.attrs.span_data, item.export_to_dataframe())
193 | spans.append(span)
194 | doc.spans[self.attrs.span_group] = SpanGroup(
195 | doc, name=self.attrs.span_group, spans=spans
196 | )
197 | return doc
198 |
199 | def _get_span_layout(
200 | self, item: DoclingItem, pages: dict[int, PageLayout]
201 | ) -> SpanLayout | None:
202 | if item.prov:
203 | prov = item.prov[0]
204 | page = pages[prov.page_no]
205 | if page.width and page.height:
206 | x, y, width, height = get_bounding_box(prov.bbox, page.height)
207 | return SpanLayout(
208 | x=x, y=y, width=width, height=height, page_no=prov.page_no
209 | )
210 |
211 | def get_pages(self, doc: Doc) -> list[tuple[PageLayout, list[Span]]]:
212 | """Get all pages and their layout spans."""
213 | layout = doc._.get(self.attrs.doc_layout)
214 | pages = {page.page_no: page for page in layout.pages}
215 | page_spans = {page.page_no: [] for page in layout.pages}
216 | for span in doc.spans[self.attrs.span_group]:
217 | span_layout = span._.get(self.attrs.span_layout)
218 | page_spans[span_layout.page_no].append(span)
219 | return [(pages[i], page_spans[i]) for i in page_spans]
220 |
221 | def get_heading(self, span: Span) -> Span | None:
222 | """Get the closest heading for a span."""
223 | spans = list(span.doc.spans[self.attrs.span_group])
224 | if span.label_ not in self.headings:
225 | # Go through previous layout spans in reverse and find first match
226 | for candidate in spans[: span.id][::-1]:
227 | if candidate.label_ in self.headings:
228 | return candidate
229 |
230 | def get_tables(self, doc: Doc) -> list[Span]:
231 | """Get all tables in the document."""
232 | return [
233 | span
234 | for span in doc.spans[self.attrs.span_group]
235 | if span.label_ in TABLE_ITEM_LABELS
236 | ]
237 |
--------------------------------------------------------------------------------
/spacy_layout/types.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 |
3 | from docling_core.types.doc.document import (
4 | ListItem,
5 | SectionHeaderItem,
6 | TableItem,
7 | TextItem,
8 | )
9 |
10 | DoclingItem = ListItem | SectionHeaderItem | TextItem | TableItem
11 |
12 |
13 | @dataclass
14 | class Attrs:
15 | """Custom atributes used to extend spaCy"""
16 |
17 | doc_layout: str
18 | doc_pages: str
19 | doc_tables: str
20 | doc_markdown: str
21 | span_layout: str
22 | span_data: str
23 | span_heading: str
24 | span_group: str
25 |
26 |
27 | @dataclass
28 | class PageLayout:
29 | page_no: int
30 | width: float
31 | height: float
32 |
33 | @classmethod
34 | def from_dict(cls, data: dict) -> "PageLayout":
35 | return cls(**data)
36 |
37 |
38 | @dataclass
39 | class DocLayout:
40 | """Document layout features added to Doc object"""
41 |
42 | pages: list[PageLayout]
43 |
44 | @classmethod
45 | def from_dict(cls, data: dict) -> "DocLayout":
46 | pages = [PageLayout.from_dict(page) for page in data.get("pages", [])]
47 | return cls(pages=pages)
48 |
49 |
50 | @dataclass
51 | class SpanLayout:
52 | """Text span layout features added to Span object"""
53 |
54 | x: float
55 | y: float
56 | width: float
57 | height: float
58 | page_no: int
59 |
60 | @classmethod
61 | def from_dict(cls, data: dict) -> "SpanLayout":
62 | return cls(**data)
63 |
--------------------------------------------------------------------------------
/spacy_layout/util.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | from typing import TYPE_CHECKING, Any, Callable
3 |
4 | from docling_core.types.doc.base import CoordOrigin
5 | from pandas import DataFrame
6 |
7 | from .types import DocLayout, PageLayout, SpanLayout
8 |
9 | if TYPE_CHECKING:
10 | from docling_core.types.doc.base import BoundingBox
11 |
12 | TYPE_ATTR = "__type__"
13 | OBJ_TYPES = {"SpanLayout": SpanLayout, "DocLayout": DocLayout, "PageLayout": PageLayout}
14 |
15 |
16 | def encode_obj(obj: Any, chain: Callable | None = None) -> Any:
17 | """Convert custom dataclass to dict for serialization."""
18 | if isinstance(obj, tuple(OBJ_TYPES.values())):
19 | result = dataclasses.asdict(obj)
20 | result[TYPE_ATTR] = type(obj).__name__
21 | return result
22 | return obj if chain is None else chain(obj)
23 |
24 |
25 | def decode_obj(obj: Any, chain: Callable | None = None) -> Any:
26 | """Load custom dataclass from serialized dict."""
27 | if isinstance(obj, dict) and obj.get(TYPE_ATTR) in OBJ_TYPES:
28 | obj_type = obj.pop(TYPE_ATTR)
29 | return OBJ_TYPES[obj_type].from_dict(obj)
30 | return obj if chain is None else chain(obj)
31 |
32 |
33 | def encode_df(obj: Any, chain: Callable | None = None) -> Any:
34 | """Convert pandas.DataFrame for serialization."""
35 | if isinstance(obj, DataFrame):
36 | return {"data": obj.to_dict(), TYPE_ATTR: "DataFrame"}
37 | return obj if chain is None else chain(obj)
38 |
39 |
40 | def decode_df(obj: Any, chain: Callable | None = None) -> Any:
41 | """Load pandas.DataFrame from serialized data."""
42 | if isinstance(obj, dict) and obj.get(TYPE_ATTR) == "DataFrame":
43 | return DataFrame(obj["data"])
44 | return obj if chain is None else chain(obj)
45 |
46 |
47 | def get_bounding_box(
48 | bbox: "BoundingBox", page_height: float
49 | ) -> tuple[float, float, float, float]:
50 | is_bottom = bbox.coord_origin == CoordOrigin.BOTTOMLEFT
51 | y = page_height - bbox.t if is_bottom else bbox.t
52 | height = bbox.t - bbox.b if is_bottom else bbox.b - bbox.t
53 | width = bbox.r - bbox.l
54 | return (bbox.l, y, width, height)
55 |
--------------------------------------------------------------------------------
/tests/data/simple.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/simple.docx
--------------------------------------------------------------------------------
/tests/data/simple.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/simple.pdf
--------------------------------------------------------------------------------
/tests/data/starcraft.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/starcraft.pdf
--------------------------------------------------------------------------------
/tests/data/table.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/table.pdf
--------------------------------------------------------------------------------
/tests/data/table_document_index.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/explosion/spacy-layout/1e6a51ac8abc6389b99f30981b0b31ab49caf5fb/tests/data/table_document_index.pdf
--------------------------------------------------------------------------------
/tests/test_general.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 |
3 | import pytest
4 | import spacy
5 | import srsly
6 | from docling_core.types.doc.base import BoundingBox, CoordOrigin
7 | from docling_core.types.doc.labels import DocItemLabel
8 | from pandas import DataFrame
9 | from pandas.testing import assert_frame_equal
10 | from spacy.tokens import DocBin
11 | import pandas as pd
12 |
13 | from spacy_layout import spaCyLayout
14 | from spacy_layout.layout import TABLE_PLACEHOLDER, get_bounding_box
15 | from spacy_layout.types import DocLayout, PageLayout, SpanLayout
16 |
17 | PDF_STARCRAFT = Path(__file__).parent / "data" / "starcraft.pdf"
18 | PDF_SIMPLE = Path(__file__).parent / "data" / "simple.pdf"
19 | DOCX_SIMPLE = Path(__file__).parent / "data" / "simple.docx"
20 | PDF_SIMPLE_BYTES = PDF_SIMPLE.open("rb").read()
21 | PDF_TABLE = Path(__file__).parent / "data" / "table.pdf"
22 | PDF_INDEX = Path(__file__).parent / "data" / "table_document_index.pdf"
23 |
24 |
25 | @pytest.fixture
26 | def nlp():
27 | return spacy.blank("en")
28 |
29 |
30 | @pytest.fixture
31 | def span_labels():
32 | return [label.value for label in DocItemLabel]
33 |
34 |
35 | @pytest.mark.parametrize("path", [PDF_STARCRAFT, PDF_SIMPLE, PDF_SIMPLE_BYTES])
36 | def test_general(path, nlp, span_labels):
37 | layout = spaCyLayout(nlp)
38 | doc = layout(path)
39 | assert isinstance(doc._.get(layout.attrs.doc_layout), DocLayout)
40 | for span in doc.spans[layout.attrs.span_group]:
41 | assert span.text
42 | assert span.label_ in span_labels
43 | assert isinstance(span._.get(layout.attrs.span_layout), SpanLayout)
44 |
45 |
46 | @pytest.mark.parametrize("path, pg_no", [(PDF_STARCRAFT, 6), (PDF_SIMPLE, 1)])
47 | def test_pages(path, pg_no, nlp):
48 | layout = spaCyLayout(nlp)
49 | doc = layout(path)
50 | # This should not raise a KeyError when accessing `pages` dict
51 | # Key Error would mean a mismatched pagination on document layout and span layout
52 | result = layout.get_pages(doc)
53 | assert len(result) == pg_no
54 | assert result[0][0].page_no == 1
55 | if pg_no == 6: # there should be 16 or 18 spans on the pg_no 1
56 | assert len(result[0][1]) in (16, 18)
57 | elif pg_no == 1: # there should be 4 spans on pg_no 1
58 | assert len(result[0][1]) == 4
59 |
60 |
61 | @pytest.mark.parametrize("path", [PDF_SIMPLE, DOCX_SIMPLE])
62 | @pytest.mark.parametrize("separator", ["\n\n", ""])
63 | def test_simple(path, separator, nlp):
64 | layout = spaCyLayout(nlp, separator=separator)
65 | doc = layout(path)
66 | assert len(doc.spans[layout.attrs.span_group]) == 4
67 | assert doc.text.startswith(f"Lorem ipsum dolor sit amet{separator}")
68 | assert doc.spans[layout.attrs.span_group][0].text == "Lorem ipsum dolor sit amet"
69 |
70 |
71 | def test_simple_pipe(nlp):
72 | layout = spaCyLayout(nlp)
73 | for doc in layout.pipe([PDF_SIMPLE, DOCX_SIMPLE]):
74 | assert len(doc.spans[layout.attrs.span_group]) == 4
75 |
76 |
77 | def test_simple_pipe_as_tuples(nlp):
78 | layout = spaCyLayout(nlp)
79 | data = [(PDF_SIMPLE, "pdf"), (DOCX_SIMPLE, "docx")]
80 | result = list(layout.pipe(data, as_tuples=True))
81 | for doc, _ in result:
82 | assert len(doc.spans[layout.attrs.span_group]) == 4
83 | assert [context for _, context in result] == ["pdf", "docx"]
84 |
85 |
86 | def test_table(nlp):
87 | layout = spaCyLayout(nlp)
88 | doc = layout(PDF_TABLE)
89 | assert len(doc._.get(layout.attrs.doc_tables)) == 1
90 | table = doc._.get(layout.attrs.doc_tables)[0]
91 | assert table.text == TABLE_PLACEHOLDER
92 | df = table._.get(layout.attrs.span_data)
93 | assert df.columns.tolist() == ["Name", "Type", "Place of birth"]
94 | assert df.to_dict(orient="list") == {
95 | "Name": ["Ines", "Matt", "Baikal", "Stanislav Petrov"],
96 | "Type": ["human", "human", "cat", "cat"],
97 | "Place of birth": [
98 | "Cologne, Germany",
99 | "Sydney, Australia",
100 | "Berlin, Germany",
101 | "Chernihiv, Ukraine",
102 | ],
103 | }
104 | markdown = (
105 | "| Name | Type | Place of birth |\n"
106 | "|------------------|--------|--------------------|\n"
107 | "| Ines | human | Cologne, Germany |\n"
108 | "| Matt | human | Sydney, Australia |\n"
109 | "| Baikal | cat | Berlin, Germany |\n"
110 | "| Stanislav Petrov | cat | Chernihiv, Ukraine |\n"
111 | )
112 | assert markdown in doc._.get(layout.attrs.doc_markdown)
113 |
114 |
115 | def test_table_index(nlp):
116 | layout = spaCyLayout(nlp)
117 | doc = layout(PDF_INDEX)
118 | assert len(doc._.get(layout.attrs.doc_tables)) == 3
119 | table = doc._.get(layout.attrs.doc_tables)[0]
120 | assert table.text == TABLE_PLACEHOLDER
121 | assert table.label_ == DocItemLabel.DOCUMENT_INDEX.value
122 |
123 | # Check that each document_index table has a dataframe
124 | document_index_tables = [span for span in doc._.get(
125 | layout.attrs.doc_tables) if span.label_ == DocItemLabel.DOCUMENT_INDEX.value]
126 | for table in document_index_tables:
127 | assert table._.data is not None, "Table data not available"
128 | assert isinstance(
129 | table._.data, pd.DataFrame), "Table data is not a DataFrame"
130 |
131 |
132 | def test_table_placeholder(nlp):
133 | def display_table(df):
134 | return f"Table with columns: {', '.join(df.columns.tolist())}"
135 |
136 | layout = spaCyLayout(nlp, display_table=display_table)
137 | doc = layout(PDF_TABLE)
138 | table = doc._.get(layout.attrs.doc_tables)[0]
139 | assert table.text == "Table with columns: Name, Type, Place of birth"
140 |
141 |
142 | @pytest.mark.parametrize(
143 | "box,page_height,expected",
144 | [
145 | (
146 | (200.0, 50.0, 100.0, 400.0, CoordOrigin.BOTTOMLEFT),
147 | 1000.0,
148 | (100.0, 800.0, 300.0, 150.0),
149 | ),
150 | (
151 | (200.0, 250.0, 100.0, 400.0, CoordOrigin.TOPLEFT),
152 | 1000.0,
153 | (100.0, 200.0, 300.0, 50.0),
154 | ),
155 | (
156 | (
157 | 648.3192749023438,
158 | 633.4112548828125,
159 | 155.50897216796875,
160 | 239.66929626464844,
161 | CoordOrigin.BOTTOMLEFT,
162 | ),
163 | 792.0,
164 | (
165 | 155.50897216796875,
166 | 143.68072509765625,
167 | 84.16032409667969,
168 | 14.90802001953125,
169 | ),
170 | ),
171 | ],
172 | )
173 | def test_bounding_box(box, page_height, expected):
174 | top, bottom, left, right, origin = box
175 | bbox = BoundingBox(t=top, b=bottom, l=left, r=right, coord_origin=origin)
176 | assert get_bounding_box(bbox, page_height) == expected
177 |
178 |
179 | def test_serialize_objects():
180 | span_layout = SpanLayout(x=10, y=20, width=30, height=40, page_no=1)
181 | doc_layout = DocLayout(pages=[PageLayout(page_no=1, width=500, height=600)])
182 | bytes_data = srsly.msgpack_dumps({"span": span_layout, "doc": doc_layout})
183 | data = srsly.msgpack_loads(bytes_data)
184 | assert isinstance(data, dict)
185 | assert data["span"] == span_layout
186 | assert data["doc"] == doc_layout
187 | df = DataFrame(data={"col1": [1, 2], "col2": [3, 4]})
188 | bytes_data = srsly.msgpack_dumps({"df": df})
189 | data = srsly.msgpack_loads(bytes_data)
190 | assert isinstance(data, dict)
191 | assert_frame_equal(df, data["df"])
192 |
193 |
194 | @pytest.mark.parametrize("path", [PDF_SIMPLE, PDF_TABLE])
195 | def test_serialize_roundtrip(path, nlp):
196 | layout = spaCyLayout(nlp)
197 | doc = layout(path)
198 | doc_bin = DocBin(store_user_data=True)
199 | doc_bin.add(doc)
200 | bytes_data = doc_bin.to_bytes()
201 | new_doc_bin = DocBin().from_bytes(bytes_data)
202 | new_doc = list(new_doc_bin.get_docs(nlp.vocab))[0]
203 | layout_spans = new_doc.spans[layout.attrs.span_group]
204 | assert len(layout_spans) == len(doc.spans[layout.attrs.span_group])
205 | assert all(
206 | isinstance(span._.get(layout.attrs.span_layout), SpanLayout)
207 | for span in layout_spans
208 | )
209 | assert isinstance(new_doc._.get(layout.attrs.doc_layout), DocLayout)
210 | tables = doc._.get(layout.attrs.doc_tables)
211 | new_tables = new_doc._.get(layout.attrs.doc_tables)
212 | for before, after in zip(tables, new_tables):
213 | table_before = before._.get(layout.attrs.span_data)
214 | table_after = after._.get(layout.attrs.span_data)
215 | assert_frame_equal(table_before, table_after)
216 |
--------------------------------------------------------------------------------