├── .flake8
├── .gitattributes
├── .github
    ├── codecov.yml
    ├── mergify.yml
    ├── scripts
    │   └── release.sh
    └── workflows
    │   ├── cd.yml
    │   ├── checks.yml
    │   ├── ci.yml
    │   └── pypi.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .whitesource
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MAINTAINERS.md
├── README.md
├── docling_core
    ├── __init__.py
    ├── cli
    │   ├── __init__.py
    │   └── view.py
    ├── experimental
    │   └── __init__.py
    ├── py.typed
    ├── resources
    │   └── schemas
    │   │   ├── doc
    │   │       ├── ANN.json
    │   │       ├── DOC.json
    │   │       ├── OCR-output.json
    │   │       └── RAW.json
    │   │   ├── generated
    │   │       ├── ccs_document_schema.json
    │   │       └── minimal_document_schema_flat.json
    │   │   └── search
    │   │       ├── search_doc_mapping.json
    │   │       └── search_doc_mapping_v2.json
    ├── search
    │   ├── __init__.py
    │   ├── json_schema_to_search_mapper.py
    │   ├── mapping.py
    │   ├── meta.py
    │   └── package.py
    ├── transforms
    │   ├── __init__.py
    │   ├── chunker
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── hierarchical_chunker.py
    │   │   ├── hybrid_chunker.py
    │   │   └── tokenizer
    │   │   │   ├── __init__.py
    │   │   │   ├── base.py
    │   │   │   ├── huggingface.py
    │   │   │   └── openai.py
    │   ├── serializer
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── common.py
    │   │   ├── doctags.py
    │   │   ├── html.py
    │   │   ├── html_styles.py
    │   │   └── markdown.py
    │   └── visualizer
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── layout_visualizer.py
    │   │   └── reading_order_visualizer.py
    ├── types
    │   ├── __init__.py
    │   ├── base.py
    │   ├── doc
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── document.py
    │   │   ├── labels.py
    │   │   ├── page.py
    │   │   ├── tokens.py
    │   │   └── utils.py
    │   ├── gen
    │   │   ├── __init__.py
    │   │   └── generic.py
    │   ├── io
    │   │   └── __init__.py
    │   ├── legacy_doc
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── doc_ann.py
    │   │   ├── doc_ocr.py
    │   │   ├── doc_raw.py
    │   │   ├── document.py
    │   │   └── tokens.py
    │   ├── nlp
    │   │   ├── __init__.py
    │   │   ├── qa.py
    │   │   └── qa_labels.py
    │   └── rec
    │   │   ├── __init__.py
    │   │   ├── attribute.py
    │   │   ├── base.py
    │   │   ├── predicate.py
    │   │   ├── record.py
    │   │   ├── statement.py
    │   │   └── subject.py
    └── utils
    │   ├── __init__.py
    │   ├── alias.py
    │   ├── file.py
    │   ├── generate_docs.py
    │   ├── generate_jsonschema.py
    │   ├── legacy.py
    │   ├── validate.py
    │   └── validators.py
├── docs
    ├── DoclingDocument.json
    ├── Generic.json
    └── Record.json
├── examples
    ├── 2408.09869v3.json
    ├── chunking_and_serialization.ipynb
    └── table_annotations.ipynb
├── pyproject.toml
├── test
    ├── __init__.py
    ├── data
    │   ├── chunker
    │   │   ├── 0_inp_dl_doc.json
    │   │   ├── 0_out_chunks.json
    │   │   ├── 0b_out_chunks.json
    │   │   ├── 2_inp_dl_doc.json
    │   │   ├── 2a_out_chunks.json
    │   │   ├── 2a_out_ser_chunks.json
    │   │   ├── 2b_out_chunks.json
    │   │   ├── 2c_out_chunks.json
    │   │   ├── 2d_out_ser_chunks.json
    │   │   ├── 2e_out_chunks.json
    │   │   ├── 2f_out_chunks.json
    │   │   └── 2g_out_chunks.json
    │   ├── doc
    │   │   ├── 01030000000083.dt
    │   │   ├── 01030000000083.png
    │   │   ├── 01030000000111.dt
    │   │   ├── 01030000000111.png
    │   │   ├── 2106.09680v1.json
    │   │   ├── 2206.01062-1.0.0.json
    │   │   ├── 2206.01062.yaml
    │   │   ├── 2206.01062.yaml.dt
    │   │   ├── 2206.01062.yaml.dt.json
    │   │   ├── 2206.01062.yaml.et
    │   │   ├── 2206.01062.yaml.html
    │   │   ├── 2206.01062.yaml.md
    │   │   ├── 2206.01062.yaml.min.dt
    │   │   ├── 2206.01062.yaml.paged.md
    │   │   ├── 2408.09869_p1.json
    │   │   ├── 2408.09869_p1_split.gt.html
    │   │   ├── 2408.09869v3_enriched.dt
    │   │   ├── 2408.09869v3_enriched.dt.json
    │   │   ├── 2408.09869v3_enriched.gt.md
    │   │   ├── 2408.09869v3_enriched.json
    │   │   ├── 2408.09869v3_enriched.out.dt
    │   │   ├── 2408.09869v3_enriched.out.dt.json
    │   │   ├── 2408.09869v3_enriched_p1_include_annotations_false.gt.html
    │   │   ├── 2408.09869v3_enriched_p1_include_annotations_false.gt.md
    │   │   ├── 2408.09869v3_enriched_p1_include_annotations_true.gt.html
    │   │   ├── 2408.09869v3_enriched_p1_mark_annotations_false.gt.md
    │   │   ├── 2408.09869v3_enriched_p1_mark_annotations_true.gt.md
    │   │   ├── 2408.09869v3_enriched_split.gt.html
    │   │   ├── 2408.09869v3_enriched_split_p2.gt.html
    │   │   ├── activities.gt.html
    │   │   ├── activities.gt.md
    │   │   ├── activities.json
    │   │   ├── activities_p1.gt.html
    │   │   ├── activities_p2.gt.html
    │   │   ├── activities_p2.gt.md
    │   │   ├── activities_pb_empty.gt.md
    │   │   ├── activities_pb_non_empty.gt.md
    │   │   ├── activities_pb_none.gt.md
    │   │   ├── bad_doc.yaml.dt
    │   │   ├── bad_doc.yaml.et
    │   │   ├── bad_doc.yaml.html
    │   │   ├── bad_doc.yaml.md
    │   │   ├── barchart.dt
    │   │   ├── barchart.gt.html
    │   │   ├── barchart.gt.md
    │   │   ├── barchart.json
    │   │   ├── barchart.png
    │   │   ├── constructed_doc.appended_child.json.gt
    │   │   ├── constructed_doc.deleted_group.json.gt
    │   │   ├── constructed_doc.deleted_picture.json.gt
    │   │   ├── constructed_doc.deleted_table.json.gt
    │   │   ├── constructed_doc.deleted_text.json.gt
    │   │   ├── constructed_doc.dt
    │   │   ├── constructed_doc.dt.gt
    │   │   ├── constructed_doc.embedded.html.gt
    │   │   ├── constructed_doc.embedded.json.gt
    │   │   ├── constructed_doc.embedded.md.gt
    │   │   ├── constructed_doc.embedded.yaml.gt
    │   │   ├── constructed_doc.html
    │   │   ├── constructed_doc.inserted_text.json.gt
    │   │   ├── constructed_doc.placeholder.html.gt
    │   │   ├── constructed_doc.placeholder.md.gt
    │   │   ├── constructed_doc.referenced.html.gt
    │   │   ├── constructed_doc.referenced.json.gt
    │   │   ├── constructed_doc.referenced.md.gt
    │   │   ├── constructed_doc.referenced.yaml.gt
    │   │   ├── constructed_doc.replaced_item.json.gt
    │   │   ├── constructed_document.yaml.dt
    │   │   ├── constructed_document.yaml.et
    │   │   ├── constructed_document.yaml.html
    │   │   ├── constructed_document.yaml.md
    │   │   ├── constructed_images
    │   │   │   ├── image_000001_797618e862d279d4e3e92f4b6313175f67e08fc36051dfda092bf63220568703.png
    │   │   │   ├── image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png
    │   │   │   └── image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png
    │   │   ├── doc_with_kv.dt
    │   │   ├── doc_with_kv.dt.json
    │   │   ├── doc_with_kv.png
    │   │   ├── dummy_doc.yaml
    │   │   ├── dummy_doc.yaml.dt
    │   │   ├── dummy_doc.yaml.et
    │   │   ├── dummy_doc.yaml.html
    │   │   ├── dummy_doc.yaml.md
    │   │   ├── dummy_doc.yaml.min.dt
    │   │   ├── misplaced_list_items.out.yaml
    │   │   ├── misplaced_list_items.yaml
    │   │   ├── misplaced_list_items.yaml.dt
    │   │   ├── page_with_pic.dt
    │   │   ├── page_with_pic.dt.json
    │   │   ├── page_with_pic.png
    │   │   ├── page_with_pic_from_files.dt.json
    │   │   └── page_without_pic.dt.json
    │   ├── docling_document
    │   │   ├── export
    │   │   │   └── formula_mathml.html
    │   │   └── unit
    │   │   │   ├── CodeItem.yaml
    │   │   │   ├── FloatingItem.yaml
    │   │   │   ├── FormItem.yaml
    │   │   │   ├── FormulaItem.yaml
    │   │   │   ├── KeyValueItem.yaml
    │   │   │   ├── ListItem.yaml
    │   │   │   ├── PictureItem.yaml
    │   │   │   ├── SectionHeaderItem.yaml
    │   │   │   ├── TableItem.yaml
    │   │   │   ├── TextItem.yaml
    │   │   │   └── TitleItem.yaml
    │   ├── json_schemas
    │   │   ├── base_identifier.json
    │   │   ├── base_log.json
    │   │   ├── dbrecord-ref.json
    │   │   └── document-ref.json
    │   ├── legacy_doc
    │   │   ├── doc-1.json
    │   │   ├── doc-1.json_table_0.dt.txt
    │   │   ├── doc-2.json
    │   │   ├── doc-2.json_table_0.dt.txt
    │   │   ├── doc-3.json
    │   │   ├── doc-4.json
    │   │   ├── doc-5.json
    │   │   ├── doc-6.json
    │   │   ├── doc-6.json_table_0.dt.txt
    │   │   ├── doc-7.json
    │   │   ├── doc-7.json_table_0.dt.txt
    │   │   ├── doc-8.json
    │   │   ├── doc-8.json_table_0.dt.txt
    │   │   ├── doc-9.json
    │   │   ├── doc-export.docling.yaml.gt
    │   │   ├── doc-export.dt.txt
    │   │   ├── doc-export.json
    │   │   ├── doc-export.json_table_0.dt.txt
    │   │   ├── doc-export.md
    │   │   ├── error-1.json
    │   │   ├── error-2.json
    │   │   ├── error-3.json
    │   │   ├── ext-1.json
    │   │   └── intermediates
    │   │   │   ├── ann.01.json
    │   │   │   ├── cells.01.json
    │   │   │   ├── final-doc.01.json
    │   │   │   ├── pdf.meta.01.json
    │   │   │   ├── publication_journal.json
    │   │   │   ├── publication_venue.json
    │   │   │   └── raw.meta.01.json
    │   ├── nlp
    │   │   ├── error-qa-1.json
    │   │   ├── error-qa-3.json
    │   │   ├── qa-1.json
    │   │   ├── qa-2.json
    │   │   └── qa-3.json
    │   ├── rec
    │   │   ├── attribute-01.json
    │   │   ├── attribute-02.json
    │   │   ├── attribute-03.json
    │   │   ├── error-attribute-01.json
    │   │   ├── error-attribute-02.json
    │   │   ├── error-predicate-01.json
    │   │   ├── error-predicate-02.json
    │   │   ├── predicate-01.json
    │   │   ├── predicate-02.json
    │   │   ├── record-01.json
    │   │   ├── record-02.json
    │   │   ├── record-03.json
    │   │   ├── record-04.json
    │   │   ├── record-05.json
    │   │   ├── record-gleif-01.json
    │   │   ├── statement-01.json
    │   │   ├── statement-02.json
    │   │   ├── statement-gleif-01.json
    │   │   ├── subject-01.json
    │   │   └── subject-02.json
    │   ├── search
    │   │   ├── error-meta-01.json
    │   │   ├── error-meta-02.json
    │   │   ├── error-meta-03.json
    │   │   ├── meta-01.json
    │   │   ├── meta-02.json
    │   │   ├── meta-03.json
    │   │   └── meta-04.json
    │   └── viz
    │   │   ├── 2408.09869v3_enriched.dt_viz_p2.png
    │   │   ├── 2408.09869v3_enriched_viz_p1.png
    │   │   ├── 2408.09869v3_enriched_viz_p2.png
    │   │   ├── 2408.09869v3_enriched_viz_p3.png
    │   │   ├── 2408.09869v3_enriched_viz_wout_lbl_p1.png
    │   │   ├── 2408.09869v3_enriched_viz_wout_lbl_p2.png
    │   │   └── 2408.09869v3_enriched_viz_wout_lbl_p3.png
    ├── test_base.py
    ├── test_collection.py
    ├── test_data_gen_flag.py
    ├── test_doc_base.py
    ├── test_doc_legacy_convert.py
    ├── test_doc_schema.py
    ├── test_doc_schema_extractor.py
    ├── test_docling_doc.py
    ├── test_doctags_load.py
    ├── test_hierarchical_chunker.py
    ├── test_hybrid_chunker.py
    ├── test_json_schema_to_search_mapper.py
    ├── test_nlp_qa.py
    ├── test_otsl_table_export.py
    ├── test_page.py
    ├── test_rec_schema.py
    ├── test_search_meta.py
    ├── test_serialization.py
    ├── test_utils.py
    └── test_visualization.py
└── uv.lock


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | per-file-ignores = __init__.py:F401
3 | max-line-length = 120
4 | exclude = test/*
5 | max-complexity = 25
6 | docstring-convention = google
7 | ignore = W503,E203
8 | classmethod-decorators = classmethod,validator
9 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | test/data/** linguist-vendored
2 | 


--------------------------------------------------------------------------------
/.github/codecov.yml:
--------------------------------------------------------------------------------
 1 | codecov:
 2 |   # https://docs.codecov.io/docs/comparing-commits
 3 |   allow_coverage_offsets: true
 4 | coverage:
 5 |   status:
 6 |     project:
 7 |       default:
 8 |         informational: true
 9 |         target: auto  # auto compares coverage to the previous base commit
10 |         if_ci_failed: success
11 |         flags:
12 |           - docling
13 | comment:
14 |   layout: "reach, diff, flags, files"
15 |   behavior: default
16 |   require_changes: false  # if true: only post the comment if coverage changes
17 |   branches:               # branch names that can post comment
18 |     - "main"
19 | 


--------------------------------------------------------------------------------
/.github/mergify.yml:
--------------------------------------------------------------------------------
 1 | merge_protections:
 2 |   - name: Enforce conventional commit
 3 |     description: Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/
 4 |     if:
 5 |       - base = main
 6 |     success_conditions:
 7 |       - "title ~=
 8 |         ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\
 9 |         \\))?(!)?:"
10 |   - name: Require two reviewer for test updates
11 |     description: When test data is updated, we require two reviewers
12 |     if:
13 |       - base = main
14 |       - files ~= ^test
15 |     success_conditions:
16 |       - "#approved-reviews-by >= 2"
17 | 


--------------------------------------------------------------------------------
/.github/scripts/release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e  # trigger failure on error - do not remove!
 4 | set -x  # display command on output
 5 | 
 6 | if [ -z "${TARGET_VERSION}" ]; then
 7 |     >&2 echo "No TARGET_VERSION specified"
 8 |     exit 1
 9 | fi
10 | CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
11 | 
12 | # update package version
13 | uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
14 | uv lock --upgrade-package docling-core
15 | 
16 | # collect release notes
17 | REL_NOTES=$(mktemp)
18 | uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"
19 | 
20 | # update changelog
21 | TMP_CHGLOG=$(mktemp)
22 | TARGET_TAG_NAME="v${TARGET_VERSION}"
23 | RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}"
24 | printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}"
25 | cat "${REL_NOTES}" >> "${TMP_CHGLOG}"
26 | if [ -f "${CHGLOG_FILE}" ]; then
27 |     printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}"
28 | fi
29 | mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
30 | 
31 | # push changes
32 | git config --global user.name 'github-actions[bot]'
33 | git config --global user.email 'github-actions[bot]@users.noreply.github.com'
34 | git add pyproject.toml uv.lock "${CHGLOG_FILE}"
35 | COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
36 | git commit -m "${COMMIT_MSG}"
37 | git push origin main
38 | 
39 | # create GitHub release (incl. Git tag)
40 | gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}"
41 | 


--------------------------------------------------------------------------------
/.github/workflows/cd.yml:
--------------------------------------------------------------------------------
 1 | name: "Run CD"
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | env:
 7 |   # disable keyring (https://github.com/actions/runner-images/issues/6185):
 8 |   PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
 9 | 
10 | jobs:
11 |   code-checks:
12 |     uses: ./.github/workflows/checks.yml
13 |     with:
14 |       push_coverage: false
15 |   pre-release-check:
16 |     runs-on: ubuntu-latest
17 |     outputs:
18 |       TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }}
19 |     steps:
20 |       - uses: actions/checkout@v4
21 |         with:
22 |           fetch-depth: 0  # for fetching tags, required for semantic-release
23 |       - name: Install uv and set the python version
24 |         uses: astral-sh/setup-uv@v5
25 |         with:
26 |           enable-cache: true
27 |       - name: Install dependencies
28 |         run: uv sync --only-dev
29 |       - name: Check version of potential release
30 |         id: version_check
31 |         run: |
32 |           TRGT_VERSION=$(uv run --no-sync semantic-release print-version)
33 |           echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT"
34 |           echo "${TRGT_VERSION}"
35 |       - name: Check notes of potential release
36 |         run: uv run --no-sync semantic-release changelog --unreleased
37 |   release:
38 |     needs: [code-checks, pre-release-check]
39 |     if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
40 |     environment: auto-release
41 |     runs-on: ubuntu-latest
42 |     concurrency: release
43 |     steps:
44 |       - uses: actions/create-github-app-token@v1
45 |         id: app-token
46 |         with:
47 |           app-id: ${{ vars.CI_APP_ID }}
48 |           private-key: ${{ secrets.CI_PRIVATE_KEY }}
49 |       - uses: actions/checkout@v4
50 |         with:
51 |           token: ${{ steps.app-token.outputs.token }}
52 |           fetch-depth: 0  # for fetching tags, required for semantic-release
53 |       - name: Install uv and set the python version
54 |         uses: astral-sh/setup-uv@v5
55 |         with:
56 |           enable-cache: true
57 |       - name: Install dependencies
58 |         run: uv sync --only-dev
59 |       - name: Run release script
60 |         env:
61 |           GH_TOKEN: ${{ steps.app-token.outputs.token }}
62 |           TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }}
63 |           CHGLOG_FILE: CHANGELOG.md
64 |         run: ./.github/scripts/release.sh
65 |         shell: bash
66 | 


--------------------------------------------------------------------------------
/.github/workflows/checks.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_call:
 3 |     inputs:
 4 |       push_coverage:
 5 |           type: boolean
 6 |           description: "If true, the coverage results are pushed to codecov.io."
 7 |           default: true
 8 |     secrets:
 9 |       CODECOV_TOKEN:
10 |         required: false        
11 | 
12 | env:
13 |   HF_HUB_DOWNLOAD_TIMEOUT: "60"
14 |   HF_HUB_ETAG_TIMEOUT: "60"
15 | 
16 | jobs:
17 |   run-checks:
18 |     runs-on: ubuntu-latest
19 |     strategy:
20 |       matrix:
21 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - name: Cache Hugging Face models
25 |         uses: actions/cache@v4
26 |         with:
27 |           path: ~/.cache/huggingface
28 |           key: huggingface-cache-py${{ matrix.python-version }}
29 |       - name: Install uv and set the python version
30 |         uses: astral-sh/setup-uv@v5
31 |         with:
32 |           python-version: ${{ matrix.python-version }}
33 |           enable-cache: true
34 |       - name: pre-commit cache key
35 |         run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
36 |       - uses: actions/cache@v4
37 |         with:
38 |           path: ~/.cache/pre-commit
39 |           key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
40 |       - name: Install dependencies
41 |         run: uv sync --frozen --all-extras
42 |       - name: Check style and run tests
43 |         run: pre-commit run --all-files
44 |       - name: Upload coverage to Codecov
45 |         if: inputs.push_coverage
46 |         uses: codecov/codecov-action@v5
47 |         with:
48 |           token: ${{ secrets.CODECOV_TOKEN }}
49 |           files: ./coverage.xml
50 | 
51 |   build-package:
52 |     runs-on: ubuntu-latest
53 |     strategy:
54 |       matrix:
55 |         python-version: ['3.12']
56 |     steps:
57 |       - uses: actions/checkout@v4
58 |       - name: Install uv and set the python version
59 |         uses: astral-sh/setup-uv@v5
60 |         with:
61 |           python-version: ${{ matrix.python-version }}
62 |           enable-cache: true
63 |       - name: Install dependencies
64 |         run: uv sync --all-extras
65 |       - name: Build package
66 |         run: uv build
67 |       - name: Check content of wheel
68 |         run: unzip -l dist/*.whl
69 |       - name: Store the distribution packages
70 |         uses: actions/upload-artifact@v4
71 |         with:
72 |           name: python-package-distributions
73 |           path: dist/
74 | 
75 |   test-package:
76 |     needs:
77 |       - build-package
78 |     runs-on: ubuntu-latest
79 |     strategy:
80 |       matrix:
81 |         python-version: ['3.12']
82 |     steps:
83 |       - name: Download all the dists
84 |         uses: actions/download-artifact@v4
85 |         with:
86 |           name: python-package-distributions
87 |           path: dist/
88 |       - name: Install uv and set the python version
89 |         uses: astral-sh/setup-uv@v5
90 |         with:
91 |           python-version: ${{ matrix.python-version }}
92 |           enable-cache: true
93 |       - name: Install package
94 |         run: uv pip install dist/*.whl
95 |       - name: Load the DoclingDocument package
96 |         run: python -c 'from docling_core.types.doc import DoclingDocument'
97 |       - name: Check if package data is present
98 |         run: python -c 'from importlib import resources; from pathlib import Path; p=Path(resources.files("docling_core").joinpath("resources/schemas/doc/DOC.json")); assert p.exists()'
99 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: "Run CI"
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     types: [opened, reopened, synchronize]
 6 |   push:
 7 |     branches:
 8 |       - "**"
 9 |       - "!gh-pages"
10 | 
11 | jobs:
12 |   code-checks:
13 |     if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling-core' && github.event.pull_request.head.repo.full_name != 'docling-project/docling-core') }}
14 |     uses: ./.github/workflows/checks.yml
15 |     secrets:
16 |       CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
17 | 
18 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: "Build and publish package"
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   build-and-publish:
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python-version: ['3.12']
16 |     environment:
17 |       name: pypi
18 |       url: https://pypi.org/p/docling-core
19 |     permissions:
20 |       id-token: write  # IMPORTANT: mandatory for trusted publishing
21 |     steps:
22 |       - uses: actions/checkout@v4
23 |       - name: Install uv and set the python version
24 |         uses: astral-sh/setup-uv@v5
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |           enable-cache: true
28 |       - name: Install dependencies
29 |         run: uv sync --all-extras
30 |       - name: Build package
31 |         run: uv build
32 |       - name: Publish distribution 📦 to PyPI
33 |         uses: pypa/gh-action-pypi-publish@release/v1
34 |         with:
35 |           attestations: true
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | node_modules/
  2 | 
  3 | .idea/
  4 | *~
  5 | *.DS_Store
  6 | test/data/constructed_images*
  7 | test/data/doc/constructed_doc*.html
  8 | test/data/doc/constructed_doc*.yaml
  9 | test/data/doc/constructed_doc*.json
 10 | test/data/doc/constructed_doc*.dt
 11 | test/data/doc/constructed_doc*.md
 12 | 
 13 | # Byte-compiled / optimized / DLL files
 14 | __pycache__/
 15 | *.py[cod]
 16 | *$py.class
 17 | 
 18 | # C extensions
 19 | *.so
 20 | 
 21 | # Distribution / packaging
 22 | .Python
 23 | build/
 24 | develop-eggs/
 25 | dist/
 26 | downloads/
 27 | eggs/
 28 | .eggs/
 29 | lib/
 30 | lib64/
 31 | parts/
 32 | sdist/
 33 | var/
 34 | wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | MANIFEST
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | 
 62 | # Translations
 63 | *.mo
 64 | *.pot
 65 | 
 66 | # Django stuff:
 67 | *.log
 68 | local_settings.py
 69 | db.sqlite3
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | target/
 83 | 
 84 | # Jupyter Notebook
 85 | .ipynb_checkpoints
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # celery beat schedule file
 91 | celerybeat-schedule
 92 | 
 93 | # SageMath parsed files
 94 | *.sage.py
 95 | 
 96 | # Environments
 97 | .env
 98 | .venv
 99 | env/
100 | venv/
101 | ENV/
102 | env.bak/
103 | venv.bak/
104 | 
105 | # Spyder project settings
106 | .spyderproject
107 | .spyproject
108 | 
109 | # Rope project settings
110 | .ropeproject
111 | 
112 | # mkdocs documentation
113 | /site
114 | 
115 | # mypy
116 | .mypy_cache/
117 | 
118 | # VisualStudioCode
119 | .vscode/


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | fail_fast: true
 2 | repos:
 3 |   - repo: local
 4 |     hooks:
 5 |       - id: black
 6 |         name: Black
 7 |         entry: uv run --no-sync black docling_core test
 8 |         pass_filenames: false
 9 |         language: system
10 |         files: '\.py$'
11 |   - repo: local
12 |     hooks:
13 |       - id: isort
14 |         name: isort
15 |         entry: uv run --no-sync isort docling_core test
16 |         pass_filenames: false
17 |         language: system
18 |         files: '\.py$'
19 |   - repo: local
20 |     hooks:
21 |       - id: autoflake
22 |         name: autoflake
23 |         entry: uv run --no-sync autoflake docling_core test
24 |         pass_filenames: false
25 |         language: system
26 |         files: '\.py$'
27 |   - repo: local
28 |     hooks:
29 |       - id: mypy
30 |         name: MyPy
31 |         entry: uv run --no-sync mypy docling_core test
32 |         pass_filenames: false
33 |         language: system
34 |         files: '\.py$'
35 |   - repo: local
36 |     hooks:
37 |       - id: flake8
38 |         name: Flake8
39 |         entry: uv run --no-sync flake8 docling_core
40 |         pass_filenames: false
41 |         language: system
42 |         files: '\.py$'
43 |   - repo: local
44 |     hooks:
45 |       - id: pytest
46 |         name: Pytest
47 |         entry: uv run --no-sync pytest --cov=docling_core --cov-report=xml test
48 |         pass_filenames: false
49 |         language: system
50 |         files: '\.py$'
51 |   - repo: local
52 |     hooks:
53 |       - id: docs
54 |         name: Docs
55 |         entry: uv run --no-sync python -m docling_core.utils.generate_docs docs
56 |         pass_filenames: false
57 |         language: system
58 |         files: '\.py$'
59 |   - repo: https://github.com/astral-sh/uv-pre-commit
60 |     rev: 0.7.8
61 |     hooks:
62 |       - id: uv-lock
63 | 


--------------------------------------------------------------------------------
/.whitesource:
--------------------------------------------------------------------------------
1 | {
2 |   "settingsInheritedFrom": "whitesource-config/whitesource-config@master"
3 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the [project team](./MAINTAINERS.md). All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## Contributing In General
 2 | Our project welcomes external contributions. If you have an itch, please feel
 3 | free to scratch it.
 4 | 
 5 | For more details on the contributing guidelines head to the Docling Project [community repository](https://github.com/docling-project/community).
 6 | 
 7 | ## Developing
 8 | 
 9 | ### Usage of uv
10 | 
11 | We use [uv](https://docs.astral.sh/uv/) as package and project manager.
12 | 
13 | #### Installation
14 | 
15 | To install `uv`, check the documentation on [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
16 | 
17 | #### Create an environment and sync it
18 | 
19 | You can use the `uv sync` to create a project virtual environment (if it does not already exist) and sync
20 | the project's dependencies with the environment.
21 | 
22 | ```bash
23 | uv sync
24 | ```
25 | 
26 | #### Use a specific Python version (optional)
27 | 
28 | If you need to work with a specific version of Python, you can create a new virtual environment for that version
29 | and run the sync command:
30 | 
31 | ```bash
32 | uv venv --python 3.12
33 | uv sync
34 | ```
35 | 
36 | More detailed options are described on the [Using Python environments](https://docs.astral.sh/uv/pip/environments/) documentation.
37 | 
38 | #### Add a new dependency
39 | 
40 | Simply use the `uv add` command. The `pyproject.toml` and `uv.lock` files will be updated.
41 | 
42 | ```bash
43 | uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>
44 | ```
45 | 
46 | ### Code sytle guidelines
47 | 
48 | We use the following tools to enforce code style:
49 | 
50 | - isort, to sort imports
51 | - Black, to format code
52 | - Flake8, to lint code
53 | - autoflake, to remove unused variables and imports
54 | - [MyPy](https://mypy.readthedocs.io), as static type checker
55 | 
56 | A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework. To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository:
57 | 
58 | ```bash
59 | uv run pre-commit install
60 | ```
61 | 
62 | To run the checks on-demand, type:
63 | 
64 | ```bash
65 | uv run pre-commit run --all-files
66 | ```
67 | 
68 | Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is because `pre-commit` doesn't like to see files modified by their hooks. In these cases, `git add` the modified files and `git commit` again.
69 | 
70 | 
71 | ### Documentation
72 | 
73 | We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects.
74 | 
75 | The documentation pages are stored in [docs](./docs/) folder and are updated at every commit, as part of the `pre-commit` check hooks.
76 | To generate the documentation on-demand, run:
77 | 
78 | ```bash
79 | uv run python -m docling_core.utils.generate_docs docs
80 | ```
81 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 International Business Machines
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
1 | # MAINTAINERS
2 | 
3 | - Cesar Berrospi Ramis - [@ceberam](https://github.com/ceberam)
4 | - Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm)
5 | - Christoph Auer - [@cau-git](https://github.com/cau-git)
6 | - Panos Vagenas - [@vagenas](https://github.com/vagenas)
7 | - Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM)
8 | 
9 | Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com).


--------------------------------------------------------------------------------
/docling_core/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Main package."""
7 | 


--------------------------------------------------------------------------------
/docling_core/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """CLI package."""
2 | 


--------------------------------------------------------------------------------
/docling_core/cli/view.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """CLI for docling viewer."""
 7 | import importlib
 8 | import tempfile
 9 | import webbrowser
10 | from pathlib import Path
11 | from typing import Annotated, Optional
12 | 
13 | import typer
14 | 
15 | from docling_core.types.doc import DoclingDocument
16 | from docling_core.types.doc.base import ImageRefMode
17 | from docling_core.utils.file import resolve_source_to_path
18 | 
19 | app = typer.Typer(
20 |     name="Docling",
21 |     no_args_is_help=True,
22 |     add_completion=False,
23 |     pretty_exceptions_enable=False,
24 | )
25 | 
26 | 
27 | def version_callback(value: bool):
28 |     """Callback for version inspection."""
29 |     if value:
30 |         docling_core_version = importlib.metadata.version("docling-core")
31 |         print(f"Docling Core version: {docling_core_version}")
32 |         raise typer.Exit()
33 | 
34 | 
35 | @app.command(no_args_is_help=True)
36 | def view(
37 |     source: Annotated[
38 |         str,
39 |         typer.Argument(
40 |             ...,
41 |             metavar="source",
42 |             help="Docling JSON file to view.",
43 |         ),
44 |     ],
45 |     version: Annotated[
46 |         Optional[bool],
47 |         typer.Option(
48 |             "--version",
49 |             callback=version_callback,
50 |             is_eager=True,
51 |             help="Show version information.",
52 |         ),
53 |     ] = None,
54 | ):
55 |     """Display a Docling JSON file on the default browser."""
56 |     path = resolve_source_to_path(source=source)
57 |     doc = DoclingDocument.load_from_json(filename=path)
58 |     target_path = Path(tempfile.mkdtemp()) / "out.html"
59 |     html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED)
60 |     with open(target_path, "w", encoding="utf-8") as f:
61 |         f.write(html_output)
62 |     webbrowser.open(url=f"file://{target_path.absolute().resolve()}")
63 | 
64 | 
65 | click_app = typer.main.get_command(app)
66 | 
67 | if __name__ == "__main__":
68 |     app()
69 | 


--------------------------------------------------------------------------------
/docling_core/experimental/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2025
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Experimental features."""
7 | 


--------------------------------------------------------------------------------
/docling_core/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/docling_core/py.typed


--------------------------------------------------------------------------------
/docling_core/resources/schemas/search/search_doc_mapping.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "mappings": {
  3 |     "dynamic": false,
  4 |     "_size": {
  5 |       "enabled": true
  6 |     },
  7 |     "_meta": {
  8 |       "$ref": "ccs:schemas#/Document"
  9 |     },
 10 |     "properties": {
 11 |       "description": {
 12 |         "type": "object",
 13 |         "properties": {
 14 |           "abstract": {
 15 |             "type": "text"
 16 |           },
 17 |           "affiliations": {
 18 |             "type": "keyword"
 19 |           },
 20 |           "authors": {
 21 |             "type": "keyword"
 22 |           },
 23 |           "title": {
 24 |             "type": "text"
 25 |           }
 26 |         }
 27 |       },
 28 |       "figures": {
 29 |         "type": "object",
 30 |         "properties": {
 31 |           "text": {
 32 |             "type": "text"
 33 |           },
 34 |           "type": {
 35 |             "type": "keyword"
 36 |           },
 37 |           "prov": {
 38 |             "type": "object",
 39 |             "properties": {
 40 |               "page": {
 41 |                 "type": "integer"
 42 |               }
 43 |             }
 44 |           }
 45 |         }
 46 |       },
 47 |       "file-info": {
 48 |         "type": "object",
 49 |         "properties": {
 50 |           "filename": {
 51 |             "type": "text"
 52 |           }
 53 |         }
 54 |       },
 55 |       "main-text": {
 56 |         "type": "object",
 57 |         "properties": {
 58 |           "text": {
 59 |             "type": "text"
 60 |           },
 61 |           "type": {
 62 |             "type": "keyword"
 63 |           },
 64 |           "name": {
 65 |             "type": "keyword"
 66 |           },
 67 |           "prov": {
 68 |             "type": "object",
 69 |             "properties": {
 70 |               "page": {
 71 |                 "type": "integer"
 72 |               }
 73 |             }
 74 |           }
 75 |         }
 76 |       },
 77 |       "_name": {
 78 |         "type": "keyword"
 79 |       },
 80 |       "tables": {
 81 |         "type": "object",
 82 |         "properties": {
 83 |           "text": {
 84 |             "type": "text"
 85 |           },
 86 |           "type": {
 87 |             "type": "keyword"
 88 |           },
 89 |           "prov": {
 90 |             "type": "object",
 91 |             "properties": {
 92 |               "page": {
 93 |                 "type": "integer"
 94 |               }
 95 |             }
 96 |           }
 97 |         }
 98 |       },
 99 |       "type": {
100 |         "type": "keyword"
101 |       }
102 |     }
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/docling_core/search/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Package for models and utility functions for search database mappings."""
7 | 


--------------------------------------------------------------------------------
/docling_core/search/mapping.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Methods to define fields in an index mapping of a search database."""
 7 | from typing import Any, Optional
 8 | 
 9 | 
10 | def es_field(
11 |     *,
12 |     type: Optional[str] = None,
13 |     ignore_above: Optional[int] = None,
14 |     term_vector: Optional[str] = None,
15 |     **kwargs: Any,
16 | ):
17 |     """Create x-es kwargs to be passed to a `pydantic.Field` via unpacking."""
18 |     all_kwargs = {**kwargs}
19 | 
20 |     if type is not None:
21 |         all_kwargs["type"] = type
22 | 
23 |     if ignore_above is not None:
24 |         all_kwargs["ignore_above"] = ignore_above
25 | 
26 |     if term_vector is not None:
27 |         all_kwargs["term_vector"] = term_vector
28 | 
29 |     return {f"x-es-{k}": v for k, v in all_kwargs.items()}
30 | 


--------------------------------------------------------------------------------
/docling_core/search/meta.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Models and methods to define the metadata fields in database index mappings."""
 7 | from pathlib import Path
 8 | from typing import Generic, Optional, TypeVar
 9 | 
10 | from pydantic import BaseModel, Field, StrictStr, ValidationInfo, field_validator
11 | 
12 | from docling_core.search.package import Package
13 | from docling_core.types.base import CollectionTypeEnum, StrictDateTime, UniqueList
14 | from docling_core.utils.alias import AliasModel
15 | 
16 | ClassificationT = TypeVar("ClassificationT", bound=str)
17 | DomainT = TypeVar("DomainT", bound=str)
18 | 
19 | 
20 | class S3Path(BaseModel, extra="forbid"):
21 |     """The path details within a cloud object storage for CCS-parsed files."""
22 | 
23 |     bucket: StrictStr
24 |     prefix: StrictStr
25 |     infix: StrictStr
26 | 
27 |     def __hash__(self):
28 |         """Return the hash value for this S3Path object."""
29 |         return hash((type(self),) + tuple(self.__dict__.values()))
30 | 
31 | 
32 | class S3CcsData(BaseModel, extra="forbid"):
33 |     """The access details to a cloud object storage for CCS-parsed files."""
34 | 
35 |     endpoint: StrictStr
36 |     paths: UniqueList[S3Path] = Field(min_length=1)
37 | 
38 | 
39 | class DocumentLicense(BaseModel, extra="forbid"):
40 |     """Document license for a search database index within the index mappings."""
41 | 
42 |     code: Optional[list[StrictStr]] = None
43 |     text: Optional[list[StrictStr]] = None
44 | 
45 | 
46 | class Meta(AliasModel, Generic[ClassificationT, DomainT], extra="forbid"):
47 |     """Metadata of a search database index within the index mappings."""
48 | 
49 |     aliases: Optional[list[StrictStr]] = None
50 |     created: StrictDateTime
51 |     description: Optional[StrictStr] = None
52 |     source: StrictStr
53 |     storage: Optional[StrictStr] = None
54 |     display_name: Optional[StrictStr] = None
55 |     type: CollectionTypeEnum
56 |     classification: Optional[list[ClassificationT]] = None
57 |     version: UniqueList[Package] = Field(min_length=1)
58 |     license: Optional[StrictStr] = None
59 |     filename: Optional[Path] = None
60 |     domain: Optional[list[DomainT]] = None
61 |     reference: Optional[StrictStr] = Field(default=None, alias="$ref")
62 |     ccs_s3_data: Optional[S3CcsData] = None
63 |     document_license: Optional[DocumentLicense] = None
64 |     index_key: Optional[StrictStr] = None
65 |     project_key: Optional[StrictStr] = None
66 | 
67 |     @field_validator("reference")
68 |     @classmethod
69 |     def reference_for_document(cls, v, info: ValidationInfo):
70 |         """Validate the reference field for indexes of type Document."""
71 |         if "type" in info.data and info.data["type"] == "Document":
72 |             if v and v != "ccs:schemas#/Document":
73 |                 raise ValueError("wrong reference value for Document type")
74 |             else:
75 |                 return "ccs:schemas#/Document"
76 |         else:
77 |             return v
78 | 
79 |     @field_validator("version")
80 |     @classmethod
81 |     def version_has_schema(cls, v):
82 |         """Validate that the docling-core library is always set in version field."""
83 |         docling_core = [item for item in v if item.name == "docling-core"]
84 |         if not docling_core:
85 |             raise ValueError(
86 |                 "the version should include at least a valid docling-core package"
87 |             )
88 |         elif len(docling_core) > 1:
89 |             raise ValueError(
90 |                 "the version must not include more than 1 docling-core package"
91 |             )
92 |         else:
93 |             return v
94 | 


--------------------------------------------------------------------------------
/docling_core/search/package.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Models and methods to define a package model."""
 7 | 
 8 | import importlib.metadata
 9 | import re
10 | from typing import Final
11 | 
12 | from pydantic import BaseModel, StrictStr, StringConstraints
13 | from typing_extensions import Annotated
14 | 
15 | VERSION_PATTERN: Final = (
16 |     r"^(?P<major>0|[1-9]\d*)\.(?P<minor>0|[1-9]\d*)\.(?P<patch>0|[1-9]\d*)"
17 |     r"(?:-(?P<prerelease>(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)"
18 |     r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+"
19 |     r"(?P<buildmetadata>[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$"
20 | )
21 | 
22 | 
23 | class Package(BaseModel, extra="forbid"):
24 |     """Representation of a software package.
25 | 
26 |     The version needs to comply with Semantic Versioning 2.0.0.
27 |     """
28 | 
29 |     name: StrictStr = "docling-core"
30 |     version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = (
31 |         importlib.metadata.version("docling-core")
32 |     )
33 | 
34 |     def __hash__(self):
35 |         """Return the hash value for this S3Path object."""
36 |         return hash((type(self),) + tuple(self.__dict__.values()))
37 | 
38 |     def get_major(self):
39 |         """Get the major version of this package."""
40 |         return re.match(VERSION_PATTERN, self.version)["major"]
41 | 
42 |     def get_minor(self):
43 |         """Get the major version of this package."""
44 |         return re.match(VERSION_PATTERN, self.version)["minor"]
45 | 
46 |     def get_patch(self):
47 |         """Get the major version of this package."""
48 |         return re.match(VERSION_PATTERN, self.version)["patch"]
49 | 
50 |     def get_pre_release(self):
51 |         """Get the pre-release version of this package."""
52 |         return re.match(VERSION_PATTERN, self.version)["prerelease"]
53 | 
54 |     def get_build_metadata(self):
55 |         """Get the build metadata version of this package."""
56 |         return re.match(VERSION_PATTERN, self.version)["buildmetadata"]
57 | 


--------------------------------------------------------------------------------
/docling_core/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Data transformations package."""
7 | 


--------------------------------------------------------------------------------
/docling_core/transforms/chunker/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the chunker types."""
 7 | 
 8 | from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta
 9 | from docling_core.transforms.chunker.hierarchical_chunker import (
10 |     DocChunk,
11 |     DocMeta,
12 |     HierarchicalChunker,
13 | )
14 | 


--------------------------------------------------------------------------------
/docling_core/transforms/chunker/base.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright IBM Corp. 2024 - 2024
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | 
  6 | """Define base classes for chunking."""
  7 | import json
  8 | from abc import ABC, abstractmethod
  9 | from typing import Any, ClassVar, Iterator
 10 | 
 11 | from pydantic import BaseModel
 12 | from typing_extensions import deprecated
 13 | 
 14 | from docling_core.types.doc import DoclingDocument as DLDocument
 15 | 
 16 | DFLT_DELIM = "\n"
 17 | 
 18 | 
 19 | class BaseMeta(BaseModel):
 20 |     """Chunk metadata base class."""
 21 | 
 22 |     excluded_embed: ClassVar[list[str]] = []
 23 |     excluded_llm: ClassVar[list[str]] = []
 24 | 
 25 |     def export_json_dict(self) -> dict[str, Any]:
 26 |         """Helper method for exporting non-None keys to JSON mode.
 27 | 
 28 |         Returns:
 29 |             dict[str, Any]: The exported dictionary.
 30 |         """
 31 |         return self.model_dump(mode="json", by_alias=True, exclude_none=True)
 32 | 
 33 | 
 34 | class BaseChunk(BaseModel):
 35 |     """Chunk base class."""
 36 | 
 37 |     text: str
 38 |     meta: BaseMeta
 39 | 
 40 |     def export_json_dict(self) -> dict[str, Any]:
 41 |         """Helper method for exporting non-None keys to JSON mode.
 42 | 
 43 |         Returns:
 44 |             dict[str, Any]: The exported dictionary.
 45 |         """
 46 |         return self.model_dump(mode="json", by_alias=True, exclude_none=True)
 47 | 
 48 | 
 49 | class BaseChunker(BaseModel, ABC):
 50 |     """Chunker base class."""
 51 | 
 52 |     delim: str = DFLT_DELIM
 53 | 
 54 |     @abstractmethod
 55 |     def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]:
 56 |         """Chunk the provided document.
 57 | 
 58 |         Args:
 59 |             dl_doc (DLDocument): document to chunk
 60 | 
 61 |         Raises:
 62 |             NotImplementedError: in this abstract implementation
 63 | 
 64 |         Yields:
 65 |             Iterator[BaseChunk]: iterator over extracted chunks
 66 |         """
 67 |         raise NotImplementedError()
 68 | 
 69 |     def contextualize(self, chunk: BaseChunk) -> str:
 70 |         """Contextualize the given chunk. This implementation is embedding-targeted.
 71 | 
 72 |         Args:
 73 |             chunk: chunk to serialize
 74 | 
 75 |         Returns:
 76 |             str: the serialized form of the chunk
 77 |         """
 78 |         meta = chunk.meta.export_json_dict()
 79 | 
 80 |         items = []
 81 |         for k in meta:
 82 |             if k not in chunk.meta.excluded_embed:
 83 |                 if isinstance(meta[k], list):
 84 |                     items.append(
 85 |                         self.delim.join(
 86 |                             [
 87 |                                 d if isinstance(d, str) else json.dumps(d)
 88 |                                 for d in meta[k]
 89 |                             ]
 90 |                         )
 91 |                     )
 92 |                 else:
 93 |                     items.append(json.dumps(meta[k]))
 94 |         items.append(chunk.text)
 95 | 
 96 |         return self.delim.join(items)
 97 | 
 98 |     @deprecated("Use contextualize() instead.")
 99 |     def serialize(self, chunk: BaseChunk) -> str:
100 |         """Contextualize the given chunk. This implementation is embedding-targeted."""
101 |         return self.contextualize(chunk=chunk)
102 | 


--------------------------------------------------------------------------------
/docling_core/transforms/chunker/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | """Define the tokenizer types."""
2 | 


--------------------------------------------------------------------------------
/docling_core/transforms/chunker/tokenizer/base.py:
--------------------------------------------------------------------------------
 1 | """Define base classes for tokenization."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Any
 5 | 
 6 | from pydantic import BaseModel
 7 | 
 8 | 
 9 | class BaseTokenizer(BaseModel, ABC):
10 |     """Base tokenizer class."""
11 | 
12 |     @abstractmethod
13 |     def count_tokens(self, text: str) -> int:
14 |         """Get number of tokens for given text."""
15 |         ...
16 | 
17 |     @abstractmethod
18 |     def get_max_tokens(self) -> int:
19 |         """Get maximum number of tokens allowed."""
20 |         ...
21 | 
22 |     @abstractmethod
23 |     def get_tokenizer(self) -> Any:
24 |         """Get underlying tokenizer object."""
25 |         ...
26 | 


--------------------------------------------------------------------------------
/docling_core/transforms/chunker/tokenizer/huggingface.py:
--------------------------------------------------------------------------------
 1 | """HuggingFace tokenization."""
 2 | 
 3 | import json
 4 | from os import PathLike
 5 | from typing import Optional, Union
 6 | 
 7 | from huggingface_hub import hf_hub_download
 8 | from pydantic import ConfigDict, model_validator
 9 | from typing_extensions import Self
10 | 
11 | from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer
12 | 
13 | try:
14 |     from transformers import AutoTokenizer, PreTrainedTokenizerBase
15 | except ImportError:
16 |     raise RuntimeError(
17 |         "Module requires 'chunking' extra; to install, run: "
18 |         "`pip install 'docling-core[chunking]'`"
19 |     )
20 | 
21 | 
22 | class HuggingFaceTokenizer(BaseTokenizer):
23 |     """HuggingFace tokenizer."""
24 | 
25 |     model_config = ConfigDict(arbitrary_types_allowed=True)
26 | 
27 |     tokenizer: PreTrainedTokenizerBase
28 |     max_tokens: int = None  # type: ignore[assignment]
29 | 
30 |     @model_validator(mode="after")
31 |     def _patch(self) -> Self:
32 |         if self.max_tokens is None:
33 |             try:
34 |                 # try to use SentenceTransformers-specific config as that seems to be
35 |                 # reliable (whenever available)
36 |                 config_name = "sentence_bert_config.json"
37 |                 config_path = hf_hub_download(
38 |                     repo_id=self.tokenizer.name_or_path,
39 |                     filename=config_name,
40 |                 )
41 |                 with open(config_path) as f:
42 |                     data = json.load(f)
43 |                 self.max_tokens = int(data["max_seq_length"])
44 |             except Exception as e:
45 |                 raise RuntimeError(
46 |                     "max_tokens could not be determined automatically; please set "
47 |                     "explicitly."
48 |                 ) from e
49 |         return self
50 | 
51 |     def count_tokens(self, text: str):
52 |         """Get number of tokens for given text."""
53 |         return len(self.tokenizer.tokenize(text=text))
54 | 
55 |     def get_max_tokens(self):
56 |         """Get maximum number of tokens allowed."""
57 |         return self.max_tokens
58 | 
59 |     @classmethod
60 |     def from_pretrained(
61 |         cls,
62 |         model_name: Union[str, PathLike],
63 |         max_tokens: Optional[int] = None,
64 |         **kwargs,
65 |     ) -> Self:
66 |         """Create tokenizer from model name."""
67 |         my_kwargs = {
68 |             "tokenizer": AutoTokenizer.from_pretrained(
69 |                 pretrained_model_name_or_path=model_name, **kwargs
70 |             ),
71 |         }
72 |         if max_tokens is not None:
73 |             my_kwargs["max_tokens"] = max_tokens
74 |         return cls(**my_kwargs)
75 | 
76 |     def get_tokenizer(self):
77 |         """Get underlying tokenizer object."""
78 |         return self.tokenizer
79 | 


--------------------------------------------------------------------------------
/docling_core/transforms/chunker/tokenizer/openai.py:
--------------------------------------------------------------------------------
 1 | """OpenAI tokenization."""
 2 | 
 3 | from pydantic import ConfigDict
 4 | 
 5 | from docling_core.transforms.chunker.hybrid_chunker import BaseTokenizer
 6 | 
 7 | try:
 8 |     import tiktoken
 9 | except ImportError:
10 |     raise RuntimeError(
11 |         "Module requires 'chunking-openai' extra; to install, run: "
12 |         "`pip install 'docling-core[chunking-openai]'`"
13 |     )
14 | 
15 | 
16 | class OpenAITokenizer(BaseTokenizer):
17 |     """OpenAI tokenizer."""
18 | 
19 |     model_config = ConfigDict(arbitrary_types_allowed=True)
20 | 
21 |     tokenizer: tiktoken.Encoding
22 |     max_tokens: int
23 | 
24 |     def count_tokens(self, text: str) -> int:
25 |         """Get number of tokens for given text."""
26 |         return len(self.tokenizer.encode(text=text))
27 | 
28 |     def get_max_tokens(self) -> int:
29 |         """Get maximum number of tokens allowed."""
30 |         return self.max_tokens
31 | 
32 |     def get_tokenizer(self) -> tiktoken.Encoding:
33 |         """Get underlying tokenizer object."""
34 |         return self.tokenizer
35 | 


--------------------------------------------------------------------------------
/docling_core/transforms/serializer/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Define the serializer types."""
7 | 


--------------------------------------------------------------------------------
/docling_core/transforms/visualizer/__init__.py:
--------------------------------------------------------------------------------
1 | """Define the visualizer types."""
2 | 


--------------------------------------------------------------------------------
/docling_core/transforms/visualizer/base.py:
--------------------------------------------------------------------------------
 1 | """Define base classes for visualization."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from typing import Optional
 5 | 
 6 | from PIL.Image import Image
 7 | from pydantic import BaseModel
 8 | 
 9 | from docling_core.types.doc import DoclingDocument
10 | 
11 | 
12 | class BaseVisualizer(BaseModel, ABC):
13 |     """Visualize base class."""
14 | 
15 |     @abstractmethod
16 |     def get_visualization(
17 |         self,
18 |         *,
19 |         doc: DoclingDocument,
20 |         **kwargs,
21 |     ) -> dict[Optional[int], Image]:
22 |         """Get visualization of the document as images by page."""
23 |         raise NotImplementedError()
24 | 


--------------------------------------------------------------------------------
/docling_core/types/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the main types."""
 7 | 
 8 | from docling_core.types.doc.document import DoclingDocument
 9 | from docling_core.types.gen.generic import Generic
10 | from docling_core.types.rec.record import Record
11 | 


--------------------------------------------------------------------------------
/docling_core/types/doc/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Package for models defined by the Document type."""
 7 | 
 8 | from .base import BoundingBox, CoordOrigin, ImageRefMode, Size
 9 | from .document import (
10 |     CodeItem,
11 |     DocItem,
12 |     DoclingDocument,
13 |     DocumentOrigin,
14 |     FloatingItem,
15 |     GroupItem,
16 |     ImageRef,
17 |     KeyValueItem,
18 |     NodeItem,
19 |     PageItem,
20 |     PictureClassificationClass,
21 |     PictureClassificationData,
22 |     PictureDataType,
23 |     PictureItem,
24 |     ProvenanceItem,
25 |     RefItem,
26 |     SectionHeaderItem,
27 |     TableCell,
28 |     TableData,
29 |     TableItem,
30 |     TextItem,
31 | )
32 | from .labels import DocItemLabel, GroupLabel, TableCellLabel
33 | 


--------------------------------------------------------------------------------
/docling_core/types/doc/utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Utils for document types."""
 7 | 
 8 | import unicodedata
 9 | from pathlib import Path
10 | 
11 | 
12 | def relative_path(src: Path, target: Path) -> Path:
13 |     """Compute the relative path from `src` to `target`.
14 | 
15 |     Args:
16 |         src (str | Path): The source directory or file path (must be absolute).
17 |         target (str | Path): The target directory or file path (must be absolute).
18 | 
19 |     Returns:
20 |         Path: The relative path from `src` to `target`.
21 | 
22 |     Raises:
23 |         ValueError: If either `src` or `target` is not an absolute path.
24 |     """
25 |     src = Path(src).resolve()
26 |     target = Path(target).resolve()
27 | 
28 |     # Ensure both paths are absolute
29 |     if not src.is_absolute():
30 |         raise ValueError(f"The source path must be absolute: {src}")
31 |     if not target.is_absolute():
32 |         raise ValueError(f"The target path must be absolute: {target}")
33 | 
34 |     # Find the common ancestor
35 |     common_parts = []
36 |     for src_part, target_part in zip(src.parts, target.parts):
37 |         if src_part == target_part:
38 |             common_parts.append(src_part)
39 |         else:
40 |             break
41 | 
42 |     # Determine the path to go up from src to the common ancestor
43 |     up_segments = [".."] * (len(src.parts) - len(common_parts))
44 | 
45 |     # Add the path from the common ancestor to the target
46 |     down_segments = target.parts[len(common_parts) :]
47 | 
48 |     # Combine and return the result
49 |     return Path(*up_segments, *down_segments)
50 | 
51 | 
52 | def get_html_tag_with_text_direction(html_tag: str, text: str) -> str:
53 |     """Form the HTML element with tag, text, and optional dir attribute."""
54 |     text_dir = get_text_direction(text)
55 | 
56 |     if text_dir == "ltr":
57 |         return f"<{html_tag}>{text}</{html_tag}>"
58 |     else:
59 |         return f'<{html_tag} dir="{text_dir}">{text}</{html_tag}>'
60 | 
61 | 
62 | def get_text_direction(text: str) -> str:
63 |     """Determine the text direction of a given string as LTR or RTL script."""
64 |     if not text:
65 |         return "ltr"  # Default for empty input
66 | 
67 |     rtl_scripts = {"R", "AL"}
68 |     rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text)
69 | 
70 |     return (
71 |         "rtl"
72 |         if unicodedata.bidirectional(text[0]) in rtl_scripts
73 |         or rtl_chars > len(text) / 2
74 |         else "ltr"
75 |     )
76 | 


--------------------------------------------------------------------------------
/docling_core/types/gen/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Package for models defined by the Generic type."""
7 | 


--------------------------------------------------------------------------------
/docling_core/types/gen/generic.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define a generic Docling type."""
 7 | 
 8 | from typing import Optional
 9 | 
10 | from pydantic import Field, StrictStr
11 | 
12 | from docling_core.search.mapping import es_field
13 | from docling_core.types.base import FileInfoObject
14 | from docling_core.utils.alias import AliasModel
15 | 
16 | 
17 | class Generic(AliasModel):
18 |     """A representation of a generic document."""
19 | 
20 |     name: Optional[StrictStr] = Field(
21 |         default=None,
22 |         description="A short description or summary of the document.",
23 |         alias="_name",
24 |         json_schema_extra=es_field(type="text"),
25 |     )
26 | 
27 |     file_info: FileInfoObject = Field(
28 |         title="Document information",
29 |         description=(
30 |             "Minimal identification information of the document within a collection."
31 |         ),
32 |         alias="file-info",
33 |     )
34 | 


--------------------------------------------------------------------------------
/docling_core/types/io/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Models for io."""
 7 | 
 8 | from io import BytesIO
 9 | 
10 | from pydantic import BaseModel, ConfigDict
11 | 
12 | 
13 | class DocumentStream(BaseModel):
14 |     """Wrapper class for a bytes stream with a filename."""
15 | 
16 |     model_config = ConfigDict(arbitrary_types_allowed=True)
17 | 
18 |     name: str
19 |     stream: BytesIO
20 | 


--------------------------------------------------------------------------------
/docling_core/types/legacy_doc/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Package for models defined by the Document type."""
7 | 


--------------------------------------------------------------------------------
/docling_core/types/legacy_doc/doc_ann.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Models for annotations and predictions in CCS."""
 7 | from typing import Any
 8 | 
 9 | from pydantic import BaseModel
10 | 
11 | from docling_core.types.legacy_doc.base import BoundingBox
12 | 
13 | AnnotationReport = Any  # TODO
14 | 
15 | 
16 | class Cell(BaseModel):
17 |     """Cell."""
18 | 
19 |     id: int
20 |     rawcell_id: int
21 |     label: str
22 | 
23 | 
24 | class Cluster(BaseModel):
25 |     """Cluster."""
26 | 
27 |     model: str
28 |     type: str
29 |     bbox: BoundingBox
30 |     cell_ids: list[int]
31 |     merged: bool
32 |     id: int
33 | 
34 | 
35 | class Table(BaseModel):
36 |     """Table."""
37 | 
38 |     cell_id: int
39 |     label: str
40 |     rows: list[int]
41 |     cols: list[int]
42 | 
43 | 
44 | class Info(BaseModel):
45 |     """Info."""
46 | 
47 |     display_name: str
48 |     model_name: str
49 |     model_class: str
50 |     model_version: str
51 |     model_id: str
52 | 
53 | 
54 | class Source(BaseModel):
55 |     """Source."""
56 | 
57 |     type: str
58 |     timestamp: float
59 |     info: Info
60 | 
61 | 
62 | class AnnotPredItem(BaseModel):
63 |     """Annotation or prediction item."""
64 | 
65 |     cells: list[Cell]
66 |     clusters: list[Cluster]
67 |     tables: list[Table]
68 |     source: Source
69 | 
70 | 
71 | class Annotation(BaseModel):
72 |     """Annotations."""
73 | 
74 |     annotations: list[AnnotPredItem]
75 |     predictions: list[AnnotPredItem]
76 |     reports: list[AnnotationReport]
77 | 


--------------------------------------------------------------------------------
/docling_core/types/legacy_doc/doc_ocr.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Models for CCS objects with OCR."""
 7 | from typing import Any, Dict, List, Literal
 8 | 
 9 | from pydantic import BaseModel, Field
10 | 
11 | from docling_core.types.legacy_doc.base import BoundingBox
12 | from docling_core.utils.alias import AliasModel
13 | 
14 | CoordsOrder = Literal["x1", "y1", "x2", "y2"]
15 | 
16 | CoordsOrigin = Literal["top-left"]  # TODO
17 | 
18 | Info = Dict[str, Any]  # TODO
19 | 
20 | 
21 | class Page(BaseModel):
22 |     """Page."""
23 | 
24 |     width: float
25 |     height: float
26 | 
27 | 
28 | class Meta(AliasModel):
29 |     """Meta."""
30 | 
31 |     page: Page
32 |     coords_order: List[CoordsOrder] = Field(..., alias="coords-order")
33 |     coords_origin: CoordsOrigin = Field(..., alias="coords-origin")
34 | 
35 | 
36 | class Dimension(BaseModel):
37 |     """Dimension."""
38 | 
39 |     width: float
40 |     height: float
41 | 
42 | 
43 | class Word(BaseModel):
44 |     """Word."""
45 | 
46 |     confidence: float
47 |     bbox: BoundingBox
48 |     content: str
49 | 
50 | 
51 | class Cell(BaseModel):
52 |     """Cell."""
53 | 
54 |     confidence: float
55 |     bbox: BoundingBox
56 |     content: str
57 | 
58 | 
59 | class Box(BaseModel):
60 |     """Box."""
61 | 
62 |     confidence: float
63 |     bbox: BoundingBox
64 |     content: str
65 | 
66 | 
67 | class Path(BaseModel):
68 |     """Path."""
69 | 
70 |     x: List[float]
71 |     y: List[float]
72 | 
73 | 
74 | class OcrOutput(AliasModel):
75 |     """OCR output."""
76 | 
77 |     meta: Meta = Field(..., alias="_meta")
78 |     info: Info
79 |     dimension: Dimension
80 |     words: List[Word]
81 |     cells: List[Cell]
82 |     boxes: List[Box]
83 |     paths: List[Path]
84 | 


--------------------------------------------------------------------------------
/docling_core/types/nlp/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Package for models defining NLP artifacts."""
7 | 


--------------------------------------------------------------------------------
/docling_core/types/nlp/qa.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the model for Q&A pairs."""
 7 | from typing import Generic, Optional
 8 | 
 9 | from pydantic import BaseModel, Field, StrictBool, StrictStr
10 | 
11 | from docling_core.search.mapping import es_field
12 | from docling_core.types.base import DescriptionAdvancedT, StrictDateTime, UniqueList
13 | from docling_core.types.nlp.qa_labels import QALabelling
14 | 
15 | 
16 | class QAPair(BaseModel, Generic[DescriptionAdvancedT]):
17 |     """A representation of a question-answering (QA) pair."""
18 | 
19 |     context: StrictStr = Field(
20 |         description=(
21 |             "A single string containing the context of the question enabling the"
22 |             " presentation of the answer."
23 |         )
24 |     )
25 |     question: StrictStr = Field(description="A question on the given context.")
26 |     answer: StrictStr = Field(
27 |         description="The answer to the question from the context."
28 |     )
29 |     short_answer: Optional[StrictStr] = Field(
30 |         default=None, description="Alternative and concise answer."
31 |     )
32 |     retrieved_context: Optional[StrictBool] = Field(
33 |         default=False,
34 |         description="Whether the context was retrieved from the question.",
35 |     )
36 |     generated_question: Optional[StrictBool] = Field(
37 |         default=False, description="Whether the question was generated by an AI model."
38 |     )
39 |     generated_answer: Optional[StrictBool] = Field(
40 |         default=False, description="Whether the answer was generated by an AI model."
41 |     )
42 |     created: StrictDateTime = Field(
43 |         description="Datetime when the QA pair was created ."
44 |     )
45 |     user: Optional[StrictStr] = Field(
46 |         default=None,
47 |         description=(
48 |             "Unique identifier of the user that created or curated this QA pair."
49 |         ),
50 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
51 |     )
52 |     model: Optional[StrictStr] = Field(
53 |         default=None,
54 |         description="Unique identifier of the model used to generate this QA pair.",
55 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
56 |     )
57 |     paths: UniqueList[StrictStr] = Field(
58 |         description=(
59 |             "One or more references to a document that identify the provenance of the"
60 |             " QA pair context."
61 |         ),
62 |         examples=[
63 |             "badce7c84d0ba7ba0fb5e94492b0d91e2506a7cb48e4524ad572c546a35f768e#/"
64 |             "main-text/4"
65 |         ],
66 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
67 |     )
68 |     advanced: Optional[DescriptionAdvancedT] = Field(
69 |         default=None,
70 |         description="Document metadata to provide more details on the context.",
71 |     )
72 |     labels: Optional[QALabelling] = Field(
73 |         default=None, description="QApair labelling axes."
74 |     )
75 | 


--------------------------------------------------------------------------------
/docling_core/types/rec/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Package for models defined by the Record type."""
7 | 


--------------------------------------------------------------------------------
/docling_core/types/rec/attribute.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the model Attribute."""
 7 | from typing import Generic, Optional
 8 | 
 9 | from pydantic import Field
10 | from typing_extensions import Annotated
11 | 
12 | from docling_core.search.mapping import es_field
13 | from docling_core.types.base import (
14 |     IdentifierTypeT,
15 |     PredicateKeyNameT,
16 |     PredicateKeyTypeT,
17 |     PredicateValueTypeT,
18 |     ProvenanceTypeT,
19 | )
20 | from docling_core.types.rec.base import ProvenanceItem
21 | from docling_core.types.rec.predicate import Predicate
22 | from docling_core.utils.alias import AliasModel
23 | 
24 | 
25 | class Attribute(
26 |     AliasModel,
27 |     Generic[
28 |         IdentifierTypeT,
29 |         PredicateValueTypeT,
30 |         PredicateKeyNameT,
31 |         PredicateKeyTypeT,
32 |         ProvenanceTypeT,
33 |     ],
34 |     extra="forbid",
35 | ):
36 |     """Attribute model that describes a list of characteristics."""
37 | 
38 |     conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0, allow_inf_nan=False)] = (
39 |         Field(
40 |             ...,
41 |             title="Confidence",
42 |             description="The confidence level of this attribute characteristics.",
43 |             json_schema_extra=es_field(type="float"),
44 |         )
45 |     )
46 | 
47 |     prov: Optional[list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]]] = Field(
48 |         default=None,
49 |         title="Provenance",
50 |         description="The sources of this attribute characteristics.",
51 |     )
52 | 
53 |     predicates: list[
54 |         Predicate[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT]
55 |     ] = Field(..., description="A list of characteristics (type, value, and name).")
56 | 


--------------------------------------------------------------------------------
/docling_core/types/rec/base.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the base models for the Record type."""
 7 | from typing import Generic, List, Optional
 8 | 
 9 | from pydantic import Field, StrictInt, StrictStr
10 | from typing_extensions import Annotated
11 | 
12 | from docling_core.search.mapping import es_field
13 | from docling_core.types.base import Identifier, IdentifierTypeT, ProvenanceTypeT
14 | from docling_core.utils.alias import AliasModel
15 | 
16 | 
17 | class ProvenanceItem(
18 |     AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT], extra="forbid"
19 | ):
20 |     """A representation of an object provenance."""
21 | 
22 |     type_: Optional[ProvenanceTypeT] = Field(
23 |         default=None,
24 |         alias="type",
25 |         title="The provenance type",
26 |         description=(
27 |             "Any string representing the type of provenance, e.g. `sentence`, "
28 |             "`table`, or `doi`."
29 |         ),
30 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
31 |     )
32 | 
33 |     text: Optional[StrictStr] = Field(
34 |         default=None,
35 |         title="Evidence of the provenance",
36 |         description=(
37 |             "A text representing the evidence of the provenance, e.g. the sentence "
38 |             "text or the content of a table cell"
39 |         ),
40 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
41 |     )
42 | 
43 |     reference: Optional[Identifier[IdentifierTypeT]] = Field(
44 |         default=None,
45 |         title="Reference to the provenance object",
46 |         description=(
47 |             "Reference to another object, e.g. record, statement, URL, or any other "
48 |             "object that identifies the provenance"
49 |         ),
50 |     )
51 | 
52 |     path: Optional[StrictStr] = Field(
53 |         default=None,
54 |         title="The location of the provenance within the referenced object",
55 |         description=(
56 |             "A path that locates the evidence within the provenance object identified "
57 |             "by the `reference` field using a JSON pointer notation, e.g., "
58 |             "`#/main-text/5` to locate the `main-text` paragraph at index 5"
59 |         ),
60 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
61 |     )
62 | 
63 |     span: Optional[Annotated[List[StrictInt], Field(min_length=2, max_length=2)]] = (
64 |         Field(
65 |             default=None,
66 |             title="The location of the item in the text/table",
67 |             description=(
68 |                 "location of the item in the text/table referenced by the `path`,"
69 |                 " e.g., `[34, 67]`"
70 |             ),
71 |         )
72 |     )
73 | 
74 | 
75 | class Provenance(AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT]):
76 |     """A representation of an evidence, as a list of provenance objects."""
77 | 
78 |     conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0)] = Field(
79 |         ...,
80 |         title="The confidence of the evidence",
81 |         description=(
82 |             "This value represents a score to the data item. Items originating from "
83 |             " databases will typically have a score 1.0, while items resulting from "
84 |             " an NLP model may have a value between 0.0 and 1.0."
85 |         ),
86 |         json_schema_extra=es_field(type="float"),
87 |     )
88 |     prov: list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]] = Field(
89 |         title="Provenance", description="A list of provenance items."
90 |     )
91 | 


--------------------------------------------------------------------------------
/docling_core/types/rec/record.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the model Record."""
 7 | from typing import Generic, Optional
 8 | 
 9 | from pydantic import BaseModel, Field, StrictStr
10 | 
11 | from docling_core.search.mapping import es_field
12 | from docling_core.types.base import (
13 |     Acquisition,
14 |     CollectionNameTypeT,
15 |     CollectionRecordInfo,
16 |     FileInfoObject,
17 |     Identifier,
18 |     IdentifierTypeT,
19 |     Log,
20 |     PredicateKeyNameT,
21 |     PredicateKeyTypeT,
22 |     PredicateValueTypeT,
23 |     StrictDateTime,
24 |     SubjectNameTypeT,
25 |     SubjectTypeT,
26 | )
27 | from docling_core.types.rec.attribute import Attribute
28 | from docling_core.types.rec.base import Provenance, ProvenanceTypeT
29 | from docling_core.types.rec.subject import Subject
30 | 
31 | 
32 | class RecordDescription(BaseModel, Generic[CollectionNameTypeT]):
33 |     """Additional record metadata, including optional collection-specific fields."""
34 | 
35 |     logs: list[Log] = Field(
36 |         description="Logs that describe the ETL tasks applied to this record."
37 |     )
38 |     publication_date: Optional[StrictDateTime] = Field(
39 |         default=None,
40 |         title="Publication date",
41 |         description=(
42 |             "The date that best represents the last publication time of a record."
43 |         ),
44 |     )
45 |     collection: Optional[CollectionRecordInfo[CollectionNameTypeT]] = Field(
46 |         default=None, description="The collection information of this record."
47 |     )
48 |     acquisition: Optional[Acquisition] = Field(
49 |         default=None,
50 |         description=(
51 |             "Information on how the document was obtained, for data governance"
52 |             " purposes."
53 |         ),
54 |     )
55 | 
56 | 
57 | class Record(
58 |     Provenance,
59 |     Generic[
60 |         IdentifierTypeT,
61 |         PredicateValueTypeT,
62 |         PredicateKeyNameT,
63 |         PredicateKeyTypeT,
64 |         ProvenanceTypeT,
65 |         SubjectTypeT,
66 |         SubjectNameTypeT,
67 |         CollectionNameTypeT,
68 |     ],
69 | ):
70 |     """A representation of a structured record in an database."""
71 | 
72 |     file_info: FileInfoObject = Field(alias="file-info")
73 |     description: RecordDescription
74 |     subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT]
75 |     attributes: Optional[
76 |         list[
77 |             Attribute[
78 |                 IdentifierTypeT,
79 |                 PredicateValueTypeT,
80 |                 PredicateKeyNameT,
81 |                 PredicateKeyTypeT,
82 |                 ProvenanceTypeT,
83 |             ]
84 |         ]
85 |     ] = None
86 |     name: Optional[StrictStr] = Field(
87 |         default=None,
88 |         description="A short description or summary of the record.",
89 |         alias="_name",
90 |         json_schema_extra=es_field(type="text"),
91 |     )
92 |     identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
93 |         default=None,
94 |         description="A list of unique identifiers of this record in a database.",
95 |     )
96 | 


--------------------------------------------------------------------------------
/docling_core/types/rec/statement.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the model Statement."""
 7 | from enum import Enum
 8 | from typing import Generic
 9 | 
10 | from pydantic import Field
11 | 
12 | from docling_core.types.base import (
13 |     IdentifierTypeT,
14 |     PredicateKeyNameT,
15 |     PredicateKeyTypeT,
16 |     PredicateValueTypeT,
17 |     ProvenanceTypeT,
18 |     SubjectNameTypeT,
19 |     SubjectTypeT,
20 | )
21 | from docling_core.types.rec.attribute import Attribute
22 | from docling_core.types.rec.subject import Subject
23 | 
24 | 
25 | class StatementToken(Enum):
26 |     """Class to represent an LLM friendly representation of statements."""
27 | 
28 |     BEG_STATEMENTS = "<statements>"
29 |     END_STATEMENTS = "</statements>"
30 | 
31 |     BEG_STATEMENT = "<statement>"
32 |     END_STATEMENT = "</statement>"
33 | 
34 |     BEG_PROV = "<prov>"
35 |     END_PROV = "</prov>"
36 | 
37 |     BEG_SUBJECT = "<subject>"
38 |     END_SUBJECT = "</subject>"
39 | 
40 |     BEG_PREDICATE = "<predicate>"
41 |     END_PREDICATE = "</predicate>"
42 | 
43 |     BEG_PROPERTY = "<property>"
44 |     END_PROPERTY = "</property>"
45 | 
46 |     BEG_VALUE = "<value>"
47 |     END_VALUE = "</value>"
48 | 
49 |     BEG_UNIT = "<unit>"
50 |     END_UNIT = "</unit>"
51 | 
52 |     @classmethod
53 |     def get_special_tokens(cls):
54 |         """Function to get all special statements tokens."""
55 |         return [token.value for token in cls]
56 | 
57 | 
58 | class Statement(
59 |     Attribute,
60 |     Generic[
61 |         IdentifierTypeT,
62 |         PredicateValueTypeT,
63 |         PredicateKeyNameT,
64 |         PredicateKeyTypeT,
65 |         ProvenanceTypeT,
66 |         SubjectTypeT,
67 |         SubjectNameTypeT,
68 |     ],
69 |     extra="allow",
70 | ):
71 |     """A representation of a statement on a subject."""
72 | 
73 |     subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT] = Field(
74 |         description="The subject (entity) of this statement."
75 |     )
76 | 


--------------------------------------------------------------------------------
/docling_core/types/rec/subject.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define the model Subject."""
 7 | from typing import Generic, Optional
 8 | 
 9 | from pydantic import Field, StrictStr
10 | 
11 | from docling_core.search.mapping import es_field
12 | from docling_core.types.base import (
13 |     Identifier,
14 |     IdentifierTypeT,
15 |     SubjectNameTypeT,
16 |     SubjectTypeT,
17 | )
18 | from docling_core.types.legacy_doc.base import S3Reference
19 | from docling_core.utils.alias import AliasModel
20 | 
21 | 
22 | class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]):
23 |     """Identifier of subject names.""" ""
24 | 
25 | 
26 | class Subject(
27 |     AliasModel,
28 |     Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT],
29 |     extra="forbid",
30 | ):
31 |     """A representation of a subject."""
32 | 
33 |     display_name: StrictStr = Field(
34 |         title="Display Name",
35 |         description=(
36 |             "Name of the subject in natural language. It can be used for end-user "
37 |             "applications to display a human-readable name. For instance, `B(2) Mg(1)` "
38 |             "for `MgB2` or `International Business Machines` for `IBM`"
39 |         ),
40 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
41 |     )
42 |     display_image: Optional[S3Reference] = Field(
43 |         default=None,
44 |         title="Display Image",
45 |         description=(
46 |             "Image representing the subject. It can be used for end-user applications."
47 |             "For example, the chemical structure drawing of a compound "
48 |             "or the eight bar IBM logo for IBM."
49 |         ),
50 |         json_schema_extra=es_field(suppress=True),
51 |     )
52 |     type_: SubjectTypeT = Field(
53 |         alias="type",
54 |         description=(
55 |             "Main subject type. For instance, `material`, `material-class`, "
56 |             "`material-device`, `company`, or `person`."
57 |         ),
58 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
59 |     )
60 |     names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field(
61 |         description=(
62 |             "List of given names for this subject. They may not be unique across "
63 |             "different subjects."
64 |         )
65 |     )
66 |     identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field(
67 |         default=None,
68 |         description=(
69 |             "List of unique identifiers in database. For instance, the `PubChem ID` "
70 |             "of a record in the PubChem database."
71 |         ),
72 |     )
73 |     labels: Optional[list[StrictStr]] = Field(
74 |         default=None,
75 |         description="List of labels or categories for this subject.",
76 |         json_schema_extra=es_field(type="keyword", ignore_above=8191),
77 |     )
78 | 


--------------------------------------------------------------------------------
/docling_core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 
6 | """Package for modules to support data models."""
7 | 


--------------------------------------------------------------------------------
/docling_core/utils/alias.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Define utility models and types related to field aliases."""
 7 | from pydantic import BaseModel, ConfigDict
 8 | 
 9 | 
10 | class AliasModel(BaseModel):
11 |     """Model for alias fields to ensure instantiation and serialization by alias."""
12 | 
13 |     model_config = ConfigDict(populate_by_name=True)
14 | 
15 |     def model_dump(self, **kwargs) -> dict:
16 |         """Generate a dictionary representation of the model using field aliases."""
17 |         if "by_alias" not in kwargs:
18 |             kwargs = {**kwargs, "by_alias": True}
19 | 
20 |         return super().model_dump(**kwargs)
21 | 
22 |     def model_dump_json(self, **kwargs) -> str:
23 |         """Generate a JSON representation of the model using field aliases."""
24 |         if "by_alias" not in kwargs:
25 |             kwargs = {**kwargs, "by_alias": True}
26 | 
27 |         return super().model_dump_json(**kwargs)
28 | 


--------------------------------------------------------------------------------
/docling_core/utils/generate_docs.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Generate documentation of Docling types as JSON schema.
 7 | 
 8 | Example:
 9 |     python docling_core/utils/generate_docs.py /tmp/docling_core_files
10 | """
11 | import argparse
12 | import json
13 | import os
14 | from argparse import BooleanOptionalAction
15 | from pathlib import Path
16 | from shutil import rmtree
17 | from typing import Final
18 | 
19 | from docling_core.utils.generate_jsonschema import generate_json_schema
20 | 
21 | MODELS: Final = ["DoclingDocument", "Record", "Generic"]
22 | 
23 | 
24 | def _prepare_directory(folder: str, clean: bool = False) -> None:
25 |     """Create a directory or empty its content if it already exists.
26 | 
27 |     Args:
28 |         folder: The name of the directory.
29 |         clean: Whether any existing content in the directory should be removed.
30 |     """
31 |     if os.path.isdir(folder):
32 |         if clean:
33 |             for path in Path(folder).glob("**/*"):
34 |                 if path.is_file():
35 |                     path.unlink()
36 |                 elif path.is_dir():
37 |                     rmtree(path)
38 |     else:
39 |         os.makedirs(folder, exist_ok=True)
40 | 
41 | 
42 | def generate_collection_jsonschema(folder: str):
43 |     """Generate the JSON schema of Docling collections and export them to a folder.
44 | 
45 |     Args:
46 |         folder: The name of the directory.
47 |     """
48 |     for item in MODELS:
49 |         json_schema = generate_json_schema(item)
50 |         with open(
51 |             os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8"
52 |         ) as json_file:
53 |             json.dump(json_schema, json_file, ensure_ascii=False, indent=2)
54 | 
55 | 
56 | def main() -> None:
57 |     """Generate the JSON Schema of Docling collections and export documentation."""
58 |     argparser = argparse.ArgumentParser()
59 |     argparser.add_argument(
60 |         "directory",
61 |         help=(
62 |             "Directory to generate files. If it exists, any existing content will be"
63 |             " removed."
64 |         ),
65 |     )
66 |     argparser.add_argument(
67 |         "--clean",
68 |         help="Whether any existing content in directory should be removed.",
69 |         action=BooleanOptionalAction,
70 |         dest="clean",
71 |         default=False,
72 |         required=False,
73 |     )
74 |     args = argparser.parse_args()
75 | 
76 |     _prepare_directory(args.directory, args.clean)
77 | 
78 |     generate_collection_jsonschema(args.directory)
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     main()
83 | 


--------------------------------------------------------------------------------
/docling_core/utils/generate_jsonschema.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Generate the JSON Schema of pydantic models and export them to files.
 7 | 
 8 | Example:
 9 |     python docling_core/utils/generate_jsonschema.py doc.document.TableCell
10 | 
11 | """
12 | import argparse
13 | import json
14 | from typing import Any, Union
15 | 
16 | from pydantic import BaseModel
17 | 
18 | 
19 | def _import_class(class_reference: str) -> Any:
20 |     components = class_reference.split(".")
21 |     module_ref = ".".join(components[:-1])
22 |     class_name = components[-1]
23 |     mod = __import__(module_ref, fromlist=[class_name])
24 |     class_type = getattr(mod, class_name)
25 | 
26 |     return class_type
27 | 
28 | 
29 | def generate_json_schema(class_reference: str) -> Union[dict, None]:
30 |     """Generate a jsonable dict of a model's schema from a data type.
31 | 
32 |     Args:
33 |         class_reference: The reference to a class in 'docling_core.types'.
34 | 
35 |     Returns:
36 |         A jsonable dict of the model's schema.
37 |     """
38 |     if not class_reference.startswith("docling_core.types."):
39 |         class_reference = "docling_core.types." + class_reference
40 |     class_type = _import_class(class_reference)
41 |     if issubclass(class_type, BaseModel):
42 |         return class_type.model_json_schema()
43 |     else:
44 |         return None
45 | 
46 | 
47 | def main() -> None:
48 |     """Print the JSON Schema of a model."""
49 |     argparser = argparse.ArgumentParser()
50 |     argparser.add_argument(
51 |         "class_ref", help="Class reference, e.g., doc.document.TableCell"
52 |     )
53 |     args = argparser.parse_args()
54 | 
55 |     json_schema = generate_json_schema(args.class_ref)
56 |     print(
57 |         json.dumps(json_schema, ensure_ascii=False, indent=2).encode("utf-8").decode()
58 |     )
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     main()
63 | 


--------------------------------------------------------------------------------
/docling_core/utils/validate.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Validation of Document-related files against their data schemas."""
 7 | import argparse
 8 | import json
 9 | import logging
10 | 
11 | from docling_core.utils.validators import (
12 |     validate_ann_schema,
13 |     validate_ocr_schema,
14 |     validate_raw_schema,
15 | )
16 | 
17 | logger = logging.getLogger("docling-core")
18 | 
19 | 
20 | def parse_arguments():
21 |     """Parse the arguments from the command line."""
22 |     argparser = argparse.ArgumentParser(description="validate example-file with schema")
23 | 
24 |     argparser.add_argument(
25 |         "-f", "--format", required=True, help="format of the file [RAW, ANN, OCR]"
26 |     )
27 | 
28 |     argparser.add_argument(
29 |         "-i", "--input-file", required=True, help="JSON filename to be validated"
30 |     )
31 | 
32 |     pargs = argparser.parse_args()
33 | 
34 |     return pargs.format, pargs.input_file
35 | 
36 | 
37 | def run():
38 |     """Run the validation of a file containing a Document."""
39 |     file_format, input_file = parse_arguments()
40 | 
41 |     with open(input_file, "r", encoding="utf-8") as fd:
42 |         file_ = json.load(fd)
43 | 
44 |     result = (False, "Empty result")
45 | 
46 |     if file_format == "RAW":
47 |         result = validate_raw_schema(file_)
48 | 
49 |     elif file_format == "ANN":
50 |         result = validate_ann_schema(file_)
51 | 
52 |     elif file_format == "OCR":
53 |         result = validate_ocr_schema(file_)
54 | 
55 |     else:
56 |         logger.error("format of the file needs to `RAW`, `ANN` or `OCR`")
57 | 
58 |     if result[0]:
59 |         logger.info("Done!")
60 |     else:
61 |         logger.error("invalid schema: {}".format(result[1]))
62 | 
63 | 
64 | def main():
65 |     """Set up the environment and run the validation of a Document."""
66 |     logger.setLevel(logging.DEBUG)
67 | 
68 |     # create console handler and set level to debug
69 |     ch = logging.StreamHandler()
70 |     ch.setLevel(logging.DEBUG)
71 | 
72 |     # create formatter
73 |     formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
74 | 
75 |     # add formatter to ch
76 |     ch.setFormatter(formatter)
77 | 
78 |     # add ch to logger
79 |     # logger.addHandler(ch)
80 | 
81 |     logging.basicConfig(handlers=[ch])
82 |     run()
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/docling_core/utils/validators.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Copyright IBM Corp. 2024 - 2024
  3 | # SPDX-License-Identifier: MIT
  4 | #
  5 | 
  6 | """Module for custom type validators."""
  7 | import json
  8 | import logging
  9 | from datetime import datetime
 10 | from importlib import resources
 11 | from typing import Hashable, TypeVar
 12 | 
 13 | import jsonschema
 14 | from pydantic_core import PydanticCustomError
 15 | 
 16 | logger = logging.getLogger("docling-core")
 17 | 
 18 | T = TypeVar("T", bound=Hashable)
 19 | 
 20 | 
 21 | def validate_schema(file_: dict, schema: dict) -> tuple[bool, str]:
 22 |     """Check wheter the workflow is properly formatted JSON and contains valid keys.
 23 | 
 24 |     Where possible, this also checks a few basic dependencies between properties, but
 25 |     this functionality is limited.
 26 |     """
 27 |     try:
 28 |         jsonschema.validate(file_, schema)
 29 |         return (True, "All good!")
 30 | 
 31 |     except jsonschema.ValidationError as err:
 32 |         return (False, err.message)
 33 | 
 34 | 
 35 | def validate_raw_schema(file_: dict) -> tuple[bool, str]:
 36 |     """Validate a RAW file."""
 37 |     logger.debug("validate RAW schema ... ")
 38 | 
 39 |     schema_txt = (
 40 |         resources.files("docling_core")
 41 |         .joinpath("resources/schemas/legacy_doc/RAW.json")
 42 |         .read_text("utf-8")
 43 |     )
 44 |     schema = json.loads(schema_txt)
 45 | 
 46 |     return validate_schema(file_, schema)
 47 | 
 48 | 
 49 | def validate_ann_schema(file_: dict) -> tuple[bool, str]:
 50 |     """Validate an annotated (ANN) file."""
 51 |     logger.debug("validate ANN schema ... ")
 52 | 
 53 |     schema_txt = (
 54 |         resources.files("docling_core")
 55 |         .joinpath("resources/schemas/legacy_doc/ANN.json")
 56 |         .read_text("utf-8")
 57 |     )
 58 |     schema = json.loads(schema_txt)
 59 | 
 60 |     return validate_schema(file_, schema)
 61 | 
 62 | 
 63 | def validate_ocr_schema(file_: dict) -> tuple[bool, str]:
 64 |     """Validate an OCR file."""
 65 |     logger.debug("validate OCR schema ... ")
 66 | 
 67 |     schema_txt = (
 68 |         resources.files("docling_core")
 69 |         .joinpath("resources/schemas/legacy_doc/OCR-output.json")
 70 |         .read_text("utf-8")
 71 |     )
 72 |     schema = json.loads(schema_txt)
 73 | 
 74 |     return validate_schema(file_, schema)
 75 | 
 76 | 
 77 | def validate_unique_list(v: list[T]) -> list[T]:
 78 |     """Validate that a list has unique values.
 79 | 
 80 |     Validator for list types, since pydantic V2 does not support the `unique_items`
 81 |     parameter from V1. More information on
 82 |     https://github.com/pydantic/pydantic-core/pull/820#issuecomment-1670475909
 83 | 
 84 |     Args:
 85 |         v: any list of hashable types
 86 | 
 87 |     Returns:
 88 |         The list, after checking for unique items.
 89 |     """
 90 |     if len(v) != len(set(v)):
 91 |         raise PydanticCustomError("unique_list", "List must be unique")
 92 |     return v
 93 | 
 94 | 
 95 | def validate_datetime(v, handler):
 96 |     """Validate that a value is a datetime or a non-numeric string."""
 97 |     if type(v) is datetime or (type(v) is str and not v.isnumeric()):
 98 |         return handler(v)
 99 |     else:
100 |         raise ValueError("Value type must be a datetime or a non-numeric string")
101 | 


--------------------------------------------------------------------------------
/docs/Generic.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$defs": {
 3 |     "FileInfoObject": {
 4 |       "description": "Filing information for any data object to be stored in a Docling database.",
 5 |       "properties": {
 6 |         "filename": {
 7 |           "description": "The name of a persistent object that created this data object",
 8 |           "title": "Filename",
 9 |           "type": "string",
10 |           "x-es-ignore_above": 8191,
11 |           "x-es-type": "keyword"
12 |         },
13 |         "filename-prov": {
14 |           "anyOf": [
15 |             {
16 |               "type": "string"
17 |             },
18 |             {
19 |               "type": "null"
20 |             }
21 |           ],
22 |           "default": null,
23 |           "description": "The provenance of this data object, e.g. an archive file, a URL, or any other repository.",
24 |           "title": "Filename-Prov",
25 |           "x-es-ignore_above": 8191,
26 |           "x-es-type": "keyword"
27 |         },
28 |         "document-hash": {
29 |           "description": "A unique identifier of this data object within a collection of a Docling database",
30 |           "title": "Document-Hash",
31 |           "type": "string",
32 |           "x-es-ignore_above": 8191,
33 |           "x-es-type": "keyword"
34 |         }
35 |       },
36 |       "required": [
37 |         "filename",
38 |         "document-hash"
39 |       ],
40 |       "title": "FileInfoObject",
41 |       "type": "object"
42 |     }
43 |   },
44 |   "description": "A representation of a generic document.",
45 |   "properties": {
46 |     "_name": {
47 |       "anyOf": [
48 |         {
49 |           "type": "string"
50 |         },
51 |         {
52 |           "type": "null"
53 |         }
54 |       ],
55 |       "default": null,
56 |       "description": "A short description or summary of the document.",
57 |       "title": "Name",
58 |       "x-es-type": "text"
59 |     },
60 |     "file-info": {
61 |       "$ref": "#/$defs/FileInfoObject",
62 |       "description": "Minimal identification information of the document within a collection.",
63 |       "title": "Document information"
64 |     }
65 |   },
66 |   "required": [
67 |     "file-info"
68 |   ],
69 |   "title": "Generic",
70 |   "type": "object"
71 | }


--------------------------------------------------------------------------------
/examples/chunking_and_serialization.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "<div class=\"alert alert-info\">\n",
 8 |     "    👉 <strong>INFO</strong>: This notebook has moved to the Docling docs, check it out \n",
 9 |     "    <a href=\"https://docling-project.github.io/docling/examples/advanced_chunking_and_serialization/\">\n",
10 |     "    here</a>.\n",
11 |     "</div>"
12 |    ]
13 |   }
14 |  ],
15 |  "metadata": {
16 |   "kernelspec": {
17 |    "display_name": ".venv",
18 |    "language": "python",
19 |    "name": "python3"
20 |   },
21 |   "language_info": {
22 |    "codemirror_mode": {
23 |     "name": "ipython",
24 |     "version": 3
25 |    },
26 |    "file_extension": ".py",
27 |    "mimetype": "text/x-python",
28 |    "name": "python",
29 |    "nbconvert_exporter": "python",
30 |    "pygments_lexer": "ipython3",
31 |    "version": "3.12.4"
32 |   }
33 |  },
34 |  "nbformat": 4,
35 |  "nbformat_minor": 2
36 | }
37 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright IBM Corp. 2024 - 2024
3 | # SPDX-License-Identifier: MIT
4 | #
5 | 


--------------------------------------------------------------------------------
/test/data/doc/01030000000083.dt:
--------------------------------------------------------------------------------
1 | <doctag><otsl><loc_83><loc_85><loc_417><loc_204><ched>Category<ched>Number of  clauses in  Union laws<ched>In  percent<ched>Number of  clauses in  State laws<ched>In  percent<nl><fcel>Commercial<fcel>529<fcel>10.1%<fcel>817<fcel>3.9%<nl><fcel>Environment, Health  and Safety<fcel>834<fcel>15.9%<fcel>345<fcel>1.7%<nl><fcel>Finance & Taxation<fcel>41<fcel>0.8%<fcel>888<fcel>4.2%<nl><fcel>General<fcel>75<fcel>1.4%<fcel>360<fcel>1.7%<nl><fcel>Industry Specific<fcel>2979<fcel>56.9%<fcel>1200<fcel>5.7%<nl><fcel>Labour<fcel>534<fcel>10.2%<fcel>17285<fcel>82.7%<nl><fcel>Secretarial<fcel>247<fcel>4.7%<fcel>0<fcel>0.0%<nl><caption><loc_82><loc_58><loc_333><loc_76>TABLE 35: UNION-STATE BREAKDOWN OF IMPRISONMENT CLAUSES BY CATEGORIES</caption></otsl>
2 | <otsl><loc_83><loc_244><loc_417><loc_304><ched>Compliances<ched>Small<ched>Medium<ched>Large<nl><fcel>Total Applicable Compliances<fcel>669<fcel>3,109<fcel>5,796<nl><fcel>Compliances with  imprisonment<fcel>461<fcel>2,172<fcel>4,085<nl><fcel>Percentage of imprisonment  clauses<fcel>69%<fcel>70%<fcel>70%<nl><caption><loc_83><loc_215><loc_399><loc_233>TABLE 36: THREE CASE STUDIES ON MANUFACTURING COMPLIANCES*</caption></otsl>
3 | <text><loc_83><loc_308><loc_420><loc_324>* These are real data from three companies operating in the automotive components business</text>
4 | <otsl><loc_83><loc_363><loc_417><loc_437><ecel><ched>Small<ched>Medium<ched>Large<nl><fcel>Less than 3 months<fcel>25<fcel>82<fcel>185<nl><fcel>3 months to less than 1 year<fcel>187<fcel>699<fcel>1,220<nl><fcel>1 year to less than 3 years<fcel>178<fcel>1,070<fcel>1,964<nl><fcel>3 years to less than 5 years<fcel>59<fcel>245<fcel>505<nl><fcel>5 years to 10 years<fcel>12<fcel>76<fcel>211<nl><caption><loc_83><loc_334><loc_411><loc_353>TABLE 37: BREAKDOWN OF IMPRISONMENT CLAUSES IN MANUFACTURING CASE STUDIES*</caption></otsl>
5 | <text><loc_83><loc_438><loc_137><loc_446>* In Table 36</text>
6 | <page_footer><loc_243><loc_473><loc_258><loc_486>85</page_footer>
7 | <page_header><loc_365><loc_20><loc_417><loc_28>Appendices</page_header>
8 | </doctag>


--------------------------------------------------------------------------------
/test/data/doc/01030000000083.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/01030000000083.png


--------------------------------------------------------------------------------
/test/data/doc/01030000000111.dt:
--------------------------------------------------------------------------------
 1 | <doctag><picture><loc_45><loc_45><loc_453><loc_208><other><caption><loc_45><loc_220><loc_427><loc_235>Figure 8.1: a) P6238 CUSSONS free and forced vortex apparatus, b) push-in orifices, c) free vortex measuring caliper, d) force vortex measuring probes</caption></picture>
 2 | <section_header_level_1><loc_45><loc_245><loc_99><loc_254>7. THEORY</section_header_level_1>
 3 | <text><loc_45><loc_261><loc_455><loc_289>Two types of vortices are distinguished in the dynamics of the motion: forced and free vortices. The forced vortex is caused by external forces on the fluid, such as the impeller of a pump, and the free vortex naturally occurs in the flow and can be observed in a drain or in the atmosphere of a tornado.</text>
 4 | <section_header_level_1><loc_45><loc_298><loc_134><loc_306>7.1. FREE VORTEX</section_header_level_1>
 5 | <text><loc_45><loc_314><loc_455><loc_342>A free vortex is formed when water flows out of a vessel through a central hole in the base (Figure 8.2). The degree of the rotation depends on the initial disturbance. In a free cylindrical vortex, the velocity varies inversely with the distance from the axis of rotation (Figure 8.3).</text>
 6 | <formula><loc_45><loc_349><loc_110><loc_364>\upsilon = \frac { k } { r } \quad \quad ( 1 )</formula>
 7 | <text><loc_45><loc_368><loc_383><loc_376>The equation governing the surface profile is derived from the Bernoulli's theorem:</text>
 8 | <formula><loc_45><loc_383><loc_134><loc_399>\upsilon ^ { 2 } = + \, z = C \quad \quad ( 2 )</formula>
 9 | <text><loc_45><loc_404><loc_291><loc_412>Substituting Equation (1) into (2) will give a new expression:</text>
10 | <formula><loc_45><loc_419><loc_141><loc_434>\frac { k ^ { 2 } } { 2 g r ^ { 2 } } + \, z = C \quad \quad ( 3 )</formula>
11 | <text><loc_45><loc_441><loc_59><loc_449>or:</text>
12 | <page_footer><loc_45><loc_470><loc_186><loc_476>68 APPLIED FLUID MECHANICS LAB MANUAL</page_footer>
13 | </doctag>


--------------------------------------------------------------------------------
/test/data/doc/01030000000111.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/01030000000111.png


--------------------------------------------------------------------------------
/test/data/doc/2206.01062.yaml.et:
--------------------------------------------------------------------------------
  1 | 0: unspecified with name=_root_
  2 |  1: section_header
  3 |  2: text
  4 |  3: text
  5 |  4: text
  6 |  5: text
  7 |  6: text
  8 |  7: section_header
  9 |  8: text
 10 |  9: section_header
 11 |  10: text
 12 |  11: text
 13 |  12: text
 14 |  13: text
 15 |  14: text
 16 |  15: text
 17 |  16: picture
 18 |   17: caption
 19 |  18: section_header
 20 |  19: text
 21 |  20: section_header
 22 |  21: text
 23 |  22: section_header
 24 |  23: text
 25 |  24: text
 26 |  25: text
 27 |  26: list with name=list
 28 |   27: list_item
 29 |   28: list_item
 30 |   29: list_item
 31 |   30: list_item
 32 |  31: footnote
 33 |  32: text
 34 |  33: list with name=list
 35 |   34: list_item
 36 |  35: text
 37 |  36: text
 38 |  37: section_header
 39 |  38: text
 40 |  39: text
 41 |  40: section_header
 42 |  41: text
 43 |  42: text
 44 |  43: picture
 45 |   44: caption
 46 |  45: text
 47 |  46: text
 48 |  47: text
 49 |  48: text
 50 |  49: footnote
 51 |  50: text
 52 |  51: text
 53 |  52: text
 54 |  53: section_header
 55 |  54: text
 56 |  55: table
 57 |   56: caption
 58 |  57: picture
 59 |   58: caption
 60 |  59: text
 61 |  60: text
 62 |  61: text
 63 |  62: text
 64 |  63: footnote
 65 |  64: text
 66 |  65: text
 67 |  66: text
 68 |  67: list with name=list
 69 |   68: list_item
 70 |   69: list_item
 71 |   70: list_item
 72 |   71: list_item
 73 |   72: list_item
 74 |   73: list_item
 75 |  74: text
 76 |  75: text
 77 |  76: picture
 78 |  77: text
 79 |  78: caption
 80 |  79: text
 81 |  80: text
 82 |  81: text
 83 |  82: table
 84 |  83: text
 85 |  84: section_header
 86 |  85: text
 87 |  86: picture
 88 |   87: caption
 89 |  88: text
 90 |  89: text
 91 |  90: section_header
 92 |  91: text
 93 |  92: text
 94 |  93: table
 95 |  94: section_header
 96 |  95: text
 97 |  96: section_header
 98 |  97: text
 99 |  98: text
100 |  99: table
101 |  100: text
102 |  101: section_header
103 |  102: text
104 |  103: section_header
105 |  104: text
106 |  105: text
107 |  106: table
108 |  107: text
109 |  108: text
110 |  109: section_header
111 |  110: text
112 |  111: section_header
113 |  112: text
114 |  113: text
115 |  114: text
116 |  115: section_header
117 |  116: list with name=list
118 |   117: list_item
119 |   118: list_item
120 |   119: list_item
121 |   120: list_item
122 |   121: list_item
123 |   122: list_item
124 |   123: list_item
125 |   124: list_item
126 |   125: list_item
127 |   126: list_item
128 |   127: list_item
129 |   128: list_item
130 |   129: list_item
131 |  130: picture
132 |   131: caption
133 |  132: text
134 |  133: text
135 |  134: list with name=list
136 |   135: list_item
137 |   136: list_item
138 |   137: list_item
139 |   138: list_item
140 |   139: list_item
141 |   140: list_item
142 |   141: list_item
143 |   142: list_item
144 |   143: list_item
145 |   144: list_item
146 | 


--------------------------------------------------------------------------------
/test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head></head>
 4 | <body>
 5 | <div class='page'>
 6 | <h1>Docling Technical Report</h1>
 7 | <p>Version 1.0</p>
 8 | <p>Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar</p>
 9 | <p>AI4K Group, IBM Research R¨ uschlikon, Switzerland</p>
10 | <h2>Abstract</h2>
11 | <p>This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.</p>
12 | <h2>1 Introduction</h2>
13 | <p>Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.</p>
14 | <p>With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.</p>
15 | </div>
16 | </body>
17 | </html>
18 | 


--------------------------------------------------------------------------------
/test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | <head></head>
 4 | <body>
 5 | <div class='page'>
 6 | <h1>Docling Technical Report</h1>
 7 | <figure><figcaption><div data-annotation-kind="description">In this image we can see a cartoon image of a duck holding a paper.</div></figcaption></figure>
 8 | <p>Version 1.0</p>
 9 | <p>Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar</p>
10 | <p>AI4K Group, IBM Research R¨ uschlikon, Switzerland</p>
11 | <h2>Abstract</h2>
12 | <p>This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.</p>
13 | <h2>1 Introduction</h2>
14 | <p>Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.</p>
15 | <p>With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.</p>
16 | </div>
17 | </body>
18 | </html>
19 | 


--------------------------------------------------------------------------------
/test/data/doc/activities.gt.md:
--------------------------------------------------------------------------------
 1 | ## Summer activities
 2 | 
 3 | ## Swimming in the lake
 4 | 
 5 | Duck
 6 | 
 7 | Figure 1: This is a cute duckling
 8 | 
 9 | ## Let's swim!
10 | 
11 | To get started with swimming, first lay down in a water and try not to drown:
12 | 
13 | - ∞ You can relax and look around
14 | - ∞ Paddle about
15 | - ∞ Enjoy summer warmth
16 | 
17 | Also, don't forget:
18 | 
19 | - 1. Wear sunglasses
20 | - 2. Don't forget to drink water
21 | - 3. Use sun cream
22 | 
23 | Hmm, what else…
24 | 
25 | - -Another activity item
26 | <!-- page break -->
27 | - -Yet another one
28 | - -Stopping it here
29 | 
30 | Some text.
31 | 
32 | <!-- page break -->
33 | 
34 | - -Starting the next page with a list item.
35 | - -Second item.
36 | 


--------------------------------------------------------------------------------
/test/data/doc/activities_p2.gt.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 | <meta charset="UTF-8">
  5 | <title>activities</title>
  6 | <meta name="generator" content="Docling HTML Serializer">
  7 | <style>
  8 |     html {
  9 |         background-color: #f5f5f5;
 10 |         font-family: Arial, sans-serif;
 11 |         line-height: 1.6;
 12 |     }
 13 |     body {
 14 |         max-width: 800px;
 15 |         margin: 0 auto;
 16 |         padding: 2rem;
 17 |         background-color: white;
 18 |         box-shadow: 0 0 10px rgba(0,0,0,0.1);
 19 |     }
 20 |     h1, h2, h3, h4, h5, h6 {
 21 |         color: #333;
 22 |         margin-top: 1.5em;
 23 |         margin-bottom: 0.5em;
 24 |     }
 25 |     h1 {
 26 |         font-size: 2em;
 27 |         border-bottom: 1px solid #eee;
 28 |         padding-bottom: 0.3em;
 29 |     }
 30 |     table {
 31 |         border-collapse: collapse;
 32 |         margin: 1em 0;
 33 |         width: 100%;
 34 |     }
 35 |     th, td {
 36 |         border: 1px solid #ddd;
 37 |         padding: 8px;
 38 |         text-align: left;
 39 |     }
 40 |     th {
 41 |         background-color: #f2f2f2;
 42 |         font-weight: bold;
 43 |     }
 44 |     figure {
 45 |         margin: 1.5em 0;
 46 |         text-align: center;
 47 |     }
 48 |     figcaption {
 49 |         color: #666;
 50 |         font-style: italic;
 51 |         margin-top: 0.5em;
 52 |     }
 53 |     img {
 54 |         max-width: 100%;
 55 |         height: auto;
 56 |     }
 57 |     pre {
 58 |         background-color: #f6f8fa;
 59 |         border-radius: 3px;
 60 |         padding: 1em;
 61 |         overflow: auto;
 62 |     }
 63 |     code {
 64 |         font-family: monospace;
 65 |         background-color: #f6f8fa;
 66 |         padding: 0.2em 0.4em;
 67 |         border-radius: 3px;
 68 |     }
 69 |     pre code {
 70 |         background-color: transparent;
 71 |         padding: 0;
 72 |     }
 73 |     .formula {
 74 |         text-align: center;
 75 |         padding: 0.5em;
 76 |         margin: 1em 0;
 77 |         background-color: #f9f9f9;
 78 |     }
 79 |     .formula-not-decoded {
 80 |         text-align: center;
 81 |         padding: 0.5em;
 82 |         margin: 1em 0;
 83 |         background: repeating-linear-gradient(
 84 |             45deg,
 85 |             #f0f0f0,
 86 |             #f0f0f0 10px,
 87 |             #f9f9f9 10px,
 88 |             #f9f9f9 20px
 89 |         );
 90 |     }
 91 |     .page-break {
 92 |         page-break-after: always;
 93 |         border-top: 1px dashed #ccc;
 94 |         margin: 2em 0;
 95 |     }
 96 |     .key-value-region {
 97 |         background-color: #f9f9f9;
 98 |         padding: 1em;
 99 |         border-radius: 4px;
100 |         margin: 1em 0;
101 |     }
102 |     .key-value-region dt {
103 |         font-weight: bold;
104 |     }
105 |     .key-value-region dd {
106 |         margin-left: 1em;
107 |         margin-bottom: 0.5em;
108 |     }
109 |     .form-container {
110 |         border: 1px solid #ddd;
111 |         padding: 1em;
112 |         border-radius: 4px;
113 |         margin: 1em 0;
114 |     }
115 |     .form-item {
116 |         margin-bottom: 0.5em;
117 |     }
118 |     .image-classification {
119 |         font-size: 0.9em;
120 |         color: #666;
121 |         margin-top: 0.5em;
122 |     }
123 | </style>
124 | </head>
125 | <body>
126 | <div class='page'>
127 | <ul>
128 | <li>-Yet another one</li>
129 | <li>-Stopping it here</li>
130 | </ul>
131 | <p>Some text.</p>
132 | </div>
133 | </body>
134 | </html>
135 | 


--------------------------------------------------------------------------------
/test/data/doc/activities_p2.gt.md:
--------------------------------------------------------------------------------
1 | - -Yet another one
2 | - -Stopping it here
3 | 
4 | Some text.
5 | 


--------------------------------------------------------------------------------
/test/data/doc/activities_pb_empty.gt.md:
--------------------------------------------------------------------------------
 1 | ## Summer activities
 2 | 
 3 | ## Swimming in the lake
 4 | 
 5 | Duck
 6 | 
 7 | Figure 1: This is a cute duckling
 8 | 
 9 | ## Let's swim!
10 | 
11 | To get started with swimming, first lay down in a water and try not to drown:
12 | 
13 | - ∞ You can relax and look around
14 | - ∞ Paddle about
15 | - ∞ Enjoy summer warmth
16 | 
17 | Also, don't forget:
18 | 
19 | - 1. Wear sunglasses
20 | - 2. Don't forget to drink water
21 | - 3. Use sun cream
22 | 
23 | Hmm, what else…
24 | 
25 | - -Another activity item
26 | 
27 | - -Yet another one
28 | - -Stopping it here
29 | 
30 | Some text.
31 | 
32 | 
33 | 
34 | - -Starting the next page with a list item.
35 | - -Second item.
36 | 


--------------------------------------------------------------------------------
/test/data/doc/activities_pb_non_empty.gt.md:
--------------------------------------------------------------------------------
 1 | ## Summer activities
 2 | 
 3 | ## Swimming in the lake
 4 | 
 5 | Duck
 6 | 
 7 | Figure 1: This is a cute duckling
 8 | 
 9 | ## Let's swim!
10 | 
11 | To get started with swimming, first lay down in a water and try not to drown:
12 | 
13 | - ∞ You can relax and look around
14 | - ∞ Paddle about
15 | - ∞ Enjoy summer warmth
16 | 
17 | Also, don't forget:
18 | 
19 | - 1. Wear sunglasses
20 | - 2. Don't forget to drink water
21 | - 3. Use sun cream
22 | 
23 | Hmm, what else…
24 | 
25 | - -Another activity item
26 | <!-- page-break -->
27 | - -Yet another one
28 | - -Stopping it here
29 | 
30 | Some text.
31 | 
32 | <!-- page-break -->
33 | 
34 | - -Starting the next page with a list item.
35 | - -Second item.
36 | 


--------------------------------------------------------------------------------
/test/data/doc/activities_pb_none.gt.md:
--------------------------------------------------------------------------------
 1 | ## Summer activities
 2 | 
 3 | ## Swimming in the lake
 4 | 
 5 | Duck
 6 | 
 7 | Figure 1: This is a cute duckling
 8 | 
 9 | ## Let's swim!
10 | 
11 | To get started with swimming, first lay down in a water and try not to drown:
12 | 
13 | - ∞ You can relax and look around
14 | - ∞ Paddle about
15 | - ∞ Enjoy summer warmth
16 | 
17 | Also, don't forget:
18 | 
19 | - 1. Wear sunglasses
20 | - 2. Don't forget to drink water
21 | - 3. Use sun cream
22 | 
23 | Hmm, what else…
24 | 
25 | - -Another activity item
26 | - -Yet another one
27 | - -Stopping it here
28 | 
29 | Some text.
30 | 
31 | - -Starting the next page with a list item.
32 | - -Second item.
33 | 


--------------------------------------------------------------------------------
/test/data/doc/bad_doc.yaml.dt:
--------------------------------------------------------------------------------
1 | <doctag><title>This is the title</title>
2 | <section_header_level_1>This is the first section</section_header_level_1>
3 | </doctag>
4 | 


--------------------------------------------------------------------------------
/test/data/doc/bad_doc.yaml.et:
--------------------------------------------------------------------------------
1 | 0: unspecified with name=_root_
2 |  1: title
3 |   2: unspecified with name=chapter 1
4 |    3: section_header
5 | 


--------------------------------------------------------------------------------
/test/data/doc/bad_doc.yaml.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <link rel="icon" type="image/png"
 5 |     href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
 6 |     <meta charset="UTF-8">
 7 |     <title>
 8 |     Powered by Docling
 9 |     </title>
10 |     <style>
11 |     html {
12 |     background-color: LightGray;
13 |     }
14 |     body {
15 |     margin: 0 auto;
16 |     width:800px;
17 |     padding: 30px;
18 |     background-color: White;
19 |     font-family: Arial, sans-serif;
20 |     box-shadow: 10px 10px 10px grey;
21 |     }
22 |     figure{
23 |     display: block;
24 |     width: 100%;
25 |     margin: 0px;
26 |     margin-top: 10px;
27 |     margin-bottom: 10px;
28 |     }
29 |     img {
30 |     display: block;
31 |     margin: auto;
32 |     margin-top: 10px;
33 |     margin-bottom: 10px;
34 |     max-width: 640px;
35 |     max-height: 640px;
36 |     }
37 |     table {
38 |     min-width:500px;
39 |     background-color: White;
40 |     border-collapse: collapse;
41 |     cell-padding: 5px;
42 |     margin: auto;
43 |     margin-top: 10px;
44 |     margin-bottom: 10px;
45 |     }
46 |     th, td {
47 |     border: 1px solid black;
48 |     padding: 8px;
49 |     }
50 |     th {
51 |     font-weight: bold;
52 |     }
53 |     table tr:nth-child(even) td{
54 |     background-color: LightGray;
55 |     }
56 |     math annotation {
57 |     display: none;
58 |     }
59 |     .formula-not-decoded {
60 |     background: repeating-linear-gradient(
61 |     45deg, /* Angle of the stripes */
62 |     LightGray, /* First color */
63 |     LightGray 10px, /* Length of the first color */
64 |     White 10px, /* Second color */
65 |     White 20px /* Length of the second color */
66 |     );
67 |     margin: 0;
68 |     text-align: center;
69 |     }
70 |     </style>
71 |     </head>
72 | <h1>This is the title</h1>
73 | <h2>This is the first section</h2>
74 | </html>
75 | 


--------------------------------------------------------------------------------
/test/data/doc/bad_doc.yaml.md:
--------------------------------------------------------------------------------
1 | # This is the title
2 | 
3 | ### This is the first section
4 | 


--------------------------------------------------------------------------------
/test/data/doc/barchart.dt:
--------------------------------------------------------------------------------
1 | <doctag><page_header><loc_71><loc_14><loc_217><loc_20>Probability, Combinatorics and Control</page_header>
2 | <chart><loc_102><loc_37><loc_392><loc_148><bar_chart><ched>Number of impellers<ched>single-frequency<ched>multi-frequency<nl><fcel>1<fcel>0.06<fcel>0.16<nl><fcel>2<fcel>0.12<fcel>0.26<nl><fcel>3<fcel>0.16<fcel>0.27<nl><fcel>4<fcel>0.14<fcel>0.26<nl><fcel>5<fcel>0.16<fcel>0.25<nl><fcel>6<fcel>0.24<fcel>0.24<nl></chart>
3 | </doctag>


--------------------------------------------------------------------------------
/test/data/doc/barchart.gt.md:
--------------------------------------------------------------------------------
 1 | bar chart
 2 | 
 3 | <!-- image -->
 4 | 
 5 | |   Number of impellers |   single-frequency |   multi-frequency |
 6 | |-----------------------|--------------------|-------------------|
 7 | |                     1 |               0.06 |              0.16 |
 8 | |                     2 |               0.12 |              0.26 |
 9 | |                     3 |               0.16 |              0.27 |
10 | |                     4 |               0.14 |              0.26 |
11 | |                     5 |               0.16 |              0.25 |
12 | |                     6 |               0.24 |              0.24 |
13 | 


--------------------------------------------------------------------------------
/test/data/doc/barchart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/barchart.png


--------------------------------------------------------------------------------
/test/data/doc/constructed_doc.dt:
--------------------------------------------------------------------------------
 1 | <doctag><unordered_list><list_item>item of leading list</list_item>
 2 | </unordered_list>
 3 | <title>Title of the Document</title>
 4 | <text>Author 1
 5 | Affiliation 1</text>
 6 | <text>Author 2
 7 | Affiliation 2</text>
 8 | <section_header_level_1>1. Introduction</section_header_level_1>
 9 | <text>This paper introduces the biggest invention ever made. ...</text>
10 | <unordered_list><list_item>list item 1</list_item>
11 | <list_item>list item 2</list_item>
12 | <list_item>list item 3</list_item>
13 | <list_item><ordered_list><list_item>list item 3.a</list_item>
14 | <list_item>list item 3.b</list_item>
15 | <list_item>list item 3.c</list_item>
16 | <list_item><ordered_list><list_item>list item 3.c.i</list_item>
17 | </ordered_list></list_item>
18 | </ordered_list></list_item>
19 | <list_item>list item 4</list_item>
20 | </unordered_list>
21 | <otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
22 | <picture><caption>This is the caption of figure 1.</caption></picture>
23 | <picture><caption>This is the caption of figure 2.</caption></picture>
24 | <unordered_list><list_item>item 1 of list</list_item>
25 | </unordered_list>
26 | <unordered_list><list_item>item 1 of list after empty list</list_item>
27 | <list_item>item 2 of list after empty list</list_item>
28 | </unordered_list>
29 | <unordered_list><list_item>item 1 of neighboring list</list_item>
30 | <list_item>item 2 of neighboring list</list_item>
31 | <list_item><unordered_list><list_item>item 1 of sub list</list_item>
32 | <list_item><inline><text>Here a code snippet:</text>
33 | <code><_unknown_>print("Hello world")</code>
34 | <text>(to be displayed inline)</text>
35 | </inline></list_item>
36 | <list_item><inline><text>Here a formula:</text>
37 | <formula>E=mc^2</formula>
38 | <text>(to be displayed inline)</text>
39 | </inline></list_item>
40 | </unordered_list></list_item>
41 | </unordered_list>
42 | <text>Here a code block:</text>
43 | <code><_unknown_>print("Hello world")</code>
44 | <text>Here a formula block:</text>
45 | <formula>E=mc^2</formula>
46 | <key_value_region><key_0>number<link_1></key_0><value_1>1<link_0></value_1></key_value_region>
47 | <inline><text>Some formatting chops:</text>
48 | <text>bold</text>
49 | <text>italic</text>
50 | <text>underline</text>
51 | <text>strikethrough</text>
52 | <text>subscript</text>
53 | <text>superscript</text>
54 | <text>hyperlink</text>
55 | <text>&</text>
56 | <text>everything at the same time.</text>
57 | </inline>
58 | <ordered_list><list_item>Item 1 in A</list_item>
59 | <list_item>Item 2 in A</list_item>
60 | <list_item>Item 3 in A</list_item>
61 | <list_item><ordered_list><list_item>Item 1 in B</list_item>
62 | <list_item>Item 2 in B</list_item>
63 | <list_item><ordered_list><list_item>Item 1 in C</list_item>
64 | <list_item>Item 2 in C</list_item>
65 | </ordered_list></list_item>
66 | <list_item>Item 3 in B</list_item>
67 | </ordered_list></list_item>
68 | <list_item>Item 4 in A</list_item>
69 | </ordered_list>
70 | <text>The end.</text>
71 | </doctag>


--------------------------------------------------------------------------------
/test/data/doc/constructed_doc.dt.gt:
--------------------------------------------------------------------------------
 1 | <doctag><unordered_list><list_item>item of leading list</list_item>
 2 | </unordered_list>
 3 | <title>Title of the Document</title>
 4 | <text>Author 1
 5 | Affiliation 1</text>
 6 | <text>Author 2
 7 | Affiliation 2</text>
 8 | <section_header_level_1>1. Introduction</section_header_level_1>
 9 | <text>This paper introduces the biggest invention ever made. ...</text>
10 | <unordered_list><list_item>list item 1</list_item>
11 | <list_item>list item 2</list_item>
12 | <list_item>list item 3</list_item>
13 | <list_item><ordered_list><list_item>list item 3.a</list_item>
14 | <list_item>list item 3.b</list_item>
15 | <list_item>list item 3.c</list_item>
16 | <list_item><ordered_list><list_item>list item 3.c.i</list_item>
17 | </ordered_list></list_item>
18 | </ordered_list></list_item>
19 | <list_item>list item 4</list_item>
20 | </unordered_list>
21 | <otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
22 | <picture><caption>This is the caption of figure 1.</caption></picture>
23 | <picture><caption>This is the caption of figure 2.</caption></picture>
24 | <unordered_list><list_item>item 1 of list</list_item>
25 | </unordered_list>
26 | <unordered_list><list_item>item 1 of list after empty list</list_item>
27 | <list_item>item 2 of list after empty list</list_item>
28 | </unordered_list>
29 | <unordered_list><list_item>item 1 of neighboring list</list_item>
30 | <list_item>item 2 of neighboring list</list_item>
31 | <list_item><unordered_list><list_item>item 1 of sub list</list_item>
32 | <list_item><inline><text>Here a code snippet:</text>
33 | <code><_unknown_>print("Hello world")</code>
34 | <text>(to be displayed inline)</text>
35 | </inline></list_item>
36 | <list_item><inline><text>Here a formula:</text>
37 | <formula>E=mc^2</formula>
38 | <text>(to be displayed inline)</text>
39 | </inline></list_item>
40 | </unordered_list></list_item>
41 | </unordered_list>
42 | <text>Here a code block:</text>
43 | <code><_unknown_>print("Hello world")</code>
44 | <text>Here a formula block:</text>
45 | <formula>E=mc^2</formula>
46 | <key_value_region><key_0>number<link_1></key_0><value_1>1<link_0></value_1></key_value_region>
47 | <inline><text>Some formatting chops:</text>
48 | <text>bold</text>
49 | <text>italic</text>
50 | <text>underline</text>
51 | <text>strikethrough</text>
52 | <text>subscript</text>
53 | <text>superscript</text>
54 | <text>hyperlink</text>
55 | <text>&</text>
56 | <text>everything at the same time.</text>
57 | </inline>
58 | <ordered_list><list_item>Item 1 in A</list_item>
59 | <list_item>Item 2 in A</list_item>
60 | <list_item>Item 3 in A</list_item>
61 | <list_item><ordered_list><list_item>Item 1 in B</list_item>
62 | <list_item>Item 2 in B</list_item>
63 | <list_item><ordered_list><list_item>Item 1 in C</list_item>
64 | <list_item>Item 2 in C</list_item>
65 | </ordered_list></list_item>
66 | <list_item>Item 3 in B</list_item>
67 | </ordered_list></list_item>
68 | <list_item>Item 4 in A</list_item>
69 | </ordered_list>
70 | <text>The end.</text>
71 | </doctag>


--------------------------------------------------------------------------------
/test/data/doc/constructed_doc.embedded.md.gt:
--------------------------------------------------------------------------------
 1 | - item of leading list
 2 | 
 3 | # Title of the Document
 4 | 
 5 | Author 1
 6 | Affiliation 1
 7 | 
 8 | Author 2
 9 | Affiliation 2
10 | 
11 | ## 1. Introduction
12 | 
13 | This paper introduces the biggest invention ever made. ...
14 | 
15 | - list item 1
16 | - list item 2
17 | - list item 3
18 |     1. list item 3.a
19 |     2. list item 3.b
20 |     3. list item 3.c
21 |         1. list item 3.c.i
22 | - list item 4
23 | 
24 | This is the caption of table 1.
25 | 
26 | | Product   |   Years |   Years |
27 | |-----------|---------|---------|
28 | | Product   |    2016 |    2017 |
29 | | Apple     |   49823 |  695944 |
30 | 
31 | This is the caption of figure 1.
32 | 
33 | <!-- 🖼️❌ Image not available. Please use `PdfPipelineOptions(generate_picture_images=True)` -->
34 | 
35 | This is the caption of figure 2.
36 | 
37 | ![Image](data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAEAAAABACAIAAAAlC+aJAAAAIklEQVR4nO3BAQ0AAADCoPdPbQ8HFAAAAAAAAAAAAAAA8G4wQAABiwCo9wAAAABJRU5ErkJggg==)
38 | 
39 | - item 1 of list
40 | 
41 | - item 1 of list after empty list
42 | - item 2 of list after empty list
43 | 
44 | - item 1 of neighboring list
45 | - item 2 of neighboring list
46 |     - item 1 of sub list
47 |     - Here a code snippet: `print("Hello world")` (to be displayed inline)
48 |     - Here a formula: $E=mc^2$ (to be displayed inline)
49 | 
50 | Here a code block:
51 | 
52 | ```
53 | print("Hello world")
54 | ```
55 | 
56 | Here a formula block:
57 | 
58 | $$E=mc^2$$
59 | 
60 | <!-- missing-key-value-item -->
61 | 
62 | <!-- missing-form-item -->
63 | 
64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)
65 | 
66 | 1. Item 1 in A
67 | 2. Item 2 in A
68 | 3. Item 3 in A
69 |     1. Item 1 in B
70 |     2. Item 2 in B
71 |         1. Item 1 in C
72 |         2. Item 2 in C
73 |     3. Item 3 in B
74 | 4. Item 4 in A
75 | 
76 | The end.


--------------------------------------------------------------------------------
/test/data/doc/constructed_doc.placeholder.md.gt:
--------------------------------------------------------------------------------
 1 | - item of leading list
 2 | 
 3 | # Title of the Document
 4 | 
 5 | Author 1
 6 | Affiliation 1
 7 | 
 8 | Author 2
 9 | Affiliation 2
10 | 
11 | ## 1. Introduction
12 | 
13 | This paper introduces the biggest invention ever made. ...
14 | 
15 | - list item 1
16 | - list item 2
17 | - list item 3
18 |     1. list item 3.a
19 |     2. list item 3.b
20 |     3. list item 3.c
21 |         1. list item 3.c.i
22 | - list item 4
23 | 
24 | This is the caption of table 1.
25 | 
26 | | Product   |   Years |   Years |
27 | |-----------|---------|---------|
28 | | Product   |    2016 |    2017 |
29 | | Apple     |   49823 |  695944 |
30 | 
31 | This is the caption of figure 1.
32 | 
33 | <!-- image -->
34 | 
35 | This is the caption of figure 2.
36 | 
37 | <!-- image -->
38 | 
39 | - item 1 of list
40 | 
41 | - item 1 of list after empty list
42 | - item 2 of list after empty list
43 | 
44 | - item 1 of neighboring list
45 | - item 2 of neighboring list
46 |     - item 1 of sub list
47 |     - Here a code snippet: `print("Hello world")` (to be displayed inline)
48 |     - Here a formula: $E=mc^2$ (to be displayed inline)
49 | 
50 | Here a code block:
51 | 
52 | ```
53 | print("Hello world")
54 | ```
55 | 
56 | Here a formula block:
57 | 
58 | $$E=mc^2$$
59 | 
60 | <!-- missing-key-value-item -->
61 | 
62 | <!-- missing-form-item -->
63 | 
64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)
65 | 
66 | 1. Item 1 in A
67 | 2. Item 2 in A
68 | 3. Item 3 in A
69 |     1. Item 1 in B
70 |     2. Item 2 in B
71 |         1. Item 1 in C
72 |         2. Item 2 in C
73 |     3. Item 3 in B
74 | 4. Item 4 in A
75 | 
76 | The end.


--------------------------------------------------------------------------------
/test/data/doc/constructed_doc.referenced.md.gt:
--------------------------------------------------------------------------------
 1 | - item of leading list
 2 | 
 3 | # Title of the Document
 4 | 
 5 | Author 1
 6 | Affiliation 1
 7 | 
 8 | Author 2
 9 | Affiliation 2
10 | 
11 | ## 1. Introduction
12 | 
13 | This paper introduces the biggest invention ever made. ...
14 | 
15 | - list item 1
16 | - list item 2
17 | - list item 3
18 |     1. list item 3.a
19 |     2. list item 3.b
20 |     3. list item 3.c
21 |         1. list item 3.c.i
22 | - list item 4
23 | 
24 | This is the caption of table 1.
25 | 
26 | | Product   |   Years |   Years |
27 | |-----------|---------|---------|
28 | | Product   |    2016 |    2017 |
29 | | Apple     |   49823 |  695944 |
30 | 
31 | This is the caption of figure 1.
32 | 
33 | <!-- image -->
34 | 
35 | This is the caption of figure 2.
36 | 
37 | ![Image](constructed_images/image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png)
38 | 
39 | - item 1 of list
40 | 
41 | - item 1 of list after empty list
42 | - item 2 of list after empty list
43 | 
44 | - item 1 of neighboring list
45 | - item 2 of neighboring list
46 |     - item 1 of sub list
47 |     - Here a code snippet: `print("Hello world")` (to be displayed inline)
48 |     - Here a formula: $E=mc^2$ (to be displayed inline)
49 | 
50 | Here a code block:
51 | 
52 | ```
53 | print("Hello world")
54 | ```
55 | 
56 | Here a formula block:
57 | 
58 | $$E=mc^2$$
59 | 
60 | <!-- missing-key-value-item -->
61 | 
62 | <!-- missing-form-item -->
63 | 
64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)
65 | 
66 | 1. Item 1 in A
67 | 2. Item 2 in A
68 | 3. Item 3 in A
69 |     1. Item 1 in B
70 |     2. Item 2 in B
71 |         1. Item 1 in C
72 |         2. Item 2 in C
73 |     3. Item 3 in B
74 | 4. Item 4 in A
75 | 
76 | The end.


--------------------------------------------------------------------------------
/test/data/doc/constructed_document.yaml.dt:
--------------------------------------------------------------------------------
 1 | <doctag><unordered_list><list_item>item of leading list</list_item>
 2 | </unordered_list>
 3 | <title>Title of the Document</title>
 4 | <text>Author 1
 5 | Affiliation 1</text>
 6 | <text>Author 2
 7 | Affiliation 2</text>
 8 | <section_header_level_1>1. Introduction</section_header_level_1>
 9 | <text>This paper introduces the biggest invention ever made. ...</text>
10 | <unordered_list><list_item>list item 1</list_item>
11 | <list_item>list item 2</list_item>
12 | <list_item>list item 3</list_item>
13 | <list_item><ordered_list><list_item>list item 3.a</list_item>
14 | <list_item>list item 3.b</list_item>
15 | <list_item>list item 3.c</list_item>
16 | <list_item><ordered_list><list_item>list item 3.c.i</list_item>
17 | </ordered_list></list_item>
18 | </ordered_list></list_item>
19 | <list_item>list item 4</list_item>
20 | </unordered_list>
21 | <otsl><fcel>Product<fcel>Years<lcel><nl><ucel><fcel>2016<fcel>2017<nl><fcel>Apple<fcel>49823<fcel>695944<nl><caption>This is the caption of table 1.</caption></otsl>
22 | <picture><caption>This is the caption of figure 1.</caption></picture>
23 | <picture><caption>This is the caption of figure 2.</caption></picture>
24 | <unordered_list><list_item>item 1 of list</list_item>
25 | </unordered_list>
26 | <unordered_list><list_item>item 1 of list after empty list</list_item>
27 | <list_item>item 2 of list after empty list</list_item>
28 | </unordered_list>
29 | <unordered_list><list_item>item 1 of neighboring list</list_item>
30 | <list_item>item 2 of neighboring list</list_item>
31 | <list_item><unordered_list><list_item>item 1 of sub list</list_item>
32 | <list_item><inline><text>Here a code snippet:</text>
33 | <code><_unknown_>print("Hello world")</code>
34 | <text>(to be displayed inline)</text>
35 | </inline></list_item>
36 | <list_item><inline><text>Here a formula:</text>
37 | <formula>E=mc^2</formula>
38 | <text>(to be displayed inline)</text>
39 | </inline></list_item>
40 | </unordered_list></list_item>
41 | </unordered_list>
42 | <text>Here a code block:</text>
43 | <code><_unknown_>print("Hello world")</code>
44 | <text>Here a formula block:</text>
45 | <formula>E=mc^2</formula>
46 | <key_value_region><key_0>number<link_1></key_0><value_1>1<link_0></value_1></key_value_region>
47 | <inline><text>Some formatting chops:</text>
48 | <text>bold</text>
49 | <text>italic</text>
50 | <text>underline</text>
51 | <text>strikethrough</text>
52 | <text>subscript</text>
53 | <text>superscript</text>
54 | <text>hyperlink</text>
55 | <text>&</text>
56 | <text>everything at the same time.</text>
57 | </inline>
58 | <ordered_list><list_item>Item 1 in A</list_item>
59 | <list_item>Item 2 in A</list_item>
60 | <list_item>Item 3 in A</list_item>
61 | <list_item><ordered_list><list_item>Item 1 in B</list_item>
62 | <list_item>Item 2 in B</list_item>
63 | <list_item><ordered_list><list_item>Item 1 in C</list_item>
64 | <list_item>Item 2 in C</list_item>
65 | </ordered_list></list_item>
66 | <list_item>Item 3 in B</list_item>
67 | </ordered_list></list_item>
68 | <list_item>Item 4 in A</list_item>
69 | </ordered_list>
70 | <text>The end.</text>
71 | </doctag>
72 | 


--------------------------------------------------------------------------------
/test/data/doc/constructed_document.yaml.et:
--------------------------------------------------------------------------------
 1 | 0: unspecified with name=_root_
 2 |  1: list with name=group
 3 |   2: list_item
 4 |  3: title
 5 |   4: text
 6 |   5: text
 7 |  6: chapter with name=Introduction
 8 |   7: section_header
 9 |   8: text
10 |   9: list with name=group
11 |    10: list_item
12 |    11: list_item
13 |    12: list_item
14 |     13: ordered_list with name=group
15 |      14: list_item
16 |      15: list_item
17 |      16: list_item
18 |       17: ordered_list with name=group
19 |        18: list_item
20 |    19: list_item
21 |  20: caption
22 |  21: table
23 |  22: caption
24 |  23: picture
25 |  24: caption
26 |  25: picture
27 |  26: list with name=group
28 |   27: list_item
29 |  28: list with name=group
30 |  29: list with name=group
31 |   30: list_item
32 |   31: list_item
33 |  32: list with name=group
34 |   33: list_item
35 |   34: list_item
36 |    35: list with name=group
37 |     36: list_item
38 |     37: inline with name=group
39 |      38: text
40 |      39: code
41 |      40: text
42 |     41: inline with name=group
43 |      42: text
44 |      43: formula
45 |      44: text
46 |  45: text
47 |  46: code
48 |  47: text
49 |  48: formula
50 |  49: key_value_region
51 |  50: form
52 |  51: inline with name=group
53 |   52: text
54 |   53: text
55 |   54: text
56 |   55: text
57 |   56: text
58 |   57: text
59 |   58: text
60 |   59: text
61 |   60: text
62 |   61: text
63 |  62: ordered_list with name=list A
64 |   63: list_item
65 |   64: list_item
66 |   65: list_item
67 |    66: ordered_list with name=list B
68 |     67: list_item
69 |     68: list_item
70 |      69: ordered_list with name=list C
71 |       70: list_item
72 |       71: list_item
73 |     72: list_item
74 |   73: list_item
75 |  74: text
76 | 


--------------------------------------------------------------------------------
/test/data/doc/constructed_document.yaml.md:
--------------------------------------------------------------------------------
 1 | - item of leading list
 2 | 
 3 | # Title of the Document
 4 | 
 5 | Author 1
 6 | Affiliation 1
 7 | 
 8 | Author 2
 9 | Affiliation 2
10 | 
11 | ## 1. Introduction
12 | 
13 | This paper introduces the biggest invention ever made. ...
14 | 
15 | - list item 1
16 | - list item 2
17 | - list item 3
18 |     1. list item 3.a
19 |     2. list item 3.b
20 |     3. list item 3.c
21 |         1. list item 3.c.i
22 | - list item 4
23 | 
24 | This is the caption of table 1.
25 | 
26 | | Product   |   Years |   Years |
27 | |-----------|---------|---------|
28 | | Product   |    2016 |    2017 |
29 | | Apple     |   49823 |  695944 |
30 | 
31 | This is the caption of figure 1.
32 | 
33 | <!-- image -->
34 | 
35 | This is the caption of figure 2.
36 | 
37 | <!-- image -->
38 | 
39 | - item 1 of list
40 | 
41 | - item 1 of list after empty list
42 | - item 2 of list after empty list
43 | 
44 | - item 1 of neighboring list
45 | - item 2 of neighboring list
46 |     - item 1 of sub list
47 |     - Here a code snippet: `print("Hello world")` (to be displayed inline)
48 |     - Here a formula: $E=mc^2$ (to be displayed inline)
49 | 
50 | Here a code block:
51 | 
52 | ```
53 | print("Hello world")
54 | ```
55 | 
56 | Here a formula block:
57 | 
58 | $$E=mc^2$$
59 | 
60 | <!-- missing-key-value-item -->
61 | 
62 | <!-- missing-form-item -->
63 | 
64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) &amp; [~~***everything at the same time.***~~](https://github.com/DS4SD/docling)
65 | 
66 | 1. Item 1 in A
67 | 2. Item 2 in A
68 | 3. Item 3 in A
69 |     1. Item 1 in B
70 |     2. Item 2 in B
71 |         1. Item 1 in C
72 |         2. Item 2 in C
73 |     3. Item 3 in B
74 | 4. Item 4 in A
75 | 
76 | The end.
77 | 


--------------------------------------------------------------------------------
/test/data/doc/constructed_images/image_000001_797618e862d279d4e3e92f4b6313175f67e08fc36051dfda092bf63220568703.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/constructed_images/image_000001_797618e862d279d4e3e92f4b6313175f67e08fc36051dfda092bf63220568703.png


--------------------------------------------------------------------------------
/test/data/doc/constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png


--------------------------------------------------------------------------------
/test/data/doc/constructed_images/image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/constructed_images/image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png


--------------------------------------------------------------------------------
/test/data/doc/doc_with_kv.dt:
--------------------------------------------------------------------------------
1 | <doctag><key_value_region><loc_30><loc_50><loc_434><loc_444><key_0><loc_31><loc_51><loc_49><loc_60>TO:<link_4></key_0><key_1><loc_31><loc_70><loc_64><loc_80>FROM:<link_5></key_1><key_2><loc_453><loc_400><loc_469><loc_456>8623474</key_2><value_3><loc_408><loc_69><loc_423><loc_78>☑</value_3><value_4><loc_82><loc_51><loc_162><loc_61>Mrs. K. A. Sparrow</value_4><value_5><loc_84><loc_69><loc_130><loc_79>R. G. Ryan</value_5><value_6><loc_339><loc_70><loc_371><loc_78>JUNE7<link_3></value_6><value_7><loc_338><loc_78><loc_373><loc_87>AUG.2</value_7><value_8><loc_339><loc_88><loc_372><loc_96>OCT.7</value_8><key_9><loc_344><loc_50><loc_434><loc_60>SUBMISSION DATE:<link_6><link_8><link_7></key_9><key_10><loc_112><loc_106><loc_361><loc_117>NEWPORT LIGHTS HEAVY UP PROGRESS REPORT</key_10><key_11><loc_31><loc_134><loc_276><loc_144>EFFECTIVENESS OF DISTRIBUTION ALLOWANCE:<link_16><link_14><link_12></key_11><value_12><loc_30><loc_154><loc_190><loc_164>DIRECT ACCOUNT/ WHOLESALERS:<link_13></value_12><value_13><loc_32><loc_164><loc_397><loc_182>Distribution allowance was very effective in accomplishing our objectives. All accounts have purchased introductory products.</value_13><value_14><loc_31><loc_218><loc_156><loc_227>DIRECT ACCOUNT CHAINS:<link_15></value_14><value_15><loc_31><loc_228><loc_156><loc_238>Eagle Foods is the only Void.</value_15><value_16><loc_31><loc_276><loc_186><loc_285>NON- DIRECT ACCOUNT CHAINS:<link_17></value_16><value_17><loc_31><loc_286><loc_381><loc_295>Reception from these accounts is most positive with a solid incentitive to purchase.</value_17><key_18><loc_31><loc_331><loc_161><loc_360>EFFECTIVENESS OF THE RETAIL (1 00 OFF CARTON) DISTRIBUTION ALLOWANCE:<link_19></key_18><value_19><loc_185><loc_350><loc_429><loc_370>Has been most helpful in acquiring desireable distribution when needed by Sales Reps.</value_19><key_20><loc_31><loc_398><loc_155><loc_408>PROMOTIONAL ACTIVITY<link_21></key_20><value_21><loc_31><loc_417><loc_120><loc_436>40c OFF PACK- GENERAL MARKET:<link_22></value_21><value_22><loc_135><loc_426><loc_401><loc_444>The 40c off promotions continue to be well received at the retail stores and by consumers, as well.</value_22></key_value_region></doctag>


--------------------------------------------------------------------------------
/test/data/doc/doc_with_kv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/doc_with_kv.png


--------------------------------------------------------------------------------
/test/data/doc/dummy_doc.yaml.dt:
--------------------------------------------------------------------------------
1 | <doctag><title><loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title>
2 | <chart><loc_297><loc_125><loc_457><loc_499><bar_chart><smiles>CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1</smiles><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></chart>
3 | <otsl><loc_210><loc_196><loc_245><loc_213></otsl>
4 | </doctag>
5 | 


--------------------------------------------------------------------------------
/test/data/doc/dummy_doc.yaml.et:
--------------------------------------------------------------------------------
1 | 0: unspecified with name=_root_
2 |  1: title
3 |  2: picture
4 |  3: caption
5 |  4: table
6 | 


--------------------------------------------------------------------------------
/test/data/doc/dummy_doc.yaml.md:
--------------------------------------------------------------------------------
 1 | # DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis
 2 | 
 3 | Figure 1: Four examples of complex page layouts across different document categories
 4 | 
 5 | bar chart
 6 | 
 7 | ...
 8 | 
 9 | CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1
10 | 
11 | <!-- image -->
12 | 
13 | A description annotation for this table.
14 | 


--------------------------------------------------------------------------------
/test/data/doc/dummy_doc.yaml.min.dt:
--------------------------------------------------------------------------------
1 | <doctag><title><loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</title><chart><loc_297><loc_125><loc_457><loc_499><bar_chart><smiles>CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1</smiles><caption><loc_210><loc_196><loc_245><loc_213>Figure 1: Four examples of complex page layouts across different document categories</caption></chart><otsl><loc_210><loc_196><loc_245><loc_213></otsl></doctag>
2 | 


--------------------------------------------------------------------------------
/test/data/doc/misplaced_list_items.out.yaml:
--------------------------------------------------------------------------------
 1 | body:
 2 |   children:
 3 |   - $ref: '#/groups/1'
 4 |   - $ref: '#/texts/0'
 5 |   - $ref: '#/groups/0'
 6 |   content_layer: body
 7 |   label: unspecified
 8 |   name: _root_
 9 |   self_ref: '#/body'
10 | form_items: []
11 | furniture:
12 |   children: []
13 |   content_layer: furniture
14 |   label: unspecified
15 |   name: _root_
16 |   self_ref: '#/furniture'
17 | groups:
18 | - children:
19 |   - $ref: '#/texts/1'
20 |   - $ref: '#/texts/2'
21 |   content_layer: body
22 |   label: list
23 |   name: group
24 |   parent:
25 |     $ref: '#/body'
26 |   self_ref: '#/groups/0'
27 | - children:
28 |   - $ref: '#/texts/3'
29 |   content_layer: body
30 |   label: ordered_list
31 |   name: group
32 |   parent:
33 |     $ref: '#/body'
34 |   self_ref: '#/groups/1'
35 | key_value_items: []
36 | name: ''
37 | pages: {}
38 | pictures: []
39 | schema_name: DoclingDocument
40 | tables: []
41 | texts:
42 | - children: []
43 |   content_layer: body
44 |   label: text
45 |   orig: bar
46 |   parent:
47 |     $ref: '#/body'
48 |   prov: []
49 |   self_ref: '#/texts/0'
50 |   text: bar
51 | - children: []
52 |   content_layer: body
53 |   enumerated: false
54 |   label: list_item
55 |   marker: '-'
56 |   orig: here
57 |   parent:
58 |     $ref: '#/groups/0'
59 |   prov: []
60 |   self_ref: '#/texts/1'
61 |   text: here
62 | - children: []
63 |   content_layer: body
64 |   enumerated: false
65 |   label: list_item
66 |   marker: '-'
67 |   orig: there
68 |   parent:
69 |     $ref: '#/groups/0'
70 |   prov: []
71 |   self_ref: '#/texts/2'
72 |   text: there
73 | - children: []
74 |   content_layer: body
75 |   enumerated: true
76 |   label: list_item
77 |   marker: '1.'
78 |   orig: foo
79 |   parent:
80 |     $ref: '#/groups/1'
81 |   prov: []
82 |   self_ref: '#/texts/3'
83 |   text: foo
84 | version: 1.4.0
85 | 


--------------------------------------------------------------------------------
/test/data/doc/misplaced_list_items.yaml:
--------------------------------------------------------------------------------
 1 | body:
 2 |   children:
 3 |   - $ref: '#/texts/0'
 4 |   - $ref: '#/texts/1'
 5 |   - $ref: '#/texts/2'
 6 |   - $ref: '#/texts/3'
 7 |   content_layer: body
 8 |   label: unspecified
 9 |   name: _root_
10 |   self_ref: '#/body'
11 | form_items: []
12 | furniture:
13 |   children: []
14 |   content_layer: furniture
15 |   label: unspecified
16 |   name: _root_
17 |   self_ref: '#/furniture'
18 | groups: []
19 | key_value_items: []
20 | name: ''
21 | pages: {}
22 | pictures: []
23 | schema_name: DoclingDocument
24 | tables: []
25 | texts:
26 | - children: []
27 |   content_layer: body
28 |   enumerated: true
29 |   label: list_item
30 |   marker: '1.'
31 |   orig: foo
32 |   parent:
33 |     $ref: '#/body'
34 |   prov: []
35 |   self_ref: '#/texts/0'
36 |   text: foo
37 | - children: []
38 |   content_layer: body
39 |   label: text
40 |   orig: bar
41 |   parent:
42 |     $ref: '#/body'
43 |   prov: []
44 |   self_ref: '#/texts/1'
45 |   text: bar
46 | - children: []
47 |   content_layer: body
48 |   enumerated: false
49 |   label: list_item
50 |   marker: '-'
51 |   orig: here
52 |   parent:
53 |     $ref: '#/body'
54 |   prov: []
55 |   self_ref: '#/texts/2'
56 |   text: here
57 | - children: []
58 |   content_layer: body
59 |   enumerated: false
60 |   label: list_item
61 |   marker: '-'
62 |   orig: there
63 |   parent:
64 |     $ref: '#/body'
65 |   prov: []
66 |   self_ref: '#/texts/3'
67 |   text: there
68 | version: 1.3.0
69 | 


--------------------------------------------------------------------------------
/test/data/doc/misplaced_list_items.yaml.dt:
--------------------------------------------------------------------------------
1 | <doctag><ordered_list><list_item>foo</list_item>
2 | </ordered_list>
3 | <text>bar</text>
4 | <unordered_list><list_item>here</list_item>
5 | <list_item>there</list_item>
6 | </unordered_list>
7 | </doctag>
8 | 


--------------------------------------------------------------------------------
/test/data/doc/page_with_pic.dt:
--------------------------------------------------------------------------------
 1 | Assistant: <doctag><page_header><loc_135><loc_33><loc_407><loc_40>Optimized Table Tokenization for Table Structure Recognition</page_header>
 2 | <page_header><loc_439><loc_32><loc_445><loc_38>7</page_header>
 3 | <picture><loc_104><loc_85><loc_413><loc_170><other><caption><loc_68><loc_56><loc_445><loc_80>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption></picture>
 4 | <section_header_level_1><loc_68><loc_191><loc_190><loc_199>4.2 Language Syntax</section_header_level_1>
 5 | <text><loc_68><loc_206><loc_318><loc_213>The OTSL representation follows these syntax rules:</text>
 6 | <unordered_list><list_item><loc_72><loc_222><loc_445><loc_238>1. Left-looking cell rule: The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item>
 7 | <list_item><loc_72><loc_240><loc_445><loc_257>2. Up-looking cell rule: The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.</list_item>
 8 | <list_item><loc_72><loc_259><loc_167><loc_267>3. Cross cell rule:</list_item>
 9 | </unordered_list>
10 | <text><loc_87><loc_268><loc_445><loc_293>The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.</text>
11 | <unordered_list><list_item><loc_72><loc_295><loc_445><loc_303>4. First row rule: Only "L" cells and "C" cells are allowed in the first row.</list_item>
12 | <list_item><loc_72><loc_304><loc_445><loc_320>5. First column rule: Only "U" cells and "C" cells are allowed in the first column.</list_item>
13 | <list_item><loc_72><loc_322><loc_445><loc_338>6. Rectangular rule: The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.</list_item>
14 | </unordered_list>
15 | <text><loc_68><loc_347><loc_445><loc_445>The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid.</text>
16 | <text><loc_68><loc_447><loc_445><loc_463>These characteristics can be easily learned by sequence generator networks, as we demonstrate further below. We find strong indications that this pattern</text>
17 | </doctag><end_of_utterance>


--------------------------------------------------------------------------------
/test/data/doc/page_with_pic.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/page_with_pic.png


--------------------------------------------------------------------------------
/test/data/docling_document/export/formula_mathml.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 | 
 4 | <body>
 5 | <div class='page'>
 6 | <div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mfrac><mrow><mn>1</mn></mrow><mrow><mi>x</mi></mrow></mfrac></mrow><annotation encoding="TeX">\frac{1}{x}</annotation></math></div>
 7 | </div>
 8 | </body>
 9 | </html>
10 | 


--------------------------------------------------------------------------------
/test/data/docling_document/unit/CodeItem.yaml:
--------------------------------------------------------------------------------
 1 | children: []
 2 | captions: []
 3 | footnotes: []
 4 | references: []
 5 | image: null
 6 | code_language: Python
 7 | content_layer: body
 8 | label: code
 9 | orig: whatever
10 | parent: null
11 | prov: []
12 | self_ref: '#'
13 | text: print(Hello World!)
14 | formatting: null
15 | hyperlink: null
16 | 


--------------------------------------------------------------------------------
/test/data/docling_document/unit/FloatingItem.yaml:
--------------------------------------------------------------------------------
 1 | captions: []
 2 | children: []
 3 | footnotes: []
 4 | image: null
 5 | label: text
 6 | parent: null
 7 | prov: []
 8 | references: []
 9 | self_ref: '#'
10 | content_layer: body


--------------------------------------------------------------------------------
/test/data/docling_document/unit/FormItem.yaml:
--------------------------------------------------------------------------------
 1 | captions: []
 2 | children: []
 3 | content_layer: body
 4 | footnotes: []
 5 | graph:
 6 |   cells:
 7 |   - cell_id: 0
 8 |     item_ref: null
 9 |     label: key
10 |     orig: '#'
11 |     prov: null
12 |     text: number
13 |   - cell_id: 1
14 |     item_ref: null
15 |     label: value
16 |     orig: '1'
17 |     prov: null
18 |     text: '1'
19 |   links:
20 |   - label: to_value
21 |     source_cell_id: 0
22 |     target_cell_id: 1
23 |   - label: to_key
24 |     source_cell_id: 1
25 |     target_cell_id: 0
26 | image: null
27 | label: form
28 | parent: null
29 | prov: []
30 | references: []
31 | self_ref: '#'


--------------------------------------------------------------------------------
/test/data/docling_document/unit/FormulaItem.yaml:
--------------------------------------------------------------------------------
 1 | children: []
 2 | label: formula
 3 | orig: whatever
 4 | parent: null
 5 | prov: []
 6 | self_ref: '#'
 7 | text: E=mc^2
 8 | content_layer: body
 9 | formatting: null
10 | hyperlink: null
11 | 


--------------------------------------------------------------------------------
/test/data/docling_document/unit/KeyValueItem.yaml:
--------------------------------------------------------------------------------
 1 | captions: []
 2 | children: []
 3 | content_layer: body
 4 | footnotes: []
 5 | graph:
 6 |   cells:
 7 |   - cell_id: 0
 8 |     item_ref: null
 9 |     label: key
10 |     orig: '#'
11 |     prov: null
12 |     text: number
13 |   - cell_id: 1
14 |     item_ref: null
15 |     label: value
16 |     orig: '1'
17 |     prov: null
18 |     text: '1'
19 |   links:
20 |   - label: to_value
21 |     source_cell_id: 0
22 |     target_cell_id: 1
23 |   - label: to_key
24 |     source_cell_id: 1
25 |     target_cell_id: 0
26 | image: null
27 | label: key_value_region
28 | parent: null
29 | prov: []
30 | references: []
31 | self_ref: '#'


--------------------------------------------------------------------------------
/test/data/docling_document/unit/ListItem.yaml:
--------------------------------------------------------------------------------
 1 | children: []
 2 | enumerated: true
 3 | label: list_item
 4 | marker: (1)
 5 | orig: whatever
 6 | parent: null
 7 | prov: []
 8 | self_ref: '#'
 9 | text: whatever
10 | content_layer: body
11 | formatting: null
12 | hyperlink: null
13 | 


--------------------------------------------------------------------------------
/test/data/docling_document/unit/PictureItem.yaml:
--------------------------------------------------------------------------------
 1 | annotations: []
 2 | captions: []
 3 | children: []
 4 | footnotes: []
 5 | image: null
 6 | label: picture
 7 | parent: null
 8 | prov: []
 9 | references: []
10 | self_ref: '#'
11 | content_layer: body


--------------------------------------------------------------------------------
/test/data/docling_document/unit/SectionHeaderItem.yaml:
--------------------------------------------------------------------------------
 1 | children: []
 2 | label: section_header
 3 | level: 2
 4 | orig: whatever
 5 | parent: null
 6 | prov: []
 7 | self_ref: '#'
 8 | text: whatever
 9 | content_layer: body
10 | formatting: null
11 | hyperlink: null
12 | 


--------------------------------------------------------------------------------
/test/data/docling_document/unit/TextItem.yaml:
--------------------------------------------------------------------------------
 1 | children: []
 2 | label: text
 3 | orig: whatever
 4 | parent: null
 5 | prov: []
 6 | self_ref: '#'
 7 | text: whatever
 8 | content_layer: body
 9 | formatting: null
10 | hyperlink: null
11 | 


--------------------------------------------------------------------------------
/test/data/docling_document/unit/TitleItem.yaml:
--------------------------------------------------------------------------------
 1 | children: []
 2 | label: title
 3 | orig: whatever
 4 | parent: null
 5 | prov: []
 6 | self_ref: '#'
 7 | text: whatever
 8 | content_layer: body
 9 | formatting: null
10 | hyperlink: null
11 | 


--------------------------------------------------------------------------------
/test/data/json_schemas/base_identifier.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "title": "Identifier",
 3 |   "description": "Unique identifier of a Docling data object.",
 4 |   "type": "object",
 5 |   "properties": {
 6 |     "type": {
 7 |       "title": "Type",
 8 |       "description": "A string representing a collection or database that contains this data object.",
 9 |       "x-es-type": "keyword",
10 |       "x-es-ignore_above": 8191,
11 |       "type": "string"
12 |     },
13 |     "value": {
14 |       "title": "Value",
15 |       "description": "The identifier value of the data object within a collection or database.",
16 |       "x-es-type": "keyword",
17 |       "x-es-ignore_above": 8191,
18 |       "type": "string"
19 |     },
20 |     "_name": {
21 |       "title": "_Name",
22 |       "description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).",
23 |       "x-es-type": "keyword",
24 |       "x-es-ignore_above": 8191,
25 |       "pattern": "^.+#.+$",
26 |       "type": "string"
27 |     }
28 |   },
29 |   "required": [
30 |     "type",
31 |     "value",
32 |     "_name"
33 |   ],
34 |   "additionalProperties": false
35 | }


--------------------------------------------------------------------------------
/test/data/json_schemas/base_log.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "additionalProperties": false,
 3 |   "description": "Log entry to describe an ETL task on a document.",
 4 |   "properties": {
 5 |     "task": {
 6 |       "anyOf": [
 7 |         {
 8 |           "type": "string"
 9 |         },
10 |         {
11 |           "type": "null"
12 |         }
13 |       ],
14 |       "default": null,
15 |       "description": "An identifier of this task. It may be used to identify this task from other tasks of the same agent and type.",
16 |       "title": "Task",
17 |       "x-es-ignore_above": 8191,
18 |       "x-es-type": "keyword"
19 |     },
20 |     "agent": {
21 |       "description": "The Docling agent that performed the task, e.g., CCS or CXS.",
22 |       "title": "Agent",
23 |       "type": "string",
24 |       "x-es-ignore_above": 8191,
25 |       "x-es-type": "keyword"
26 |     },
27 |     "type": {
28 |       "description": "A task category.",
29 |       "title": "Type",
30 |       "type": "string",
31 |       "x-es-ignore_above": 8191,
32 |       "x-es-type": "keyword"
33 |     },
34 |     "comment": {
35 |       "anyOf": [
36 |         {
37 |           "type": "string"
38 |         },
39 |         {
40 |           "type": "null"
41 |         }
42 |       ],
43 |       "default": null,
44 |       "description": "A description of the task or any comments in natural language.",
45 |       "title": "Comment"
46 |     },
47 |     "date": {
48 |       "description": "A string representation of the task execution datetime in ISO 8601 format.",
49 |       "format": "date-time",
50 |       "title": "Date",
51 |       "type": "string"
52 |     }
53 |   },
54 |   "required": [
55 |     "agent",
56 |     "type",
57 |     "date"
58 |   ],
59 |   "title": "Log",
60 |   "type": "object"
61 | }


--------------------------------------------------------------------------------
/test/data/legacy_doc/doc-4.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "description": {
  3 |     "publication_date": "2024-07-01T12:00:00.000+00:00",
  4 |     "languages": [
  5 |       "en"
  6 |     ],
  7 |     "url_refs": [
  8 |       "https://www.link-to-pdf-626144176a8a0616ce8c111ecda4bc30b4a.com/file.pdf"
  9 |     ],
 10 |     "title": "Lorem ipsum",
 11 |     "affiliations": [
 12 |       {
 13 |         "name": "Affiliation Name"
 14 |       }
 15 |     ],
 16 |     "abstract": [
 17 |       "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
 18 |     ],
 19 |     "authors": [
 20 |       {
 21 |         "name": "Author 1"
 22 |       }
 23 |     ],
 24 |     "publication": [
 25 |       {
 26 |         "identifiers": [
 27 |           {
 28 |             "_name": "collection#12345",
 29 |             "type": "collection",
 30 |             "value": "12345"
 31 |           }
 32 |         ],
 33 |         "name": "International Conference",
 34 |         "type": [
 35 |           "conference"
 36 |         ],
 37 |         "alternate_names": [
 38 |           "Int Conference",
 39 |           "IC"
 40 |         ],
 41 |         "url": "https://en.wikipedia.org/wiki/Lorem_ipsum"
 42 |       },
 43 |       {
 44 |         "name": "Lorem ipsum",
 45 |         "pages": "130-189",
 46 |         "volume": "87"
 47 |       }
 48 |     ],
 49 |     "reference_count": 15,
 50 |     "citation_count": 3,
 51 |     "citation_date": "2023-05-23T12:00:00.000+00:00",
 52 |     "logs": [
 53 |       {
 54 |         "agent": "CXS",
 55 |         "type": "parsing",
 56 |         "comment": "parsing of documents",
 57 |         "date": "2022-11-09T21:22:19.248+00:00"
 58 |       }
 59 |     ],
 60 |     "acquisition": {
 61 |       "type": "Download",
 62 |       "date": "2022-11-06T07:13:09.317+00:00",
 63 |       "link": "https://en.wikipedia.org/wiki/Lorem_ipsum",
 64 |       "size": 102356
 65 |     },
 66 |     "collection": {
 67 |       "name": "Sample Collection",
 68 |       "type": "Document",
 69 |       "version": "1.2.3",
 70 |       "alias": [
 71 |         "SC"
 72 |       ]
 73 |     }
 74 |   },
 75 |   "main-text": [
 76 |     {
 77 |       "name": "title",
 78 |       "type": "title",
 79 |       "text": "Lorem ipsum"
 80 |     },
 81 |     {
 82 |       "name": "text",
 83 |       "type": "paragraph",
 84 |       "text": "Author"
 85 |     },
 86 |     {
 87 |       "name": "abstract",
 88 |       "type": "paragraph",
 89 |       "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
 90 |     }
 91 |   ],
 92 |   "file-info": {
 93 |     "document-hash": "9cdad4912f0b81298c96478626144176a8a0616fe8c101ecda4bc30b4a518374",
 94 |     "filename": "12345",
 95 |     "filename-prov": "12345.zip"
 96 |   },
 97 |   "identifiers": [
 98 |     {
 99 |       "_name": "collection#12345",
100 |       "type": "collection",
101 |       "value": "12345"
102 |     }
103 |   ],
104 |   "type": "article",
105 |   "_name": "Lorem ipsum"
106 | }


--------------------------------------------------------------------------------
/test/data/legacy_doc/doc-5.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": {
 3 |     "publication_date": "2024-07-01T12:00:00.000+00:00",
 4 |     "languages": [
 5 |       "en"
 6 |     ],
 7 |     "url_refs": [
 8 |       "https://www.link-to-pdf-626144176a8a0616ce8c111ecda4bc30b4a.com/file.pdf"
 9 |     ],
10 |     "title": "Lorem ipsum",
11 |     "affiliations": [
12 |       {
13 |         "name": "Affiliation Name"
14 |       }
15 |     ],
16 |     "abstract": [
17 |       "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
18 |     ],
19 |     "authors": [
20 |       {
21 |         "name": "Author 1"
22 |       }
23 |     ],
24 |     "publication": [
25 |       {
26 |         "identifiers": [
27 |           {
28 |             "_name": "collection#12345",
29 |             "type": "collection",
30 |             "value": "12345"
31 |           }
32 |         ],
33 |         "name": "International Conference",
34 |         "type": [
35 |           "conference"
36 |         ],
37 |         "alternate_names": [
38 |           "Int Conference",
39 |           "IC"
40 |         ],
41 |         "url": "https://en.wikipedia.org/wiki/Lorem_ipsum"
42 |       },
43 |       {
44 |         "name": "Lorem ipsum",
45 |         "pages": "130-189",
46 |         "volume": "87"
47 |       }
48 |     ],
49 |     "reference_count": 15,
50 |     "citation_count": 3,
51 |     "citation_date": "2023-05-23T12:00:00.000+00:00",
52 |     "logs": [
53 |       {
54 |         "agent": "CXS",
55 |         "type": "parsing",
56 |         "comment": "parsing of documents",
57 |         "date": "2022-11-09T21:22:19.248+00:00"
58 |       }
59 |     ],
60 |     "collection": {
61 |       "name": "Sample Collection",
62 |       "type": "Document",
63 |       "version": "1.2.3",
64 |       "alias": [
65 |         "SC"
66 |       ]
67 |     }
68 |   },
69 |   "file-info": {
70 |     "document-hash": "9cdad4912f0b81298c96478626144176a8a0616fe8c101ecda4bc30b4a518374",
71 |     "filename": "12345",
72 |     "filename-prov": "12345.zip"
73 |   },
74 |   "identifiers": [
75 |     {
76 |       "_name": "collection#12345",
77 |       "type": "collection",
78 |       "value": "12345"
79 |     }
80 |   ],
81 |   "type": "article",
82 |   "_name": "Lorem ipsum"
83 | }


--------------------------------------------------------------------------------
/test/data/legacy_doc/doc-8.json_table_0.dt.txt:
--------------------------------------------------------------------------------
1 | <table>
2 | <location><page_4><loc_44><loc_67><loc_67><loc_79></location>
3 | <row_0><col_0><location><page_4><loc_44><loc_77><loc_59><loc_79></location>Letter from Our Chairman and CEO</col_0><col_1><location><page_4><loc_66><loc_77><loc_67><loc_79></location>5</col_1></row_0>
4 | <row_1><col_0><location><page_4><loc_44><loc_74><loc_51><loc_76></location>Our ESG Goals</col_0><col_1><location><page_4><loc_66><loc_74><loc_67><loc_76></location>7</col_1></row_1>
5 | <row_2><col_0><location><page_4><loc_44><loc_71><loc_57><loc_73></location>Accountability for ESG at IBM</col_0><col_1><location><page_4><loc_66><loc_71><loc_67><loc_73></location>9</col_1></row_2>
6 | <row_3><col_0><location><page_4><loc_44><loc_67><loc_54><loc_69></location>Human Rights at IBM</col_0><col_1><location><page_4><loc_66><loc_67><loc_67><loc_69></location>10</col_1></row_3>
7 | </table>
8 | 


--------------------------------------------------------------------------------
/test/data/legacy_doc/error-3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "description": {
 3 |     "publication_date": "2024-07-01T12:00:00.000+00:00",
 4 |     "languages": [
 5 |       "en"
 6 |     ],
 7 |     "url_refs": [
 8 |       "https://www.link-to-pdf-626144176a8a0616ce8c111ecda4bc30b4a.com/file.pdf"
 9 |     ],
10 |     "title": "Lorem ipsum",
11 |     "affiliations": [
12 |       {
13 |         "name": "Affiliation Name"
14 |       }
15 |     ],
16 |     "abstract": [
17 |       "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
18 |     ],
19 |     "authors": [
20 |       {
21 |         "name": "Author 1"
22 |       }
23 |     ],
24 |     "publication": [
25 |       {
26 |         "id": "publication-id-12345",
27 |         "name": "International Conference",
28 |         "type": [
29 |           "conference"
30 |         ],
31 |         "alternate_names": [
32 |           "Int Conference",
33 |           "IC"
34 |         ],
35 |         "url": "https://en.wikipedia.org/wiki/Lorem_ipsum"
36 |       },
37 |       {
38 |         "name": "Lorem ipsum",
39 |         "pages": "130-189",
40 |         "volume": "87"
41 |       }
42 |     ],
43 |     "reference_count": -1,
44 |     "citation_count": "3",
45 |     "logs": [
46 |       {
47 |         "agent": "CXS",
48 |         "type": "parsing",
49 |         "comment": "parsing of documents",
50 |         "date": "2022-11-09T21:22:19.248+00:00"
51 |       }
52 |     ],
53 |     "collection": {
54 |       "name": "Sample Collection",
55 |       "type": "Document",
56 |       "version": "1.2.3",
57 |       "alias": [
58 |         "SC"
59 |       ]
60 |     }
61 |   },
62 |   "main-text": [
63 |     {
64 |       "name": "title",
65 |       "type": "title",
66 |       "text": "Lorem ipsum"
67 |     },
68 |     {
69 |       "name": "text",
70 |       "type": "paragraph",
71 |       "text": "Author"
72 |     },
73 |     {
74 |       "name": "abstract",
75 |       "type": "paragraph",
76 |       "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum."
77 |     }
78 |   ],
79 |   "file-info": {
80 |     "document-hash": "9cdad4912f0b81298c96478626144176a8a0616fe8c101ecda4bc30b4a518374",
81 |     "filename": "12345",
82 |     "filename-prov": "12345.zip"
83 |   },
84 |   "identifiers": [
85 |     {
86 |       "_name": "collection#12345",
87 |       "type": "collection",
88 |       "value": "12345"
89 |     }
90 |   ],
91 |   "type": "article",
92 |   "_name": "Lorem ipsum"
93 | }


--------------------------------------------------------------------------------
/test/data/legacy_doc/intermediates/ann.01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "annotations": [],
 3 |     "predictions": [
 4 |       {
 5 |         "cells": [
 6 |           {
 7 |             "id": 0,
 8 |             "rawcell_id": 0,
 9 |             "label": "List-identifier"
10 |           },
11 |           {
12 |             "id": 1,
13 |             "rawcell_id": 1,
14 |             "label": "List-item"
15 |           },
16 |           {
17 |             "id": 2,
18 |             "rawcell_id": 2,
19 |             "label": "List-identifier"
20 |           },
21 |           {
22 |             "id": 3,
23 |             "rawcell_id": 3,
24 |             "label": "List-item"
25 |           },
26 |           {
27 |             "id": 4,
28 |             "rawcell_id": 4,
29 |             "label": "Footnote"
30 |           },
31 |           {
32 |             "id": 5,
33 |             "rawcell_id": 5,
34 |             "label": "Footnote"
35 |           },
36 |           {
37 |             "id": 6,
38 |             "rawcell_id": 6,
39 |             "label": "Footnote"
40 |           }
41 |         ],
42 |         "clusters": [
43 |           {
44 |             "model": "RRF-image",
45 |             "type": "Picture",
46 |             "bbox": [
47 |               72.0,
48 |               366.100006,
49 |               612.0,
50 |               720.099976
51 |             ],
52 |             "cell_ids": [],
53 |             "merged": false,
54 |             "id": 0
55 |           }
56 |         ],
57 |         "tables": [],
58 |         "source": {
59 |           "type": "model",
60 |           "info": {
61 |             "display_name": "Applied predictions of collection model",
62 |             "model_name": "CollectionModel",
63 |             "model_class": "models.interactive",
64 |             "model_version": "20171024-16:04",
65 |             "model_id": "a423918e-77b9-025d-a06e-56a02f2z4f3b"
66 |           },
67 |           "timestamp": 1549956870.877
68 |         }
69 |       }
70 |     ],
71 |     "reports": []
72 |   }


--------------------------------------------------------------------------------
/test/data/legacy_doc/intermediates/pdf.meta.01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_id": "5bd03fdcdeff5a006862ee70",
 3 |   "file-info": {
 4 |     "filename": "test.pdf",
 5 |     "page-no": 5,
 6 |     "#-pages": 10,
 7 |     "document-hash": "a91d9bd6083c5adf1738589e12569f4f1e04f895aaa9a92d03e8a52137753fa5",
 8 |     "page-hash": "05956039dc5ea674f57cce469a3e86365c1047df9821b9ec55d5e16dbd4e9dcd",
 9 |     "description": {}
10 |   },
11 |   "_parse-status": "SUCCESS",
12 |   "lastModified": 1540374535.437
13 | }


--------------------------------------------------------------------------------
/test/data/legacy_doc/intermediates/publication_journal.json:
--------------------------------------------------------------------------------
1 | {
2 |     "name": "Journal of Environment",
3 |     "pages": "314-391",
4 |     "issue": "3",
5 |     "volume": "125",
6 |     "type": ["JournalArticle"],
7 |     "url": "https://www.ibm.com"
8 | }


--------------------------------------------------------------------------------
/test/data/legacy_doc/intermediates/publication_venue.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "identifiers": [
 3 |         {
 4 |             "_name": "venue#12345",
 5 |             "type": "venue",
 6 |             "value": "12345"
 7 |         }
 8 |     ],
 9 |     "name": "International Conference on Technology",
10 |     "type": [
11 |         "conference"
12 |     ],
13 |     "alternate_names": [
14 |         "ICoT",
15 |         "Random Conference on Technology"
16 |     ],
17 |     "url": "http://www.ibm.com"
18 | }


--------------------------------------------------------------------------------
/test/data/legacy_doc/intermediates/raw.meta.01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "file-info" :
 3 |     {
 4 |       "#-pages" : 10,
 5 |       "description" :
 6 |         {
 7 |         },
 8 |       "document-hash" : "a91d9bd6083c5adf1738589e12569f4f1e04f895aaa9a92d03e8a52137753fa5",
 9 |       "filename" : "ocr.pdf",
10 |       "page-hash" : "05956039dc5ea674f57cce469a3e86365c1047df9821b9ec55d5e16dbd4e9dcd",
11 |       "page-no" : 5
12 |     }
13 | }


--------------------------------------------------------------------------------
/test/data/nlp/error-qa-1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.",
 3 |   "question": null,
 4 |   "answer": "Armonk, New York.",
 5 |   "short_answer": null,
 6 |   "created": "2024-02-19T12:00:00.000+00:00",
 7 |   "generated_question": true,
 8 |   "generated_answer": true,
 9 |   "model": "model-name/model-task",
10 |   "paths": [
11 |     "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35"
12 |   ],
13 |   "advanced": {
14 |     "submitter": "Wikipedia"
15 |   }
16 | }


--------------------------------------------------------------------------------
/test/data/nlp/error-qa-3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.",
 3 |   "question": "Where is the headquarters of IBM located?",
 4 |   "answer": "IBM is headquartered in Armonk, New York.",
 5 |   "short_answer": null,
 6 |   "created": "2024-02-19T12:00:00.000+00:00",
 7 |   "generated_question": true,
 8 |   "generated_answer": true,
 9 |   "model": "model-name/model-task",
10 |   "paths": [
11 |     "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35",
12 |     "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/21",
13 |     "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35"
14 |   ],
15 |   "advanced": {
16 |     "submitter": "Wikipedia"
17 |   }
18 | }


--------------------------------------------------------------------------------
/test/data/nlp/qa-1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.",
 3 |   "question": "Where is the headquarters of IBM located?",
 4 |   "answer": "IBM is headquartered in Armonk, New York.",
 5 |   "short_answer": null,
 6 |   "created": "2024-02-19T12:00:00.000+00:00",
 7 |   "generated_question": true,
 8 |   "generated_answer": true,
 9 |   "model": "model-name/model-task",
10 |   "paths": [
11 |     "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35"
12 |   ],
13 |   "advanced": {
14 |     "submitter": "Wikipedia"
15 |   }
16 | }


--------------------------------------------------------------------------------
/test/data/nlp/qa-2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.",
 3 |   "question": "Where is the headquarters of IBM located?",
 4 |   "answer": "IBM is headquartered in Armonk, New York.",
 5 |   "short_answer": null,
 6 |   "created": "2024-02-19T12:00:00.000+00:00",
 7 |   "generated_question": true,
 8 |   "generated_answer": true,
 9 |   "model": "model-name/model-task",
10 |   "paths": [],
11 |   "advanced": {
12 |     "submitter": "Wikipedia"
13 |   }
14 | }


--------------------------------------------------------------------------------
/test/data/nlp/qa-3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.",
 3 |   "question": "Where is the headquarters of IBM located?",
 4 |   "answer": "IBM is headquartered in Armonk, New York.",
 5 |   "short_answer": null,
 6 |   "created": "2024-02-19T12:00:00.000+00:00",
 7 |   "generated_question": true,
 8 |   "generated_answer": true,
 9 |   "model": "model-name/model-task",
10 |   "paths": [],
11 |   "advanced": {
12 |     "submitter": "Wikipedia"
13 |   },
14 |   "labels": {
15 |     "scope": "document",
16 |     "alignment": "aligned",
17 |     "correctness": "entailed",
18 |     "completeness": "complete",
19 |     "information": "procedure"
20 |   }
21 | }


--------------------------------------------------------------------------------
/test/data/rec/attribute-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "conf": 1.0,
 3 |   "prov": [
 4 |     {
 5 |       "type": "sentence",
 6 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivly."
 7 |     }
 8 |   ],
 9 |   "predicates": [
10 |     {
11 |       "key": {
12 |         "type": "property",
13 |         "name": "Tc"
14 |       },
15 |       "value": {
16 |         "type": "property-value",
17 |         "name": "5K"
18 |       }
19 |     },
20 |     {
21 |       "key": {
22 |         "type": "property",
23 |         "name": "pressure"
24 |       },
25 |       "value": {
26 |         "type": "property-value",
27 |         "name": "5GPa"
28 |       }
29 |     }
30 |   ]
31 | }


--------------------------------------------------------------------------------
/test/data/rec/attribute-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "conf": 0.799,
 3 |   "prov": [
 4 |     {
 5 |       "type": "sentence",
 6 |       "text": "provenance in a sentence."
 7 |     },
 8 |     {
 9 |       "type": "table",
10 |       "text": "provenance in a table."
11 |     }
12 |   ],
13 |   "predicates": [
14 |     {
15 |       "key": {
16 |         "type": "property",
17 |         "name": "Tc"
18 |       },
19 |       "value": {
20 |         "type": "property-value",
21 |         "name": "5K"
22 |       }
23 |     }
24 |   ]
25 | }


--------------------------------------------------------------------------------
/test/data/rec/attribute-03.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "conf": 0.799,
 3 |   "prov": [
 4 |     {
 5 |       "type": "sentence",
 6 |       "text": "provenance in a sentence."
 7 |     },
 8 |     {
 9 |       "type": "table",
10 |       "text": "provenance in a table."
11 |     }
12 |   ],
13 |   "predicates": []
14 | }


--------------------------------------------------------------------------------
/test/data/rec/error-attribute-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "conf": 1.1,
 3 |   "prov": [
 4 |     {
 5 |       "type": "sentence",
 6 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivly."
 7 |     }
 8 |   ],
 9 |   "predicates": [
10 |     {
11 |       "key": {
12 |         "type": "property",
13 |         "name": "Tc"
14 |       },
15 |       "value": {
16 |         "type": "property-value",
17 |         "name": "5K"
18 |       }
19 |     },
20 |     {
21 |       "key": {
22 |         "type": "property",
23 |         "name": "pressure"
24 |       },
25 |       "value": {
26 |         "type": "property-value",
27 |         "name": "5GPa"
28 |       }
29 |     }
30 |   ]
31 | }


--------------------------------------------------------------------------------
/test/data/rec/error-attribute-02.json:
--------------------------------------------------------------------------------
1 | {
2 |   "conf": 1.0,
3 |   "prov": [
4 |     {
5 |       "type": "sentence",
6 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivly."
7 |     }
8 |   ]
9 | }


--------------------------------------------------------------------------------
/test/data/rec/error-predicate-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "key": {
 3 |     "type": "property",
 4 |     "name": "geopoint"
 5 |   },
 6 |   "value": {
 7 |     "type": "property-value",
 8 |     "name": "91.203494,-73.7238702"
 9 |   },
10 |   "geopoint_value": {
11 |     "value": [
12 |       -73.7238702,
13 |       91.203494
14 |     ],
15 |     "conf": 0.902875
16 |   }
17 | }


--------------------------------------------------------------------------------
/test/data/rec/error-predicate-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "key": {
 3 |     "type": "property",
 4 |     "name": "geopoint"
 5 |   },
 6 |   "value": {
 7 |     "type": "property-value",
 8 |     "name": "41.1096169,-73.7238702"
 9 |   },
10 |   "geopoint_value": {
11 |     "value": [
12 |       -73.7238702,
13 |       41.1096169
14 |     ],
15 |     "conf": 2.902875
16 |   }
17 | }


--------------------------------------------------------------------------------
/test/data/rec/predicate-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "key": {
 3 |     "type": "property",
 4 |     "name": "geopoint"
 5 |   },
 6 |   "value": {
 7 |     "type": "property-value",
 8 |     "name": "41.1096169,-73.7238702"
 9 |   },
10 |   "geopoint_value": {
11 |     "value": [
12 |       -73.7238702,
13 |       41.1096169
14 |     ],
15 |     "conf": 0.902875
16 |   }
17 | }


--------------------------------------------------------------------------------
/test/data/rec/predicate-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "key": {
 3 |     "type": "property",
 4 |     "name": "legal entity creation date"
 5 |   },
 6 |   "value": {
 7 |     "type": "property-value",
 8 |     "name": "2012-11-29T00:00:00.000Z"
 9 |   },
10 |   "datetime_value": {
11 |     "value": "2012-11-29T00:00:00.000Z"
12 |   }
13 | }


--------------------------------------------------------------------------------
/test/data/rec/record-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name": "some text here",
 3 |   "file-info": {
 4 |     "filename": "filename.pdf",
 5 |     "document-hash": "qwertyuiop1234567890"
 6 |   },
 7 |   "description": {
 8 |     "logs": []
 9 |   },
10 |   "conf": 1.0,
11 |   "prov": [
12 |     {
13 |       "type": "sentence",
14 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
15 |     }
16 |   ],
17 |   "identifiers": [
18 |     {
19 |       "_name": "db#1234567",
20 |       "type": "db",
21 |       "value": "1234567"
22 |     }
23 |   ],
24 |   "subject": {
25 |     "display_name": "FeSe",
26 |     "type": "material",
27 |     "names": [
28 |       {
29 |         "type": "chemical_name",
30 |         "value": "FeSe",
31 |         "_name": "chemical_name#fese"
32 |       },
33 |       {
34 |         "type": "sum_formula",
35 |         "value": "Fe(1) Se(1)",
36 |         "_name": "sum_formula#fe(1) se(1)"
37 |       }
38 |     ],
39 |     "identifiers": [
40 |       {
41 |         "_name": "db#1234567",
42 |         "type": "db",
43 |         "value": "1234567"
44 |       }
45 |     ]
46 |   },
47 |   "attributes": [
48 |     {
49 |       "predicates": [
50 |         {
51 |           "key": {
52 |             "type": "property",
53 |             "name": "Tc"
54 |           },
55 |           "value": {
56 |             "type": "property-value",
57 |             "name": "5K"
58 |           }
59 |         },
60 |         {
61 |           "key": {
62 |             "type": "property",
63 |             "name": "pressure"
64 |           },
65 |           "value": {
66 |             "type": "property-value",
67 |             "name": "5GPa"
68 |           }
69 |         }
70 |       ],
71 |       "conf": 1.0,
72 |       "prov": [
73 |         {
74 |           "type": "sentence",
75 |           "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
76 |         }
77 |       ]
78 |     }
79 |   ]
80 | }


--------------------------------------------------------------------------------
/test/data/rec/record-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name": "some text here",
 3 |   "file-info": {
 4 |     "filename": "filename.pdf",
 5 |     "document-hash": "qwertyuiop1234567890"
 6 |   },
 7 |   "description": {
 8 |     "logs": []
 9 |   },
10 |   "conf": 1.0,
11 |   "prov": [
12 |     {
13 |       "type": "sentence",
14 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
15 |     }
16 |   ],
17 |   "identifiers": [
18 |     {
19 |       "_name": "db#1234567",
20 |       "type": "db",
21 |       "value": "1234567"
22 |     }
23 |   ],
24 |   "extra": "Extra field temporarlly allowed",
25 |   "subject": {
26 |     "display_name": "FeSe",
27 |     "type": "material",
28 |     "names": [
29 |       {
30 |         "type": "chemical_name",
31 |         "value": "FeSe",
32 |         "_name": "chemical_name#fese"
33 |       },
34 |       {
35 |         "type": "sum_formula",
36 |         "value": "Fe(1) Se(1)",
37 |         "_name": "sum_formula#fe(1) se(1)"
38 |       }
39 |     ],
40 |     "identifiers": [
41 |       {
42 |         "_name": "db#1234567",
43 |         "type": "db",
44 |         "value": "1234567"
45 |       }
46 |     ]
47 |   },
48 |   "attributes": [
49 |     {
50 |       "predicates": [
51 |         {
52 |           "key": {
53 |             "type": "property",
54 |             "name": "Tc"
55 |           },
56 |           "value": {
57 |             "type": "property-value",
58 |             "name": "5K"
59 |           }
60 |         },
61 |         {
62 |           "key": {
63 |             "type": "property",
64 |             "name": "pressure"
65 |           },
66 |           "value": {
67 |             "type": "property-value",
68 |             "name": "5GPa"
69 |           }
70 |         }
71 |       ],
72 |       "conf": 1.0,
73 |       "prov": [
74 |         {
75 |           "type": "sentence",
76 |           "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
77 |         }
78 |       ]
79 |     }
80 |   ]
81 | }


--------------------------------------------------------------------------------
/test/data/rec/record-03.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name": "some text here",
 3 |   "file-info": {
 4 |     "filename": "filename.pdf",
 5 |     "document-hash": "qwertyuiop1234567890"
 6 |   },
 7 |   "description": {
 8 |     "logs": []
 9 |   },
10 |   "conf": 1.0,
11 |   "prov": [
12 |     {
13 |       "type": "sentence",
14 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
15 |     }
16 |   ],
17 |   "identifiers": [
18 |     {
19 |       "_name": "db#1234567",
20 |       "type": "db",
21 |       "value": "1234567"
22 |     }
23 |   ],
24 |   "subject": {
25 |     "display_name": "FeSe",
26 |     "type": "material",
27 |     "names": [
28 |       {
29 |         "type": "chemical_name",
30 |         "value": "FeSe",
31 |         "_name": "chemical_name#fese"
32 |       },
33 |       {
34 |         "type": "sum_formula",
35 |         "value": "Fe(1) Se(1)",
36 |         "_name": "sum_formula#fe(1) se(1)"
37 |       }
38 |     ],
39 |     "identifiers": [
40 |       {
41 |         "_name": "db#1234567",
42 |         "type": "db",
43 |         "value": "1234567"
44 |       }
45 |     ]
46 |   }
47 | }


--------------------------------------------------------------------------------
/test/data/rec/record-04.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name": "some text here",
 3 |   "file-info": {
 4 |     "filename": "filename.pdf",
 5 |     "document-hash": "qwertyuiop1234567890"
 6 |   },
 7 |   "description": {
 8 |     "logs": [
 9 |       {
10 |         "date": "2023-03-01T19:32:20.000000Z",
11 |         "agent": "CXS",
12 |         "type": "parsing"
13 |       }
14 |     ],
15 |     "collection": {
16 |       "name": "DB",
17 |       "type": "Record",
18 |       "version": "3.2.0",
19 |       "alias": [
20 |         "db"
21 |       ]
22 |     },
23 |     "publication_date": "2023-03-01T18:32:20.416449Z"
24 |   },
25 |   "conf": 1.0,
26 |   "prov": [
27 |     {
28 |       "type": "sentence",
29 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
30 |     }
31 |   ],
32 |   "identifiers": [
33 |     {
34 |       "_name": "db#1234567",
35 |       "type": "db",
36 |       "value": "1234567"
37 |     }
38 |   ],
39 |   "subject": {
40 |     "display_name": "FeSe",
41 |     "type": "material",
42 |     "names": [
43 |       {
44 |         "type": "chemical_name",
45 |         "value": "FeSe",
46 |         "_name": "chemical_name#fese"
47 |       },
48 |       {
49 |         "type": "sum_formula",
50 |         "value": "Fe(1) Se(1)",
51 |         "_name": "sum_formula#fe(1) se(1)"
52 |       }
53 |     ],
54 |     "identifiers": [
55 |       {
56 |         "_name": "db#1234567",
57 |         "type": "db",
58 |         "value": "1234567"
59 |       }
60 |     ]
61 |   },
62 |   "attributes": [
63 |     {
64 |       "predicates": [
65 |         {
66 |           "key": {
67 |             "type": "property",
68 |             "name": "temperature"
69 |           },
70 |           "value": {
71 |             "type": "property-value",
72 |             "name": "5K"
73 |           }
74 |         },
75 |         {
76 |           "key": {
77 |             "type": "property",
78 |             "name": "pressure"
79 |           },
80 |           "value": {
81 |             "type": "property-value",
82 |             "name": "5GPa"
83 |           }
84 |         }
85 |       ],
86 |       "conf": 1.0,
87 |       "prov": [
88 |         {
89 |           "type": "sentence",
90 |           "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
91 |         }
92 |       ]
93 |     }
94 |   ]
95 | }


--------------------------------------------------------------------------------
/test/data/rec/record-05.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name": "some text here",
 3 |   "file-info": {
 4 |     "filename": "filename.pdf",
 5 |     "document-hash": "qwertyuiop1234567890"
 6 |   },
 7 |   "description": {
 8 |     "logs": [],
 9 |     "collection": {
10 |       "name": "DB",
11 |       "type": "Record",
12 |       "version": "3.2.0",
13 |       "alias": [
14 |         "db"
15 |       ]
16 |     },
17 |     "publication_date": "2023-03-01T18:32:20.416449Z"
18 |   },
19 |   "conf": 1.0,
20 |   "prov": [],
21 |   "identifiers": [
22 |     {
23 |       "_name": "db#1234567",
24 |       "type": "db",
25 |       "value": "1234567"
26 |     }
27 |   ],
28 |   "subject": {
29 |     "display_name": "FeSe",
30 |     "display_image": {
31 |       "__ref_s3_data": "#/_s3_data/pdf_pages/0"
32 |     },
33 |     "type": "material",
34 |     "names": [
35 |       {
36 |         "type": "chemical_name",
37 |         "value": "FeSe",
38 |         "_name": "chemical_name#fese"
39 |       }
40 |     ],
41 |     "identifiers": [
42 |       {
43 |         "_name": "db#1234567",
44 |         "type": "db",
45 |         "value": "1234567"
46 |       }
47 |     ]
48 |   },
49 |   "attributes": [],
50 |   "_s3_data": {
51 |     "pdf_pages": [
52 |       {
53 |         "mime": "application/png",
54 |         "path": "PDFImages/3d201262771eb38591c1112c0ad52bfdc7ef5a352.png",
55 |         "page": 9,
56 |         "url": "https://s3.somecosurl.com/3d201262771eb38591c1112c0ad52bfdc7ef5a352"
57 |       }
58 |     ]
59 |   }
60 | }


--------------------------------------------------------------------------------
/test/data/rec/record-gleif-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_name": "some text here",
 3 |   "file-info": {
 4 |     "filename": "VGRQXHF3J8VDLUA7XE92",
 5 |     "document-hash": "f54e9703c17364ea904d2a07d3e2739ed7c7de0f86b2800ed25bdeea5871eb88",
 6 |     "filename-prov": "20221013-0800-gleif-goldencopy-lei2-golden-copy.csv.zip"
 7 |   },
 8 |   "description": {
 9 |     "publication_date": "2021-06-05T18:00:00.000Z",
10 |     "logs": [
11 |       {
12 |         "date": "2022-09-16T09:36:35.741+00:00",
13 |         "agent": "CXS",
14 |         "comment": "gleif parsing",
15 |         "type": "parsing"
16 |       }
17 |     ]
18 |   },
19 |   "conf": 1.0,
20 |   "prov": [
21 |     {
22 |       "type": "database",
23 |       "text": "Gleif golden copy"
24 |     }
25 |   ],
26 |   "identifiers": [
27 |     {
28 |       "type": "some_type",
29 |       "value": "my_id",
30 |       "_name": "some_type#my_id"
31 |     }
32 |   ],
33 |   "subject": {
34 |     "display_name": "INTERNATIONAL BUSINESS MACHINES CORPORATION",
35 |     "type": "company",
36 |     "names": [
37 |       {
38 |         "type": "company",
39 |         "value": "INTERNATIONAL BUSINESS MACHINES CORPORATION",
40 |         "_name": "company#international business machines corporation"
41 |       }
42 |     ],
43 |     "identifiers": [
44 |       {
45 |         "type": "ticker",
46 |         "value": "IBM:NYSE",
47 |         "_name": "ticker#ibm:nyse"
48 |       }
49 |     ]
50 |   },
51 |   "attributes": [
52 |     {
53 |       "predicates": [
54 |         {
55 |           "key": {
56 |             "type": "property",
57 |             "name": "legal address country"
58 |           },
59 |           "value": {
60 |             "type": "property-value",
61 |             "name": "US"
62 |           }
63 |         },
64 |         {
65 |           "key": {
66 |             "type": "property",
67 |             "name": "legal address city"
68 |           },
69 |           "value": {
70 |             "type": "property-value",
71 |             "name": "Armonk"
72 |           }
73 |         }
74 |       ],
75 |       "conf": 1.0,
76 |       "prov": [
77 |         {
78 |           "type": "database record",
79 |           "text": "VGRQXHF3J8VDLUA7XE92"
80 |         }
81 |       ]
82 |     }
83 |   ]
84 | }


--------------------------------------------------------------------------------
/test/data/rec/statement-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "conf": 0.25,
 3 |   "prov": [
 4 |     {
 5 |       "type": "sentence",
 6 |       "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily."
 7 |     }
 8 |   ],
 9 |   "subject": {
10 |     "display_name": "FeSe",
11 |     "type": "material",
12 |     "names": [
13 |       {
14 |         "type": "material",
15 |         "value": "FeSe",
16 |         "_name": "material#fese"
17 |       }
18 |     ],
19 |     "identifiers": [
20 |       {
21 |         "type": "material",
22 |         "value": "Fe(1) Se(1)",
23 |         "_name": "material#fe(1) se(1)"
24 |       }
25 |     ]
26 |   },
27 |   "predicates": [
28 |     {
29 |       "key": {
30 |         "type": "property",
31 |         "name": "Tc"
32 |       },
33 |       "value": {
34 |         "type": "property-value",
35 |         "name": "5K"
36 |       }
37 |     },
38 |     {
39 |       "key": {
40 |         "type": "property",
41 |         "name": "pressure"
42 |       },
43 |       "value": {
44 |         "type": "property-value",
45 |         "name": "5GPa"
46 |       }
47 |     }
48 |   ]
49 | }


--------------------------------------------------------------------------------
/test/data/rec/statement-02.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "subject": {
  3 |     "names": [
  4 |       {
  5 |         "_name": "chemical_name#bi2sr2cacu2o8",
  6 |         "type": "chemical_name",
  7 |         "value": "Bi2Sr2CaCu2O8"
  8 |       }
  9 |     ],
 10 |     "identifiers": [
 11 |       {
 12 |         "_name": "ent_id#b94dls9d",
 13 |         "type": "ent_id",
 14 |         "value": "b94dls9d"
 15 |       },
 16 |       {
 17 |         "_name": "sum_formula#bi(2) ca(1) cu(2) o(8) sr(2)",
 18 |         "type": "sum_formula",
 19 |         "value": "Bi(2) Ca(1) Cu(2) O(8) Sr(2)"
 20 |       }
 21 |     ],
 22 |     "display_name": "Bi(2) Ca(1) Cu(2) O(8) Sr(2)",
 23 |     "type": "material"
 24 |   },
 25 |   "conf": 1.0,
 26 |   "predicates": [
 27 |     {
 28 |       "numerical_value": {
 29 |         "val": 0.23,
 30 |         "unit": "dimensionless",
 31 |         "min": 0.11,
 32 |         "err": 0.05,
 33 |         "max": 0.35
 34 |       },
 35 |       "numerical_value_si": {
 36 |         "val": 0.23,
 37 |         "unit": "dimensionless",
 38 |         "min": 0.11,
 39 |         "err": 0.05,
 40 |         "max": 0.35
 41 |       },
 42 |       "value": {
 43 |         "name": "0.11 to 0.35",
 44 |         "type": "property-value"
 45 |       },
 46 |       "key": {
 47 |         "name": "hole concentration",
 48 |         "type": "property"
 49 |       }
 50 |     }
 51 |   ],
 52 |   "type": "statement",
 53 |   "subtype": "mat_to_prop_to_pvls",
 54 |   "model": "Docling Model 0.0.0",
 55 |   "source": "sentence.3",
 56 |   "match": "89f0d4058c2483678b2cc4f515acf463",
 57 |   "range": [
 58 |     430,
 59 |     668
 60 |   ],
 61 |   "prov": [
 62 |     {
 63 |       "text": "Here is a sentence with measurements with high-Tc superconductor, on Bi2Sr2CaCu2O8 samples with different Tc values (hole concentration of 0.11 to 0.35).",
 64 |       "type": "sentence",
 65 |       "reference": {
 66 |         "_name": "arxivid#0706.0214",
 67 |         "type": "arxivid",
 68 |         "value": "0706.0214"
 69 |       }
 70 |     }
 71 |   ],
 72 |   "_name": "sentence.3",
 73 |   "identifiers": [
 74 |     {
 75 |       "_name": "ent_id#b94dls9d",
 76 |       "type": "ent_id",
 77 |       "value": "b94dls9d"
 78 |     },
 79 |     {
 80 |       "_name": "sum_formula#bi(2) ca(1) cu(2) o(8) sr(2)",
 81 |       "type": "sum_formula",
 82 |       "value": "Bi(2) Ca(1) Cu(2) O(8) Sr(2)"
 83 |     }
 84 |   ],
 85 |   "file-info": {
 86 |     "document-hash": "ff94fd78199fe714f2bf6143ab4af8379a581587fd7049aa1a62167196f8f07e",
 87 |     "filename": "db-file.pdf",
 88 |     "filename-prov": "db-archive.zip"
 89 |   },
 90 |   "description": {
 91 |     "provenance": {
 92 |       "source": "arXiv abstracts"
 93 |     },
 94 |     "publication_date": "2007-06-01T20:36:32.000+00:00",
 95 |     "logs": [
 96 |       {
 97 |         "agent": "CXS",
 98 |         "type": "parsing",
 99 |         "comment": "statement extraction",
100 |         "date": "2022-12-15T13:24:51.778+00:00"
101 |       }
102 |     ]
103 |   }
104 | }


--------------------------------------------------------------------------------
/test/data/rec/statement-gleif-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "conf": 0.723,
 3 |   "prov": [
 4 |     {
 5 |       "type": "sentence",
 6 |       "reference": {
 7 |         "_name": "report#nyse:ibm-2022q2",
 8 |         "type": "report",
 9 |         "value": "NYSE:IBM-2022Q2"
10 |       },
11 |       "path": "#/main-text/30",
12 |       "span": [
13 |         23,
14 |         67
15 |       ],
16 |       "text": "IBM Q2 2022 revenue reached $15.5 billion in the period ending June 30"
17 |     }
18 |   ],
19 |   "subject": {
20 |     "display_name": "IBM",
21 |     "type": "company",
22 |     "names": [
23 |       {
24 |         "type": "company",
25 |         "value": "IBM",
26 |         "_name": "company#ibm"
27 |       }
28 |     ],
29 |     "identifiers": [
30 |       {
31 |         "type": "ticker",
32 |         "value": "IBM:NYSE",
33 |         "_name": "ticker#ibm:nyse"
34 |       }
35 |     ]
36 |   },
37 |   "predicates": [
38 |     {
39 |       "key": {
40 |         "type": "property",
41 |         "name": "kpi"
42 |       },
43 |       "value": {
44 |         "type": "property-value",
45 |         "name": "$15.5 billion"
46 |       }
47 |     },
48 |     {
49 |       "key": {
50 |         "type": "property",
51 |         "name": "date"
52 |       },
53 |       "value": {
54 |         "type": "property-value",
55 |         "name": "Q2 2022"
56 |       }
57 |     },
58 |     {
59 |       "key": {
60 |         "type": "property",
61 |         "name": "reporting_period"
62 |       },
63 |       "value": {
64 |         "type": "property-value",
65 |         "name": "ending June 30"
66 |       }
67 |     }
68 |   ]
69 | }


--------------------------------------------------------------------------------
/test/data/rec/subject-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"display_name": "FeSe",
 3 | 	"type": "material",
 4 | 	"names": [
 5 | 		{
 6 | 			"type": "chemical_name",
 7 | 			"value": "FeSe",
 8 | 			"_name": "chemical_name#fese"
 9 | 		},
10 | 		{
11 | 			"type": "sum_formula",
12 | 			"value": "Fe(1) Se(1)",
13 | 			"_name": "sum_formula#fe(1) se(1)"
14 | 		}
15 | 	],
16 | 	"identifiers": [
17 | 		{
18 | 			"_name": "db#1234567",
19 | 			"type": "db",
20 | 			"value": "1234567"
21 | 		}
22 | 	]
23 | }


--------------------------------------------------------------------------------
/test/data/rec/subject-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "display_name": "FeSe",
 3 |   "display_image": {
 4 |     "__ref_s3_data": "#/s3_data/figures/0"
 5 |   },
 6 |   "type": "material",
 7 |   "names": [
 8 |     {
 9 |       "type": "chemical_name",
10 |       "value": "FeSe",
11 |       "_name": "chemical_name#fese"
12 |     }
13 |   ],
14 |   "identifiers": [
15 |     {
16 |       "_name": "db#1234567",
17 |       "type": "db",
18 |       "value": "1234567"
19 |     }
20 |   ]
21 | }


--------------------------------------------------------------------------------
/test/data/search/error-meta-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aliases": [
 3 |     ".production",
 4 |     "arxiv"
 5 |   ],
 6 |   "created": "2022-08-15T14:10:32.768+00:00",
 7 |   "description": "arXiv® is a curated research-sharing platform open to anyone. It stores scholarly articles in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics.",
 8 |   "source": "https://arxiv.org",
 9 |   "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-arxiv-data",
10 |   "display_name": "arXiv full documents",
11 |   "type": "Reference",
12 |   "classification": [
13 |     "Public",
14 |     "PI"
15 |   ],
16 |   "license": "https://arxiv.org/about",
17 |   "filename": "arxiv-gs.json",
18 |   "domain": [
19 |     "Science",
20 |     "Literature"
21 |   ],
22 |   "$ref": "ccs:schemas#/Document",
23 |   "ccs_s3_data": {
24 |     "endpoint": "s3.eu-de.cloud-object-storage.appdomain.cloud",
25 |     "paths": [
26 |       {
27 |         "bucket": "foc-deepsearch-s3-elastic",
28 |         "prefix": "deepsearch-elastic-dataplatform",
29 |         "infix": "cxs-8a1925c96f7b49508855ab270d0f3281"
30 |       }
31 |     ]
32 |   }
33 | }


--------------------------------------------------------------------------------
/test/data/search/error-meta-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aliases": [
 3 |     ".production",
 4 |     "arxiv"
 5 |   ],
 6 |   "created": "2022-08-15T14:10:32.768+00:00",
 7 |   "description": "arXiv® is a curated research-sharing platform open to anyone. It stores scholarly articles in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics.",
 8 |   "source": "https://arxiv.org",
 9 |   "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-arxiv-data",
10 |   "display_name": "arXiv full documents",
11 |   "type": "Document",
12 |   "classification": [
13 |     "Public",
14 |     "PI"
15 |   ],
16 |   "version": [
17 |     {
18 |       "name": "docling-core",
19 |       "version": "beta"
20 |     }
21 |   ],
22 |   "license": "https://arxiv.org/about",
23 |   "filename": "arxiv-gs.json",
24 |   "domain": [
25 |     "Science",
26 |     "Banking & Finance"
27 |   ],
28 |   "$ref": "cps:schemas#/Record",
29 |   "ccs_s3_data": {
30 |     "endpoint": "s3.eu-de.cloud-object-storage.appdomain.cloud",
31 |     "paths": [
32 |       {
33 |         "bucket": "foc-deepsearch-s3-elastic",
34 |         "prefix": "deepsearch-elastic-dataplatform",
35 |         "infix": "cxs-8a1925c96f7b49508855ab270d0f3281"
36 |       }
37 |     ]
38 |   }
39 | }


--------------------------------------------------------------------------------
/test/data/search/error-meta-03.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "created": "2022-08-15T14:10:32.768+00:00",
 3 |   "version": [
 4 |     {
 5 |       "name": "docling-core",
 6 |       "version": "1.0.0"
 7 |     }
 8 |   ],
 9 |   "type": "Record",
10 |   "$ref": "cps:schemas#/Record",
11 |   "extra": "an extra field"
12 | }


--------------------------------------------------------------------------------
/test/data/search/meta-01.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aliases": [
 3 |     ".production",
 4 |     "arxiv"
 5 |   ],
 6 |   "created": "2022-08-15T14:10:32.768+00:00",
 7 |   "description": "arXiv® is a curated research-sharing platform open to anyone. It stores scholarly articles in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics.",
 8 |   "source": "https://arxiv.org",
 9 |   "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-arxiv-data",
10 |   "display_name": "arXiv full documents",
11 |   "type": "Document",
12 |   "classification": [
13 |     "Public",
14 |     "PI"
15 |   ],
16 |   "version": [
17 |     {
18 |       "name": "docling-core",
19 |       "version": "1.0.1"
20 |     }
21 |   ],
22 |   "license": "https://arxiv.org/about",
23 |   "filename": "arxiv-gs.json",
24 |   "domain": [
25 |     "Science"
26 |   ],
27 |   "$ref": "ccs:schemas#/Document",
28 |   "ccs_s3_data": {
29 |     "endpoint": "s3.eu-de.cloud-object-storage.appdomain.cloud",
30 |     "paths": [
31 |       {
32 |         "bucket": "foc-deepsearch-s3-elastic",
33 |         "prefix": "deepsearch-elastic-dataplatform",
34 |         "infix": "cxs-8a1925c96f7b49508855ab270d0f3281"
35 |       }
36 |     ]
37 |   }
38 | }


--------------------------------------------------------------------------------
/test/data/search/meta-02.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aliases": [
 3 |     ".production",
 4 |     "patent-uspto"
 5 |   ],
 6 |   "created": "2022-10-09T20:57:06.860+00:00",
 7 |   "description": "Patents from the US Patent and Trade Office (USPTO)",
 8 |   "source": "https://www.uspto.gov/",
 9 |   "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-uspto-data",
10 |   "display_name": "Patents from USPTO",
11 |   "type": "Document",
12 |   "classification": [
13 |     "Public",
14 |     "PI"
15 |   ],
16 |   "version": [
17 |     {
18 |       "name": "docling-core",
19 |       "version": "1.0.1"
20 |     },
21 |     {
22 |       "name": "deepsearch-cxs",
23 |       "version": "1.0.0-alpha0.valid"
24 |     }
25 |   ],
26 |   "license": "https://www.uspto.gov/terms-use-uspto-websites",
27 |   "domain": [
28 |     "Science",
29 |     "Technology"
30 |   ],
31 |   "$ref": "ccs:schemas#/Document"
32 | }


--------------------------------------------------------------------------------
/test/data/search/meta-03.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "aliases": [
 3 |     ".production",
 4 |     "osm"
 5 |   ],
 6 |   "created": "2022-10-09T20:57:06.860+00:00",
 7 |   "description": "OpenStreetMap dsata",
 8 |   "source": "https://www.openstreetmap.org",
 9 |   "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-osm-data",
10 |   "display_name": "OpenStreetMap",
11 |   "type": "Generic",
12 |   "classification": [
13 |     "Public"
14 |   ],
15 |   "version": [
16 |     {
17 |       "name": "docling-core",
18 |       "version": "1.0.1"
19 |     },
20 |     {
21 |       "name": "deepsearch-cxs",
22 |       "version": "1.0.0-alpha0.valid"
23 |     }
24 |   ],
25 |   "license": "https://www.openstreetmap.org/copyright",
26 |   "domain": [
27 |     "Geography"
28 |   ]
29 | }


--------------------------------------------------------------------------------
/test/data/search/meta-04.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "license": "",
 3 |   "index_key": "07a32fab8e3cf827a1d6691fd530941485282dd7",
 4 |   "created": "2023-11-28T15:10:08.226942+00:00",
 5 |   "project_key": "9cd28d76ca3d0cc853edb23976f32b44f5739839",
 6 |   "description": "",
 7 |   "source": "",
 8 |   "display_name": "",
 9 |   "type": "Document",
10 |   "version": [
11 |     {
12 |       "name": "docling-core",
13 |       "version": "1.4.0"
14 |     }
15 |   ],
16 |   "$ref": "ccs:schemas#/Document"
17 | }


--------------------------------------------------------------------------------
/test/data/viz/2408.09869v3_enriched.dt_viz_p2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched.dt_viz_p2.png


--------------------------------------------------------------------------------
/test/data/viz/2408.09869v3_enriched_viz_p1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_p1.png


--------------------------------------------------------------------------------
/test/data/viz/2408.09869v3_enriched_viz_p2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_p2.png


--------------------------------------------------------------------------------
/test/data/viz/2408.09869v3_enriched_viz_p3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_p3.png


--------------------------------------------------------------------------------
/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p1.png


--------------------------------------------------------------------------------
/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p2.png


--------------------------------------------------------------------------------
/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p3.png


--------------------------------------------------------------------------------
/test/test_data_gen_flag.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from pydantic import TypeAdapter
 4 | 
 5 | GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
 6 | 
 7 | 
 8 | def test_gen_test_data_flag():
 9 |     assert not GEN_TEST_DATA
10 | 


--------------------------------------------------------------------------------
/test/test_doc_base.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | 
 7 | import pytest
 8 | from pydantic import ValidationError
 9 | 
10 | from docling_core.types.legacy_doc.base import Prov, S3Reference
11 | 
12 | 
13 | def test_s3_reference():
14 |     """Validate data with Identifier model."""
15 |     gold_dict = {"__ref_s3_data": "#/s3_data/figures/0"}
16 |     data = S3Reference(__ref_s3_data="#/s3_data/figures/0")
17 | 
18 |     assert data.model_dump() == gold_dict
19 |     assert data.model_dump(by_alias=True) == gold_dict
20 | 
21 |     with pytest.raises(ValidationError, match="required"):
22 |         S3Reference()
23 | 
24 | 
25 | def test_prov():
26 |     prov = {
27 |         "bbox": [
28 |             48.19645328521729,
29 |             644.2883926391602,
30 |             563.6185592651367,
31 |             737.4546043395997,
32 |         ],
33 |         "page": 2,
34 |         "span": [0, 0],
35 |     }
36 | 
37 |     assert Prov(**prov)
38 | 
39 |     with pytest.raises(ValidationError, match="valid integer"):
40 |         prov["span"] = ["foo", 0]
41 |         Prov(**prov)
42 | 
43 |     with pytest.raises(ValidationError, match="at least 2 items"):
44 |         prov["span"] = [0]
45 |         Prov(**prov)
46 | 


--------------------------------------------------------------------------------
/test/test_doc_legacy_convert.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import yaml
 4 | 
 5 | from docling_core.types.doc import DoclingDocument
 6 | from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 7 | from docling_core.utils.legacy import (
 8 |     docling_document_to_legacy,
 9 |     legacy_to_docling_document,
10 | )
11 | 
12 | GENERATE = False
13 | 
14 | 
15 | def test_new_to_old():
16 |     filename = "test/data/doc/2206.01062.yaml"
17 | 
18 |     with open(filename, "r", encoding="utf-8") as fp:
19 |         dict_from_yaml = yaml.safe_load(fp)
20 | 
21 |     doc = DoclingDocument.model_validate(dict_from_yaml)
22 | 
23 |     docling_document_to_legacy(doc)
24 | 
25 | 
26 | def test_old_to_new():
27 |     filepath = Path("test/data/legacy_doc/doc-export.json")
28 |     leg_doc = DsDocument.model_validate_json(filepath.read_text())
29 | 
30 |     doc = legacy_to_docling_document(leg_doc)
31 | 
32 |     gt_filepath = Path(filepath.with_suffix(".docling.yaml.gt"))
33 |     if GENERATE:
34 |         doc.save_as_yaml(gt_filepath)
35 | 
36 |     with gt_filepath.open() as gt_fp:
37 |         gt_dict = yaml.safe_load(gt_fp)
38 |         gt_doc = DoclingDocument.model_validate(gt_dict)
39 | 
40 |     assert doc == gt_doc
41 | 


--------------------------------------------------------------------------------
/test/test_doc_schema_extractor.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Test the pydantic models in module data_types.ccs."""
 7 | import json
 8 | 
 9 | from pydantic import ValidationError
10 | 
11 | from docling_core.types.legacy_doc.document import CCSDocument
12 | 
13 | 
14 | def test_ccs_document_update():
15 |     """Validate data with CCSDocument extract."""
16 |     filename = "test/data/legacy_doc/ext-1.json"
17 |     try:
18 |         with open(filename, encoding="utf-8") as f:
19 |             raw_doc = json.load(f)
20 |             for item in raw_doc["main-text"]:
21 |                 if "$ref" in item:
22 |                     assert False, f"$ref should not be in file {filename}"
23 | 
24 |             doc = CCSDocument.model_validate(raw_doc)
25 | 
26 |             if doc.description.abstract:
27 |                 assert False, f"Abstract should not be present"
28 | 
29 |     except ValidationError as e:
30 |         print(f"Validation error in file {filename}:\n{e.json()}")
31 |         raise
32 | 


--------------------------------------------------------------------------------
/test/test_hierarchical_chunker.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | import json
 7 | 
 8 | from docling_core.transforms.chunker import HierarchicalChunker
 9 | from docling_core.transforms.chunker.hierarchical_chunker import (
10 |     ChunkingDocSerializer,
11 |     ChunkingSerializerProvider,
12 |     DocChunk,
13 | )
14 | from docling_core.transforms.serializer.markdown import MarkdownTableSerializer
15 | from docling_core.types.doc import DoclingDocument as DLDocument
16 | from docling_core.types.doc.document import DoclingDocument
17 | 
18 | from .test_data_gen_flag import GEN_TEST_DATA
19 | 
20 | 
21 | def _process(act_data, exp_path_str):
22 |     if GEN_TEST_DATA:
23 |         with open(exp_path_str, mode="w", encoding="utf-8") as f:
24 |             json.dump(act_data, fp=f, indent=4)
25 |             f.write("\n")
26 |     else:
27 |         with open(exp_path_str, encoding="utf-8") as f:
28 |             exp_data = json.load(fp=f)
29 |         assert exp_data == act_data
30 | 
31 | 
32 | def test_chunk():
33 |     with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
34 |         data_json = f.read()
35 |     dl_doc = DLDocument.model_validate_json(data_json)
36 |     chunker = HierarchicalChunker(
37 |         merge_list_items=True,
38 |     )
39 |     chunks = chunker.chunk(dl_doc=dl_doc)
40 |     act_data = dict(
41 |         root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
42 |     )
43 |     _process(
44 |         act_data=act_data,
45 |         exp_path_str="test/data/chunker/0_out_chunks.json",
46 |     )
47 | 
48 | 
49 | def test_chunk_custom_serializer():
50 |     with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f:
51 |         data_json = f.read()
52 |     dl_doc = DLDocument.model_validate_json(data_json)
53 | 
54 |     class MySerializerProvider(ChunkingSerializerProvider):
55 |         def get_serializer(self, doc: DoclingDocument):
56 |             return ChunkingDocSerializer(
57 |                 doc=doc,
58 |                 table_serializer=MarkdownTableSerializer(),
59 |             )
60 | 
61 |     chunker = HierarchicalChunker(
62 |         merge_list_items=True,
63 |         serializer_provider=MySerializerProvider(),
64 |     )
65 | 
66 |     chunks = chunker.chunk(dl_doc=dl_doc)
67 |     act_data = dict(
68 |         root=[DocChunk.model_validate(n).export_json_dict() for n in chunks]
69 |     )
70 |     _process(
71 |         act_data=act_data,
72 |         exp_path_str="test/data/chunker/0b_out_chunks.json",
73 |     )
74 | 


--------------------------------------------------------------------------------
/test/test_nlp_qa.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Test the pydantic models in module data_types.nlp.qa.py"""
 7 | import glob
 8 | import unittest
 9 | 
10 | import pytest
11 | from pydantic import ValidationError
12 | 
13 | from docling_core.types.nlp.qa import QAPair
14 | 
15 | 
16 | class TestQAPair(unittest.TestCase):
17 |     """Test QAPair model."""
18 | 
19 |     def test_qapair_read(self):
20 |         """Validate data read from files."""
21 |         for filename in glob.glob("test/data/nlp/qa-*.json"):
22 |             try:
23 |                 with open(filename, encoding="utf-8") as file_obj:
24 |                     file_json = file_obj.read()
25 |                 QAPair.model_validate_json(file_json)
26 |             except ValidationError as e:
27 |                 print(f"Validation error in file {filename}", e.json())
28 |                 raise
29 | 
30 |     def test_qapair_wrong(self):
31 |         """Validates wrong format from files."""
32 |         filename = "test/data/nlp/error-qa-1.json"
33 |         with (
34 |             pytest.raises(ValidationError, match="Input should be a valid string"),
35 |             open(filename, encoding="utf-8") as file_obj,
36 |         ):
37 |             file_json = file_obj.read()
38 |             QAPair.model_validate_json(file_json)
39 | 
40 |         filename = "test/data/nlp/error-qa-3.json"
41 |         with (
42 |             pytest.raises(ValidationError, match="List must be unique"),
43 |             open(filename, encoding="utf-8") as file_obj,
44 |         ):
45 |             file_json = file_obj.read()
46 |             QAPair.model_validate_json(file_json)
47 | 


--------------------------------------------------------------------------------
/test/test_page.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | import pytest
 5 | 
 6 | from docling_core.types.doc.page import BoundingRectangle
 7 | 
 8 | SQRT_2 = math.sqrt(2)
 9 | 
10 | R_0 = BoundingRectangle(r_x0=0, r_y0=0, r_x1=1, r_y1=0, r_x2=1, r_y2=1, r_x3=0, r_y3=1)
11 | R_45 = BoundingRectangle(
12 |     r_x0=0,
13 |     r_y0=0,
14 |     r_x1=SQRT_2 / 2,
15 |     r_y1=SQRT_2 / 2,
16 |     r_x2=0,
17 |     r_y2=SQRT_2,
18 |     r_x3=-SQRT_2 / 2,
19 |     r_y3=SQRT_2 / 2,
20 | )
21 | R_90 = BoundingRectangle(
22 |     r_x0=0, r_y0=0, r_x1=0, r_y1=1, r_x2=-1, r_y2=1, r_x3=-1, r_y3=0
23 | )
24 | R_135 = BoundingRectangle(
25 |     r_x0=0,
26 |     r_y0=0,
27 |     r_x1=-SQRT_2 / 2,
28 |     r_y1=SQRT_2 / 2,
29 |     r_x2=-SQRT_2,
30 |     r_y2=0,
31 |     r_x3=-SQRT_2 / 2,
32 |     r_y3=-SQRT_2 / 2,
33 | )
34 | R_180 = BoundingRectangle(
35 |     r_x0=0, r_y0=0, r_x1=-0, r_y1=0, r_x2=-1, r_y2=-1, r_x3=0, r_y3=-1
36 | )
37 | R_MINUS_135 = BoundingRectangle(
38 |     r_x0=0,
39 |     r_y0=0,
40 |     r_x1=-SQRT_2 / 2,
41 |     r_y1=-SQRT_2 / 2,
42 |     r_x2=0,
43 |     r_y2=-SQRT_2,
44 |     r_x3=SQRT_2 / 2,
45 |     r_y3=-SQRT_2 / 2,
46 | )
47 | R_MINUS_90 = BoundingRectangle(
48 |     r_x0=0, r_y0=0, r_x1=0, r_y1=-1, r_x2=1, r_y2=-1, r_x3=1, r_y3=0
49 | )
50 | R_MINUS_45 = BoundingRectangle(
51 |     r_x0=0,
52 |     r_y0=0,
53 |     r_x1=SQRT_2 / 2,
54 |     r_y1=-SQRT_2 / 2,
55 |     r_x2=SQRT_2,
56 |     r_y2=0,
57 |     r_x3=SQRT_2 / 2,
58 |     r_y3=SQRT_2 / 2,
59 | )
60 | 
61 | 
62 | @pytest.mark.parametrize(
63 |     ("rectangle", "expected_angle", "expected_angle_360"),
64 |     [
65 |         (R_0, 0, 0.0),
66 |         (R_45, np.pi / 4, 45),
67 |         (R_90, np.pi / 2, 90),
68 |         (R_135, 3 * np.pi / 4, 135),
69 |         (R_180, np.pi, 180),
70 |         (R_MINUS_135, 5 * np.pi / 4, 225),
71 |         (R_MINUS_90, 3 * np.pi / 2, 270),
72 |         (R_MINUS_45, 7 * np.pi / 4, 315),
73 |     ],
74 | )
75 | def test_bounding_rectangle_angle(
76 |     rectangle: BoundingRectangle, expected_angle: float, expected_angle_360: int
77 | ):
78 |     assert pytest.approx(rectangle.angle, abs=1e-6) == expected_angle
79 |     assert pytest.approx(rectangle.angle_360, abs=1e-6) == expected_angle_360
80 | 


--------------------------------------------------------------------------------
/test/test_search_meta.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Test the pydantic models in module search.metadata.py."""
 7 | import glob
 8 | import os
 9 | from typing import Literal
10 | 
11 | from pydantic import ValidationError
12 | 
13 | from docling_core.search.meta import Meta
14 | 
15 | 
16 | def test_meta():
17 |     """Validate data with Meta schema."""
18 |     taxonomy = Literal["Public", "PI"]
19 |     domain = Literal[
20 |         "Science", "Technology", "History", "Art", "Literature", "Geography"
21 |     ]
22 | 
23 |     for filename in glob.glob("test/data/search/meta-*.json"):
24 |         try:
25 |             with open(filename, encoding="utf-8") as file_obj:
26 |                 file_json = file_obj.read()
27 |             Meta[taxonomy, domain].model_validate_json(file_json)
28 |         except ValidationError as e:
29 |             print(f"Validation error in file {filename}", e.json())
30 |             raise
31 | 
32 |     # test invalid documents
33 |     gold_errors = {
34 |         "error-meta-01.json": ["type", "version"],
35 |         "error-meta-02.json": ["version", "domain", "$ref"],
36 |         "error-meta-03.json": ["source", "extra"],
37 |     }
38 | 
39 |     for filename in glob.glob("test/data/search/error-meta-*.json"):
40 |         gold = gold_errors[os.path.basename(filename)]
41 |         try:
42 |             with open(filename, encoding="utf-8") as file_obj:
43 |                 file_json = file_obj.read()
44 |             Meta[taxonomy, domain].model_validate_json(file_json)
45 |             assert False, f"File {filename} should be an invalid metadata"
46 |         except ValidationError as e:
47 |             errors = e.errors()
48 |             assert len(errors) == len(gold), f"Wrong number of errors in {filename}"
49 |             assert all(errors[zdx]["loc"][0] == gold[zdx] for zdx in range(len(errors)))
50 | 


--------------------------------------------------------------------------------
/test/test_utils.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright IBM Corp. 2024 - 2024
 3 | # SPDX-License-Identifier: MIT
 4 | #
 5 | 
 6 | """Test the pydantic models in package utils."""
 7 | import json
 8 | 
 9 | from pydantic import Field
10 | from requests import Response
11 | 
12 | from docling_core.utils.alias import AliasModel
13 | from docling_core.utils.file import resolve_source_to_path, resolve_source_to_stream
14 | 
15 | 
16 | def test_alias_model():
17 |     """Test the functionality of AliasModel."""
18 | 
19 |     class AliasModelChild(AliasModel):
20 |         foo: str = Field(alias="boo")
21 | 
22 |     data = {"foo": "lorem ipsum"}
23 |     data_alias = {"boo": "lorem ipsum"}
24 | 
25 |     # data validated from dict, JSON, and constructor can use field names or aliases
26 | 
27 |     AliasModelChild.model_validate(data_alias)
28 |     AliasModelChild.model_validate(data)
29 | 
30 |     AliasModelChild.model_validate_json(json.dumps(data_alias))
31 |     AliasModelChild.model_validate_json(json.dumps(data))
32 | 
33 |     AliasModelChild(boo="lorem ipsum")
34 |     AliasModelChild(foo="lorem ipsum")
35 | 
36 |     # children classes will also inherite the populate_by_name
37 | 
38 |     class AliasModelGrandChild(AliasModelChild):
39 |         var: int
40 | 
41 |     AliasModelGrandChild(boo="lorem ipsum", var=3)
42 |     AliasModelGrandChild(foo="lorem ipsum", var=3)
43 | 
44 |     # serialized data will always use aliases
45 | 
46 |     obj = AliasModelChild.model_validate(data_alias)
47 |     assert obj.model_dump() == data_alias
48 |     assert obj.model_dump() != data
49 | 
50 |     assert obj.model_dump_json() == json.dumps(data_alias, separators=(",", ":"))
51 |     assert obj.model_dump_json() != json.dumps(data, separators=(",", ":"))
52 | 
53 | 
54 | def test_resolve_source_to_path_url_wout_path(monkeypatch):
55 |     expected_str = "foo"
56 |     expected_bytes = bytes(expected_str, "utf-8")
57 | 
58 |     def get_dummy_response(*args, **kwargs):
59 |         r = Response()
60 |         r.status_code = 200
61 |         r._content = expected_bytes
62 |         return r
63 | 
64 |     monkeypatch.setattr("requests.get", get_dummy_response)
65 |     monkeypatch.setattr(
66 |         "requests.models.Response.iter_content",
67 |         lambda *args, **kwargs: [expected_bytes],
68 |     )
69 |     path = resolve_source_to_path("https://pypi.org")
70 |     with open(path, encoding="utf-8") as f:
71 |         text = f.read()
72 |     assert text == expected_str
73 | 
74 | 
75 | def test_resolve_source_to_stream_url_wout_path(monkeypatch):
76 |     expected_str = "foo"
77 |     expected_bytes = bytes(expected_str, "utf-8")
78 | 
79 |     def get_dummy_response(*args, **kwargs):
80 |         r = Response()
81 |         r.status_code = 200
82 |         r._content = expected_bytes
83 |         return r
84 | 
85 |     monkeypatch.setattr("requests.get", get_dummy_response)
86 |     monkeypatch.setattr(
87 |         "requests.models.Response.iter_content",
88 |         lambda *args, **kwargs: [expected_bytes],
89 |     )
90 |     doc_stream = resolve_source_to_stream("https://pypi.org")
91 |     assert doc_stream.name == "file"
92 | 
93 |     text = doc_stream.stream.read().decode("utf8")
94 |     assert text == expected_str
95 | 


--------------------------------------------------------------------------------
/test/test_visualization.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | import PIL.Image
 4 | 
 5 | from docling_core.types.doc.document import DoclingDocument
 6 | 
 7 | from .test_data_gen_flag import GEN_TEST_DATA
 8 | 
 9 | VIZ_TEST_DATA_PATH = Path("./test/data/viz")
10 | 
11 | 
12 | def verify(exp_file: Path, actual: PIL.Image.Image):
13 |     if GEN_TEST_DATA:
14 |         with open(exp_file, "w", encoding="utf-8") as f:
15 |             actual.save(exp_file)
16 |     else:
17 |         with PIL.Image.open(exp_file) as expected:
18 |             assert actual == expected
19 | 
20 | 
21 | def test_doc_visualization():
22 |     src = Path("./test/data/doc/2408.09869v3_enriched.json")
23 |     doc = DoclingDocument.load_from_json(src)
24 |     viz_pages = doc.get_visualization()
25 |     for k in viz_pages:
26 |         if k <= 3:
27 |             verify(
28 |                 exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_p{k}.png",
29 |                 actual=viz_pages[k],
30 |             )
31 | 
32 | 
33 | def test_doc_visualization_inline_circumscribed_bbox():
34 |     src = Path("./test/data/doc/2408.09869v3_enriched.dt.json")
35 |     doc = DoclingDocument.load_from_json(src)
36 |     viz_pages = doc.get_visualization()
37 |     for k in viz_pages:
38 |         if k == 2:
39 |             verify(
40 |                 exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_p{k}.png",
41 |                 actual=viz_pages[k],
42 |             )
43 | 
44 | 
45 | def test_doc_visualization_no_label():
46 |     src = Path("./test/data/doc/2408.09869v3_enriched.json")
47 |     doc = DoclingDocument.load_from_json(src)
48 |     viz_pages = doc.get_visualization(show_label=False)
49 |     for k in viz_pages:
50 |         if k <= 3:
51 |             verify(
52 |                 exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_wout_lbl_p{k}.png",
53 |                 actual=viz_pages[k],
54 |             )
55 | 


--------------------------------------------------------------------------------