├── .flake8 ├── .gitattributes ├── .github ├── codecov.yml ├── mergify.yml ├── scripts │ └── release.sh └── workflows │ ├── cd.yml │ ├── checks.yml │ ├── ci.yml │ └── pypi.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .whitesource ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MAINTAINERS.md ├── README.md ├── docling_core ├── __init__.py ├── cli │ ├── __init__.py │ └── view.py ├── experimental │ └── __init__.py ├── py.typed ├── resources │ └── schemas │ │ ├── doc │ │ ├── ANN.json │ │ ├── DOC.json │ │ ├── OCR-output.json │ │ └── RAW.json │ │ ├── generated │ │ ├── ccs_document_schema.json │ │ └── minimal_document_schema_flat.json │ │ └── search │ │ ├── search_doc_mapping.json │ │ └── search_doc_mapping_v2.json ├── search │ ├── __init__.py │ ├── json_schema_to_search_mapper.py │ ├── mapping.py │ ├── meta.py │ └── package.py ├── transforms │ ├── __init__.py │ ├── chunker │ │ ├── __init__.py │ │ ├── base.py │ │ ├── hierarchical_chunker.py │ │ ├── hybrid_chunker.py │ │ └── tokenizer │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── huggingface.py │ │ │ └── openai.py │ ├── serializer │ │ ├── __init__.py │ │ ├── base.py │ │ ├── common.py │ │ ├── doctags.py │ │ ├── html.py │ │ ├── html_styles.py │ │ └── markdown.py │ └── visualizer │ │ ├── __init__.py │ │ ├── base.py │ │ ├── layout_visualizer.py │ │ └── reading_order_visualizer.py ├── types │ ├── __init__.py │ ├── base.py │ ├── doc │ │ ├── __init__.py │ │ ├── base.py │ │ ├── document.py │ │ ├── labels.py │ │ ├── page.py │ │ ├── tokens.py │ │ └── utils.py │ ├── gen │ │ ├── __init__.py │ │ └── generic.py │ ├── io │ │ └── __init__.py │ ├── legacy_doc │ │ ├── __init__.py │ │ ├── base.py │ │ ├── doc_ann.py │ │ ├── doc_ocr.py │ │ ├── doc_raw.py │ │ ├── document.py │ │ └── tokens.py │ ├── nlp │ │ ├── __init__.py │ │ ├── qa.py │ │ └── qa_labels.py │ └── rec │ │ ├── __init__.py │ │ ├── attribute.py │ │ ├── base.py │ │ ├── predicate.py │ │ ├── record.py │ │ ├── statement.py │ │ └── subject.py └── utils │ ├── __init__.py │ ├── alias.py │ ├── file.py │ ├── generate_docs.py │ ├── generate_jsonschema.py │ ├── legacy.py │ ├── validate.py │ └── validators.py ├── docs ├── DoclingDocument.json ├── Generic.json └── Record.json ├── examples ├── 2408.09869v3.json ├── chunking_and_serialization.ipynb └── table_annotations.ipynb ├── pyproject.toml ├── test ├── __init__.py ├── data │ ├── chunker │ │ ├── 0_inp_dl_doc.json │ │ ├── 0_out_chunks.json │ │ ├── 0b_out_chunks.json │ │ ├── 2_inp_dl_doc.json │ │ ├── 2a_out_chunks.json │ │ ├── 2a_out_ser_chunks.json │ │ ├── 2b_out_chunks.json │ │ ├── 2c_out_chunks.json │ │ ├── 2d_out_ser_chunks.json │ │ ├── 2e_out_chunks.json │ │ ├── 2f_out_chunks.json │ │ └── 2g_out_chunks.json │ ├── doc │ │ ├── 01030000000083.dt │ │ ├── 01030000000083.png │ │ ├── 01030000000111.dt │ │ ├── 01030000000111.png │ │ ├── 2106.09680v1.json │ │ ├── 2206.01062-1.0.0.json │ │ ├── 2206.01062.yaml │ │ ├── 2206.01062.yaml.dt │ │ ├── 2206.01062.yaml.dt.json │ │ ├── 2206.01062.yaml.et │ │ ├── 2206.01062.yaml.html │ │ ├── 2206.01062.yaml.md │ │ ├── 2206.01062.yaml.min.dt │ │ ├── 2206.01062.yaml.paged.md │ │ ├── 2408.09869_p1.json │ │ ├── 2408.09869_p1_split.gt.html │ │ ├── 2408.09869v3_enriched.dt │ │ ├── 2408.09869v3_enriched.dt.json │ │ ├── 2408.09869v3_enriched.gt.md │ │ ├── 2408.09869v3_enriched.json │ │ ├── 2408.09869v3_enriched.out.dt │ │ ├── 2408.09869v3_enriched.out.dt.json │ │ ├── 2408.09869v3_enriched_p1_include_annotations_false.gt.html │ │ ├── 2408.09869v3_enriched_p1_include_annotations_false.gt.md │ │ ├── 2408.09869v3_enriched_p1_include_annotations_true.gt.html │ │ ├── 2408.09869v3_enriched_p1_mark_annotations_false.gt.md │ │ ├── 2408.09869v3_enriched_p1_mark_annotations_true.gt.md │ │ ├── 2408.09869v3_enriched_split.gt.html │ │ ├── 2408.09869v3_enriched_split_p2.gt.html │ │ ├── activities.gt.html │ │ ├── activities.gt.md │ │ ├── activities.json │ │ ├── activities_p1.gt.html │ │ ├── activities_p2.gt.html │ │ ├── activities_p2.gt.md │ │ ├── activities_pb_empty.gt.md │ │ ├── activities_pb_non_empty.gt.md │ │ ├── activities_pb_none.gt.md │ │ ├── bad_doc.yaml.dt │ │ ├── bad_doc.yaml.et │ │ ├── bad_doc.yaml.html │ │ ├── bad_doc.yaml.md │ │ ├── barchart.dt │ │ ├── barchart.gt.html │ │ ├── barchart.gt.md │ │ ├── barchart.json │ │ ├── barchart.png │ │ ├── constructed_doc.appended_child.json.gt │ │ ├── constructed_doc.deleted_group.json.gt │ │ ├── constructed_doc.deleted_picture.json.gt │ │ ├── constructed_doc.deleted_table.json.gt │ │ ├── constructed_doc.deleted_text.json.gt │ │ ├── constructed_doc.dt │ │ ├── constructed_doc.dt.gt │ │ ├── constructed_doc.embedded.html.gt │ │ ├── constructed_doc.embedded.json.gt │ │ ├── constructed_doc.embedded.md.gt │ │ ├── constructed_doc.embedded.yaml.gt │ │ ├── constructed_doc.html │ │ ├── constructed_doc.inserted_text.json.gt │ │ ├── constructed_doc.placeholder.html.gt │ │ ├── constructed_doc.placeholder.md.gt │ │ ├── constructed_doc.referenced.html.gt │ │ ├── constructed_doc.referenced.json.gt │ │ ├── constructed_doc.referenced.md.gt │ │ ├── constructed_doc.referenced.yaml.gt │ │ ├── constructed_doc.replaced_item.json.gt │ │ ├── constructed_document.yaml.dt │ │ ├── constructed_document.yaml.et │ │ ├── constructed_document.yaml.html │ │ ├── constructed_document.yaml.md │ │ ├── constructed_images │ │ │ ├── image_000001_797618e862d279d4e3e92f4b6313175f67e08fc36051dfda092bf63220568703.png │ │ │ ├── image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png │ │ │ └── image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png │ │ ├── doc_with_kv.dt │ │ ├── doc_with_kv.dt.json │ │ ├── doc_with_kv.png │ │ ├── dummy_doc.yaml │ │ ├── dummy_doc.yaml.dt │ │ ├── dummy_doc.yaml.et │ │ ├── dummy_doc.yaml.html │ │ ├── dummy_doc.yaml.md │ │ ├── dummy_doc.yaml.min.dt │ │ ├── misplaced_list_items.out.yaml │ │ ├── misplaced_list_items.yaml │ │ ├── misplaced_list_items.yaml.dt │ │ ├── page_with_pic.dt │ │ ├── page_with_pic.dt.json │ │ ├── page_with_pic.png │ │ ├── page_with_pic_from_files.dt.json │ │ └── page_without_pic.dt.json │ ├── docling_document │ │ ├── export │ │ │ └── formula_mathml.html │ │ └── unit │ │ │ ├── CodeItem.yaml │ │ │ ├── FloatingItem.yaml │ │ │ ├── FormItem.yaml │ │ │ ├── FormulaItem.yaml │ │ │ ├── KeyValueItem.yaml │ │ │ ├── ListItem.yaml │ │ │ ├── PictureItem.yaml │ │ │ ├── SectionHeaderItem.yaml │ │ │ ├── TableItem.yaml │ │ │ ├── TextItem.yaml │ │ │ └── TitleItem.yaml │ ├── json_schemas │ │ ├── base_identifier.json │ │ ├── base_log.json │ │ ├── dbrecord-ref.json │ │ └── document-ref.json │ ├── legacy_doc │ │ ├── doc-1.json │ │ ├── doc-1.json_table_0.dt.txt │ │ ├── doc-2.json │ │ ├── doc-2.json_table_0.dt.txt │ │ ├── doc-3.json │ │ ├── doc-4.json │ │ ├── doc-5.json │ │ ├── doc-6.json │ │ ├── doc-6.json_table_0.dt.txt │ │ ├── doc-7.json │ │ ├── doc-7.json_table_0.dt.txt │ │ ├── doc-8.json │ │ ├── doc-8.json_table_0.dt.txt │ │ ├── doc-9.json │ │ ├── doc-export.docling.yaml.gt │ │ ├── doc-export.dt.txt │ │ ├── doc-export.json │ │ ├── doc-export.json_table_0.dt.txt │ │ ├── doc-export.md │ │ ├── error-1.json │ │ ├── error-2.json │ │ ├── error-3.json │ │ ├── ext-1.json │ │ └── intermediates │ │ │ ├── ann.01.json │ │ │ ├── cells.01.json │ │ │ ├── final-doc.01.json │ │ │ ├── pdf.meta.01.json │ │ │ ├── publication_journal.json │ │ │ ├── publication_venue.json │ │ │ └── raw.meta.01.json │ ├── nlp │ │ ├── error-qa-1.json │ │ ├── error-qa-3.json │ │ ├── qa-1.json │ │ ├── qa-2.json │ │ └── qa-3.json │ ├── rec │ │ ├── attribute-01.json │ │ ├── attribute-02.json │ │ ├── attribute-03.json │ │ ├── error-attribute-01.json │ │ ├── error-attribute-02.json │ │ ├── error-predicate-01.json │ │ ├── error-predicate-02.json │ │ ├── predicate-01.json │ │ ├── predicate-02.json │ │ ├── record-01.json │ │ ├── record-02.json │ │ ├── record-03.json │ │ ├── record-04.json │ │ ├── record-05.json │ │ ├── record-gleif-01.json │ │ ├── statement-01.json │ │ ├── statement-02.json │ │ ├── statement-gleif-01.json │ │ ├── subject-01.json │ │ └── subject-02.json │ ├── search │ │ ├── error-meta-01.json │ │ ├── error-meta-02.json │ │ ├── error-meta-03.json │ │ ├── meta-01.json │ │ ├── meta-02.json │ │ ├── meta-03.json │ │ └── meta-04.json │ └── viz │ │ ├── 2408.09869v3_enriched.dt_viz_p2.png │ │ ├── 2408.09869v3_enriched_viz_p1.png │ │ ├── 2408.09869v3_enriched_viz_p2.png │ │ ├── 2408.09869v3_enriched_viz_p3.png │ │ ├── 2408.09869v3_enriched_viz_wout_lbl_p1.png │ │ ├── 2408.09869v3_enriched_viz_wout_lbl_p2.png │ │ └── 2408.09869v3_enriched_viz_wout_lbl_p3.png ├── test_base.py ├── test_collection.py ├── test_data_gen_flag.py ├── test_doc_base.py ├── test_doc_legacy_convert.py ├── test_doc_schema.py ├── test_doc_schema_extractor.py ├── test_docling_doc.py ├── test_doctags_load.py ├── test_hierarchical_chunker.py ├── test_hybrid_chunker.py ├── test_json_schema_to_search_mapper.py ├── test_nlp_qa.py ├── test_otsl_table_export.py ├── test_page.py ├── test_rec_schema.py ├── test_search_meta.py ├── test_serialization.py ├── test_utils.py └── test_visualization.py └── uv.lock /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | per-file-ignores = __init__.py:F401 3 | max-line-length = 120 4 | exclude = test/* 5 | max-complexity = 25 6 | docstring-convention = google 7 | ignore = W503,E203 8 | classmethod-decorators = classmethod,validator 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | test/data/** linguist-vendored 2 | -------------------------------------------------------------------------------- /.github/codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | # https://docs.codecov.io/docs/comparing-commits 3 | allow_coverage_offsets: true 4 | coverage: 5 | status: 6 | project: 7 | default: 8 | informational: true 9 | target: auto # auto compares coverage to the previous base commit 10 | if_ci_failed: success 11 | flags: 12 | - docling 13 | comment: 14 | layout: "reach, diff, flags, files" 15 | behavior: default 16 | require_changes: false # if true: only post the comment if coverage changes 17 | branches: # branch names that can post comment 18 | - "main" 19 | -------------------------------------------------------------------------------- /.github/mergify.yml: -------------------------------------------------------------------------------- 1 | merge_protections: 2 | - name: Enforce conventional commit 3 | description: Make sure that we follow https://www.conventionalcommits.org/en/v1.0.0/ 4 | if: 5 | - base = main 6 | success_conditions: 7 | - "title ~= 8 | ^(fix|feat|docs|style|refactor|perf|test|build|ci|chore|revert)(?:\\(.+\ 9 | \\))?(!)?:" 10 | - name: Require two reviewer for test updates 11 | description: When test data is updated, we require two reviewers 12 | if: 13 | - base = main 14 | - files ~= ^test 15 | success_conditions: 16 | - "#approved-reviews-by >= 2" 17 | -------------------------------------------------------------------------------- /.github/scripts/release.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e # trigger failure on error - do not remove! 4 | set -x # display command on output 5 | 6 | if [ -z "${TARGET_VERSION}" ]; then 7 | >&2 echo "No TARGET_VERSION specified" 8 | exit 1 9 | fi 10 | CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}" 11 | 12 | # update package version 13 | uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}" 14 | uv lock --upgrade-package docling-core 15 | 16 | # collect release notes 17 | REL_NOTES=$(mktemp) 18 | uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}" 19 | 20 | # update changelog 21 | TMP_CHGLOG=$(mktemp) 22 | TARGET_TAG_NAME="v${TARGET_VERSION}" 23 | RELEASE_URL="$(gh repo view --json url -q ".url")/releases/tag/${TARGET_TAG_NAME}" 24 | printf "## [${TARGET_TAG_NAME}](${RELEASE_URL}) - $(date -Idate)\n\n" >> "${TMP_CHGLOG}" 25 | cat "${REL_NOTES}" >> "${TMP_CHGLOG}" 26 | if [ -f "${CHGLOG_FILE}" ]; then 27 | printf "\n" | cat - "${CHGLOG_FILE}" >> "${TMP_CHGLOG}" 28 | fi 29 | mv "${TMP_CHGLOG}" "${CHGLOG_FILE}" 30 | 31 | # push changes 32 | git config --global user.name 'github-actions[bot]' 33 | git config --global user.email 'github-actions[bot]@users.noreply.github.com' 34 | git add pyproject.toml uv.lock "${CHGLOG_FILE}" 35 | COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]" 36 | git commit -m "${COMMIT_MSG}" 37 | git push origin main 38 | 39 | # create GitHub release (incl. Git tag) 40 | gh release create "${TARGET_TAG_NAME}" -F "${REL_NOTES}" 41 | -------------------------------------------------------------------------------- /.github/workflows/cd.yml: -------------------------------------------------------------------------------- 1 | name: "Run CD" 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | env: 7 | # disable keyring (https://github.com/actions/runner-images/issues/6185): 8 | PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring 9 | 10 | jobs: 11 | code-checks: 12 | uses: ./.github/workflows/checks.yml 13 | with: 14 | push_coverage: false 15 | pre-release-check: 16 | runs-on: ubuntu-latest 17 | outputs: 18 | TARGET_TAG_V: ${{ steps.version_check.outputs.TRGT_VERSION }} 19 | steps: 20 | - uses: actions/checkout@v4 21 | with: 22 | fetch-depth: 0 # for fetching tags, required for semantic-release 23 | - name: Install uv and set the python version 24 | uses: astral-sh/setup-uv@v5 25 | with: 26 | enable-cache: true 27 | - name: Install dependencies 28 | run: uv sync --only-dev 29 | - name: Check version of potential release 30 | id: version_check 31 | run: | 32 | TRGT_VERSION=$(uv run --no-sync semantic-release print-version) 33 | echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT" 34 | echo "${TRGT_VERSION}" 35 | - name: Check notes of potential release 36 | run: uv run --no-sync semantic-release changelog --unreleased 37 | release: 38 | needs: [code-checks, pre-release-check] 39 | if: needs.pre-release-check.outputs.TARGET_TAG_V != '' 40 | environment: auto-release 41 | runs-on: ubuntu-latest 42 | concurrency: release 43 | steps: 44 | - uses: actions/create-github-app-token@v1 45 | id: app-token 46 | with: 47 | app-id: ${{ vars.CI_APP_ID }} 48 | private-key: ${{ secrets.CI_PRIVATE_KEY }} 49 | - uses: actions/checkout@v4 50 | with: 51 | token: ${{ steps.app-token.outputs.token }} 52 | fetch-depth: 0 # for fetching tags, required for semantic-release 53 | - name: Install uv and set the python version 54 | uses: astral-sh/setup-uv@v5 55 | with: 56 | enable-cache: true 57 | - name: Install dependencies 58 | run: uv sync --only-dev 59 | - name: Run release script 60 | env: 61 | GH_TOKEN: ${{ steps.app-token.outputs.token }} 62 | TARGET_VERSION: ${{ needs.pre-release-check.outputs.TARGET_TAG_V }} 63 | CHGLOG_FILE: CHANGELOG.md 64 | run: ./.github/scripts/release.sh 65 | shell: bash 66 | -------------------------------------------------------------------------------- /.github/workflows/checks.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_call: 3 | inputs: 4 | push_coverage: 5 | type: boolean 6 | description: "If true, the coverage results are pushed to codecov.io." 7 | default: true 8 | secrets: 9 | CODECOV_TOKEN: 10 | required: false 11 | 12 | env: 13 | HF_HUB_DOWNLOAD_TIMEOUT: "60" 14 | HF_HUB_ETAG_TIMEOUT: "60" 15 | 16 | jobs: 17 | run-checks: 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 22 | steps: 23 | - uses: actions/checkout@v4 24 | - name: Cache Hugging Face models 25 | uses: actions/cache@v4 26 | with: 27 | path: ~/.cache/huggingface 28 | key: huggingface-cache-py${{ matrix.python-version }} 29 | - name: Install uv and set the python version 30 | uses: astral-sh/setup-uv@v5 31 | with: 32 | python-version: ${{ matrix.python-version }} 33 | enable-cache: true 34 | - name: pre-commit cache key 35 | run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV" 36 | - uses: actions/cache@v4 37 | with: 38 | path: ~/.cache/pre-commit 39 | key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }} 40 | - name: Install dependencies 41 | run: uv sync --frozen --all-extras 42 | - name: Check style and run tests 43 | run: pre-commit run --all-files 44 | - name: Upload coverage to Codecov 45 | if: inputs.push_coverage 46 | uses: codecov/codecov-action@v5 47 | with: 48 | token: ${{ secrets.CODECOV_TOKEN }} 49 | files: ./coverage.xml 50 | 51 | build-package: 52 | runs-on: ubuntu-latest 53 | strategy: 54 | matrix: 55 | python-version: ['3.12'] 56 | steps: 57 | - uses: actions/checkout@v4 58 | - name: Install uv and set the python version 59 | uses: astral-sh/setup-uv@v5 60 | with: 61 | python-version: ${{ matrix.python-version }} 62 | enable-cache: true 63 | - name: Install dependencies 64 | run: uv sync --all-extras 65 | - name: Build package 66 | run: uv build 67 | - name: Check content of wheel 68 | run: unzip -l dist/*.whl 69 | - name: Store the distribution packages 70 | uses: actions/upload-artifact@v4 71 | with: 72 | name: python-package-distributions 73 | path: dist/ 74 | 75 | test-package: 76 | needs: 77 | - build-package 78 | runs-on: ubuntu-latest 79 | strategy: 80 | matrix: 81 | python-version: ['3.12'] 82 | steps: 83 | - name: Download all the dists 84 | uses: actions/download-artifact@v4 85 | with: 86 | name: python-package-distributions 87 | path: dist/ 88 | - name: Install uv and set the python version 89 | uses: astral-sh/setup-uv@v5 90 | with: 91 | python-version: ${{ matrix.python-version }} 92 | enable-cache: true 93 | - name: Install package 94 | run: uv pip install dist/*.whl 95 | - name: Load the DoclingDocument package 96 | run: python -c 'from docling_core.types.doc import DoclingDocument' 97 | - name: Check if package data is present 98 | run: python -c 'from importlib import resources; from pathlib import Path; p=Path(resources.files("docling_core").joinpath("resources/schemas/doc/DOC.json")); assert p.exists()' 99 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: "Run CI" 2 | 3 | on: 4 | pull_request: 5 | types: [opened, reopened, synchronize] 6 | push: 7 | branches: 8 | - "**" 9 | - "!gh-pages" 10 | 11 | jobs: 12 | code-checks: 13 | if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling-core' && github.event.pull_request.head.repo.full_name != 'docling-project/docling-core') }} 14 | uses: ./.github/workflows/checks.yml 15 | secrets: 16 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 17 | 18 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: "Build and publish package" 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | build-and-publish: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: ['3.12'] 16 | environment: 17 | name: pypi 18 | url: https://pypi.org/p/docling-core 19 | permissions: 20 | id-token: write # IMPORTANT: mandatory for trusted publishing 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Install uv and set the python version 24 | uses: astral-sh/setup-uv@v5 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | enable-cache: true 28 | - name: Install dependencies 29 | run: uv sync --all-extras 30 | - name: Build package 31 | run: uv build 32 | - name: Publish distribution 📦 to PyPI 33 | uses: pypa/gh-action-pypi-publish@release/v1 34 | with: 35 | attestations: true 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | 3 | .idea/ 4 | *~ 5 | *.DS_Store 6 | test/data/constructed_images* 7 | test/data/doc/constructed_doc*.html 8 | test/data/doc/constructed_doc*.yaml 9 | test/data/doc/constructed_doc*.json 10 | test/data/doc/constructed_doc*.dt 11 | test/data/doc/constructed_doc*.md 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # Distribution / packaging 22 | .Python 23 | build/ 24 | develop-eggs/ 25 | dist/ 26 | downloads/ 27 | eggs/ 28 | .eggs/ 29 | lib/ 30 | lib64/ 31 | parts/ 32 | sdist/ 33 | var/ 34 | wheels/ 35 | *.egg-info/ 36 | .installed.cfg 37 | *.egg 38 | MANIFEST 39 | 40 | # PyInstaller 41 | # Usually these files are written by a python script from a template 42 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 43 | *.manifest 44 | *.spec 45 | 46 | # Installer logs 47 | pip-log.txt 48 | pip-delete-this-directory.txt 49 | 50 | # Unit test / coverage reports 51 | htmlcov/ 52 | .tox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # celery beat schedule file 91 | celerybeat-schedule 92 | 93 | # SageMath parsed files 94 | *.sage.py 95 | 96 | # Environments 97 | .env 98 | .venv 99 | env/ 100 | venv/ 101 | ENV/ 102 | env.bak/ 103 | venv.bak/ 104 | 105 | # Spyder project settings 106 | .spyderproject 107 | .spyproject 108 | 109 | # Rope project settings 110 | .ropeproject 111 | 112 | # mkdocs documentation 113 | /site 114 | 115 | # mypy 116 | .mypy_cache/ 117 | 118 | # VisualStudioCode 119 | .vscode/ -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | fail_fast: true 2 | repos: 3 | - repo: local 4 | hooks: 5 | - id: black 6 | name: Black 7 | entry: uv run --no-sync black docling_core test 8 | pass_filenames: false 9 | language: system 10 | files: '\.py$' 11 | - repo: local 12 | hooks: 13 | - id: isort 14 | name: isort 15 | entry: uv run --no-sync isort docling_core test 16 | pass_filenames: false 17 | language: system 18 | files: '\.py$' 19 | - repo: local 20 | hooks: 21 | - id: autoflake 22 | name: autoflake 23 | entry: uv run --no-sync autoflake docling_core test 24 | pass_filenames: false 25 | language: system 26 | files: '\.py$' 27 | - repo: local 28 | hooks: 29 | - id: mypy 30 | name: MyPy 31 | entry: uv run --no-sync mypy docling_core test 32 | pass_filenames: false 33 | language: system 34 | files: '\.py$' 35 | - repo: local 36 | hooks: 37 | - id: flake8 38 | name: Flake8 39 | entry: uv run --no-sync flake8 docling_core 40 | pass_filenames: false 41 | language: system 42 | files: '\.py$' 43 | - repo: local 44 | hooks: 45 | - id: pytest 46 | name: Pytest 47 | entry: uv run --no-sync pytest --cov=docling_core --cov-report=xml test 48 | pass_filenames: false 49 | language: system 50 | files: '\.py$' 51 | - repo: local 52 | hooks: 53 | - id: docs 54 | name: Docs 55 | entry: uv run --no-sync python -m docling_core.utils.generate_docs docs 56 | pass_filenames: false 57 | language: system 58 | files: '\.py$' 59 | - repo: https://github.com/astral-sh/uv-pre-commit 60 | rev: 0.7.8 61 | hooks: 62 | - id: uv-lock 63 | -------------------------------------------------------------------------------- /.whitesource: -------------------------------------------------------------------------------- 1 | { 2 | "settingsInheritedFrom": "whitesource-config/whitesource-config@master" 3 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the [project team](./MAINTAINERS.md). All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contributing In General 2 | Our project welcomes external contributions. If you have an itch, please feel 3 | free to scratch it. 4 | 5 | For more details on the contributing guidelines head to the Docling Project [community repository](https://github.com/docling-project/community). 6 | 7 | ## Developing 8 | 9 | ### Usage of uv 10 | 11 | We use [uv](https://docs.astral.sh/uv/) as package and project manager. 12 | 13 | #### Installation 14 | 15 | To install `uv`, check the documentation on [Installing uv](https://docs.astral.sh/uv/getting-started/installation/). 16 | 17 | #### Create an environment and sync it 18 | 19 | You can use the `uv sync` to create a project virtual environment (if it does not already exist) and sync 20 | the project's dependencies with the environment. 21 | 22 | ```bash 23 | uv sync 24 | ``` 25 | 26 | #### Use a specific Python version (optional) 27 | 28 | If you need to work with a specific version of Python, you can create a new virtual environment for that version 29 | and run the sync command: 30 | 31 | ```bash 32 | uv venv --python 3.12 33 | uv sync 34 | ``` 35 | 36 | More detailed options are described on the [Using Python environments](https://docs.astral.sh/uv/pip/environments/) documentation. 37 | 38 | #### Add a new dependency 39 | 40 | Simply use the `uv add` command. The `pyproject.toml` and `uv.lock` files will be updated. 41 | 42 | ```bash 43 | uv add [OPTIONS] > 44 | ``` 45 | 46 | ### Code sytle guidelines 47 | 48 | We use the following tools to enforce code style: 49 | 50 | - isort, to sort imports 51 | - Black, to format code 52 | - Flake8, to lint code 53 | - autoflake, to remove unused variables and imports 54 | - [MyPy](https://mypy.readthedocs.io), as static type checker 55 | 56 | A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework. To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository: 57 | 58 | ```bash 59 | uv run pre-commit install 60 | ``` 61 | 62 | To run the checks on-demand, type: 63 | 64 | ```bash 65 | uv run pre-commit run --all-files 66 | ``` 67 | 68 | Note: Checks like `Black` and `isort` will _fail_ if they modify files. This is because `pre-commit` doesn't like to see files modified by their hooks. In these cases, `git add` the modified files and `git commit` again. 69 | 70 | 71 | ### Documentation 72 | 73 | We use [JSON Schema for Humans](https://github.com/coveooss/json-schema-for-humans) to generate Markdown pages documenting the JSON schema of the Docling objects. 74 | 75 | The documentation pages are stored in [docs](./docs/) folder and are updated at every commit, as part of the `pre-commit` check hooks. 76 | To generate the documentation on-demand, run: 77 | 78 | ```bash 79 | uv run python -m docling_core.utils.generate_docs docs 80 | ``` 81 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 International Business Machines 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | # MAINTAINERS 2 | 3 | - Cesar Berrospi Ramis - [@ceberam](https://github.com/ceberam) 4 | - Michele Dolfi - [@dolfim-ibm](https://github.com/dolfim-ibm) 5 | - Christoph Auer - [@cau-git](https://github.com/cau-git) 6 | - Panos Vagenas - [@vagenas](https://github.com/vagenas) 7 | - Peter Staar - [@PeterStaar-IBM](https://github.com/PeterStaar-IBM) 8 | 9 | Maintainers can be contacted at [deepsearch-core@zurich.ibm.com](mailto:deepsearch-core@zurich.ibm.com). -------------------------------------------------------------------------------- /docling_core/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Main package.""" 7 | -------------------------------------------------------------------------------- /docling_core/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """CLI package.""" 2 | -------------------------------------------------------------------------------- /docling_core/cli/view.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """CLI for docling viewer.""" 7 | import importlib 8 | import tempfile 9 | import webbrowser 10 | from pathlib import Path 11 | from typing import Annotated, Optional 12 | 13 | import typer 14 | 15 | from docling_core.types.doc import DoclingDocument 16 | from docling_core.types.doc.base import ImageRefMode 17 | from docling_core.utils.file import resolve_source_to_path 18 | 19 | app = typer.Typer( 20 | name="Docling", 21 | no_args_is_help=True, 22 | add_completion=False, 23 | pretty_exceptions_enable=False, 24 | ) 25 | 26 | 27 | def version_callback(value: bool): 28 | """Callback for version inspection.""" 29 | if value: 30 | docling_core_version = importlib.metadata.version("docling-core") 31 | print(f"Docling Core version: {docling_core_version}") 32 | raise typer.Exit() 33 | 34 | 35 | @app.command(no_args_is_help=True) 36 | def view( 37 | source: Annotated[ 38 | str, 39 | typer.Argument( 40 | ..., 41 | metavar="source", 42 | help="Docling JSON file to view.", 43 | ), 44 | ], 45 | version: Annotated[ 46 | Optional[bool], 47 | typer.Option( 48 | "--version", 49 | callback=version_callback, 50 | is_eager=True, 51 | help="Show version information.", 52 | ), 53 | ] = None, 54 | ): 55 | """Display a Docling JSON file on the default browser.""" 56 | path = resolve_source_to_path(source=source) 57 | doc = DoclingDocument.load_from_json(filename=path) 58 | target_path = Path(tempfile.mkdtemp()) / "out.html" 59 | html_output = doc.export_to_html(image_mode=ImageRefMode.EMBEDDED) 60 | with open(target_path, "w", encoding="utf-8") as f: 61 | f.write(html_output) 62 | webbrowser.open(url=f"file://{target_path.absolute().resolve()}") 63 | 64 | 65 | click_app = typer.main.get_command(app) 66 | 67 | if __name__ == "__main__": 68 | app() 69 | -------------------------------------------------------------------------------- /docling_core/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2025 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Experimental features.""" 7 | -------------------------------------------------------------------------------- /docling_core/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/docling_core/py.typed -------------------------------------------------------------------------------- /docling_core/resources/schemas/search/search_doc_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "mappings": { 3 | "dynamic": false, 4 | "_size": { 5 | "enabled": true 6 | }, 7 | "_meta": { 8 | "$ref": "ccs:schemas#/Document" 9 | }, 10 | "properties": { 11 | "description": { 12 | "type": "object", 13 | "properties": { 14 | "abstract": { 15 | "type": "text" 16 | }, 17 | "affiliations": { 18 | "type": "keyword" 19 | }, 20 | "authors": { 21 | "type": "keyword" 22 | }, 23 | "title": { 24 | "type": "text" 25 | } 26 | } 27 | }, 28 | "figures": { 29 | "type": "object", 30 | "properties": { 31 | "text": { 32 | "type": "text" 33 | }, 34 | "type": { 35 | "type": "keyword" 36 | }, 37 | "prov": { 38 | "type": "object", 39 | "properties": { 40 | "page": { 41 | "type": "integer" 42 | } 43 | } 44 | } 45 | } 46 | }, 47 | "file-info": { 48 | "type": "object", 49 | "properties": { 50 | "filename": { 51 | "type": "text" 52 | } 53 | } 54 | }, 55 | "main-text": { 56 | "type": "object", 57 | "properties": { 58 | "text": { 59 | "type": "text" 60 | }, 61 | "type": { 62 | "type": "keyword" 63 | }, 64 | "name": { 65 | "type": "keyword" 66 | }, 67 | "prov": { 68 | "type": "object", 69 | "properties": { 70 | "page": { 71 | "type": "integer" 72 | } 73 | } 74 | } 75 | } 76 | }, 77 | "_name": { 78 | "type": "keyword" 79 | }, 80 | "tables": { 81 | "type": "object", 82 | "properties": { 83 | "text": { 84 | "type": "text" 85 | }, 86 | "type": { 87 | "type": "keyword" 88 | }, 89 | "prov": { 90 | "type": "object", 91 | "properties": { 92 | "page": { 93 | "type": "integer" 94 | } 95 | } 96 | } 97 | } 98 | }, 99 | "type": { 100 | "type": "keyword" 101 | } 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /docling_core/search/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Package for models and utility functions for search database mappings.""" 7 | -------------------------------------------------------------------------------- /docling_core/search/mapping.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Methods to define fields in an index mapping of a search database.""" 7 | from typing import Any, Optional 8 | 9 | 10 | def es_field( 11 | *, 12 | type: Optional[str] = None, 13 | ignore_above: Optional[int] = None, 14 | term_vector: Optional[str] = None, 15 | **kwargs: Any, 16 | ): 17 | """Create x-es kwargs to be passed to a `pydantic.Field` via unpacking.""" 18 | all_kwargs = {**kwargs} 19 | 20 | if type is not None: 21 | all_kwargs["type"] = type 22 | 23 | if ignore_above is not None: 24 | all_kwargs["ignore_above"] = ignore_above 25 | 26 | if term_vector is not None: 27 | all_kwargs["term_vector"] = term_vector 28 | 29 | return {f"x-es-{k}": v for k, v in all_kwargs.items()} 30 | -------------------------------------------------------------------------------- /docling_core/search/meta.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Models and methods to define the metadata fields in database index mappings.""" 7 | from pathlib import Path 8 | from typing import Generic, Optional, TypeVar 9 | 10 | from pydantic import BaseModel, Field, StrictStr, ValidationInfo, field_validator 11 | 12 | from docling_core.search.package import Package 13 | from docling_core.types.base import CollectionTypeEnum, StrictDateTime, UniqueList 14 | from docling_core.utils.alias import AliasModel 15 | 16 | ClassificationT = TypeVar("ClassificationT", bound=str) 17 | DomainT = TypeVar("DomainT", bound=str) 18 | 19 | 20 | class S3Path(BaseModel, extra="forbid"): 21 | """The path details within a cloud object storage for CCS-parsed files.""" 22 | 23 | bucket: StrictStr 24 | prefix: StrictStr 25 | infix: StrictStr 26 | 27 | def __hash__(self): 28 | """Return the hash value for this S3Path object.""" 29 | return hash((type(self),) + tuple(self.__dict__.values())) 30 | 31 | 32 | class S3CcsData(BaseModel, extra="forbid"): 33 | """The access details to a cloud object storage for CCS-parsed files.""" 34 | 35 | endpoint: StrictStr 36 | paths: UniqueList[S3Path] = Field(min_length=1) 37 | 38 | 39 | class DocumentLicense(BaseModel, extra="forbid"): 40 | """Document license for a search database index within the index mappings.""" 41 | 42 | code: Optional[list[StrictStr]] = None 43 | text: Optional[list[StrictStr]] = None 44 | 45 | 46 | class Meta(AliasModel, Generic[ClassificationT, DomainT], extra="forbid"): 47 | """Metadata of a search database index within the index mappings.""" 48 | 49 | aliases: Optional[list[StrictStr]] = None 50 | created: StrictDateTime 51 | description: Optional[StrictStr] = None 52 | source: StrictStr 53 | storage: Optional[StrictStr] = None 54 | display_name: Optional[StrictStr] = None 55 | type: CollectionTypeEnum 56 | classification: Optional[list[ClassificationT]] = None 57 | version: UniqueList[Package] = Field(min_length=1) 58 | license: Optional[StrictStr] = None 59 | filename: Optional[Path] = None 60 | domain: Optional[list[DomainT]] = None 61 | reference: Optional[StrictStr] = Field(default=None, alias="$ref") 62 | ccs_s3_data: Optional[S3CcsData] = None 63 | document_license: Optional[DocumentLicense] = None 64 | index_key: Optional[StrictStr] = None 65 | project_key: Optional[StrictStr] = None 66 | 67 | @field_validator("reference") 68 | @classmethod 69 | def reference_for_document(cls, v, info: ValidationInfo): 70 | """Validate the reference field for indexes of type Document.""" 71 | if "type" in info.data and info.data["type"] == "Document": 72 | if v and v != "ccs:schemas#/Document": 73 | raise ValueError("wrong reference value for Document type") 74 | else: 75 | return "ccs:schemas#/Document" 76 | else: 77 | return v 78 | 79 | @field_validator("version") 80 | @classmethod 81 | def version_has_schema(cls, v): 82 | """Validate that the docling-core library is always set in version field.""" 83 | docling_core = [item for item in v if item.name == "docling-core"] 84 | if not docling_core: 85 | raise ValueError( 86 | "the version should include at least a valid docling-core package" 87 | ) 88 | elif len(docling_core) > 1: 89 | raise ValueError( 90 | "the version must not include more than 1 docling-core package" 91 | ) 92 | else: 93 | return v 94 | -------------------------------------------------------------------------------- /docling_core/search/package.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Models and methods to define a package model.""" 7 | 8 | import importlib.metadata 9 | import re 10 | from typing import Final 11 | 12 | from pydantic import BaseModel, StrictStr, StringConstraints 13 | from typing_extensions import Annotated 14 | 15 | VERSION_PATTERN: Final = ( 16 | r"^(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)" 17 | r"(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)" 18 | r"(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+" 19 | r"(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$" 20 | ) 21 | 22 | 23 | class Package(BaseModel, extra="forbid"): 24 | """Representation of a software package. 25 | 26 | The version needs to comply with Semantic Versioning 2.0.0. 27 | """ 28 | 29 | name: StrictStr = "docling-core" 30 | version: Annotated[str, StringConstraints(strict=True, pattern=VERSION_PATTERN)] = ( 31 | importlib.metadata.version("docling-core") 32 | ) 33 | 34 | def __hash__(self): 35 | """Return the hash value for this S3Path object.""" 36 | return hash((type(self),) + tuple(self.__dict__.values())) 37 | 38 | def get_major(self): 39 | """Get the major version of this package.""" 40 | return re.match(VERSION_PATTERN, self.version)["major"] 41 | 42 | def get_minor(self): 43 | """Get the major version of this package.""" 44 | return re.match(VERSION_PATTERN, self.version)["minor"] 45 | 46 | def get_patch(self): 47 | """Get the major version of this package.""" 48 | return re.match(VERSION_PATTERN, self.version)["patch"] 49 | 50 | def get_pre_release(self): 51 | """Get the pre-release version of this package.""" 52 | return re.match(VERSION_PATTERN, self.version)["prerelease"] 53 | 54 | def get_build_metadata(self): 55 | """Get the build metadata version of this package.""" 56 | return re.match(VERSION_PATTERN, self.version)["buildmetadata"] 57 | -------------------------------------------------------------------------------- /docling_core/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Data transformations package.""" 7 | -------------------------------------------------------------------------------- /docling_core/transforms/chunker/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the chunker types.""" 7 | 8 | from docling_core.transforms.chunker.base import BaseChunk, BaseChunker, BaseMeta 9 | from docling_core.transforms.chunker.hierarchical_chunker import ( 10 | DocChunk, 11 | DocMeta, 12 | HierarchicalChunker, 13 | ) 14 | -------------------------------------------------------------------------------- /docling_core/transforms/chunker/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define base classes for chunking.""" 7 | import json 8 | from abc import ABC, abstractmethod 9 | from typing import Any, ClassVar, Iterator 10 | 11 | from pydantic import BaseModel 12 | from typing_extensions import deprecated 13 | 14 | from docling_core.types.doc import DoclingDocument as DLDocument 15 | 16 | DFLT_DELIM = "\n" 17 | 18 | 19 | class BaseMeta(BaseModel): 20 | """Chunk metadata base class.""" 21 | 22 | excluded_embed: ClassVar[list[str]] = [] 23 | excluded_llm: ClassVar[list[str]] = [] 24 | 25 | def export_json_dict(self) -> dict[str, Any]: 26 | """Helper method for exporting non-None keys to JSON mode. 27 | 28 | Returns: 29 | dict[str, Any]: The exported dictionary. 30 | """ 31 | return self.model_dump(mode="json", by_alias=True, exclude_none=True) 32 | 33 | 34 | class BaseChunk(BaseModel): 35 | """Chunk base class.""" 36 | 37 | text: str 38 | meta: BaseMeta 39 | 40 | def export_json_dict(self) -> dict[str, Any]: 41 | """Helper method for exporting non-None keys to JSON mode. 42 | 43 | Returns: 44 | dict[str, Any]: The exported dictionary. 45 | """ 46 | return self.model_dump(mode="json", by_alias=True, exclude_none=True) 47 | 48 | 49 | class BaseChunker(BaseModel, ABC): 50 | """Chunker base class.""" 51 | 52 | delim: str = DFLT_DELIM 53 | 54 | @abstractmethod 55 | def chunk(self, dl_doc: DLDocument, **kwargs: Any) -> Iterator[BaseChunk]: 56 | """Chunk the provided document. 57 | 58 | Args: 59 | dl_doc (DLDocument): document to chunk 60 | 61 | Raises: 62 | NotImplementedError: in this abstract implementation 63 | 64 | Yields: 65 | Iterator[BaseChunk]: iterator over extracted chunks 66 | """ 67 | raise NotImplementedError() 68 | 69 | def contextualize(self, chunk: BaseChunk) -> str: 70 | """Contextualize the given chunk. This implementation is embedding-targeted. 71 | 72 | Args: 73 | chunk: chunk to serialize 74 | 75 | Returns: 76 | str: the serialized form of the chunk 77 | """ 78 | meta = chunk.meta.export_json_dict() 79 | 80 | items = [] 81 | for k in meta: 82 | if k not in chunk.meta.excluded_embed: 83 | if isinstance(meta[k], list): 84 | items.append( 85 | self.delim.join( 86 | [ 87 | d if isinstance(d, str) else json.dumps(d) 88 | for d in meta[k] 89 | ] 90 | ) 91 | ) 92 | else: 93 | items.append(json.dumps(meta[k])) 94 | items.append(chunk.text) 95 | 96 | return self.delim.join(items) 97 | 98 | @deprecated("Use contextualize() instead.") 99 | def serialize(self, chunk: BaseChunk) -> str: 100 | """Contextualize the given chunk. This implementation is embedding-targeted.""" 101 | return self.contextualize(chunk=chunk) 102 | -------------------------------------------------------------------------------- /docling_core/transforms/chunker/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | """Define the tokenizer types.""" 2 | -------------------------------------------------------------------------------- /docling_core/transforms/chunker/tokenizer/base.py: -------------------------------------------------------------------------------- 1 | """Define base classes for tokenization.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Any 5 | 6 | from pydantic import BaseModel 7 | 8 | 9 | class BaseTokenizer(BaseModel, ABC): 10 | """Base tokenizer class.""" 11 | 12 | @abstractmethod 13 | def count_tokens(self, text: str) -> int: 14 | """Get number of tokens for given text.""" 15 | ... 16 | 17 | @abstractmethod 18 | def get_max_tokens(self) -> int: 19 | """Get maximum number of tokens allowed.""" 20 | ... 21 | 22 | @abstractmethod 23 | def get_tokenizer(self) -> Any: 24 | """Get underlying tokenizer object.""" 25 | ... 26 | -------------------------------------------------------------------------------- /docling_core/transforms/chunker/tokenizer/huggingface.py: -------------------------------------------------------------------------------- 1 | """HuggingFace tokenization.""" 2 | 3 | import json 4 | from os import PathLike 5 | from typing import Optional, Union 6 | 7 | from huggingface_hub import hf_hub_download 8 | from pydantic import ConfigDict, model_validator 9 | from typing_extensions import Self 10 | 11 | from docling_core.transforms.chunker.tokenizer.base import BaseTokenizer 12 | 13 | try: 14 | from transformers import AutoTokenizer, PreTrainedTokenizerBase 15 | except ImportError: 16 | raise RuntimeError( 17 | "Module requires 'chunking' extra; to install, run: " 18 | "`pip install 'docling-core[chunking]'`" 19 | ) 20 | 21 | 22 | class HuggingFaceTokenizer(BaseTokenizer): 23 | """HuggingFace tokenizer.""" 24 | 25 | model_config = ConfigDict(arbitrary_types_allowed=True) 26 | 27 | tokenizer: PreTrainedTokenizerBase 28 | max_tokens: int = None # type: ignore[assignment] 29 | 30 | @model_validator(mode="after") 31 | def _patch(self) -> Self: 32 | if self.max_tokens is None: 33 | try: 34 | # try to use SentenceTransformers-specific config as that seems to be 35 | # reliable (whenever available) 36 | config_name = "sentence_bert_config.json" 37 | config_path = hf_hub_download( 38 | repo_id=self.tokenizer.name_or_path, 39 | filename=config_name, 40 | ) 41 | with open(config_path) as f: 42 | data = json.load(f) 43 | self.max_tokens = int(data["max_seq_length"]) 44 | except Exception as e: 45 | raise RuntimeError( 46 | "max_tokens could not be determined automatically; please set " 47 | "explicitly." 48 | ) from e 49 | return self 50 | 51 | def count_tokens(self, text: str): 52 | """Get number of tokens for given text.""" 53 | return len(self.tokenizer.tokenize(text=text)) 54 | 55 | def get_max_tokens(self): 56 | """Get maximum number of tokens allowed.""" 57 | return self.max_tokens 58 | 59 | @classmethod 60 | def from_pretrained( 61 | cls, 62 | model_name: Union[str, PathLike], 63 | max_tokens: Optional[int] = None, 64 | **kwargs, 65 | ) -> Self: 66 | """Create tokenizer from model name.""" 67 | my_kwargs = { 68 | "tokenizer": AutoTokenizer.from_pretrained( 69 | pretrained_model_name_or_path=model_name, **kwargs 70 | ), 71 | } 72 | if max_tokens is not None: 73 | my_kwargs["max_tokens"] = max_tokens 74 | return cls(**my_kwargs) 75 | 76 | def get_tokenizer(self): 77 | """Get underlying tokenizer object.""" 78 | return self.tokenizer 79 | -------------------------------------------------------------------------------- /docling_core/transforms/chunker/tokenizer/openai.py: -------------------------------------------------------------------------------- 1 | """OpenAI tokenization.""" 2 | 3 | from pydantic import ConfigDict 4 | 5 | from docling_core.transforms.chunker.hybrid_chunker import BaseTokenizer 6 | 7 | try: 8 | import tiktoken 9 | except ImportError: 10 | raise RuntimeError( 11 | "Module requires 'chunking-openai' extra; to install, run: " 12 | "`pip install 'docling-core[chunking-openai]'`" 13 | ) 14 | 15 | 16 | class OpenAITokenizer(BaseTokenizer): 17 | """OpenAI tokenizer.""" 18 | 19 | model_config = ConfigDict(arbitrary_types_allowed=True) 20 | 21 | tokenizer: tiktoken.Encoding 22 | max_tokens: int 23 | 24 | def count_tokens(self, text: str) -> int: 25 | """Get number of tokens for given text.""" 26 | return len(self.tokenizer.encode(text=text)) 27 | 28 | def get_max_tokens(self) -> int: 29 | """Get maximum number of tokens allowed.""" 30 | return self.max_tokens 31 | 32 | def get_tokenizer(self) -> tiktoken.Encoding: 33 | """Get underlying tokenizer object.""" 34 | return self.tokenizer 35 | -------------------------------------------------------------------------------- /docling_core/transforms/serializer/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the serializer types.""" 7 | -------------------------------------------------------------------------------- /docling_core/transforms/visualizer/__init__.py: -------------------------------------------------------------------------------- 1 | """Define the visualizer types.""" 2 | -------------------------------------------------------------------------------- /docling_core/transforms/visualizer/base.py: -------------------------------------------------------------------------------- 1 | """Define base classes for visualization.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Optional 5 | 6 | from PIL.Image import Image 7 | from pydantic import BaseModel 8 | 9 | from docling_core.types.doc import DoclingDocument 10 | 11 | 12 | class BaseVisualizer(BaseModel, ABC): 13 | """Visualize base class.""" 14 | 15 | @abstractmethod 16 | def get_visualization( 17 | self, 18 | *, 19 | doc: DoclingDocument, 20 | **kwargs, 21 | ) -> dict[Optional[int], Image]: 22 | """Get visualization of the document as images by page.""" 23 | raise NotImplementedError() 24 | -------------------------------------------------------------------------------- /docling_core/types/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the main types.""" 7 | 8 | from docling_core.types.doc.document import DoclingDocument 9 | from docling_core.types.gen.generic import Generic 10 | from docling_core.types.rec.record import Record 11 | -------------------------------------------------------------------------------- /docling_core/types/doc/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Package for models defined by the Document type.""" 7 | 8 | from .base import BoundingBox, CoordOrigin, ImageRefMode, Size 9 | from .document import ( 10 | CodeItem, 11 | DocItem, 12 | DoclingDocument, 13 | DocumentOrigin, 14 | FloatingItem, 15 | GroupItem, 16 | ImageRef, 17 | KeyValueItem, 18 | NodeItem, 19 | PageItem, 20 | PictureClassificationClass, 21 | PictureClassificationData, 22 | PictureDataType, 23 | PictureItem, 24 | ProvenanceItem, 25 | RefItem, 26 | SectionHeaderItem, 27 | TableCell, 28 | TableData, 29 | TableItem, 30 | TextItem, 31 | ) 32 | from .labels import DocItemLabel, GroupLabel, TableCellLabel 33 | -------------------------------------------------------------------------------- /docling_core/types/doc/utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Utils for document types.""" 7 | 8 | import unicodedata 9 | from pathlib import Path 10 | 11 | 12 | def relative_path(src: Path, target: Path) -> Path: 13 | """Compute the relative path from `src` to `target`. 14 | 15 | Args: 16 | src (str | Path): The source directory or file path (must be absolute). 17 | target (str | Path): The target directory or file path (must be absolute). 18 | 19 | Returns: 20 | Path: The relative path from `src` to `target`. 21 | 22 | Raises: 23 | ValueError: If either `src` or `target` is not an absolute path. 24 | """ 25 | src = Path(src).resolve() 26 | target = Path(target).resolve() 27 | 28 | # Ensure both paths are absolute 29 | if not src.is_absolute(): 30 | raise ValueError(f"The source path must be absolute: {src}") 31 | if not target.is_absolute(): 32 | raise ValueError(f"The target path must be absolute: {target}") 33 | 34 | # Find the common ancestor 35 | common_parts = [] 36 | for src_part, target_part in zip(src.parts, target.parts): 37 | if src_part == target_part: 38 | common_parts.append(src_part) 39 | else: 40 | break 41 | 42 | # Determine the path to go up from src to the common ancestor 43 | up_segments = [".."] * (len(src.parts) - len(common_parts)) 44 | 45 | # Add the path from the common ancestor to the target 46 | down_segments = target.parts[len(common_parts) :] 47 | 48 | # Combine and return the result 49 | return Path(*up_segments, *down_segments) 50 | 51 | 52 | def get_html_tag_with_text_direction(html_tag: str, text: str) -> str: 53 | """Form the HTML element with tag, text, and optional dir attribute.""" 54 | text_dir = get_text_direction(text) 55 | 56 | if text_dir == "ltr": 57 | return f"<{html_tag}>{text}" 58 | else: 59 | return f'<{html_tag} dir="{text_dir}">{text}' 60 | 61 | 62 | def get_text_direction(text: str) -> str: 63 | """Determine the text direction of a given string as LTR or RTL script.""" 64 | if not text: 65 | return "ltr" # Default for empty input 66 | 67 | rtl_scripts = {"R", "AL"} 68 | rtl_chars = sum(unicodedata.bidirectional(c) in rtl_scripts for c in text) 69 | 70 | return ( 71 | "rtl" 72 | if unicodedata.bidirectional(text[0]) in rtl_scripts 73 | or rtl_chars > len(text) / 2 74 | else "ltr" 75 | ) 76 | -------------------------------------------------------------------------------- /docling_core/types/gen/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Package for models defined by the Generic type.""" 7 | -------------------------------------------------------------------------------- /docling_core/types/gen/generic.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define a generic Docling type.""" 7 | 8 | from typing import Optional 9 | 10 | from pydantic import Field, StrictStr 11 | 12 | from docling_core.search.mapping import es_field 13 | from docling_core.types.base import FileInfoObject 14 | from docling_core.utils.alias import AliasModel 15 | 16 | 17 | class Generic(AliasModel): 18 | """A representation of a generic document.""" 19 | 20 | name: Optional[StrictStr] = Field( 21 | default=None, 22 | description="A short description or summary of the document.", 23 | alias="_name", 24 | json_schema_extra=es_field(type="text"), 25 | ) 26 | 27 | file_info: FileInfoObject = Field( 28 | title="Document information", 29 | description=( 30 | "Minimal identification information of the document within a collection." 31 | ), 32 | alias="file-info", 33 | ) 34 | -------------------------------------------------------------------------------- /docling_core/types/io/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Models for io.""" 7 | 8 | from io import BytesIO 9 | 10 | from pydantic import BaseModel, ConfigDict 11 | 12 | 13 | class DocumentStream(BaseModel): 14 | """Wrapper class for a bytes stream with a filename.""" 15 | 16 | model_config = ConfigDict(arbitrary_types_allowed=True) 17 | 18 | name: str 19 | stream: BytesIO 20 | -------------------------------------------------------------------------------- /docling_core/types/legacy_doc/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Package for models defined by the Document type.""" 7 | -------------------------------------------------------------------------------- /docling_core/types/legacy_doc/doc_ann.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Models for annotations and predictions in CCS.""" 7 | from typing import Any 8 | 9 | from pydantic import BaseModel 10 | 11 | from docling_core.types.legacy_doc.base import BoundingBox 12 | 13 | AnnotationReport = Any # TODO 14 | 15 | 16 | class Cell(BaseModel): 17 | """Cell.""" 18 | 19 | id: int 20 | rawcell_id: int 21 | label: str 22 | 23 | 24 | class Cluster(BaseModel): 25 | """Cluster.""" 26 | 27 | model: str 28 | type: str 29 | bbox: BoundingBox 30 | cell_ids: list[int] 31 | merged: bool 32 | id: int 33 | 34 | 35 | class Table(BaseModel): 36 | """Table.""" 37 | 38 | cell_id: int 39 | label: str 40 | rows: list[int] 41 | cols: list[int] 42 | 43 | 44 | class Info(BaseModel): 45 | """Info.""" 46 | 47 | display_name: str 48 | model_name: str 49 | model_class: str 50 | model_version: str 51 | model_id: str 52 | 53 | 54 | class Source(BaseModel): 55 | """Source.""" 56 | 57 | type: str 58 | timestamp: float 59 | info: Info 60 | 61 | 62 | class AnnotPredItem(BaseModel): 63 | """Annotation or prediction item.""" 64 | 65 | cells: list[Cell] 66 | clusters: list[Cluster] 67 | tables: list[Table] 68 | source: Source 69 | 70 | 71 | class Annotation(BaseModel): 72 | """Annotations.""" 73 | 74 | annotations: list[AnnotPredItem] 75 | predictions: list[AnnotPredItem] 76 | reports: list[AnnotationReport] 77 | -------------------------------------------------------------------------------- /docling_core/types/legacy_doc/doc_ocr.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Models for CCS objects with OCR.""" 7 | from typing import Any, Dict, List, Literal 8 | 9 | from pydantic import BaseModel, Field 10 | 11 | from docling_core.types.legacy_doc.base import BoundingBox 12 | from docling_core.utils.alias import AliasModel 13 | 14 | CoordsOrder = Literal["x1", "y1", "x2", "y2"] 15 | 16 | CoordsOrigin = Literal["top-left"] # TODO 17 | 18 | Info = Dict[str, Any] # TODO 19 | 20 | 21 | class Page(BaseModel): 22 | """Page.""" 23 | 24 | width: float 25 | height: float 26 | 27 | 28 | class Meta(AliasModel): 29 | """Meta.""" 30 | 31 | page: Page 32 | coords_order: List[CoordsOrder] = Field(..., alias="coords-order") 33 | coords_origin: CoordsOrigin = Field(..., alias="coords-origin") 34 | 35 | 36 | class Dimension(BaseModel): 37 | """Dimension.""" 38 | 39 | width: float 40 | height: float 41 | 42 | 43 | class Word(BaseModel): 44 | """Word.""" 45 | 46 | confidence: float 47 | bbox: BoundingBox 48 | content: str 49 | 50 | 51 | class Cell(BaseModel): 52 | """Cell.""" 53 | 54 | confidence: float 55 | bbox: BoundingBox 56 | content: str 57 | 58 | 59 | class Box(BaseModel): 60 | """Box.""" 61 | 62 | confidence: float 63 | bbox: BoundingBox 64 | content: str 65 | 66 | 67 | class Path(BaseModel): 68 | """Path.""" 69 | 70 | x: List[float] 71 | y: List[float] 72 | 73 | 74 | class OcrOutput(AliasModel): 75 | """OCR output.""" 76 | 77 | meta: Meta = Field(..., alias="_meta") 78 | info: Info 79 | dimension: Dimension 80 | words: List[Word] 81 | cells: List[Cell] 82 | boxes: List[Box] 83 | paths: List[Path] 84 | -------------------------------------------------------------------------------- /docling_core/types/nlp/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Package for models defining NLP artifacts.""" 7 | -------------------------------------------------------------------------------- /docling_core/types/nlp/qa.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the model for Q&A pairs.""" 7 | from typing import Generic, Optional 8 | 9 | from pydantic import BaseModel, Field, StrictBool, StrictStr 10 | 11 | from docling_core.search.mapping import es_field 12 | from docling_core.types.base import DescriptionAdvancedT, StrictDateTime, UniqueList 13 | from docling_core.types.nlp.qa_labels import QALabelling 14 | 15 | 16 | class QAPair(BaseModel, Generic[DescriptionAdvancedT]): 17 | """A representation of a question-answering (QA) pair.""" 18 | 19 | context: StrictStr = Field( 20 | description=( 21 | "A single string containing the context of the question enabling the" 22 | " presentation of the answer." 23 | ) 24 | ) 25 | question: StrictStr = Field(description="A question on the given context.") 26 | answer: StrictStr = Field( 27 | description="The answer to the question from the context." 28 | ) 29 | short_answer: Optional[StrictStr] = Field( 30 | default=None, description="Alternative and concise answer." 31 | ) 32 | retrieved_context: Optional[StrictBool] = Field( 33 | default=False, 34 | description="Whether the context was retrieved from the question.", 35 | ) 36 | generated_question: Optional[StrictBool] = Field( 37 | default=False, description="Whether the question was generated by an AI model." 38 | ) 39 | generated_answer: Optional[StrictBool] = Field( 40 | default=False, description="Whether the answer was generated by an AI model." 41 | ) 42 | created: StrictDateTime = Field( 43 | description="Datetime when the QA pair was created ." 44 | ) 45 | user: Optional[StrictStr] = Field( 46 | default=None, 47 | description=( 48 | "Unique identifier of the user that created or curated this QA pair." 49 | ), 50 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 51 | ) 52 | model: Optional[StrictStr] = Field( 53 | default=None, 54 | description="Unique identifier of the model used to generate this QA pair.", 55 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 56 | ) 57 | paths: UniqueList[StrictStr] = Field( 58 | description=( 59 | "One or more references to a document that identify the provenance of the" 60 | " QA pair context." 61 | ), 62 | examples=[ 63 | "badce7c84d0ba7ba0fb5e94492b0d91e2506a7cb48e4524ad572c546a35f768e#/" 64 | "main-text/4" 65 | ], 66 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 67 | ) 68 | advanced: Optional[DescriptionAdvancedT] = Field( 69 | default=None, 70 | description="Document metadata to provide more details on the context.", 71 | ) 72 | labels: Optional[QALabelling] = Field( 73 | default=None, description="QApair labelling axes." 74 | ) 75 | -------------------------------------------------------------------------------- /docling_core/types/rec/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Package for models defined by the Record type.""" 7 | -------------------------------------------------------------------------------- /docling_core/types/rec/attribute.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the model Attribute.""" 7 | from typing import Generic, Optional 8 | 9 | from pydantic import Field 10 | from typing_extensions import Annotated 11 | 12 | from docling_core.search.mapping import es_field 13 | from docling_core.types.base import ( 14 | IdentifierTypeT, 15 | PredicateKeyNameT, 16 | PredicateKeyTypeT, 17 | PredicateValueTypeT, 18 | ProvenanceTypeT, 19 | ) 20 | from docling_core.types.rec.base import ProvenanceItem 21 | from docling_core.types.rec.predicate import Predicate 22 | from docling_core.utils.alias import AliasModel 23 | 24 | 25 | class Attribute( 26 | AliasModel, 27 | Generic[ 28 | IdentifierTypeT, 29 | PredicateValueTypeT, 30 | PredicateKeyNameT, 31 | PredicateKeyTypeT, 32 | ProvenanceTypeT, 33 | ], 34 | extra="forbid", 35 | ): 36 | """Attribute model that describes a list of characteristics.""" 37 | 38 | conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0, allow_inf_nan=False)] = ( 39 | Field( 40 | ..., 41 | title="Confidence", 42 | description="The confidence level of this attribute characteristics.", 43 | json_schema_extra=es_field(type="float"), 44 | ) 45 | ) 46 | 47 | prov: Optional[list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]]] = Field( 48 | default=None, 49 | title="Provenance", 50 | description="The sources of this attribute characteristics.", 51 | ) 52 | 53 | predicates: list[ 54 | Predicate[PredicateValueTypeT, PredicateKeyNameT, PredicateKeyTypeT] 55 | ] = Field(..., description="A list of characteristics (type, value, and name).") 56 | -------------------------------------------------------------------------------- /docling_core/types/rec/base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the base models for the Record type.""" 7 | from typing import Generic, List, Optional 8 | 9 | from pydantic import Field, StrictInt, StrictStr 10 | from typing_extensions import Annotated 11 | 12 | from docling_core.search.mapping import es_field 13 | from docling_core.types.base import Identifier, IdentifierTypeT, ProvenanceTypeT 14 | from docling_core.utils.alias import AliasModel 15 | 16 | 17 | class ProvenanceItem( 18 | AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT], extra="forbid" 19 | ): 20 | """A representation of an object provenance.""" 21 | 22 | type_: Optional[ProvenanceTypeT] = Field( 23 | default=None, 24 | alias="type", 25 | title="The provenance type", 26 | description=( 27 | "Any string representing the type of provenance, e.g. `sentence`, " 28 | "`table`, or `doi`." 29 | ), 30 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 31 | ) 32 | 33 | text: Optional[StrictStr] = Field( 34 | default=None, 35 | title="Evidence of the provenance", 36 | description=( 37 | "A text representing the evidence of the provenance, e.g. the sentence " 38 | "text or the content of a table cell" 39 | ), 40 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 41 | ) 42 | 43 | reference: Optional[Identifier[IdentifierTypeT]] = Field( 44 | default=None, 45 | title="Reference to the provenance object", 46 | description=( 47 | "Reference to another object, e.g. record, statement, URL, or any other " 48 | "object that identifies the provenance" 49 | ), 50 | ) 51 | 52 | path: Optional[StrictStr] = Field( 53 | default=None, 54 | title="The location of the provenance within the referenced object", 55 | description=( 56 | "A path that locates the evidence within the provenance object identified " 57 | "by the `reference` field using a JSON pointer notation, e.g., " 58 | "`#/main-text/5` to locate the `main-text` paragraph at index 5" 59 | ), 60 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 61 | ) 62 | 63 | span: Optional[Annotated[List[StrictInt], Field(min_length=2, max_length=2)]] = ( 64 | Field( 65 | default=None, 66 | title="The location of the item in the text/table", 67 | description=( 68 | "location of the item in the text/table referenced by the `path`," 69 | " e.g., `[34, 67]`" 70 | ), 71 | ) 72 | ) 73 | 74 | 75 | class Provenance(AliasModel, Generic[IdentifierTypeT, ProvenanceTypeT]): 76 | """A representation of an evidence, as a list of provenance objects.""" 77 | 78 | conf: Annotated[float, Field(strict=True, ge=0.0, le=1.0)] = Field( 79 | ..., 80 | title="The confidence of the evidence", 81 | description=( 82 | "This value represents a score to the data item. Items originating from " 83 | " databases will typically have a score 1.0, while items resulting from " 84 | " an NLP model may have a value between 0.0 and 1.0." 85 | ), 86 | json_schema_extra=es_field(type="float"), 87 | ) 88 | prov: list[ProvenanceItem[IdentifierTypeT, ProvenanceTypeT]] = Field( 89 | title="Provenance", description="A list of provenance items." 90 | ) 91 | -------------------------------------------------------------------------------- /docling_core/types/rec/record.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the model Record.""" 7 | from typing import Generic, Optional 8 | 9 | from pydantic import BaseModel, Field, StrictStr 10 | 11 | from docling_core.search.mapping import es_field 12 | from docling_core.types.base import ( 13 | Acquisition, 14 | CollectionNameTypeT, 15 | CollectionRecordInfo, 16 | FileInfoObject, 17 | Identifier, 18 | IdentifierTypeT, 19 | Log, 20 | PredicateKeyNameT, 21 | PredicateKeyTypeT, 22 | PredicateValueTypeT, 23 | StrictDateTime, 24 | SubjectNameTypeT, 25 | SubjectTypeT, 26 | ) 27 | from docling_core.types.rec.attribute import Attribute 28 | from docling_core.types.rec.base import Provenance, ProvenanceTypeT 29 | from docling_core.types.rec.subject import Subject 30 | 31 | 32 | class RecordDescription(BaseModel, Generic[CollectionNameTypeT]): 33 | """Additional record metadata, including optional collection-specific fields.""" 34 | 35 | logs: list[Log] = Field( 36 | description="Logs that describe the ETL tasks applied to this record." 37 | ) 38 | publication_date: Optional[StrictDateTime] = Field( 39 | default=None, 40 | title="Publication date", 41 | description=( 42 | "The date that best represents the last publication time of a record." 43 | ), 44 | ) 45 | collection: Optional[CollectionRecordInfo[CollectionNameTypeT]] = Field( 46 | default=None, description="The collection information of this record." 47 | ) 48 | acquisition: Optional[Acquisition] = Field( 49 | default=None, 50 | description=( 51 | "Information on how the document was obtained, for data governance" 52 | " purposes." 53 | ), 54 | ) 55 | 56 | 57 | class Record( 58 | Provenance, 59 | Generic[ 60 | IdentifierTypeT, 61 | PredicateValueTypeT, 62 | PredicateKeyNameT, 63 | PredicateKeyTypeT, 64 | ProvenanceTypeT, 65 | SubjectTypeT, 66 | SubjectNameTypeT, 67 | CollectionNameTypeT, 68 | ], 69 | ): 70 | """A representation of a structured record in an database.""" 71 | 72 | file_info: FileInfoObject = Field(alias="file-info") 73 | description: RecordDescription 74 | subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT] 75 | attributes: Optional[ 76 | list[ 77 | Attribute[ 78 | IdentifierTypeT, 79 | PredicateValueTypeT, 80 | PredicateKeyNameT, 81 | PredicateKeyTypeT, 82 | ProvenanceTypeT, 83 | ] 84 | ] 85 | ] = None 86 | name: Optional[StrictStr] = Field( 87 | default=None, 88 | description="A short description or summary of the record.", 89 | alias="_name", 90 | json_schema_extra=es_field(type="text"), 91 | ) 92 | identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field( 93 | default=None, 94 | description="A list of unique identifiers of this record in a database.", 95 | ) 96 | -------------------------------------------------------------------------------- /docling_core/types/rec/statement.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the model Statement.""" 7 | from enum import Enum 8 | from typing import Generic 9 | 10 | from pydantic import Field 11 | 12 | from docling_core.types.base import ( 13 | IdentifierTypeT, 14 | PredicateKeyNameT, 15 | PredicateKeyTypeT, 16 | PredicateValueTypeT, 17 | ProvenanceTypeT, 18 | SubjectNameTypeT, 19 | SubjectTypeT, 20 | ) 21 | from docling_core.types.rec.attribute import Attribute 22 | from docling_core.types.rec.subject import Subject 23 | 24 | 25 | class StatementToken(Enum): 26 | """Class to represent an LLM friendly representation of statements.""" 27 | 28 | BEG_STATEMENTS = "" 29 | END_STATEMENTS = "" 30 | 31 | BEG_STATEMENT = "" 32 | END_STATEMENT = "" 33 | 34 | BEG_PROV = "" 35 | END_PROV = "" 36 | 37 | BEG_SUBJECT = "" 38 | END_SUBJECT = "" 39 | 40 | BEG_PREDICATE = "" 41 | END_PREDICATE = "" 42 | 43 | BEG_PROPERTY = "" 44 | END_PROPERTY = "" 45 | 46 | BEG_VALUE = "" 47 | END_VALUE = "" 48 | 49 | BEG_UNIT = "" 50 | END_UNIT = "" 51 | 52 | @classmethod 53 | def get_special_tokens(cls): 54 | """Function to get all special statements tokens.""" 55 | return [token.value for token in cls] 56 | 57 | 58 | class Statement( 59 | Attribute, 60 | Generic[ 61 | IdentifierTypeT, 62 | PredicateValueTypeT, 63 | PredicateKeyNameT, 64 | PredicateKeyTypeT, 65 | ProvenanceTypeT, 66 | SubjectTypeT, 67 | SubjectNameTypeT, 68 | ], 69 | extra="allow", 70 | ): 71 | """A representation of a statement on a subject.""" 72 | 73 | subject: Subject[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT] = Field( 74 | description="The subject (entity) of this statement." 75 | ) 76 | -------------------------------------------------------------------------------- /docling_core/types/rec/subject.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define the model Subject.""" 7 | from typing import Generic, Optional 8 | 9 | from pydantic import Field, StrictStr 10 | 11 | from docling_core.search.mapping import es_field 12 | from docling_core.types.base import ( 13 | Identifier, 14 | IdentifierTypeT, 15 | SubjectNameTypeT, 16 | SubjectTypeT, 17 | ) 18 | from docling_core.types.legacy_doc.base import S3Reference 19 | from docling_core.utils.alias import AliasModel 20 | 21 | 22 | class SubjectNameIdentifier(Identifier[SubjectNameTypeT], Generic[SubjectNameTypeT]): 23 | """Identifier of subject names.""" "" 24 | 25 | 26 | class Subject( 27 | AliasModel, 28 | Generic[IdentifierTypeT, SubjectTypeT, SubjectNameTypeT], 29 | extra="forbid", 30 | ): 31 | """A representation of a subject.""" 32 | 33 | display_name: StrictStr = Field( 34 | title="Display Name", 35 | description=( 36 | "Name of the subject in natural language. It can be used for end-user " 37 | "applications to display a human-readable name. For instance, `B(2) Mg(1)` " 38 | "for `MgB2` or `International Business Machines` for `IBM`" 39 | ), 40 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 41 | ) 42 | display_image: Optional[S3Reference] = Field( 43 | default=None, 44 | title="Display Image", 45 | description=( 46 | "Image representing the subject. It can be used for end-user applications." 47 | "For example, the chemical structure drawing of a compound " 48 | "or the eight bar IBM logo for IBM." 49 | ), 50 | json_schema_extra=es_field(suppress=True), 51 | ) 52 | type_: SubjectTypeT = Field( 53 | alias="type", 54 | description=( 55 | "Main subject type. For instance, `material`, `material-class`, " 56 | "`material-device`, `company`, or `person`." 57 | ), 58 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 59 | ) 60 | names: list[SubjectNameIdentifier[SubjectNameTypeT]] = Field( 61 | description=( 62 | "List of given names for this subject. They may not be unique across " 63 | "different subjects." 64 | ) 65 | ) 66 | identifiers: Optional[list[Identifier[IdentifierTypeT]]] = Field( 67 | default=None, 68 | description=( 69 | "List of unique identifiers in database. For instance, the `PubChem ID` " 70 | "of a record in the PubChem database." 71 | ), 72 | ) 73 | labels: Optional[list[StrictStr]] = Field( 74 | default=None, 75 | description="List of labels or categories for this subject.", 76 | json_schema_extra=es_field(type="keyword", ignore_above=8191), 77 | ) 78 | -------------------------------------------------------------------------------- /docling_core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Package for modules to support data models.""" 7 | -------------------------------------------------------------------------------- /docling_core/utils/alias.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Define utility models and types related to field aliases.""" 7 | from pydantic import BaseModel, ConfigDict 8 | 9 | 10 | class AliasModel(BaseModel): 11 | """Model for alias fields to ensure instantiation and serialization by alias.""" 12 | 13 | model_config = ConfigDict(populate_by_name=True) 14 | 15 | def model_dump(self, **kwargs) -> dict: 16 | """Generate a dictionary representation of the model using field aliases.""" 17 | if "by_alias" not in kwargs: 18 | kwargs = {**kwargs, "by_alias": True} 19 | 20 | return super().model_dump(**kwargs) 21 | 22 | def model_dump_json(self, **kwargs) -> str: 23 | """Generate a JSON representation of the model using field aliases.""" 24 | if "by_alias" not in kwargs: 25 | kwargs = {**kwargs, "by_alias": True} 26 | 27 | return super().model_dump_json(**kwargs) 28 | -------------------------------------------------------------------------------- /docling_core/utils/generate_docs.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Generate documentation of Docling types as JSON schema. 7 | 8 | Example: 9 | python docling_core/utils/generate_docs.py /tmp/docling_core_files 10 | """ 11 | import argparse 12 | import json 13 | import os 14 | from argparse import BooleanOptionalAction 15 | from pathlib import Path 16 | from shutil import rmtree 17 | from typing import Final 18 | 19 | from docling_core.utils.generate_jsonschema import generate_json_schema 20 | 21 | MODELS: Final = ["DoclingDocument", "Record", "Generic"] 22 | 23 | 24 | def _prepare_directory(folder: str, clean: bool = False) -> None: 25 | """Create a directory or empty its content if it already exists. 26 | 27 | Args: 28 | folder: The name of the directory. 29 | clean: Whether any existing content in the directory should be removed. 30 | """ 31 | if os.path.isdir(folder): 32 | if clean: 33 | for path in Path(folder).glob("**/*"): 34 | if path.is_file(): 35 | path.unlink() 36 | elif path.is_dir(): 37 | rmtree(path) 38 | else: 39 | os.makedirs(folder, exist_ok=True) 40 | 41 | 42 | def generate_collection_jsonschema(folder: str): 43 | """Generate the JSON schema of Docling collections and export them to a folder. 44 | 45 | Args: 46 | folder: The name of the directory. 47 | """ 48 | for item in MODELS: 49 | json_schema = generate_json_schema(item) 50 | with open( 51 | os.path.join(folder, f"{item}.json"), mode="w", encoding="utf8" 52 | ) as json_file: 53 | json.dump(json_schema, json_file, ensure_ascii=False, indent=2) 54 | 55 | 56 | def main() -> None: 57 | """Generate the JSON Schema of Docling collections and export documentation.""" 58 | argparser = argparse.ArgumentParser() 59 | argparser.add_argument( 60 | "directory", 61 | help=( 62 | "Directory to generate files. If it exists, any existing content will be" 63 | " removed." 64 | ), 65 | ) 66 | argparser.add_argument( 67 | "--clean", 68 | help="Whether any existing content in directory should be removed.", 69 | action=BooleanOptionalAction, 70 | dest="clean", 71 | default=False, 72 | required=False, 73 | ) 74 | args = argparser.parse_args() 75 | 76 | _prepare_directory(args.directory, args.clean) 77 | 78 | generate_collection_jsonschema(args.directory) 79 | 80 | 81 | if __name__ == "__main__": 82 | main() 83 | -------------------------------------------------------------------------------- /docling_core/utils/generate_jsonschema.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Generate the JSON Schema of pydantic models and export them to files. 7 | 8 | Example: 9 | python docling_core/utils/generate_jsonschema.py doc.document.TableCell 10 | 11 | """ 12 | import argparse 13 | import json 14 | from typing import Any, Union 15 | 16 | from pydantic import BaseModel 17 | 18 | 19 | def _import_class(class_reference: str) -> Any: 20 | components = class_reference.split(".") 21 | module_ref = ".".join(components[:-1]) 22 | class_name = components[-1] 23 | mod = __import__(module_ref, fromlist=[class_name]) 24 | class_type = getattr(mod, class_name) 25 | 26 | return class_type 27 | 28 | 29 | def generate_json_schema(class_reference: str) -> Union[dict, None]: 30 | """Generate a jsonable dict of a model's schema from a data type. 31 | 32 | Args: 33 | class_reference: The reference to a class in 'docling_core.types'. 34 | 35 | Returns: 36 | A jsonable dict of the model's schema. 37 | """ 38 | if not class_reference.startswith("docling_core.types."): 39 | class_reference = "docling_core.types." + class_reference 40 | class_type = _import_class(class_reference) 41 | if issubclass(class_type, BaseModel): 42 | return class_type.model_json_schema() 43 | else: 44 | return None 45 | 46 | 47 | def main() -> None: 48 | """Print the JSON Schema of a model.""" 49 | argparser = argparse.ArgumentParser() 50 | argparser.add_argument( 51 | "class_ref", help="Class reference, e.g., doc.document.TableCell" 52 | ) 53 | args = argparser.parse_args() 54 | 55 | json_schema = generate_json_schema(args.class_ref) 56 | print( 57 | json.dumps(json_schema, ensure_ascii=False, indent=2).encode("utf-8").decode() 58 | ) 59 | 60 | 61 | if __name__ == "__main__": 62 | main() 63 | -------------------------------------------------------------------------------- /docling_core/utils/validate.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Validation of Document-related files against their data schemas.""" 7 | import argparse 8 | import json 9 | import logging 10 | 11 | from docling_core.utils.validators import ( 12 | validate_ann_schema, 13 | validate_ocr_schema, 14 | validate_raw_schema, 15 | ) 16 | 17 | logger = logging.getLogger("docling-core") 18 | 19 | 20 | def parse_arguments(): 21 | """Parse the arguments from the command line.""" 22 | argparser = argparse.ArgumentParser(description="validate example-file with schema") 23 | 24 | argparser.add_argument( 25 | "-f", "--format", required=True, help="format of the file [RAW, ANN, OCR]" 26 | ) 27 | 28 | argparser.add_argument( 29 | "-i", "--input-file", required=True, help="JSON filename to be validated" 30 | ) 31 | 32 | pargs = argparser.parse_args() 33 | 34 | return pargs.format, pargs.input_file 35 | 36 | 37 | def run(): 38 | """Run the validation of a file containing a Document.""" 39 | file_format, input_file = parse_arguments() 40 | 41 | with open(input_file, "r", encoding="utf-8") as fd: 42 | file_ = json.load(fd) 43 | 44 | result = (False, "Empty result") 45 | 46 | if file_format == "RAW": 47 | result = validate_raw_schema(file_) 48 | 49 | elif file_format == "ANN": 50 | result = validate_ann_schema(file_) 51 | 52 | elif file_format == "OCR": 53 | result = validate_ocr_schema(file_) 54 | 55 | else: 56 | logger.error("format of the file needs to `RAW`, `ANN` or `OCR`") 57 | 58 | if result[0]: 59 | logger.info("Done!") 60 | else: 61 | logger.error("invalid schema: {}".format(result[1])) 62 | 63 | 64 | def main(): 65 | """Set up the environment and run the validation of a Document.""" 66 | logger.setLevel(logging.DEBUG) 67 | 68 | # create console handler and set level to debug 69 | ch = logging.StreamHandler() 70 | ch.setLevel(logging.DEBUG) 71 | 72 | # create formatter 73 | formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s") 74 | 75 | # add formatter to ch 76 | ch.setFormatter(formatter) 77 | 78 | # add ch to logger 79 | # logger.addHandler(ch) 80 | 81 | logging.basicConfig(handlers=[ch]) 82 | run() 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /docling_core/utils/validators.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Module for custom type validators.""" 7 | import json 8 | import logging 9 | from datetime import datetime 10 | from importlib import resources 11 | from typing import Hashable, TypeVar 12 | 13 | import jsonschema 14 | from pydantic_core import PydanticCustomError 15 | 16 | logger = logging.getLogger("docling-core") 17 | 18 | T = TypeVar("T", bound=Hashable) 19 | 20 | 21 | def validate_schema(file_: dict, schema: dict) -> tuple[bool, str]: 22 | """Check wheter the workflow is properly formatted JSON and contains valid keys. 23 | 24 | Where possible, this also checks a few basic dependencies between properties, but 25 | this functionality is limited. 26 | """ 27 | try: 28 | jsonschema.validate(file_, schema) 29 | return (True, "All good!") 30 | 31 | except jsonschema.ValidationError as err: 32 | return (False, err.message) 33 | 34 | 35 | def validate_raw_schema(file_: dict) -> tuple[bool, str]: 36 | """Validate a RAW file.""" 37 | logger.debug("validate RAW schema ... ") 38 | 39 | schema_txt = ( 40 | resources.files("docling_core") 41 | .joinpath("resources/schemas/legacy_doc/RAW.json") 42 | .read_text("utf-8") 43 | ) 44 | schema = json.loads(schema_txt) 45 | 46 | return validate_schema(file_, schema) 47 | 48 | 49 | def validate_ann_schema(file_: dict) -> tuple[bool, str]: 50 | """Validate an annotated (ANN) file.""" 51 | logger.debug("validate ANN schema ... ") 52 | 53 | schema_txt = ( 54 | resources.files("docling_core") 55 | .joinpath("resources/schemas/legacy_doc/ANN.json") 56 | .read_text("utf-8") 57 | ) 58 | schema = json.loads(schema_txt) 59 | 60 | return validate_schema(file_, schema) 61 | 62 | 63 | def validate_ocr_schema(file_: dict) -> tuple[bool, str]: 64 | """Validate an OCR file.""" 65 | logger.debug("validate OCR schema ... ") 66 | 67 | schema_txt = ( 68 | resources.files("docling_core") 69 | .joinpath("resources/schemas/legacy_doc/OCR-output.json") 70 | .read_text("utf-8") 71 | ) 72 | schema = json.loads(schema_txt) 73 | 74 | return validate_schema(file_, schema) 75 | 76 | 77 | def validate_unique_list(v: list[T]) -> list[T]: 78 | """Validate that a list has unique values. 79 | 80 | Validator for list types, since pydantic V2 does not support the `unique_items` 81 | parameter from V1. More information on 82 | https://github.com/pydantic/pydantic-core/pull/820#issuecomment-1670475909 83 | 84 | Args: 85 | v: any list of hashable types 86 | 87 | Returns: 88 | The list, after checking for unique items. 89 | """ 90 | if len(v) != len(set(v)): 91 | raise PydanticCustomError("unique_list", "List must be unique") 92 | return v 93 | 94 | 95 | def validate_datetime(v, handler): 96 | """Validate that a value is a datetime or a non-numeric string.""" 97 | if type(v) is datetime or (type(v) is str and not v.isnumeric()): 98 | return handler(v) 99 | else: 100 | raise ValueError("Value type must be a datetime or a non-numeric string") 101 | -------------------------------------------------------------------------------- /docs/Generic.json: -------------------------------------------------------------------------------- 1 | { 2 | "$defs": { 3 | "FileInfoObject": { 4 | "description": "Filing information for any data object to be stored in a Docling database.", 5 | "properties": { 6 | "filename": { 7 | "description": "The name of a persistent object that created this data object", 8 | "title": "Filename", 9 | "type": "string", 10 | "x-es-ignore_above": 8191, 11 | "x-es-type": "keyword" 12 | }, 13 | "filename-prov": { 14 | "anyOf": [ 15 | { 16 | "type": "string" 17 | }, 18 | { 19 | "type": "null" 20 | } 21 | ], 22 | "default": null, 23 | "description": "The provenance of this data object, e.g. an archive file, a URL, or any other repository.", 24 | "title": "Filename-Prov", 25 | "x-es-ignore_above": 8191, 26 | "x-es-type": "keyword" 27 | }, 28 | "document-hash": { 29 | "description": "A unique identifier of this data object within a collection of a Docling database", 30 | "title": "Document-Hash", 31 | "type": "string", 32 | "x-es-ignore_above": 8191, 33 | "x-es-type": "keyword" 34 | } 35 | }, 36 | "required": [ 37 | "filename", 38 | "document-hash" 39 | ], 40 | "title": "FileInfoObject", 41 | "type": "object" 42 | } 43 | }, 44 | "description": "A representation of a generic document.", 45 | "properties": { 46 | "_name": { 47 | "anyOf": [ 48 | { 49 | "type": "string" 50 | }, 51 | { 52 | "type": "null" 53 | } 54 | ], 55 | "default": null, 56 | "description": "A short description or summary of the document.", 57 | "title": "Name", 58 | "x-es-type": "text" 59 | }, 60 | "file-info": { 61 | "$ref": "#/$defs/FileInfoObject", 62 | "description": "Minimal identification information of the document within a collection.", 63 | "title": "Document information" 64 | } 65 | }, 66 | "required": [ 67 | "file-info" 68 | ], 69 | "title": "Generic", 70 | "type": "object" 71 | } -------------------------------------------------------------------------------- /examples/chunking_and_serialization.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "
\n", 8 | " 👉 INFO: This notebook has moved to the Docling docs, check it out \n", 9 | " \n", 10 | " here.\n", 11 | "
" 12 | ] 13 | } 14 | ], 15 | "metadata": { 16 | "kernelspec": { 17 | "display_name": ".venv", 18 | "language": "python", 19 | "name": "python3" 20 | }, 21 | "language_info": { 22 | "codemirror_mode": { 23 | "name": "ipython", 24 | "version": 3 25 | }, 26 | "file_extension": ".py", 27 | "mimetype": "text/x-python", 28 | "name": "python", 29 | "nbconvert_exporter": "python", 30 | "pygments_lexer": "ipython3", 31 | "version": "3.12.4" 32 | } 33 | }, 34 | "nbformat": 4, 35 | "nbformat_minor": 2 36 | } 37 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | -------------------------------------------------------------------------------- /test/data/doc/01030000000083.dt: -------------------------------------------------------------------------------- 1 | CategoryNumber of clauses in Union lawsIn percentNumber of clauses in State lawsIn percentCommercial52910.1%8173.9%Environment, Health and Safety83415.9%3451.7%Finance & Taxation410.8%8884.2%General751.4%3601.7%Industry Specific297956.9%12005.7%Labour53410.2%1728582.7%Secretarial2474.7%00.0%TABLE 35: UNION-STATE BREAKDOWN OF IMPRISONMENT CLAUSES BY CATEGORIES 2 | CompliancesSmallMediumLargeTotal Applicable Compliances6693,1095,796Compliances with imprisonment4612,1724,085Percentage of imprisonment clauses69%70%70%TABLE 36: THREE CASE STUDIES ON MANUFACTURING COMPLIANCES* 3 | * These are real data from three companies operating in the automotive components business 4 | SmallMediumLargeLess than 3 months25821853 months to less than 1 year1876991,2201 year to less than 3 years1781,0701,9643 years to less than 5 years592455055 years to 10 years1276211TABLE 37: BREAKDOWN OF IMPRISONMENT CLAUSES IN MANUFACTURING CASE STUDIES* 5 | * In Table 36 6 | 85 7 | Appendices 8 | -------------------------------------------------------------------------------- /test/data/doc/01030000000083.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/01030000000083.png -------------------------------------------------------------------------------- /test/data/doc/01030000000111.dt: -------------------------------------------------------------------------------- 1 | Figure 8.1: a) P6238 CUSSONS free and forced vortex apparatus, b) push-in orifices, c) free vortex measuring caliper, d) force vortex measuring probes 2 | 7. THEORY 3 | Two types of vortices are distinguished in the dynamics of the motion: forced and free vortices. The forced vortex is caused by external forces on the fluid, such as the impeller of a pump, and the free vortex naturally occurs in the flow and can be observed in a drain or in the atmosphere of a tornado. 4 | 7.1. FREE VORTEX 5 | A free vortex is formed when water flows out of a vessel through a central hole in the base (Figure 8.2). The degree of the rotation depends on the initial disturbance. In a free cylindrical vortex, the velocity varies inversely with the distance from the axis of rotation (Figure 8.3). 6 | \upsilon = \frac { k } { r } \quad \quad ( 1 ) 7 | The equation governing the surface profile is derived from the Bernoulli's theorem: 8 | \upsilon ^ { 2 } = + \, z = C \quad \quad ( 2 ) 9 | Substituting Equation (1) into (2) will give a new expression: 10 | \frac { k ^ { 2 } } { 2 g r ^ { 2 } } + \, z = C \quad \quad ( 3 ) 11 | or: 12 | 68 APPLIED FLUID MECHANICS LAB MANUAL 13 | -------------------------------------------------------------------------------- /test/data/doc/01030000000111.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/01030000000111.png -------------------------------------------------------------------------------- /test/data/doc/2206.01062.yaml.et: -------------------------------------------------------------------------------- 1 | 0: unspecified with name=_root_ 2 | 1: section_header 3 | 2: text 4 | 3: text 5 | 4: text 6 | 5: text 7 | 6: text 8 | 7: section_header 9 | 8: text 10 | 9: section_header 11 | 10: text 12 | 11: text 13 | 12: text 14 | 13: text 15 | 14: text 16 | 15: text 17 | 16: picture 18 | 17: caption 19 | 18: section_header 20 | 19: text 21 | 20: section_header 22 | 21: text 23 | 22: section_header 24 | 23: text 25 | 24: text 26 | 25: text 27 | 26: list with name=list 28 | 27: list_item 29 | 28: list_item 30 | 29: list_item 31 | 30: list_item 32 | 31: footnote 33 | 32: text 34 | 33: list with name=list 35 | 34: list_item 36 | 35: text 37 | 36: text 38 | 37: section_header 39 | 38: text 40 | 39: text 41 | 40: section_header 42 | 41: text 43 | 42: text 44 | 43: picture 45 | 44: caption 46 | 45: text 47 | 46: text 48 | 47: text 49 | 48: text 50 | 49: footnote 51 | 50: text 52 | 51: text 53 | 52: text 54 | 53: section_header 55 | 54: text 56 | 55: table 57 | 56: caption 58 | 57: picture 59 | 58: caption 60 | 59: text 61 | 60: text 62 | 61: text 63 | 62: text 64 | 63: footnote 65 | 64: text 66 | 65: text 67 | 66: text 68 | 67: list with name=list 69 | 68: list_item 70 | 69: list_item 71 | 70: list_item 72 | 71: list_item 73 | 72: list_item 74 | 73: list_item 75 | 74: text 76 | 75: text 77 | 76: picture 78 | 77: text 79 | 78: caption 80 | 79: text 81 | 80: text 82 | 81: text 83 | 82: table 84 | 83: text 85 | 84: section_header 86 | 85: text 87 | 86: picture 88 | 87: caption 89 | 88: text 90 | 89: text 91 | 90: section_header 92 | 91: text 93 | 92: text 94 | 93: table 95 | 94: section_header 96 | 95: text 97 | 96: section_header 98 | 97: text 99 | 98: text 100 | 99: table 101 | 100: text 102 | 101: section_header 103 | 102: text 104 | 103: section_header 105 | 104: text 106 | 105: text 107 | 106: table 108 | 107: text 109 | 108: text 110 | 109: section_header 111 | 110: text 112 | 111: section_header 113 | 112: text 114 | 113: text 115 | 114: text 116 | 115: section_header 117 | 116: list with name=list 118 | 117: list_item 119 | 118: list_item 120 | 119: list_item 121 | 120: list_item 122 | 121: list_item 123 | 122: list_item 124 | 123: list_item 125 | 124: list_item 126 | 125: list_item 127 | 126: list_item 128 | 127: list_item 129 | 128: list_item 130 | 129: list_item 131 | 130: picture 132 | 131: caption 133 | 132: text 134 | 133: text 135 | 134: list with name=list 136 | 135: list_item 137 | 136: list_item 138 | 137: list_item 139 | 138: list_item 140 | 139: list_item 141 | 140: list_item 142 | 141: list_item 143 | 142: list_item 144 | 143: list_item 145 | 144: list_item 146 | -------------------------------------------------------------------------------- /test/data/doc/2408.09869v3_enriched_p1_include_annotations_false.gt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 |

Docling Technical Report

7 |

Version 1.0

8 |

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

9 |

AI4K Group, IBM Research R¨ uschlikon, Switzerland

10 |

Abstract

11 |

This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.

12 |

1 Introduction

13 |

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.

14 |

With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.

15 |
16 | 17 | 18 | -------------------------------------------------------------------------------- /test/data/doc/2408.09869v3_enriched_p1_include_annotations_true.gt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 |

Docling Technical Report

7 |
In this image we can see a cartoon image of a duck holding a paper.
8 |

Version 1.0

9 |

Christoph Auer Maksym Lysak Ahmed Nassar Michele Dolfi Nikolaos Livathinos Panos Vagenas Cesar Berrospi Ramis Matteo Omenetti Fabian Lindlbauer Kasper Dinkla Lokesh Mishra Yusik Kim Shubham Gupta Rafael Teixeira de Lima Valery Weber Lucas Morin Ingmar Meijer Viktor Kuropiatnyk Peter W. J. Staar

10 |

AI4K Group, IBM Research R¨ uschlikon, Switzerland

11 |

Abstract

12 |

This technical report introduces Docling , an easy to use, self-contained, MITlicensed open-source package for PDF document conversion. It is powered by state-of-the-art specialized AI models for layout analysis (DocLayNet) and table structure recognition (TableFormer), and runs efficiently on commodity hardware in a small resource budget. The code interface allows for easy extensibility and addition of new features and models.

13 |

1 Introduction

14 |

Converting PDF documents back into a machine-processable format has been a major challenge for decades due to their huge variability in formats, weak standardization and printing-optimized characteristic, which discards most structural features and metadata. With the advent of LLMs and popular application patterns such as retrieval-augmented generation (RAG), leveraging the rich content embedded in PDFs has become ever more relevant. In the past decade, several powerful document understanding solutions have emerged on the market, most of which are commercial software, cloud offerings [3] and most recently, multi-modal vision-language models. As of today, only a handful of open-source tools cover PDF conversion, leaving a significant feature and quality gap to proprietary solutions.

15 |

With Docling , we open-source a very capable and efficient document conversion tool which builds on the powerful, specialized AI models and datasets for layout analysis and table structure recognition we developed and presented in the recent past [12, 13, 9]. Docling is designed as a simple, self-contained python library with permissive license, running entirely locally on commodity hardware. Its code architecture allows for easy extensibility and addition of new features and models.

16 |
17 | 18 | 19 | -------------------------------------------------------------------------------- /test/data/doc/activities.gt.md: -------------------------------------------------------------------------------- 1 | ## Summer activities 2 | 3 | ## Swimming in the lake 4 | 5 | Duck 6 | 7 | Figure 1: This is a cute duckling 8 | 9 | ## Let's swim! 10 | 11 | To get started with swimming, first lay down in a water and try not to drown: 12 | 13 | - ∞ You can relax and look around 14 | - ∞ Paddle about 15 | - ∞ Enjoy summer warmth 16 | 17 | Also, don't forget: 18 | 19 | - 1. Wear sunglasses 20 | - 2. Don't forget to drink water 21 | - 3. Use sun cream 22 | 23 | Hmm, what else… 24 | 25 | - -Another activity item 26 | 27 | - -Yet another one 28 | - -Stopping it here 29 | 30 | Some text. 31 | 32 | 33 | 34 | - -Starting the next page with a list item. 35 | - -Second item. 36 | -------------------------------------------------------------------------------- /test/data/doc/activities_p2.gt.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | activities 6 | 7 | 124 | 125 | 126 |
127 |
    128 |
  • -Yet another one
  • 129 |
  • -Stopping it here
  • 130 |
131 |

Some text.

132 |
133 | 134 | 135 | -------------------------------------------------------------------------------- /test/data/doc/activities_p2.gt.md: -------------------------------------------------------------------------------- 1 | - -Yet another one 2 | - -Stopping it here 3 | 4 | Some text. 5 | -------------------------------------------------------------------------------- /test/data/doc/activities_pb_empty.gt.md: -------------------------------------------------------------------------------- 1 | ## Summer activities 2 | 3 | ## Swimming in the lake 4 | 5 | Duck 6 | 7 | Figure 1: This is a cute duckling 8 | 9 | ## Let's swim! 10 | 11 | To get started with swimming, first lay down in a water and try not to drown: 12 | 13 | - ∞ You can relax and look around 14 | - ∞ Paddle about 15 | - ∞ Enjoy summer warmth 16 | 17 | Also, don't forget: 18 | 19 | - 1. Wear sunglasses 20 | - 2. Don't forget to drink water 21 | - 3. Use sun cream 22 | 23 | Hmm, what else… 24 | 25 | - -Another activity item 26 | 27 | - -Yet another one 28 | - -Stopping it here 29 | 30 | Some text. 31 | 32 | 33 | 34 | - -Starting the next page with a list item. 35 | - -Second item. 36 | -------------------------------------------------------------------------------- /test/data/doc/activities_pb_non_empty.gt.md: -------------------------------------------------------------------------------- 1 | ## Summer activities 2 | 3 | ## Swimming in the lake 4 | 5 | Duck 6 | 7 | Figure 1: This is a cute duckling 8 | 9 | ## Let's swim! 10 | 11 | To get started with swimming, first lay down in a water and try not to drown: 12 | 13 | - ∞ You can relax and look around 14 | - ∞ Paddle about 15 | - ∞ Enjoy summer warmth 16 | 17 | Also, don't forget: 18 | 19 | - 1. Wear sunglasses 20 | - 2. Don't forget to drink water 21 | - 3. Use sun cream 22 | 23 | Hmm, what else… 24 | 25 | - -Another activity item 26 | 27 | - -Yet another one 28 | - -Stopping it here 29 | 30 | Some text. 31 | 32 | 33 | 34 | - -Starting the next page with a list item. 35 | - -Second item. 36 | -------------------------------------------------------------------------------- /test/data/doc/activities_pb_none.gt.md: -------------------------------------------------------------------------------- 1 | ## Summer activities 2 | 3 | ## Swimming in the lake 4 | 5 | Duck 6 | 7 | Figure 1: This is a cute duckling 8 | 9 | ## Let's swim! 10 | 11 | To get started with swimming, first lay down in a water and try not to drown: 12 | 13 | - ∞ You can relax and look around 14 | - ∞ Paddle about 15 | - ∞ Enjoy summer warmth 16 | 17 | Also, don't forget: 18 | 19 | - 1. Wear sunglasses 20 | - 2. Don't forget to drink water 21 | - 3. Use sun cream 22 | 23 | Hmm, what else… 24 | 25 | - -Another activity item 26 | - -Yet another one 27 | - -Stopping it here 28 | 29 | Some text. 30 | 31 | - -Starting the next page with a list item. 32 | - -Second item. 33 | -------------------------------------------------------------------------------- /test/data/doc/bad_doc.yaml.dt: -------------------------------------------------------------------------------- 1 | This is the title 2 | This is the first section 3 | 4 | -------------------------------------------------------------------------------- /test/data/doc/bad_doc.yaml.et: -------------------------------------------------------------------------------- 1 | 0: unspecified with name=_root_ 2 | 1: title 3 | 2: unspecified with name=chapter 1 4 | 3: section_header 5 | -------------------------------------------------------------------------------- /test/data/doc/bad_doc.yaml.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | Powered by Docling 9 | 10 | 71 | 72 |

This is the title

73 |

This is the first section

74 | 75 | -------------------------------------------------------------------------------- /test/data/doc/bad_doc.yaml.md: -------------------------------------------------------------------------------- 1 | # This is the title 2 | 3 | ### This is the first section 4 | -------------------------------------------------------------------------------- /test/data/doc/barchart.dt: -------------------------------------------------------------------------------- 1 | Probability, Combinatorics and Control 2 | Number of impellerssingle-frequencymulti-frequency10.060.1620.120.2630.160.2740.140.2650.160.2560.240.24 3 | -------------------------------------------------------------------------------- /test/data/doc/barchart.gt.md: -------------------------------------------------------------------------------- 1 | bar chart 2 | 3 | 4 | 5 | | Number of impellers | single-frequency | multi-frequency | 6 | |-----------------------|--------------------|-------------------| 7 | | 1 | 0.06 | 0.16 | 8 | | 2 | 0.12 | 0.26 | 9 | | 3 | 0.16 | 0.27 | 10 | | 4 | 0.14 | 0.26 | 11 | | 5 | 0.16 | 0.25 | 12 | | 6 | 0.24 | 0.24 | 13 | -------------------------------------------------------------------------------- /test/data/doc/barchart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/barchart.png -------------------------------------------------------------------------------- /test/data/doc/constructed_doc.dt: -------------------------------------------------------------------------------- 1 | item of leading list 2 | 3 | Title of the Document 4 | Author 1 5 | Affiliation 1 6 | Author 2 7 | Affiliation 2 8 | 1. Introduction 9 | This paper introduces the biggest invention ever made. ... 10 | list item 1 11 | list item 2 12 | list item 3 13 | list item 3.a 14 | list item 3.b 15 | list item 3.c 16 | list item 3.c.i 17 | 18 | 19 | list item 4 20 | 21 | ProductYears20162017Apple49823695944This is the caption of table 1. 22 | This is the caption of figure 1. 23 | This is the caption of figure 2. 24 | item 1 of list 25 | 26 | item 1 of list after empty list 27 | item 2 of list after empty list 28 | 29 | item 1 of neighboring list 30 | item 2 of neighboring list 31 | item 1 of sub list 32 | Here a code snippet: 33 | <_unknown_>print("Hello world") 34 | (to be displayed inline) 35 | 36 | Here a formula: 37 | E=mc^2 38 | (to be displayed inline) 39 | 40 | 41 | 42 | Here a code block: 43 | <_unknown_>print("Hello world") 44 | Here a formula block: 45 | E=mc^2 46 | number1 47 | Some formatting chops: 48 | bold 49 | italic 50 | underline 51 | strikethrough 52 | subscript 53 | superscript 54 | hyperlink 55 | & 56 | everything at the same time. 57 | 58 | Item 1 in A 59 | Item 2 in A 60 | Item 3 in A 61 | Item 1 in B 62 | Item 2 in B 63 | Item 1 in C 64 | Item 2 in C 65 | 66 | Item 3 in B 67 | 68 | Item 4 in A 69 | 70 | The end. 71 | -------------------------------------------------------------------------------- /test/data/doc/constructed_doc.dt.gt: -------------------------------------------------------------------------------- 1 | item of leading list 2 | 3 | Title of the Document 4 | Author 1 5 | Affiliation 1 6 | Author 2 7 | Affiliation 2 8 | 1. Introduction 9 | This paper introduces the biggest invention ever made. ... 10 | list item 1 11 | list item 2 12 | list item 3 13 | list item 3.a 14 | list item 3.b 15 | list item 3.c 16 | list item 3.c.i 17 | 18 | 19 | list item 4 20 | 21 | ProductYears20162017Apple49823695944This is the caption of table 1. 22 | This is the caption of figure 1. 23 | This is the caption of figure 2. 24 | item 1 of list 25 | 26 | item 1 of list after empty list 27 | item 2 of list after empty list 28 | 29 | item 1 of neighboring list 30 | item 2 of neighboring list 31 | item 1 of sub list 32 | Here a code snippet: 33 | <_unknown_>print("Hello world") 34 | (to be displayed inline) 35 | 36 | Here a formula: 37 | E=mc^2 38 | (to be displayed inline) 39 | 40 | 41 | 42 | Here a code block: 43 | <_unknown_>print("Hello world") 44 | Here a formula block: 45 | E=mc^2 46 | number1 47 | Some formatting chops: 48 | bold 49 | italic 50 | underline 51 | strikethrough 52 | subscript 53 | superscript 54 | hyperlink 55 | & 56 | everything at the same time. 57 | 58 | Item 1 in A 59 | Item 2 in A 60 | Item 3 in A 61 | Item 1 in B 62 | Item 2 in B 63 | Item 1 in C 64 | Item 2 in C 65 | 66 | Item 3 in B 67 | 68 | Item 4 in A 69 | 70 | The end. 71 | -------------------------------------------------------------------------------- /test/data/doc/constructed_doc.embedded.md.gt: -------------------------------------------------------------------------------- 1 | - item of leading list 2 | 3 | # Title of the Document 4 | 5 | Author 1 6 | Affiliation 1 7 | 8 | Author 2 9 | Affiliation 2 10 | 11 | ## 1. Introduction 12 | 13 | This paper introduces the biggest invention ever made. ... 14 | 15 | - list item 1 16 | - list item 2 17 | - list item 3 18 | 1. list item 3.a 19 | 2. list item 3.b 20 | 3. list item 3.c 21 | 1. list item 3.c.i 22 | - list item 4 23 | 24 | This is the caption of table 1. 25 | 26 | | Product | Years | Years | 27 | |-----------|---------|---------| 28 | | Product | 2016 | 2017 | 29 | | Apple | 49823 | 695944 | 30 | 31 | This is the caption of figure 1. 32 | 33 | 34 | 35 | This is the caption of figure 2. 36 | 37 | ![Image]() 38 | 39 | - item 1 of list 40 | 41 | - item 1 of list after empty list 42 | - item 2 of list after empty list 43 | 44 | - item 1 of neighboring list 45 | - item 2 of neighboring list 46 | - item 1 of sub list 47 | - Here a code snippet: `print("Hello world")` (to be displayed inline) 48 | - Here a formula: $E=mc^2$ (to be displayed inline) 49 | 50 | Here a code block: 51 | 52 | ``` 53 | print("Hello world") 54 | ``` 55 | 56 | Here a formula block: 57 | 58 | $$E=mc^2$$ 59 | 60 | 61 | 62 | 63 | 64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) & [~~***everything at the same time.***~~](https://github.com/DS4SD/docling) 65 | 66 | 1. Item 1 in A 67 | 2. Item 2 in A 68 | 3. Item 3 in A 69 | 1. Item 1 in B 70 | 2. Item 2 in B 71 | 1. Item 1 in C 72 | 2. Item 2 in C 73 | 3. Item 3 in B 74 | 4. Item 4 in A 75 | 76 | The end. -------------------------------------------------------------------------------- /test/data/doc/constructed_doc.placeholder.md.gt: -------------------------------------------------------------------------------- 1 | - item of leading list 2 | 3 | # Title of the Document 4 | 5 | Author 1 6 | Affiliation 1 7 | 8 | Author 2 9 | Affiliation 2 10 | 11 | ## 1. Introduction 12 | 13 | This paper introduces the biggest invention ever made. ... 14 | 15 | - list item 1 16 | - list item 2 17 | - list item 3 18 | 1. list item 3.a 19 | 2. list item 3.b 20 | 3. list item 3.c 21 | 1. list item 3.c.i 22 | - list item 4 23 | 24 | This is the caption of table 1. 25 | 26 | | Product | Years | Years | 27 | |-----------|---------|---------| 28 | | Product | 2016 | 2017 | 29 | | Apple | 49823 | 695944 | 30 | 31 | This is the caption of figure 1. 32 | 33 | 34 | 35 | This is the caption of figure 2. 36 | 37 | 38 | 39 | - item 1 of list 40 | 41 | - item 1 of list after empty list 42 | - item 2 of list after empty list 43 | 44 | - item 1 of neighboring list 45 | - item 2 of neighboring list 46 | - item 1 of sub list 47 | - Here a code snippet: `print("Hello world")` (to be displayed inline) 48 | - Here a formula: $E=mc^2$ (to be displayed inline) 49 | 50 | Here a code block: 51 | 52 | ``` 53 | print("Hello world") 54 | ``` 55 | 56 | Here a formula block: 57 | 58 | $$E=mc^2$$ 59 | 60 | 61 | 62 | 63 | 64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) & [~~***everything at the same time.***~~](https://github.com/DS4SD/docling) 65 | 66 | 1. Item 1 in A 67 | 2. Item 2 in A 68 | 3. Item 3 in A 69 | 1. Item 1 in B 70 | 2. Item 2 in B 71 | 1. Item 1 in C 72 | 2. Item 2 in C 73 | 3. Item 3 in B 74 | 4. Item 4 in A 75 | 76 | The end. -------------------------------------------------------------------------------- /test/data/doc/constructed_doc.referenced.md.gt: -------------------------------------------------------------------------------- 1 | - item of leading list 2 | 3 | # Title of the Document 4 | 5 | Author 1 6 | Affiliation 1 7 | 8 | Author 2 9 | Affiliation 2 10 | 11 | ## 1. Introduction 12 | 13 | This paper introduces the biggest invention ever made. ... 14 | 15 | - list item 1 16 | - list item 2 17 | - list item 3 18 | 1. list item 3.a 19 | 2. list item 3.b 20 | 3. list item 3.c 21 | 1. list item 3.c.i 22 | - list item 4 23 | 24 | This is the caption of table 1. 25 | 26 | | Product | Years | Years | 27 | |-----------|---------|---------| 28 | | Product | 2016 | 2017 | 29 | | Apple | 49823 | 695944 | 30 | 31 | This is the caption of figure 1. 32 | 33 | 34 | 35 | This is the caption of figure 2. 36 | 37 | ![Image](constructed_images/image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png) 38 | 39 | - item 1 of list 40 | 41 | - item 1 of list after empty list 42 | - item 2 of list after empty list 43 | 44 | - item 1 of neighboring list 45 | - item 2 of neighboring list 46 | - item 1 of sub list 47 | - Here a code snippet: `print("Hello world")` (to be displayed inline) 48 | - Here a formula: $E=mc^2$ (to be displayed inline) 49 | 50 | Here a code block: 51 | 52 | ``` 53 | print("Hello world") 54 | ``` 55 | 56 | Here a formula block: 57 | 58 | $$E=mc^2$$ 59 | 60 | 61 | 62 | 63 | 64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) & [~~***everything at the same time.***~~](https://github.com/DS4SD/docling) 65 | 66 | 1. Item 1 in A 67 | 2. Item 2 in A 68 | 3. Item 3 in A 69 | 1. Item 1 in B 70 | 2. Item 2 in B 71 | 1. Item 1 in C 72 | 2. Item 2 in C 73 | 3. Item 3 in B 74 | 4. Item 4 in A 75 | 76 | The end. -------------------------------------------------------------------------------- /test/data/doc/constructed_document.yaml.dt: -------------------------------------------------------------------------------- 1 | item of leading list 2 | 3 | Title of the Document 4 | Author 1 5 | Affiliation 1 6 | Author 2 7 | Affiliation 2 8 | 1. Introduction 9 | This paper introduces the biggest invention ever made. ... 10 | list item 1 11 | list item 2 12 | list item 3 13 | list item 3.a 14 | list item 3.b 15 | list item 3.c 16 | list item 3.c.i 17 | 18 | 19 | list item 4 20 | 21 | ProductYears20162017Apple49823695944This is the caption of table 1. 22 | This is the caption of figure 1. 23 | This is the caption of figure 2. 24 | item 1 of list 25 | 26 | item 1 of list after empty list 27 | item 2 of list after empty list 28 | 29 | item 1 of neighboring list 30 | item 2 of neighboring list 31 | item 1 of sub list 32 | Here a code snippet: 33 | <_unknown_>print("Hello world") 34 | (to be displayed inline) 35 | 36 | Here a formula: 37 | E=mc^2 38 | (to be displayed inline) 39 | 40 | 41 | 42 | Here a code block: 43 | <_unknown_>print("Hello world") 44 | Here a formula block: 45 | E=mc^2 46 | number1 47 | Some formatting chops: 48 | bold 49 | italic 50 | underline 51 | strikethrough 52 | subscript 53 | superscript 54 | hyperlink 55 | & 56 | everything at the same time. 57 | 58 | Item 1 in A 59 | Item 2 in A 60 | Item 3 in A 61 | Item 1 in B 62 | Item 2 in B 63 | Item 1 in C 64 | Item 2 in C 65 | 66 | Item 3 in B 67 | 68 | Item 4 in A 69 | 70 | The end. 71 | 72 | -------------------------------------------------------------------------------- /test/data/doc/constructed_document.yaml.et: -------------------------------------------------------------------------------- 1 | 0: unspecified with name=_root_ 2 | 1: list with name=group 3 | 2: list_item 4 | 3: title 5 | 4: text 6 | 5: text 7 | 6: chapter with name=Introduction 8 | 7: section_header 9 | 8: text 10 | 9: list with name=group 11 | 10: list_item 12 | 11: list_item 13 | 12: list_item 14 | 13: ordered_list with name=group 15 | 14: list_item 16 | 15: list_item 17 | 16: list_item 18 | 17: ordered_list with name=group 19 | 18: list_item 20 | 19: list_item 21 | 20: caption 22 | 21: table 23 | 22: caption 24 | 23: picture 25 | 24: caption 26 | 25: picture 27 | 26: list with name=group 28 | 27: list_item 29 | 28: list with name=group 30 | 29: list with name=group 31 | 30: list_item 32 | 31: list_item 33 | 32: list with name=group 34 | 33: list_item 35 | 34: list_item 36 | 35: list with name=group 37 | 36: list_item 38 | 37: inline with name=group 39 | 38: text 40 | 39: code 41 | 40: text 42 | 41: inline with name=group 43 | 42: text 44 | 43: formula 45 | 44: text 46 | 45: text 47 | 46: code 48 | 47: text 49 | 48: formula 50 | 49: key_value_region 51 | 50: form 52 | 51: inline with name=group 53 | 52: text 54 | 53: text 55 | 54: text 56 | 55: text 57 | 56: text 58 | 57: text 59 | 58: text 60 | 59: text 61 | 60: text 62 | 61: text 63 | 62: ordered_list with name=list A 64 | 63: list_item 65 | 64: list_item 66 | 65: list_item 67 | 66: ordered_list with name=list B 68 | 67: list_item 69 | 68: list_item 70 | 69: ordered_list with name=list C 71 | 70: list_item 72 | 71: list_item 73 | 72: list_item 74 | 73: list_item 75 | 74: text 76 | -------------------------------------------------------------------------------- /test/data/doc/constructed_document.yaml.md: -------------------------------------------------------------------------------- 1 | - item of leading list 2 | 3 | # Title of the Document 4 | 5 | Author 1 6 | Affiliation 1 7 | 8 | Author 2 9 | Affiliation 2 10 | 11 | ## 1. Introduction 12 | 13 | This paper introduces the biggest invention ever made. ... 14 | 15 | - list item 1 16 | - list item 2 17 | - list item 3 18 | 1. list item 3.a 19 | 2. list item 3.b 20 | 3. list item 3.c 21 | 1. list item 3.c.i 22 | - list item 4 23 | 24 | This is the caption of table 1. 25 | 26 | | Product | Years | Years | 27 | |-----------|---------|---------| 28 | | Product | 2016 | 2017 | 29 | | Apple | 49823 | 695944 | 30 | 31 | This is the caption of figure 1. 32 | 33 | 34 | 35 | This is the caption of figure 2. 36 | 37 | 38 | 39 | - item 1 of list 40 | 41 | - item 1 of list after empty list 42 | - item 2 of list after empty list 43 | 44 | - item 1 of neighboring list 45 | - item 2 of neighboring list 46 | - item 1 of sub list 47 | - Here a code snippet: `print("Hello world")` (to be displayed inline) 48 | - Here a formula: $E=mc^2$ (to be displayed inline) 49 | 50 | Here a code block: 51 | 52 | ``` 53 | print("Hello world") 54 | ``` 55 | 56 | Here a formula block: 57 | 58 | $$E=mc^2$$ 59 | 60 | 61 | 62 | 63 | 64 | Some formatting chops: **bold** *italic* underline ~~strikethrough~~ subscript superscript [hyperlink](.) & [~~***everything at the same time.***~~](https://github.com/DS4SD/docling) 65 | 66 | 1. Item 1 in A 67 | 2. Item 2 in A 68 | 3. Item 3 in A 69 | 1. Item 1 in B 70 | 2. Item 2 in B 71 | 1. Item 1 in C 72 | 2. Item 2 in C 73 | 3. Item 3 in B 74 | 4. Item 4 in A 75 | 76 | The end. 77 | -------------------------------------------------------------------------------- /test/data/doc/constructed_images/image_000001_797618e862d279d4e3e92f4b6313175f67e08fc36051dfda092bf63220568703.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/constructed_images/image_000001_797618e862d279d4e3e92f4b6313175f67e08fc36051dfda092bf63220568703.png -------------------------------------------------------------------------------- /test/data/doc/constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/constructed_images/image_000001_ccb4cbe7039fe17892f3d611cfb71eafff1d4d230b19b10779334cc4b63c98bc.png -------------------------------------------------------------------------------- /test/data/doc/constructed_images/image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/constructed_images/image_000001_f3cc103136423a57975750907ebc1d367e2985ac6338976d4d5a439f50323f4a.png -------------------------------------------------------------------------------- /test/data/doc/doc_with_kv.dt: -------------------------------------------------------------------------------- 1 | TO:FROM:8623474Mrs. K. A. SparrowR. G. RyanJUNE7AUG.2OCT.7SUBMISSION DATE:NEWPORT LIGHTS HEAVY UP PROGRESS REPORTEFFECTIVENESS OF DISTRIBUTION ALLOWANCE:DIRECT ACCOUNT/ WHOLESALERS:Distribution allowance was very effective in accomplishing our objectives. All accounts have purchased introductory products.DIRECT ACCOUNT CHAINS:Eagle Foods is the only Void.NON- DIRECT ACCOUNT CHAINS:Reception from these accounts is most positive with a solid incentitive to purchase.EFFECTIVENESS OF THE RETAIL (1 00 OFF CARTON) DISTRIBUTION ALLOWANCE:Has been most helpful in acquiring desireable distribution when needed by Sales Reps.PROMOTIONAL ACTIVITY40c OFF PACK- GENERAL MARKET:The 40c off promotions continue to be well received at the retail stores and by consumers, as well. -------------------------------------------------------------------------------- /test/data/doc/doc_with_kv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/doc_with_kv.png -------------------------------------------------------------------------------- /test/data/doc/dummy_doc.yaml.dt: -------------------------------------------------------------------------------- 1 | <loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis 2 | CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1Figure 1: Four examples of complex page layouts across different document categories 3 | 4 | 5 | -------------------------------------------------------------------------------- /test/data/doc/dummy_doc.yaml.et: -------------------------------------------------------------------------------- 1 | 0: unspecified with name=_root_ 2 | 1: title 3 | 2: picture 4 | 3: caption 5 | 4: table 6 | -------------------------------------------------------------------------------- /test/data/doc/dummy_doc.yaml.md: -------------------------------------------------------------------------------- 1 | # DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis 2 | 3 | Figure 1: Four examples of complex page layouts across different document categories 4 | 5 | bar chart 6 | 7 | ... 8 | 9 | CC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1 10 | 11 | 12 | 13 | A description annotation for this table. 14 | -------------------------------------------------------------------------------- /test/data/doc/dummy_doc.yaml.min.dt: -------------------------------------------------------------------------------- 1 | <loc_42><loc_26><loc_406><loc_46>DocLayNet: A Large Human-Annotated Dataset for Document-Layout AnalysisCC1=NNC(C2=CN3C=CN=C3C(CC3=CC(F)=CC(F)=C3)=N2)=N1Figure 1: Four examples of complex page layouts across different document categories 2 | -------------------------------------------------------------------------------- /test/data/doc/misplaced_list_items.out.yaml: -------------------------------------------------------------------------------- 1 | body: 2 | children: 3 | - $ref: '#/groups/1' 4 | - $ref: '#/texts/0' 5 | - $ref: '#/groups/0' 6 | content_layer: body 7 | label: unspecified 8 | name: _root_ 9 | self_ref: '#/body' 10 | form_items: [] 11 | furniture: 12 | children: [] 13 | content_layer: furniture 14 | label: unspecified 15 | name: _root_ 16 | self_ref: '#/furniture' 17 | groups: 18 | - children: 19 | - $ref: '#/texts/1' 20 | - $ref: '#/texts/2' 21 | content_layer: body 22 | label: list 23 | name: group 24 | parent: 25 | $ref: '#/body' 26 | self_ref: '#/groups/0' 27 | - children: 28 | - $ref: '#/texts/3' 29 | content_layer: body 30 | label: ordered_list 31 | name: group 32 | parent: 33 | $ref: '#/body' 34 | self_ref: '#/groups/1' 35 | key_value_items: [] 36 | name: '' 37 | pages: {} 38 | pictures: [] 39 | schema_name: DoclingDocument 40 | tables: [] 41 | texts: 42 | - children: [] 43 | content_layer: body 44 | label: text 45 | orig: bar 46 | parent: 47 | $ref: '#/body' 48 | prov: [] 49 | self_ref: '#/texts/0' 50 | text: bar 51 | - children: [] 52 | content_layer: body 53 | enumerated: false 54 | label: list_item 55 | marker: '-' 56 | orig: here 57 | parent: 58 | $ref: '#/groups/0' 59 | prov: [] 60 | self_ref: '#/texts/1' 61 | text: here 62 | - children: [] 63 | content_layer: body 64 | enumerated: false 65 | label: list_item 66 | marker: '-' 67 | orig: there 68 | parent: 69 | $ref: '#/groups/0' 70 | prov: [] 71 | self_ref: '#/texts/2' 72 | text: there 73 | - children: [] 74 | content_layer: body 75 | enumerated: true 76 | label: list_item 77 | marker: '1.' 78 | orig: foo 79 | parent: 80 | $ref: '#/groups/1' 81 | prov: [] 82 | self_ref: '#/texts/3' 83 | text: foo 84 | version: 1.4.0 85 | -------------------------------------------------------------------------------- /test/data/doc/misplaced_list_items.yaml: -------------------------------------------------------------------------------- 1 | body: 2 | children: 3 | - $ref: '#/texts/0' 4 | - $ref: '#/texts/1' 5 | - $ref: '#/texts/2' 6 | - $ref: '#/texts/3' 7 | content_layer: body 8 | label: unspecified 9 | name: _root_ 10 | self_ref: '#/body' 11 | form_items: [] 12 | furniture: 13 | children: [] 14 | content_layer: furniture 15 | label: unspecified 16 | name: _root_ 17 | self_ref: '#/furniture' 18 | groups: [] 19 | key_value_items: [] 20 | name: '' 21 | pages: {} 22 | pictures: [] 23 | schema_name: DoclingDocument 24 | tables: [] 25 | texts: 26 | - children: [] 27 | content_layer: body 28 | enumerated: true 29 | label: list_item 30 | marker: '1.' 31 | orig: foo 32 | parent: 33 | $ref: '#/body' 34 | prov: [] 35 | self_ref: '#/texts/0' 36 | text: foo 37 | - children: [] 38 | content_layer: body 39 | label: text 40 | orig: bar 41 | parent: 42 | $ref: '#/body' 43 | prov: [] 44 | self_ref: '#/texts/1' 45 | text: bar 46 | - children: [] 47 | content_layer: body 48 | enumerated: false 49 | label: list_item 50 | marker: '-' 51 | orig: here 52 | parent: 53 | $ref: '#/body' 54 | prov: [] 55 | self_ref: '#/texts/2' 56 | text: here 57 | - children: [] 58 | content_layer: body 59 | enumerated: false 60 | label: list_item 61 | marker: '-' 62 | orig: there 63 | parent: 64 | $ref: '#/body' 65 | prov: [] 66 | self_ref: '#/texts/3' 67 | text: there 68 | version: 1.3.0 69 | -------------------------------------------------------------------------------- /test/data/doc/misplaced_list_items.yaml.dt: -------------------------------------------------------------------------------- 1 | foo 2 | 3 | bar 4 | here 5 | there 6 | 7 | 8 | -------------------------------------------------------------------------------- /test/data/doc/page_with_pic.dt: -------------------------------------------------------------------------------- 1 | Assistant: Optimized Table Tokenization for Table Structure Recognition 2 | 7 3 | Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding 4 | 4.2 Language Syntax 5 | The OTSL representation follows these syntax rules: 6 | 1. Left-looking cell rule: The left neighbour of an "L" cell must be either another "L" cell or a "C" cell. 7 | 2. Up-looking cell rule: The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell. 8 | 3. Cross cell rule: 9 | 10 | The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell. 11 | 4. First row rule: Only "L" cells and "C" cells are allowed in the first row. 12 | 5. First column rule: Only "U" cells and "C" cells are allowed in the first column. 13 | 6. Rectangular rule: The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token. 14 | 15 | The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid. 16 | These characteristics can be easily learned by sequence generator networks, as we demonstrate further below. We find strong indications that this pattern 17 | -------------------------------------------------------------------------------- /test/data/doc/page_with_pic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/doc/page_with_pic.png -------------------------------------------------------------------------------- /test/data/docling_document/export/formula_mathml.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |
6 |
1x\frac{1}{x}
7 |
8 | 9 | 10 | -------------------------------------------------------------------------------- /test/data/docling_document/unit/CodeItem.yaml: -------------------------------------------------------------------------------- 1 | children: [] 2 | captions: [] 3 | footnotes: [] 4 | references: [] 5 | image: null 6 | code_language: Python 7 | content_layer: body 8 | label: code 9 | orig: whatever 10 | parent: null 11 | prov: [] 12 | self_ref: '#' 13 | text: print(Hello World!) 14 | formatting: null 15 | hyperlink: null 16 | -------------------------------------------------------------------------------- /test/data/docling_document/unit/FloatingItem.yaml: -------------------------------------------------------------------------------- 1 | captions: [] 2 | children: [] 3 | footnotes: [] 4 | image: null 5 | label: text 6 | parent: null 7 | prov: [] 8 | references: [] 9 | self_ref: '#' 10 | content_layer: body -------------------------------------------------------------------------------- /test/data/docling_document/unit/FormItem.yaml: -------------------------------------------------------------------------------- 1 | captions: [] 2 | children: [] 3 | content_layer: body 4 | footnotes: [] 5 | graph: 6 | cells: 7 | - cell_id: 0 8 | item_ref: null 9 | label: key 10 | orig: '#' 11 | prov: null 12 | text: number 13 | - cell_id: 1 14 | item_ref: null 15 | label: value 16 | orig: '1' 17 | prov: null 18 | text: '1' 19 | links: 20 | - label: to_value 21 | source_cell_id: 0 22 | target_cell_id: 1 23 | - label: to_key 24 | source_cell_id: 1 25 | target_cell_id: 0 26 | image: null 27 | label: form 28 | parent: null 29 | prov: [] 30 | references: [] 31 | self_ref: '#' -------------------------------------------------------------------------------- /test/data/docling_document/unit/FormulaItem.yaml: -------------------------------------------------------------------------------- 1 | children: [] 2 | label: formula 3 | orig: whatever 4 | parent: null 5 | prov: [] 6 | self_ref: '#' 7 | text: E=mc^2 8 | content_layer: body 9 | formatting: null 10 | hyperlink: null 11 | -------------------------------------------------------------------------------- /test/data/docling_document/unit/KeyValueItem.yaml: -------------------------------------------------------------------------------- 1 | captions: [] 2 | children: [] 3 | content_layer: body 4 | footnotes: [] 5 | graph: 6 | cells: 7 | - cell_id: 0 8 | item_ref: null 9 | label: key 10 | orig: '#' 11 | prov: null 12 | text: number 13 | - cell_id: 1 14 | item_ref: null 15 | label: value 16 | orig: '1' 17 | prov: null 18 | text: '1' 19 | links: 20 | - label: to_value 21 | source_cell_id: 0 22 | target_cell_id: 1 23 | - label: to_key 24 | source_cell_id: 1 25 | target_cell_id: 0 26 | image: null 27 | label: key_value_region 28 | parent: null 29 | prov: [] 30 | references: [] 31 | self_ref: '#' -------------------------------------------------------------------------------- /test/data/docling_document/unit/ListItem.yaml: -------------------------------------------------------------------------------- 1 | children: [] 2 | enumerated: true 3 | label: list_item 4 | marker: (1) 5 | orig: whatever 6 | parent: null 7 | prov: [] 8 | self_ref: '#' 9 | text: whatever 10 | content_layer: body 11 | formatting: null 12 | hyperlink: null 13 | -------------------------------------------------------------------------------- /test/data/docling_document/unit/PictureItem.yaml: -------------------------------------------------------------------------------- 1 | annotations: [] 2 | captions: [] 3 | children: [] 4 | footnotes: [] 5 | image: null 6 | label: picture 7 | parent: null 8 | prov: [] 9 | references: [] 10 | self_ref: '#' 11 | content_layer: body -------------------------------------------------------------------------------- /test/data/docling_document/unit/SectionHeaderItem.yaml: -------------------------------------------------------------------------------- 1 | children: [] 2 | label: section_header 3 | level: 2 4 | orig: whatever 5 | parent: null 6 | prov: [] 7 | self_ref: '#' 8 | text: whatever 9 | content_layer: body 10 | formatting: null 11 | hyperlink: null 12 | -------------------------------------------------------------------------------- /test/data/docling_document/unit/TextItem.yaml: -------------------------------------------------------------------------------- 1 | children: [] 2 | label: text 3 | orig: whatever 4 | parent: null 5 | prov: [] 6 | self_ref: '#' 7 | text: whatever 8 | content_layer: body 9 | formatting: null 10 | hyperlink: null 11 | -------------------------------------------------------------------------------- /test/data/docling_document/unit/TitleItem.yaml: -------------------------------------------------------------------------------- 1 | children: [] 2 | label: title 3 | orig: whatever 4 | parent: null 5 | prov: [] 6 | self_ref: '#' 7 | text: whatever 8 | content_layer: body 9 | formatting: null 10 | hyperlink: null 11 | -------------------------------------------------------------------------------- /test/data/json_schemas/base_identifier.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Identifier", 3 | "description": "Unique identifier of a Docling data object.", 4 | "type": "object", 5 | "properties": { 6 | "type": { 7 | "title": "Type", 8 | "description": "A string representing a collection or database that contains this data object.", 9 | "x-es-type": "keyword", 10 | "x-es-ignore_above": 8191, 11 | "type": "string" 12 | }, 13 | "value": { 14 | "title": "Value", 15 | "description": "The identifier value of the data object within a collection or database.", 16 | "x-es-type": "keyword", 17 | "x-es-ignore_above": 8191, 18 | "type": "string" 19 | }, 20 | "_name": { 21 | "title": "_Name", 22 | "description": "A unique identifier of the data object across Docling, consisting of the concatenation of type and value in lower case, separated by hash (#).", 23 | "x-es-type": "keyword", 24 | "x-es-ignore_above": 8191, 25 | "pattern": "^.+#.+$", 26 | "type": "string" 27 | } 28 | }, 29 | "required": [ 30 | "type", 31 | "value", 32 | "_name" 33 | ], 34 | "additionalProperties": false 35 | } -------------------------------------------------------------------------------- /test/data/json_schemas/base_log.json: -------------------------------------------------------------------------------- 1 | { 2 | "additionalProperties": false, 3 | "description": "Log entry to describe an ETL task on a document.", 4 | "properties": { 5 | "task": { 6 | "anyOf": [ 7 | { 8 | "type": "string" 9 | }, 10 | { 11 | "type": "null" 12 | } 13 | ], 14 | "default": null, 15 | "description": "An identifier of this task. It may be used to identify this task from other tasks of the same agent and type.", 16 | "title": "Task", 17 | "x-es-ignore_above": 8191, 18 | "x-es-type": "keyword" 19 | }, 20 | "agent": { 21 | "description": "The Docling agent that performed the task, e.g., CCS or CXS.", 22 | "title": "Agent", 23 | "type": "string", 24 | "x-es-ignore_above": 8191, 25 | "x-es-type": "keyword" 26 | }, 27 | "type": { 28 | "description": "A task category.", 29 | "title": "Type", 30 | "type": "string", 31 | "x-es-ignore_above": 8191, 32 | "x-es-type": "keyword" 33 | }, 34 | "comment": { 35 | "anyOf": [ 36 | { 37 | "type": "string" 38 | }, 39 | { 40 | "type": "null" 41 | } 42 | ], 43 | "default": null, 44 | "description": "A description of the task or any comments in natural language.", 45 | "title": "Comment" 46 | }, 47 | "date": { 48 | "description": "A string representation of the task execution datetime in ISO 8601 format.", 49 | "format": "date-time", 50 | "title": "Date", 51 | "type": "string" 52 | } 53 | }, 54 | "required": [ 55 | "agent", 56 | "type", 57 | "date" 58 | ], 59 | "title": "Log", 60 | "type": "object" 61 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/doc-4.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": { 3 | "publication_date": "2024-07-01T12:00:00.000+00:00", 4 | "languages": [ 5 | "en" 6 | ], 7 | "url_refs": [ 8 | "https://www.link-to-pdf-626144176a8a0616ce8c111ecda4bc30b4a.com/file.pdf" 9 | ], 10 | "title": "Lorem ipsum", 11 | "affiliations": [ 12 | { 13 | "name": "Affiliation Name" 14 | } 15 | ], 16 | "abstract": [ 17 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 18 | ], 19 | "authors": [ 20 | { 21 | "name": "Author 1" 22 | } 23 | ], 24 | "publication": [ 25 | { 26 | "identifiers": [ 27 | { 28 | "_name": "collection#12345", 29 | "type": "collection", 30 | "value": "12345" 31 | } 32 | ], 33 | "name": "International Conference", 34 | "type": [ 35 | "conference" 36 | ], 37 | "alternate_names": [ 38 | "Int Conference", 39 | "IC" 40 | ], 41 | "url": "https://en.wikipedia.org/wiki/Lorem_ipsum" 42 | }, 43 | { 44 | "name": "Lorem ipsum", 45 | "pages": "130-189", 46 | "volume": "87" 47 | } 48 | ], 49 | "reference_count": 15, 50 | "citation_count": 3, 51 | "citation_date": "2023-05-23T12:00:00.000+00:00", 52 | "logs": [ 53 | { 54 | "agent": "CXS", 55 | "type": "parsing", 56 | "comment": "parsing of documents", 57 | "date": "2022-11-09T21:22:19.248+00:00" 58 | } 59 | ], 60 | "acquisition": { 61 | "type": "Download", 62 | "date": "2022-11-06T07:13:09.317+00:00", 63 | "link": "https://en.wikipedia.org/wiki/Lorem_ipsum", 64 | "size": 102356 65 | }, 66 | "collection": { 67 | "name": "Sample Collection", 68 | "type": "Document", 69 | "version": "1.2.3", 70 | "alias": [ 71 | "SC" 72 | ] 73 | } 74 | }, 75 | "main-text": [ 76 | { 77 | "name": "title", 78 | "type": "title", 79 | "text": "Lorem ipsum" 80 | }, 81 | { 82 | "name": "text", 83 | "type": "paragraph", 84 | "text": "Author" 85 | }, 86 | { 87 | "name": "abstract", 88 | "type": "paragraph", 89 | "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 90 | } 91 | ], 92 | "file-info": { 93 | "document-hash": "9cdad4912f0b81298c96478626144176a8a0616fe8c101ecda4bc30b4a518374", 94 | "filename": "12345", 95 | "filename-prov": "12345.zip" 96 | }, 97 | "identifiers": [ 98 | { 99 | "_name": "collection#12345", 100 | "type": "collection", 101 | "value": "12345" 102 | } 103 | ], 104 | "type": "article", 105 | "_name": "Lorem ipsum" 106 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/doc-5.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": { 3 | "publication_date": "2024-07-01T12:00:00.000+00:00", 4 | "languages": [ 5 | "en" 6 | ], 7 | "url_refs": [ 8 | "https://www.link-to-pdf-626144176a8a0616ce8c111ecda4bc30b4a.com/file.pdf" 9 | ], 10 | "title": "Lorem ipsum", 11 | "affiliations": [ 12 | { 13 | "name": "Affiliation Name" 14 | } 15 | ], 16 | "abstract": [ 17 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 18 | ], 19 | "authors": [ 20 | { 21 | "name": "Author 1" 22 | } 23 | ], 24 | "publication": [ 25 | { 26 | "identifiers": [ 27 | { 28 | "_name": "collection#12345", 29 | "type": "collection", 30 | "value": "12345" 31 | } 32 | ], 33 | "name": "International Conference", 34 | "type": [ 35 | "conference" 36 | ], 37 | "alternate_names": [ 38 | "Int Conference", 39 | "IC" 40 | ], 41 | "url": "https://en.wikipedia.org/wiki/Lorem_ipsum" 42 | }, 43 | { 44 | "name": "Lorem ipsum", 45 | "pages": "130-189", 46 | "volume": "87" 47 | } 48 | ], 49 | "reference_count": 15, 50 | "citation_count": 3, 51 | "citation_date": "2023-05-23T12:00:00.000+00:00", 52 | "logs": [ 53 | { 54 | "agent": "CXS", 55 | "type": "parsing", 56 | "comment": "parsing of documents", 57 | "date": "2022-11-09T21:22:19.248+00:00" 58 | } 59 | ], 60 | "collection": { 61 | "name": "Sample Collection", 62 | "type": "Document", 63 | "version": "1.2.3", 64 | "alias": [ 65 | "SC" 66 | ] 67 | } 68 | }, 69 | "file-info": { 70 | "document-hash": "9cdad4912f0b81298c96478626144176a8a0616fe8c101ecda4bc30b4a518374", 71 | "filename": "12345", 72 | "filename-prov": "12345.zip" 73 | }, 74 | "identifiers": [ 75 | { 76 | "_name": "collection#12345", 77 | "type": "collection", 78 | "value": "12345" 79 | } 80 | ], 81 | "type": "article", 82 | "_name": "Lorem ipsum" 83 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/doc-8.json_table_0.dt.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | Letter from Our Chairman and CEO5 4 | Our ESG Goals7 5 | Accountability for ESG at IBM9 6 | Human Rights at IBM10 7 |
8 | -------------------------------------------------------------------------------- /test/data/legacy_doc/error-3.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": { 3 | "publication_date": "2024-07-01T12:00:00.000+00:00", 4 | "languages": [ 5 | "en" 6 | ], 7 | "url_refs": [ 8 | "https://www.link-to-pdf-626144176a8a0616ce8c111ecda4bc30b4a.com/file.pdf" 9 | ], 10 | "title": "Lorem ipsum", 11 | "affiliations": [ 12 | { 13 | "name": "Affiliation Name" 14 | } 15 | ], 16 | "abstract": [ 17 | "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 18 | ], 19 | "authors": [ 20 | { 21 | "name": "Author 1" 22 | } 23 | ], 24 | "publication": [ 25 | { 26 | "id": "publication-id-12345", 27 | "name": "International Conference", 28 | "type": [ 29 | "conference" 30 | ], 31 | "alternate_names": [ 32 | "Int Conference", 33 | "IC" 34 | ], 35 | "url": "https://en.wikipedia.org/wiki/Lorem_ipsum" 36 | }, 37 | { 38 | "name": "Lorem ipsum", 39 | "pages": "130-189", 40 | "volume": "87" 41 | } 42 | ], 43 | "reference_count": -1, 44 | "citation_count": "3", 45 | "logs": [ 46 | { 47 | "agent": "CXS", 48 | "type": "parsing", 49 | "comment": "parsing of documents", 50 | "date": "2022-11-09T21:22:19.248+00:00" 51 | } 52 | ], 53 | "collection": { 54 | "name": "Sample Collection", 55 | "type": "Document", 56 | "version": "1.2.3", 57 | "alias": [ 58 | "SC" 59 | ] 60 | } 61 | }, 62 | "main-text": [ 63 | { 64 | "name": "title", 65 | "type": "title", 66 | "text": "Lorem ipsum" 67 | }, 68 | { 69 | "name": "text", 70 | "type": "paragraph", 71 | "text": "Author" 72 | }, 73 | { 74 | "name": "abstract", 75 | "type": "paragraph", 76 | "text": "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." 77 | } 78 | ], 79 | "file-info": { 80 | "document-hash": "9cdad4912f0b81298c96478626144176a8a0616fe8c101ecda4bc30b4a518374", 81 | "filename": "12345", 82 | "filename-prov": "12345.zip" 83 | }, 84 | "identifiers": [ 85 | { 86 | "_name": "collection#12345", 87 | "type": "collection", 88 | "value": "12345" 89 | } 90 | ], 91 | "type": "article", 92 | "_name": "Lorem ipsum" 93 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/intermediates/ann.01.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": [], 3 | "predictions": [ 4 | { 5 | "cells": [ 6 | { 7 | "id": 0, 8 | "rawcell_id": 0, 9 | "label": "List-identifier" 10 | }, 11 | { 12 | "id": 1, 13 | "rawcell_id": 1, 14 | "label": "List-item" 15 | }, 16 | { 17 | "id": 2, 18 | "rawcell_id": 2, 19 | "label": "List-identifier" 20 | }, 21 | { 22 | "id": 3, 23 | "rawcell_id": 3, 24 | "label": "List-item" 25 | }, 26 | { 27 | "id": 4, 28 | "rawcell_id": 4, 29 | "label": "Footnote" 30 | }, 31 | { 32 | "id": 5, 33 | "rawcell_id": 5, 34 | "label": "Footnote" 35 | }, 36 | { 37 | "id": 6, 38 | "rawcell_id": 6, 39 | "label": "Footnote" 40 | } 41 | ], 42 | "clusters": [ 43 | { 44 | "model": "RRF-image", 45 | "type": "Picture", 46 | "bbox": [ 47 | 72.0, 48 | 366.100006, 49 | 612.0, 50 | 720.099976 51 | ], 52 | "cell_ids": [], 53 | "merged": false, 54 | "id": 0 55 | } 56 | ], 57 | "tables": [], 58 | "source": { 59 | "type": "model", 60 | "info": { 61 | "display_name": "Applied predictions of collection model", 62 | "model_name": "CollectionModel", 63 | "model_class": "models.interactive", 64 | "model_version": "20171024-16:04", 65 | "model_id": "a423918e-77b9-025d-a06e-56a02f2z4f3b" 66 | }, 67 | "timestamp": 1549956870.877 68 | } 69 | } 70 | ], 71 | "reports": [] 72 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/intermediates/pdf.meta.01.json: -------------------------------------------------------------------------------- 1 | { 2 | "_id": "5bd03fdcdeff5a006862ee70", 3 | "file-info": { 4 | "filename": "test.pdf", 5 | "page-no": 5, 6 | "#-pages": 10, 7 | "document-hash": "a91d9bd6083c5adf1738589e12569f4f1e04f895aaa9a92d03e8a52137753fa5", 8 | "page-hash": "05956039dc5ea674f57cce469a3e86365c1047df9821b9ec55d5e16dbd4e9dcd", 9 | "description": {} 10 | }, 11 | "_parse-status": "SUCCESS", 12 | "lastModified": 1540374535.437 13 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/intermediates/publication_journal.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Journal of Environment", 3 | "pages": "314-391", 4 | "issue": "3", 5 | "volume": "125", 6 | "type": ["JournalArticle"], 7 | "url": "https://www.ibm.com" 8 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/intermediates/publication_venue.json: -------------------------------------------------------------------------------- 1 | { 2 | "identifiers": [ 3 | { 4 | "_name": "venue#12345", 5 | "type": "venue", 6 | "value": "12345" 7 | } 8 | ], 9 | "name": "International Conference on Technology", 10 | "type": [ 11 | "conference" 12 | ], 13 | "alternate_names": [ 14 | "ICoT", 15 | "Random Conference on Technology" 16 | ], 17 | "url": "http://www.ibm.com" 18 | } -------------------------------------------------------------------------------- /test/data/legacy_doc/intermediates/raw.meta.01.json: -------------------------------------------------------------------------------- 1 | { 2 | "file-info" : 3 | { 4 | "#-pages" : 10, 5 | "description" : 6 | { 7 | }, 8 | "document-hash" : "a91d9bd6083c5adf1738589e12569f4f1e04f895aaa9a92d03e8a52137753fa5", 9 | "filename" : "ocr.pdf", 10 | "page-hash" : "05956039dc5ea674f57cce469a3e86365c1047df9821b9ec55d5e16dbd4e9dcd", 11 | "page-no" : 5 12 | } 13 | } -------------------------------------------------------------------------------- /test/data/nlp/error-qa-1.json: -------------------------------------------------------------------------------- 1 | { 2 | "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.", 3 | "question": null, 4 | "answer": "Armonk, New York.", 5 | "short_answer": null, 6 | "created": "2024-02-19T12:00:00.000+00:00", 7 | "generated_question": true, 8 | "generated_answer": true, 9 | "model": "model-name/model-task", 10 | "paths": [ 11 | "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35" 12 | ], 13 | "advanced": { 14 | "submitter": "Wikipedia" 15 | } 16 | } -------------------------------------------------------------------------------- /test/data/nlp/error-qa-3.json: -------------------------------------------------------------------------------- 1 | { 2 | "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.", 3 | "question": "Where is the headquarters of IBM located?", 4 | "answer": "IBM is headquartered in Armonk, New York.", 5 | "short_answer": null, 6 | "created": "2024-02-19T12:00:00.000+00:00", 7 | "generated_question": true, 8 | "generated_answer": true, 9 | "model": "model-name/model-task", 10 | "paths": [ 11 | "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35", 12 | "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/21", 13 | "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35" 14 | ], 15 | "advanced": { 16 | "submitter": "Wikipedia" 17 | } 18 | } -------------------------------------------------------------------------------- /test/data/nlp/qa-1.json: -------------------------------------------------------------------------------- 1 | { 2 | "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.", 3 | "question": "Where is the headquarters of IBM located?", 4 | "answer": "IBM is headquartered in Armonk, New York.", 5 | "short_answer": null, 6 | "created": "2024-02-19T12:00:00.000+00:00", 7 | "generated_question": true, 8 | "generated_answer": true, 9 | "model": "model-name/model-task", 10 | "paths": [ 11 | "3c57cc90136eed2007b40835bb88f22fc2e81b81c9ddd8ca25265ae7ea154393#main-text/35" 12 | ], 13 | "advanced": { 14 | "submitter": "Wikipedia" 15 | } 16 | } -------------------------------------------------------------------------------- /test/data/nlp/qa-2.json: -------------------------------------------------------------------------------- 1 | { 2 | "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.", 3 | "question": "Where is the headquarters of IBM located?", 4 | "answer": "IBM is headquartered in Armonk, New York.", 5 | "short_answer": null, 6 | "created": "2024-02-19T12:00:00.000+00:00", 7 | "generated_question": true, 8 | "generated_answer": true, 9 | "model": "model-name/model-task", 10 | "paths": [], 11 | "advanced": { 12 | "submitter": "Wikipedia" 13 | } 14 | } -------------------------------------------------------------------------------- /test/data/nlp/qa-3.json: -------------------------------------------------------------------------------- 1 | { 2 | "context": "International Business Machines Corporation (using the trademark IBM), nicknamed Big Blue, is an American multinational technology company headquartered in Armonk, New York and present in over 175 countries. IBM is the largest industrial research organization in the world, with 19 research facilities across a dozen countries, having held the record for most annual U.S. patents generated by a business for 29 consecutive years from 1993 to 2021.", 3 | "question": "Where is the headquarters of IBM located?", 4 | "answer": "IBM is headquartered in Armonk, New York.", 5 | "short_answer": null, 6 | "created": "2024-02-19T12:00:00.000+00:00", 7 | "generated_question": true, 8 | "generated_answer": true, 9 | "model": "model-name/model-task", 10 | "paths": [], 11 | "advanced": { 12 | "submitter": "Wikipedia" 13 | }, 14 | "labels": { 15 | "scope": "document", 16 | "alignment": "aligned", 17 | "correctness": "entailed", 18 | "completeness": "complete", 19 | "information": "procedure" 20 | } 21 | } -------------------------------------------------------------------------------- /test/data/rec/attribute-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "conf": 1.0, 3 | "prov": [ 4 | { 5 | "type": "sentence", 6 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivly." 7 | } 8 | ], 9 | "predicates": [ 10 | { 11 | "key": { 12 | "type": "property", 13 | "name": "Tc" 14 | }, 15 | "value": { 16 | "type": "property-value", 17 | "name": "5K" 18 | } 19 | }, 20 | { 21 | "key": { 22 | "type": "property", 23 | "name": "pressure" 24 | }, 25 | "value": { 26 | "type": "property-value", 27 | "name": "5GPa" 28 | } 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /test/data/rec/attribute-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "conf": 0.799, 3 | "prov": [ 4 | { 5 | "type": "sentence", 6 | "text": "provenance in a sentence." 7 | }, 8 | { 9 | "type": "table", 10 | "text": "provenance in a table." 11 | } 12 | ], 13 | "predicates": [ 14 | { 15 | "key": { 16 | "type": "property", 17 | "name": "Tc" 18 | }, 19 | "value": { 20 | "type": "property-value", 21 | "name": "5K" 22 | } 23 | } 24 | ] 25 | } -------------------------------------------------------------------------------- /test/data/rec/attribute-03.json: -------------------------------------------------------------------------------- 1 | { 2 | "conf": 0.799, 3 | "prov": [ 4 | { 5 | "type": "sentence", 6 | "text": "provenance in a sentence." 7 | }, 8 | { 9 | "type": "table", 10 | "text": "provenance in a table." 11 | } 12 | ], 13 | "predicates": [] 14 | } -------------------------------------------------------------------------------- /test/data/rec/error-attribute-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "conf": 1.1, 3 | "prov": [ 4 | { 5 | "type": "sentence", 6 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivly." 7 | } 8 | ], 9 | "predicates": [ 10 | { 11 | "key": { 12 | "type": "property", 13 | "name": "Tc" 14 | }, 15 | "value": { 16 | "type": "property-value", 17 | "name": "5K" 18 | } 19 | }, 20 | { 21 | "key": { 22 | "type": "property", 23 | "name": "pressure" 24 | }, 25 | "value": { 26 | "type": "property-value", 27 | "name": "5GPa" 28 | } 29 | } 30 | ] 31 | } -------------------------------------------------------------------------------- /test/data/rec/error-attribute-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "conf": 1.0, 3 | "prov": [ 4 | { 5 | "type": "sentence", 6 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivly." 7 | } 8 | ] 9 | } -------------------------------------------------------------------------------- /test/data/rec/error-predicate-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": { 3 | "type": "property", 4 | "name": "geopoint" 5 | }, 6 | "value": { 7 | "type": "property-value", 8 | "name": "91.203494,-73.7238702" 9 | }, 10 | "geopoint_value": { 11 | "value": [ 12 | -73.7238702, 13 | 91.203494 14 | ], 15 | "conf": 0.902875 16 | } 17 | } -------------------------------------------------------------------------------- /test/data/rec/error-predicate-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": { 3 | "type": "property", 4 | "name": "geopoint" 5 | }, 6 | "value": { 7 | "type": "property-value", 8 | "name": "41.1096169,-73.7238702" 9 | }, 10 | "geopoint_value": { 11 | "value": [ 12 | -73.7238702, 13 | 41.1096169 14 | ], 15 | "conf": 2.902875 16 | } 17 | } -------------------------------------------------------------------------------- /test/data/rec/predicate-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": { 3 | "type": "property", 4 | "name": "geopoint" 5 | }, 6 | "value": { 7 | "type": "property-value", 8 | "name": "41.1096169,-73.7238702" 9 | }, 10 | "geopoint_value": { 11 | "value": [ 12 | -73.7238702, 13 | 41.1096169 14 | ], 15 | "conf": 0.902875 16 | } 17 | } -------------------------------------------------------------------------------- /test/data/rec/predicate-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "key": { 3 | "type": "property", 4 | "name": "legal entity creation date" 5 | }, 6 | "value": { 7 | "type": "property-value", 8 | "name": "2012-11-29T00:00:00.000Z" 9 | }, 10 | "datetime_value": { 11 | "value": "2012-11-29T00:00:00.000Z" 12 | } 13 | } -------------------------------------------------------------------------------- /test/data/rec/record-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name": "some text here", 3 | "file-info": { 4 | "filename": "filename.pdf", 5 | "document-hash": "qwertyuiop1234567890" 6 | }, 7 | "description": { 8 | "logs": [] 9 | }, 10 | "conf": 1.0, 11 | "prov": [ 12 | { 13 | "type": "sentence", 14 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 15 | } 16 | ], 17 | "identifiers": [ 18 | { 19 | "_name": "db#1234567", 20 | "type": "db", 21 | "value": "1234567" 22 | } 23 | ], 24 | "subject": { 25 | "display_name": "FeSe", 26 | "type": "material", 27 | "names": [ 28 | { 29 | "type": "chemical_name", 30 | "value": "FeSe", 31 | "_name": "chemical_name#fese" 32 | }, 33 | { 34 | "type": "sum_formula", 35 | "value": "Fe(1) Se(1)", 36 | "_name": "sum_formula#fe(1) se(1)" 37 | } 38 | ], 39 | "identifiers": [ 40 | { 41 | "_name": "db#1234567", 42 | "type": "db", 43 | "value": "1234567" 44 | } 45 | ] 46 | }, 47 | "attributes": [ 48 | { 49 | "predicates": [ 50 | { 51 | "key": { 52 | "type": "property", 53 | "name": "Tc" 54 | }, 55 | "value": { 56 | "type": "property-value", 57 | "name": "5K" 58 | } 59 | }, 60 | { 61 | "key": { 62 | "type": "property", 63 | "name": "pressure" 64 | }, 65 | "value": { 66 | "type": "property-value", 67 | "name": "5GPa" 68 | } 69 | } 70 | ], 71 | "conf": 1.0, 72 | "prov": [ 73 | { 74 | "type": "sentence", 75 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 76 | } 77 | ] 78 | } 79 | ] 80 | } -------------------------------------------------------------------------------- /test/data/rec/record-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name": "some text here", 3 | "file-info": { 4 | "filename": "filename.pdf", 5 | "document-hash": "qwertyuiop1234567890" 6 | }, 7 | "description": { 8 | "logs": [] 9 | }, 10 | "conf": 1.0, 11 | "prov": [ 12 | { 13 | "type": "sentence", 14 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 15 | } 16 | ], 17 | "identifiers": [ 18 | { 19 | "_name": "db#1234567", 20 | "type": "db", 21 | "value": "1234567" 22 | } 23 | ], 24 | "extra": "Extra field temporarlly allowed", 25 | "subject": { 26 | "display_name": "FeSe", 27 | "type": "material", 28 | "names": [ 29 | { 30 | "type": "chemical_name", 31 | "value": "FeSe", 32 | "_name": "chemical_name#fese" 33 | }, 34 | { 35 | "type": "sum_formula", 36 | "value": "Fe(1) Se(1)", 37 | "_name": "sum_formula#fe(1) se(1)" 38 | } 39 | ], 40 | "identifiers": [ 41 | { 42 | "_name": "db#1234567", 43 | "type": "db", 44 | "value": "1234567" 45 | } 46 | ] 47 | }, 48 | "attributes": [ 49 | { 50 | "predicates": [ 51 | { 52 | "key": { 53 | "type": "property", 54 | "name": "Tc" 55 | }, 56 | "value": { 57 | "type": "property-value", 58 | "name": "5K" 59 | } 60 | }, 61 | { 62 | "key": { 63 | "type": "property", 64 | "name": "pressure" 65 | }, 66 | "value": { 67 | "type": "property-value", 68 | "name": "5GPa" 69 | } 70 | } 71 | ], 72 | "conf": 1.0, 73 | "prov": [ 74 | { 75 | "type": "sentence", 76 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 77 | } 78 | ] 79 | } 80 | ] 81 | } -------------------------------------------------------------------------------- /test/data/rec/record-03.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name": "some text here", 3 | "file-info": { 4 | "filename": "filename.pdf", 5 | "document-hash": "qwertyuiop1234567890" 6 | }, 7 | "description": { 8 | "logs": [] 9 | }, 10 | "conf": 1.0, 11 | "prov": [ 12 | { 13 | "type": "sentence", 14 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 15 | } 16 | ], 17 | "identifiers": [ 18 | { 19 | "_name": "db#1234567", 20 | "type": "db", 21 | "value": "1234567" 22 | } 23 | ], 24 | "subject": { 25 | "display_name": "FeSe", 26 | "type": "material", 27 | "names": [ 28 | { 29 | "type": "chemical_name", 30 | "value": "FeSe", 31 | "_name": "chemical_name#fese" 32 | }, 33 | { 34 | "type": "sum_formula", 35 | "value": "Fe(1) Se(1)", 36 | "_name": "sum_formula#fe(1) se(1)" 37 | } 38 | ], 39 | "identifiers": [ 40 | { 41 | "_name": "db#1234567", 42 | "type": "db", 43 | "value": "1234567" 44 | } 45 | ] 46 | } 47 | } -------------------------------------------------------------------------------- /test/data/rec/record-04.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name": "some text here", 3 | "file-info": { 4 | "filename": "filename.pdf", 5 | "document-hash": "qwertyuiop1234567890" 6 | }, 7 | "description": { 8 | "logs": [ 9 | { 10 | "date": "2023-03-01T19:32:20.000000Z", 11 | "agent": "CXS", 12 | "type": "parsing" 13 | } 14 | ], 15 | "collection": { 16 | "name": "DB", 17 | "type": "Record", 18 | "version": "3.2.0", 19 | "alias": [ 20 | "db" 21 | ] 22 | }, 23 | "publication_date": "2023-03-01T18:32:20.416449Z" 24 | }, 25 | "conf": 1.0, 26 | "prov": [ 27 | { 28 | "type": "sentence", 29 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 30 | } 31 | ], 32 | "identifiers": [ 33 | { 34 | "_name": "db#1234567", 35 | "type": "db", 36 | "value": "1234567" 37 | } 38 | ], 39 | "subject": { 40 | "display_name": "FeSe", 41 | "type": "material", 42 | "names": [ 43 | { 44 | "type": "chemical_name", 45 | "value": "FeSe", 46 | "_name": "chemical_name#fese" 47 | }, 48 | { 49 | "type": "sum_formula", 50 | "value": "Fe(1) Se(1)", 51 | "_name": "sum_formula#fe(1) se(1)" 52 | } 53 | ], 54 | "identifiers": [ 55 | { 56 | "_name": "db#1234567", 57 | "type": "db", 58 | "value": "1234567" 59 | } 60 | ] 61 | }, 62 | "attributes": [ 63 | { 64 | "predicates": [ 65 | { 66 | "key": { 67 | "type": "property", 68 | "name": "temperature" 69 | }, 70 | "value": { 71 | "type": "property-value", 72 | "name": "5K" 73 | } 74 | }, 75 | { 76 | "key": { 77 | "type": "property", 78 | "name": "pressure" 79 | }, 80 | "value": { 81 | "type": "property-value", 82 | "name": "5GPa" 83 | } 84 | } 85 | ], 86 | "conf": 1.0, 87 | "prov": [ 88 | { 89 | "type": "sentence", 90 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 91 | } 92 | ] 93 | } 94 | ] 95 | } -------------------------------------------------------------------------------- /test/data/rec/record-05.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name": "some text here", 3 | "file-info": { 4 | "filename": "filename.pdf", 5 | "document-hash": "qwertyuiop1234567890" 6 | }, 7 | "description": { 8 | "logs": [], 9 | "collection": { 10 | "name": "DB", 11 | "type": "Record", 12 | "version": "3.2.0", 13 | "alias": [ 14 | "db" 15 | ] 16 | }, 17 | "publication_date": "2023-03-01T18:32:20.416449Z" 18 | }, 19 | "conf": 1.0, 20 | "prov": [], 21 | "identifiers": [ 22 | { 23 | "_name": "db#1234567", 24 | "type": "db", 25 | "value": "1234567" 26 | } 27 | ], 28 | "subject": { 29 | "display_name": "FeSe", 30 | "display_image": { 31 | "__ref_s3_data": "#/_s3_data/pdf_pages/0" 32 | }, 33 | "type": "material", 34 | "names": [ 35 | { 36 | "type": "chemical_name", 37 | "value": "FeSe", 38 | "_name": "chemical_name#fese" 39 | } 40 | ], 41 | "identifiers": [ 42 | { 43 | "_name": "db#1234567", 44 | "type": "db", 45 | "value": "1234567" 46 | } 47 | ] 48 | }, 49 | "attributes": [], 50 | "_s3_data": { 51 | "pdf_pages": [ 52 | { 53 | "mime": "application/png", 54 | "path": "PDFImages/3d201262771eb38591c1112c0ad52bfdc7ef5a352.png", 55 | "page": 9, 56 | "url": "https://s3.somecosurl.com/3d201262771eb38591c1112c0ad52bfdc7ef5a352" 57 | } 58 | ] 59 | } 60 | } -------------------------------------------------------------------------------- /test/data/rec/record-gleif-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "_name": "some text here", 3 | "file-info": { 4 | "filename": "VGRQXHF3J8VDLUA7XE92", 5 | "document-hash": "f54e9703c17364ea904d2a07d3e2739ed7c7de0f86b2800ed25bdeea5871eb88", 6 | "filename-prov": "20221013-0800-gleif-goldencopy-lei2-golden-copy.csv.zip" 7 | }, 8 | "description": { 9 | "publication_date": "2021-06-05T18:00:00.000Z", 10 | "logs": [ 11 | { 12 | "date": "2022-09-16T09:36:35.741+00:00", 13 | "agent": "CXS", 14 | "comment": "gleif parsing", 15 | "type": "parsing" 16 | } 17 | ] 18 | }, 19 | "conf": 1.0, 20 | "prov": [ 21 | { 22 | "type": "database", 23 | "text": "Gleif golden copy" 24 | } 25 | ], 26 | "identifiers": [ 27 | { 28 | "type": "some_type", 29 | "value": "my_id", 30 | "_name": "some_type#my_id" 31 | } 32 | ], 33 | "subject": { 34 | "display_name": "INTERNATIONAL BUSINESS MACHINES CORPORATION", 35 | "type": "company", 36 | "names": [ 37 | { 38 | "type": "company", 39 | "value": "INTERNATIONAL BUSINESS MACHINES CORPORATION", 40 | "_name": "company#international business machines corporation" 41 | } 42 | ], 43 | "identifiers": [ 44 | { 45 | "type": "ticker", 46 | "value": "IBM:NYSE", 47 | "_name": "ticker#ibm:nyse" 48 | } 49 | ] 50 | }, 51 | "attributes": [ 52 | { 53 | "predicates": [ 54 | { 55 | "key": { 56 | "type": "property", 57 | "name": "legal address country" 58 | }, 59 | "value": { 60 | "type": "property-value", 61 | "name": "US" 62 | } 63 | }, 64 | { 65 | "key": { 66 | "type": "property", 67 | "name": "legal address city" 68 | }, 69 | "value": { 70 | "type": "property-value", 71 | "name": "Armonk" 72 | } 73 | } 74 | ], 75 | "conf": 1.0, 76 | "prov": [ 77 | { 78 | "type": "database record", 79 | "text": "VGRQXHF3J8VDLUA7XE92" 80 | } 81 | ] 82 | } 83 | ] 84 | } -------------------------------------------------------------------------------- /test/data/rec/statement-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "conf": 0.25, 3 | "prov": [ 4 | { 5 | "type": "sentence", 6 | "text": "FeSe has a Tc of 1K and 5K at 1GP and 5GPa respectivily." 7 | } 8 | ], 9 | "subject": { 10 | "display_name": "FeSe", 11 | "type": "material", 12 | "names": [ 13 | { 14 | "type": "material", 15 | "value": "FeSe", 16 | "_name": "material#fese" 17 | } 18 | ], 19 | "identifiers": [ 20 | { 21 | "type": "material", 22 | "value": "Fe(1) Se(1)", 23 | "_name": "material#fe(1) se(1)" 24 | } 25 | ] 26 | }, 27 | "predicates": [ 28 | { 29 | "key": { 30 | "type": "property", 31 | "name": "Tc" 32 | }, 33 | "value": { 34 | "type": "property-value", 35 | "name": "5K" 36 | } 37 | }, 38 | { 39 | "key": { 40 | "type": "property", 41 | "name": "pressure" 42 | }, 43 | "value": { 44 | "type": "property-value", 45 | "name": "5GPa" 46 | } 47 | } 48 | ] 49 | } -------------------------------------------------------------------------------- /test/data/rec/statement-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "subject": { 3 | "names": [ 4 | { 5 | "_name": "chemical_name#bi2sr2cacu2o8", 6 | "type": "chemical_name", 7 | "value": "Bi2Sr2CaCu2O8" 8 | } 9 | ], 10 | "identifiers": [ 11 | { 12 | "_name": "ent_id#b94dls9d", 13 | "type": "ent_id", 14 | "value": "b94dls9d" 15 | }, 16 | { 17 | "_name": "sum_formula#bi(2) ca(1) cu(2) o(8) sr(2)", 18 | "type": "sum_formula", 19 | "value": "Bi(2) Ca(1) Cu(2) O(8) Sr(2)" 20 | } 21 | ], 22 | "display_name": "Bi(2) Ca(1) Cu(2) O(8) Sr(2)", 23 | "type": "material" 24 | }, 25 | "conf": 1.0, 26 | "predicates": [ 27 | { 28 | "numerical_value": { 29 | "val": 0.23, 30 | "unit": "dimensionless", 31 | "min": 0.11, 32 | "err": 0.05, 33 | "max": 0.35 34 | }, 35 | "numerical_value_si": { 36 | "val": 0.23, 37 | "unit": "dimensionless", 38 | "min": 0.11, 39 | "err": 0.05, 40 | "max": 0.35 41 | }, 42 | "value": { 43 | "name": "0.11 to 0.35", 44 | "type": "property-value" 45 | }, 46 | "key": { 47 | "name": "hole concentration", 48 | "type": "property" 49 | } 50 | } 51 | ], 52 | "type": "statement", 53 | "subtype": "mat_to_prop_to_pvls", 54 | "model": "Docling Model 0.0.0", 55 | "source": "sentence.3", 56 | "match": "89f0d4058c2483678b2cc4f515acf463", 57 | "range": [ 58 | 430, 59 | 668 60 | ], 61 | "prov": [ 62 | { 63 | "text": "Here is a sentence with measurements with high-Tc superconductor, on Bi2Sr2CaCu2O8 samples with different Tc values (hole concentration of 0.11 to 0.35).", 64 | "type": "sentence", 65 | "reference": { 66 | "_name": "arxivid#0706.0214", 67 | "type": "arxivid", 68 | "value": "0706.0214" 69 | } 70 | } 71 | ], 72 | "_name": "sentence.3", 73 | "identifiers": [ 74 | { 75 | "_name": "ent_id#b94dls9d", 76 | "type": "ent_id", 77 | "value": "b94dls9d" 78 | }, 79 | { 80 | "_name": "sum_formula#bi(2) ca(1) cu(2) o(8) sr(2)", 81 | "type": "sum_formula", 82 | "value": "Bi(2) Ca(1) Cu(2) O(8) Sr(2)" 83 | } 84 | ], 85 | "file-info": { 86 | "document-hash": "ff94fd78199fe714f2bf6143ab4af8379a581587fd7049aa1a62167196f8f07e", 87 | "filename": "db-file.pdf", 88 | "filename-prov": "db-archive.zip" 89 | }, 90 | "description": { 91 | "provenance": { 92 | "source": "arXiv abstracts" 93 | }, 94 | "publication_date": "2007-06-01T20:36:32.000+00:00", 95 | "logs": [ 96 | { 97 | "agent": "CXS", 98 | "type": "parsing", 99 | "comment": "statement extraction", 100 | "date": "2022-12-15T13:24:51.778+00:00" 101 | } 102 | ] 103 | } 104 | } -------------------------------------------------------------------------------- /test/data/rec/statement-gleif-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "conf": 0.723, 3 | "prov": [ 4 | { 5 | "type": "sentence", 6 | "reference": { 7 | "_name": "report#nyse:ibm-2022q2", 8 | "type": "report", 9 | "value": "NYSE:IBM-2022Q2" 10 | }, 11 | "path": "#/main-text/30", 12 | "span": [ 13 | 23, 14 | 67 15 | ], 16 | "text": "IBM Q2 2022 revenue reached $15.5 billion in the period ending June 30" 17 | } 18 | ], 19 | "subject": { 20 | "display_name": "IBM", 21 | "type": "company", 22 | "names": [ 23 | { 24 | "type": "company", 25 | "value": "IBM", 26 | "_name": "company#ibm" 27 | } 28 | ], 29 | "identifiers": [ 30 | { 31 | "type": "ticker", 32 | "value": "IBM:NYSE", 33 | "_name": "ticker#ibm:nyse" 34 | } 35 | ] 36 | }, 37 | "predicates": [ 38 | { 39 | "key": { 40 | "type": "property", 41 | "name": "kpi" 42 | }, 43 | "value": { 44 | "type": "property-value", 45 | "name": "$15.5 billion" 46 | } 47 | }, 48 | { 49 | "key": { 50 | "type": "property", 51 | "name": "date" 52 | }, 53 | "value": { 54 | "type": "property-value", 55 | "name": "Q2 2022" 56 | } 57 | }, 58 | { 59 | "key": { 60 | "type": "property", 61 | "name": "reporting_period" 62 | }, 63 | "value": { 64 | "type": "property-value", 65 | "name": "ending June 30" 66 | } 67 | } 68 | ] 69 | } -------------------------------------------------------------------------------- /test/data/rec/subject-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "display_name": "FeSe", 3 | "type": "material", 4 | "names": [ 5 | { 6 | "type": "chemical_name", 7 | "value": "FeSe", 8 | "_name": "chemical_name#fese" 9 | }, 10 | { 11 | "type": "sum_formula", 12 | "value": "Fe(1) Se(1)", 13 | "_name": "sum_formula#fe(1) se(1)" 14 | } 15 | ], 16 | "identifiers": [ 17 | { 18 | "_name": "db#1234567", 19 | "type": "db", 20 | "value": "1234567" 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /test/data/rec/subject-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "display_name": "FeSe", 3 | "display_image": { 4 | "__ref_s3_data": "#/s3_data/figures/0" 5 | }, 6 | "type": "material", 7 | "names": [ 8 | { 9 | "type": "chemical_name", 10 | "value": "FeSe", 11 | "_name": "chemical_name#fese" 12 | } 13 | ], 14 | "identifiers": [ 15 | { 16 | "_name": "db#1234567", 17 | "type": "db", 18 | "value": "1234567" 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /test/data/search/error-meta-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "aliases": [ 3 | ".production", 4 | "arxiv" 5 | ], 6 | "created": "2022-08-15T14:10:32.768+00:00", 7 | "description": "arXiv® is a curated research-sharing platform open to anyone. It stores scholarly articles in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics.", 8 | "source": "https://arxiv.org", 9 | "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-arxiv-data", 10 | "display_name": "arXiv full documents", 11 | "type": "Reference", 12 | "classification": [ 13 | "Public", 14 | "PI" 15 | ], 16 | "license": "https://arxiv.org/about", 17 | "filename": "arxiv-gs.json", 18 | "domain": [ 19 | "Science", 20 | "Literature" 21 | ], 22 | "$ref": "ccs:schemas#/Document", 23 | "ccs_s3_data": { 24 | "endpoint": "s3.eu-de.cloud-object-storage.appdomain.cloud", 25 | "paths": [ 26 | { 27 | "bucket": "foc-deepsearch-s3-elastic", 28 | "prefix": "deepsearch-elastic-dataplatform", 29 | "infix": "cxs-8a1925c96f7b49508855ab270d0f3281" 30 | } 31 | ] 32 | } 33 | } -------------------------------------------------------------------------------- /test/data/search/error-meta-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "aliases": [ 3 | ".production", 4 | "arxiv" 5 | ], 6 | "created": "2022-08-15T14:10:32.768+00:00", 7 | "description": "arXiv® is a curated research-sharing platform open to anyone. It stores scholarly articles in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics.", 8 | "source": "https://arxiv.org", 9 | "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-arxiv-data", 10 | "display_name": "arXiv full documents", 11 | "type": "Document", 12 | "classification": [ 13 | "Public", 14 | "PI" 15 | ], 16 | "version": [ 17 | { 18 | "name": "docling-core", 19 | "version": "beta" 20 | } 21 | ], 22 | "license": "https://arxiv.org/about", 23 | "filename": "arxiv-gs.json", 24 | "domain": [ 25 | "Science", 26 | "Banking & Finance" 27 | ], 28 | "$ref": "cps:schemas#/Record", 29 | "ccs_s3_data": { 30 | "endpoint": "s3.eu-de.cloud-object-storage.appdomain.cloud", 31 | "paths": [ 32 | { 33 | "bucket": "foc-deepsearch-s3-elastic", 34 | "prefix": "deepsearch-elastic-dataplatform", 35 | "infix": "cxs-8a1925c96f7b49508855ab270d0f3281" 36 | } 37 | ] 38 | } 39 | } -------------------------------------------------------------------------------- /test/data/search/error-meta-03.json: -------------------------------------------------------------------------------- 1 | { 2 | "created": "2022-08-15T14:10:32.768+00:00", 3 | "version": [ 4 | { 5 | "name": "docling-core", 6 | "version": "1.0.0" 7 | } 8 | ], 9 | "type": "Record", 10 | "$ref": "cps:schemas#/Record", 11 | "extra": "an extra field" 12 | } -------------------------------------------------------------------------------- /test/data/search/meta-01.json: -------------------------------------------------------------------------------- 1 | { 2 | "aliases": [ 3 | ".production", 4 | "arxiv" 5 | ], 6 | "created": "2022-08-15T14:10:32.768+00:00", 7 | "description": "arXiv® is a curated research-sharing platform open to anyone. It stores scholarly articles in the fields of physics, mathematics, computer science, quantitative biology, quantitative finance, statistics, electrical engineering and systems science, and economics.", 8 | "source": "https://arxiv.org", 9 | "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-arxiv-data", 10 | "display_name": "arXiv full documents", 11 | "type": "Document", 12 | "classification": [ 13 | "Public", 14 | "PI" 15 | ], 16 | "version": [ 17 | { 18 | "name": "docling-core", 19 | "version": "1.0.1" 20 | } 21 | ], 22 | "license": "https://arxiv.org/about", 23 | "filename": "arxiv-gs.json", 24 | "domain": [ 25 | "Science" 26 | ], 27 | "$ref": "ccs:schemas#/Document", 28 | "ccs_s3_data": { 29 | "endpoint": "s3.eu-de.cloud-object-storage.appdomain.cloud", 30 | "paths": [ 31 | { 32 | "bucket": "foc-deepsearch-s3-elastic", 33 | "prefix": "deepsearch-elastic-dataplatform", 34 | "infix": "cxs-8a1925c96f7b49508855ab270d0f3281" 35 | } 36 | ] 37 | } 38 | } -------------------------------------------------------------------------------- /test/data/search/meta-02.json: -------------------------------------------------------------------------------- 1 | { 2 | "aliases": [ 3 | ".production", 4 | "patent-uspto" 5 | ], 6 | "created": "2022-10-09T20:57:06.860+00:00", 7 | "description": "Patents from the US Patent and Trade Office (USPTO)", 8 | "source": "https://www.uspto.gov/", 9 | "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-uspto-data", 10 | "display_name": "Patents from USPTO", 11 | "type": "Document", 12 | "classification": [ 13 | "Public", 14 | "PI" 15 | ], 16 | "version": [ 17 | { 18 | "name": "docling-core", 19 | "version": "1.0.1" 20 | }, 21 | { 22 | "name": "deepsearch-cxs", 23 | "version": "1.0.0-alpha0.valid" 24 | } 25 | ], 26 | "license": "https://www.uspto.gov/terms-use-uspto-websites", 27 | "domain": [ 28 | "Science", 29 | "Technology" 30 | ], 31 | "$ref": "ccs:schemas#/Document" 32 | } -------------------------------------------------------------------------------- /test/data/search/meta-03.json: -------------------------------------------------------------------------------- 1 | { 2 | "aliases": [ 3 | ".production", 4 | "osm" 5 | ], 6 | "created": "2022-10-09T20:57:06.860+00:00", 7 | "description": "OpenStreetMap dsata", 8 | "source": "https://www.openstreetmap.org", 9 | "storage": "crn:v1:bluemix:public:cloud-object-storage:global:a/a01ec3a49b79bf6abe91ea574f0f4715:fbb68e6e-2371-48e0-bfe3-4960a445df21:bucket:foc-deepsearch-osm-data", 10 | "display_name": "OpenStreetMap", 11 | "type": "Generic", 12 | "classification": [ 13 | "Public" 14 | ], 15 | "version": [ 16 | { 17 | "name": "docling-core", 18 | "version": "1.0.1" 19 | }, 20 | { 21 | "name": "deepsearch-cxs", 22 | "version": "1.0.0-alpha0.valid" 23 | } 24 | ], 25 | "license": "https://www.openstreetmap.org/copyright", 26 | "domain": [ 27 | "Geography" 28 | ] 29 | } -------------------------------------------------------------------------------- /test/data/search/meta-04.json: -------------------------------------------------------------------------------- 1 | { 2 | "license": "", 3 | "index_key": "07a32fab8e3cf827a1d6691fd530941485282dd7", 4 | "created": "2023-11-28T15:10:08.226942+00:00", 5 | "project_key": "9cd28d76ca3d0cc853edb23976f32b44f5739839", 6 | "description": "", 7 | "source": "", 8 | "display_name": "", 9 | "type": "Document", 10 | "version": [ 11 | { 12 | "name": "docling-core", 13 | "version": "1.4.0" 14 | } 15 | ], 16 | "$ref": "ccs:schemas#/Document" 17 | } -------------------------------------------------------------------------------- /test/data/viz/2408.09869v3_enriched.dt_viz_p2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched.dt_viz_p2.png -------------------------------------------------------------------------------- /test/data/viz/2408.09869v3_enriched_viz_p1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_p1.png -------------------------------------------------------------------------------- /test/data/viz/2408.09869v3_enriched_viz_p2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_p2.png -------------------------------------------------------------------------------- /test/data/viz/2408.09869v3_enriched_viz_p3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_p3.png -------------------------------------------------------------------------------- /test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p1.png -------------------------------------------------------------------------------- /test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p2.png -------------------------------------------------------------------------------- /test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/docling-project/docling-core/878aafd1b78f7f3d730ccb4bd519650230651498/test/data/viz/2408.09869v3_enriched_viz_wout_lbl_p3.png -------------------------------------------------------------------------------- /test/test_data_gen_flag.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from pydantic import TypeAdapter 4 | 5 | GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0)) 6 | 7 | 8 | def test_gen_test_data_flag(): 9 | assert not GEN_TEST_DATA 10 | -------------------------------------------------------------------------------- /test/test_doc_base.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | 7 | import pytest 8 | from pydantic import ValidationError 9 | 10 | from docling_core.types.legacy_doc.base import Prov, S3Reference 11 | 12 | 13 | def test_s3_reference(): 14 | """Validate data with Identifier model.""" 15 | gold_dict = {"__ref_s3_data": "#/s3_data/figures/0"} 16 | data = S3Reference(__ref_s3_data="#/s3_data/figures/0") 17 | 18 | assert data.model_dump() == gold_dict 19 | assert data.model_dump(by_alias=True) == gold_dict 20 | 21 | with pytest.raises(ValidationError, match="required"): 22 | S3Reference() 23 | 24 | 25 | def test_prov(): 26 | prov = { 27 | "bbox": [ 28 | 48.19645328521729, 29 | 644.2883926391602, 30 | 563.6185592651367, 31 | 737.4546043395997, 32 | ], 33 | "page": 2, 34 | "span": [0, 0], 35 | } 36 | 37 | assert Prov(**prov) 38 | 39 | with pytest.raises(ValidationError, match="valid integer"): 40 | prov["span"] = ["foo", 0] 41 | Prov(**prov) 42 | 43 | with pytest.raises(ValidationError, match="at least 2 items"): 44 | prov["span"] = [0] 45 | Prov(**prov) 46 | -------------------------------------------------------------------------------- /test/test_doc_legacy_convert.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import yaml 4 | 5 | from docling_core.types.doc import DoclingDocument 6 | from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument 7 | from docling_core.utils.legacy import ( 8 | docling_document_to_legacy, 9 | legacy_to_docling_document, 10 | ) 11 | 12 | GENERATE = False 13 | 14 | 15 | def test_new_to_old(): 16 | filename = "test/data/doc/2206.01062.yaml" 17 | 18 | with open(filename, "r", encoding="utf-8") as fp: 19 | dict_from_yaml = yaml.safe_load(fp) 20 | 21 | doc = DoclingDocument.model_validate(dict_from_yaml) 22 | 23 | docling_document_to_legacy(doc) 24 | 25 | 26 | def test_old_to_new(): 27 | filepath = Path("test/data/legacy_doc/doc-export.json") 28 | leg_doc = DsDocument.model_validate_json(filepath.read_text()) 29 | 30 | doc = legacy_to_docling_document(leg_doc) 31 | 32 | gt_filepath = Path(filepath.with_suffix(".docling.yaml.gt")) 33 | if GENERATE: 34 | doc.save_as_yaml(gt_filepath) 35 | 36 | with gt_filepath.open() as gt_fp: 37 | gt_dict = yaml.safe_load(gt_fp) 38 | gt_doc = DoclingDocument.model_validate(gt_dict) 39 | 40 | assert doc == gt_doc 41 | -------------------------------------------------------------------------------- /test/test_doc_schema_extractor.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Test the pydantic models in module data_types.ccs.""" 7 | import json 8 | 9 | from pydantic import ValidationError 10 | 11 | from docling_core.types.legacy_doc.document import CCSDocument 12 | 13 | 14 | def test_ccs_document_update(): 15 | """Validate data with CCSDocument extract.""" 16 | filename = "test/data/legacy_doc/ext-1.json" 17 | try: 18 | with open(filename, encoding="utf-8") as f: 19 | raw_doc = json.load(f) 20 | for item in raw_doc["main-text"]: 21 | if "$ref" in item: 22 | assert False, f"$ref should not be in file {filename}" 23 | 24 | doc = CCSDocument.model_validate(raw_doc) 25 | 26 | if doc.description.abstract: 27 | assert False, f"Abstract should not be present" 28 | 29 | except ValidationError as e: 30 | print(f"Validation error in file {filename}:\n{e.json()}") 31 | raise 32 | -------------------------------------------------------------------------------- /test/test_hierarchical_chunker.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | import json 7 | 8 | from docling_core.transforms.chunker import HierarchicalChunker 9 | from docling_core.transforms.chunker.hierarchical_chunker import ( 10 | ChunkingDocSerializer, 11 | ChunkingSerializerProvider, 12 | DocChunk, 13 | ) 14 | from docling_core.transforms.serializer.markdown import MarkdownTableSerializer 15 | from docling_core.types.doc import DoclingDocument as DLDocument 16 | from docling_core.types.doc.document import DoclingDocument 17 | 18 | from .test_data_gen_flag import GEN_TEST_DATA 19 | 20 | 21 | def _process(act_data, exp_path_str): 22 | if GEN_TEST_DATA: 23 | with open(exp_path_str, mode="w", encoding="utf-8") as f: 24 | json.dump(act_data, fp=f, indent=4) 25 | f.write("\n") 26 | else: 27 | with open(exp_path_str, encoding="utf-8") as f: 28 | exp_data = json.load(fp=f) 29 | assert exp_data == act_data 30 | 31 | 32 | def test_chunk(): 33 | with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f: 34 | data_json = f.read() 35 | dl_doc = DLDocument.model_validate_json(data_json) 36 | chunker = HierarchicalChunker( 37 | merge_list_items=True, 38 | ) 39 | chunks = chunker.chunk(dl_doc=dl_doc) 40 | act_data = dict( 41 | root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] 42 | ) 43 | _process( 44 | act_data=act_data, 45 | exp_path_str="test/data/chunker/0_out_chunks.json", 46 | ) 47 | 48 | 49 | def test_chunk_custom_serializer(): 50 | with open("test/data/chunker/0_inp_dl_doc.json", encoding="utf-8") as f: 51 | data_json = f.read() 52 | dl_doc = DLDocument.model_validate_json(data_json) 53 | 54 | class MySerializerProvider(ChunkingSerializerProvider): 55 | def get_serializer(self, doc: DoclingDocument): 56 | return ChunkingDocSerializer( 57 | doc=doc, 58 | table_serializer=MarkdownTableSerializer(), 59 | ) 60 | 61 | chunker = HierarchicalChunker( 62 | merge_list_items=True, 63 | serializer_provider=MySerializerProvider(), 64 | ) 65 | 66 | chunks = chunker.chunk(dl_doc=dl_doc) 67 | act_data = dict( 68 | root=[DocChunk.model_validate(n).export_json_dict() for n in chunks] 69 | ) 70 | _process( 71 | act_data=act_data, 72 | exp_path_str="test/data/chunker/0b_out_chunks.json", 73 | ) 74 | -------------------------------------------------------------------------------- /test/test_nlp_qa.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Test the pydantic models in module data_types.nlp.qa.py""" 7 | import glob 8 | import unittest 9 | 10 | import pytest 11 | from pydantic import ValidationError 12 | 13 | from docling_core.types.nlp.qa import QAPair 14 | 15 | 16 | class TestQAPair(unittest.TestCase): 17 | """Test QAPair model.""" 18 | 19 | def test_qapair_read(self): 20 | """Validate data read from files.""" 21 | for filename in glob.glob("test/data/nlp/qa-*.json"): 22 | try: 23 | with open(filename, encoding="utf-8") as file_obj: 24 | file_json = file_obj.read() 25 | QAPair.model_validate_json(file_json) 26 | except ValidationError as e: 27 | print(f"Validation error in file {filename}", e.json()) 28 | raise 29 | 30 | def test_qapair_wrong(self): 31 | """Validates wrong format from files.""" 32 | filename = "test/data/nlp/error-qa-1.json" 33 | with ( 34 | pytest.raises(ValidationError, match="Input should be a valid string"), 35 | open(filename, encoding="utf-8") as file_obj, 36 | ): 37 | file_json = file_obj.read() 38 | QAPair.model_validate_json(file_json) 39 | 40 | filename = "test/data/nlp/error-qa-3.json" 41 | with ( 42 | pytest.raises(ValidationError, match="List must be unique"), 43 | open(filename, encoding="utf-8") as file_obj, 44 | ): 45 | file_json = file_obj.read() 46 | QAPair.model_validate_json(file_json) 47 | -------------------------------------------------------------------------------- /test/test_page.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | import pytest 5 | 6 | from docling_core.types.doc.page import BoundingRectangle 7 | 8 | SQRT_2 = math.sqrt(2) 9 | 10 | R_0 = BoundingRectangle(r_x0=0, r_y0=0, r_x1=1, r_y1=0, r_x2=1, r_y2=1, r_x3=0, r_y3=1) 11 | R_45 = BoundingRectangle( 12 | r_x0=0, 13 | r_y0=0, 14 | r_x1=SQRT_2 / 2, 15 | r_y1=SQRT_2 / 2, 16 | r_x2=0, 17 | r_y2=SQRT_2, 18 | r_x3=-SQRT_2 / 2, 19 | r_y3=SQRT_2 / 2, 20 | ) 21 | R_90 = BoundingRectangle( 22 | r_x0=0, r_y0=0, r_x1=0, r_y1=1, r_x2=-1, r_y2=1, r_x3=-1, r_y3=0 23 | ) 24 | R_135 = BoundingRectangle( 25 | r_x0=0, 26 | r_y0=0, 27 | r_x1=-SQRT_2 / 2, 28 | r_y1=SQRT_2 / 2, 29 | r_x2=-SQRT_2, 30 | r_y2=0, 31 | r_x3=-SQRT_2 / 2, 32 | r_y3=-SQRT_2 / 2, 33 | ) 34 | R_180 = BoundingRectangle( 35 | r_x0=0, r_y0=0, r_x1=-0, r_y1=0, r_x2=-1, r_y2=-1, r_x3=0, r_y3=-1 36 | ) 37 | R_MINUS_135 = BoundingRectangle( 38 | r_x0=0, 39 | r_y0=0, 40 | r_x1=-SQRT_2 / 2, 41 | r_y1=-SQRT_2 / 2, 42 | r_x2=0, 43 | r_y2=-SQRT_2, 44 | r_x3=SQRT_2 / 2, 45 | r_y3=-SQRT_2 / 2, 46 | ) 47 | R_MINUS_90 = BoundingRectangle( 48 | r_x0=0, r_y0=0, r_x1=0, r_y1=-1, r_x2=1, r_y2=-1, r_x3=1, r_y3=0 49 | ) 50 | R_MINUS_45 = BoundingRectangle( 51 | r_x0=0, 52 | r_y0=0, 53 | r_x1=SQRT_2 / 2, 54 | r_y1=-SQRT_2 / 2, 55 | r_x2=SQRT_2, 56 | r_y2=0, 57 | r_x3=SQRT_2 / 2, 58 | r_y3=SQRT_2 / 2, 59 | ) 60 | 61 | 62 | @pytest.mark.parametrize( 63 | ("rectangle", "expected_angle", "expected_angle_360"), 64 | [ 65 | (R_0, 0, 0.0), 66 | (R_45, np.pi / 4, 45), 67 | (R_90, np.pi / 2, 90), 68 | (R_135, 3 * np.pi / 4, 135), 69 | (R_180, np.pi, 180), 70 | (R_MINUS_135, 5 * np.pi / 4, 225), 71 | (R_MINUS_90, 3 * np.pi / 2, 270), 72 | (R_MINUS_45, 7 * np.pi / 4, 315), 73 | ], 74 | ) 75 | def test_bounding_rectangle_angle( 76 | rectangle: BoundingRectangle, expected_angle: float, expected_angle_360: int 77 | ): 78 | assert pytest.approx(rectangle.angle, abs=1e-6) == expected_angle 79 | assert pytest.approx(rectangle.angle_360, abs=1e-6) == expected_angle_360 80 | -------------------------------------------------------------------------------- /test/test_search_meta.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Test the pydantic models in module search.metadata.py.""" 7 | import glob 8 | import os 9 | from typing import Literal 10 | 11 | from pydantic import ValidationError 12 | 13 | from docling_core.search.meta import Meta 14 | 15 | 16 | def test_meta(): 17 | """Validate data with Meta schema.""" 18 | taxonomy = Literal["Public", "PI"] 19 | domain = Literal[ 20 | "Science", "Technology", "History", "Art", "Literature", "Geography" 21 | ] 22 | 23 | for filename in glob.glob("test/data/search/meta-*.json"): 24 | try: 25 | with open(filename, encoding="utf-8") as file_obj: 26 | file_json = file_obj.read() 27 | Meta[taxonomy, domain].model_validate_json(file_json) 28 | except ValidationError as e: 29 | print(f"Validation error in file {filename}", e.json()) 30 | raise 31 | 32 | # test invalid documents 33 | gold_errors = { 34 | "error-meta-01.json": ["type", "version"], 35 | "error-meta-02.json": ["version", "domain", "$ref"], 36 | "error-meta-03.json": ["source", "extra"], 37 | } 38 | 39 | for filename in glob.glob("test/data/search/error-meta-*.json"): 40 | gold = gold_errors[os.path.basename(filename)] 41 | try: 42 | with open(filename, encoding="utf-8") as file_obj: 43 | file_json = file_obj.read() 44 | Meta[taxonomy, domain].model_validate_json(file_json) 45 | assert False, f"File {filename} should be an invalid metadata" 46 | except ValidationError as e: 47 | errors = e.errors() 48 | assert len(errors) == len(gold), f"Wrong number of errors in {filename}" 49 | assert all(errors[zdx]["loc"][0] == gold[zdx] for zdx in range(len(errors))) 50 | -------------------------------------------------------------------------------- /test/test_utils.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright IBM Corp. 2024 - 2024 3 | # SPDX-License-Identifier: MIT 4 | # 5 | 6 | """Test the pydantic models in package utils.""" 7 | import json 8 | 9 | from pydantic import Field 10 | from requests import Response 11 | 12 | from docling_core.utils.alias import AliasModel 13 | from docling_core.utils.file import resolve_source_to_path, resolve_source_to_stream 14 | 15 | 16 | def test_alias_model(): 17 | """Test the functionality of AliasModel.""" 18 | 19 | class AliasModelChild(AliasModel): 20 | foo: str = Field(alias="boo") 21 | 22 | data = {"foo": "lorem ipsum"} 23 | data_alias = {"boo": "lorem ipsum"} 24 | 25 | # data validated from dict, JSON, and constructor can use field names or aliases 26 | 27 | AliasModelChild.model_validate(data_alias) 28 | AliasModelChild.model_validate(data) 29 | 30 | AliasModelChild.model_validate_json(json.dumps(data_alias)) 31 | AliasModelChild.model_validate_json(json.dumps(data)) 32 | 33 | AliasModelChild(boo="lorem ipsum") 34 | AliasModelChild(foo="lorem ipsum") 35 | 36 | # children classes will also inherite the populate_by_name 37 | 38 | class AliasModelGrandChild(AliasModelChild): 39 | var: int 40 | 41 | AliasModelGrandChild(boo="lorem ipsum", var=3) 42 | AliasModelGrandChild(foo="lorem ipsum", var=3) 43 | 44 | # serialized data will always use aliases 45 | 46 | obj = AliasModelChild.model_validate(data_alias) 47 | assert obj.model_dump() == data_alias 48 | assert obj.model_dump() != data 49 | 50 | assert obj.model_dump_json() == json.dumps(data_alias, separators=(",", ":")) 51 | assert obj.model_dump_json() != json.dumps(data, separators=(",", ":")) 52 | 53 | 54 | def test_resolve_source_to_path_url_wout_path(monkeypatch): 55 | expected_str = "foo" 56 | expected_bytes = bytes(expected_str, "utf-8") 57 | 58 | def get_dummy_response(*args, **kwargs): 59 | r = Response() 60 | r.status_code = 200 61 | r._content = expected_bytes 62 | return r 63 | 64 | monkeypatch.setattr("requests.get", get_dummy_response) 65 | monkeypatch.setattr( 66 | "requests.models.Response.iter_content", 67 | lambda *args, **kwargs: [expected_bytes], 68 | ) 69 | path = resolve_source_to_path("https://pypi.org") 70 | with open(path, encoding="utf-8") as f: 71 | text = f.read() 72 | assert text == expected_str 73 | 74 | 75 | def test_resolve_source_to_stream_url_wout_path(monkeypatch): 76 | expected_str = "foo" 77 | expected_bytes = bytes(expected_str, "utf-8") 78 | 79 | def get_dummy_response(*args, **kwargs): 80 | r = Response() 81 | r.status_code = 200 82 | r._content = expected_bytes 83 | return r 84 | 85 | monkeypatch.setattr("requests.get", get_dummy_response) 86 | monkeypatch.setattr( 87 | "requests.models.Response.iter_content", 88 | lambda *args, **kwargs: [expected_bytes], 89 | ) 90 | doc_stream = resolve_source_to_stream("https://pypi.org") 91 | assert doc_stream.name == "file" 92 | 93 | text = doc_stream.stream.read().decode("utf8") 94 | assert text == expected_str 95 | -------------------------------------------------------------------------------- /test/test_visualization.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import PIL.Image 4 | 5 | from docling_core.types.doc.document import DoclingDocument 6 | 7 | from .test_data_gen_flag import GEN_TEST_DATA 8 | 9 | VIZ_TEST_DATA_PATH = Path("./test/data/viz") 10 | 11 | 12 | def verify(exp_file: Path, actual: PIL.Image.Image): 13 | if GEN_TEST_DATA: 14 | with open(exp_file, "w", encoding="utf-8") as f: 15 | actual.save(exp_file) 16 | else: 17 | with PIL.Image.open(exp_file) as expected: 18 | assert actual == expected 19 | 20 | 21 | def test_doc_visualization(): 22 | src = Path("./test/data/doc/2408.09869v3_enriched.json") 23 | doc = DoclingDocument.load_from_json(src) 24 | viz_pages = doc.get_visualization() 25 | for k in viz_pages: 26 | if k <= 3: 27 | verify( 28 | exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_p{k}.png", 29 | actual=viz_pages[k], 30 | ) 31 | 32 | 33 | def test_doc_visualization_inline_circumscribed_bbox(): 34 | src = Path("./test/data/doc/2408.09869v3_enriched.dt.json") 35 | doc = DoclingDocument.load_from_json(src) 36 | viz_pages = doc.get_visualization() 37 | for k in viz_pages: 38 | if k == 2: 39 | verify( 40 | exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_p{k}.png", 41 | actual=viz_pages[k], 42 | ) 43 | 44 | 45 | def test_doc_visualization_no_label(): 46 | src = Path("./test/data/doc/2408.09869v3_enriched.json") 47 | doc = DoclingDocument.load_from_json(src) 48 | viz_pages = doc.get_visualization(show_label=False) 49 | for k in viz_pages: 50 | if k <= 3: 51 | verify( 52 | exp_file=VIZ_TEST_DATA_PATH / f"{src.stem}_viz_wout_lbl_p{k}.png", 53 | actual=viz_pages[k], 54 | ) 55 | --------------------------------------------------------------------------------