├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── publish.yml
    │   └── test_workflow.yml
├── .gitignore
├── LICENSE.txt
├── Makefile
├── README.md
├── activate_venv
├── examples
    ├── Basic_usage.ipynb
    ├── Implicit.ipynb
    ├── borderless.ipynb
    ├── data
    │   ├── borderless.jpg
    │   ├── borderless
    │   │   ├── 1.png
    │   │   ├── 2.png
    │   │   ├── 3.png
    │   │   └── 4.png
    │   ├── implicit.png
    │   ├── tables.pdf
    │   ├── tables.png
    │   └── tables.xlsx
    └── utils.py
├── pyproject.toml
├── pytest.ini
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
├── src
    └── img2table
    │   ├── __init__.py
    │   ├── document
    │       ├── __init__.py
    │       ├── base
    │       │   ├── __init__.py
    │       │   └── rotation.py
    │       ├── image.py
    │       └── pdf.py
    │   ├── ocr
    │       ├── __init__.py
    │       ├── aws_textract.py
    │       ├── azure.py
    │       ├── base.py
    │       ├── data.py
    │       ├── doctr.py
    │       ├── easyocr.py
    │       ├── google_vision.py
    │       ├── paddle.py
    │       ├── pdf.py
    │       ├── surya.py
    │       └── tesseract.py
    │   └── tables
    │       ├── __init__.py
    │       ├── image.py
    │       ├── metrics.py
    │       ├── objects
    │           ├── __init__.py
    │           ├── cell.py
    │           ├── extraction.py
    │           ├── line.py
    │           ├── row.py
    │           └── table.py
    │       └── processing
    │           ├── __init__.py
    │           ├── bordered_tables
    │               ├── __init__.py
    │               ├── cells
    │               │   ├── __init__.py
    │               │   ├── deduplication.py
    │               │   └── identification.py
    │               ├── lines.py
    │               └── tables
    │               │   ├── __init__.py
    │               │   ├── cell_clustering.py
    │               │   ├── consecutive.py
    │               │   ├── implicit.py
    │               │   ├── semi_bordered.py
    │               │   └── table_creation.py
    │           ├── borderless_tables
    │               ├── __init__.py
    │               ├── columns.py
    │               ├── layout
    │               │   ├── __init__.py
    │               │   ├── column_segments.py
    │               │   ├── image_elements.py
    │               │   ├── rlsa.py
    │               │   └── table_segments.py
    │               ├── model.py
    │               ├── rows.py
    │               ├── table
    │               │   ├── __init__.py
    │               │   ├── coherency.py
    │               │   └── table_creation.py
    │               └── whitespaces.py
    │           ├── common.py
    │           └── text
    │               ├── __init__.py
    │               └── titles.py
└── tests
    ├── __init__.py
    ├── _mock_data
        ├── azure.pkl
        ├── surya.pkl
        ├── tesseract_hocr.html
        ├── textract.json
        ├── vision.json
        └── vision.pkl
    ├── conftest.py
    ├── document
        ├── __init__.py
        ├── base
        │   ├── __init__.py
        │   ├── test_data
        │   │   └── test.png
        │   └── test_rotation.py
        ├── image
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── blank.png
        │   │   ├── dark.png
        │   │   ├── expected.xlsx
        │   │   └── test.png
        │   └── test_image.py
        └── pdf
        │   ├── __init__.py
        │   ├── test_data
        │       └── test.pdf
        │   └── test_pdf.py
    ├── ocr
        ├── __init__.py
        ├── aws_textract
        │   ├── __init__.py
        │   ├── test_aws_textract.py
        │   └── test_data
        │   │   ├── content.json
        │   │   ├── ocr_df.csv
        │   │   └── test.png
        ├── azure
        │   ├── __init__.py
        │   ├── test_azure.py
        │   └── test_data
        │   │   ├── ocr_df.csv
        │   │   └── test.png
        ├── data
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── expected_table.json
        │   │   ├── ocr_df.csv
        │   │   └── table.json
        │   └── test_ocr_data.py
        ├── doctr
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── ocr.pkl
        │   │   ├── ocr_df.csv
        │   │   └── test.png
        │   └── test_doctr.py
        ├── easyocr
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── ocr.json
        │   │   ├── ocr_df.csv
        │   │   └── test.png
        │   └── test_easyocr.py
        ├── google_vision
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── expected_content.json
        │   │   ├── ocr_df.csv
        │   │   └── test.png
        │   └── test_google_vision.py
        ├── paddle
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── hocr.json
        │   │   ├── ocr_df.csv
        │   │   └── test.png
        │   └── test_paddle.py
        ├── pdf
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── content.json
        │   │   ├── ocr_df.csv
        │   │   └── test.pdf
        │   └── test_pdf_ocr.py
        ├── surya
        │   ├── __init__.py
        │   ├── test_data
        │   │   ├── ocr_df.csv
        │   │   └── test.png
        │   └── test_surya.py
        └── tesseract
        │   ├── __init__.py
        │   ├── test_data
        │       ├── ocr_df.csv
        │       └── test.png
        │   └── test_tesseract.py
    └── tables
        ├── __init__.py
        ├── image
            ├── __init__.py
            ├── test_data
            │   ├── blank.png
            │   ├── ocr.csv
            │   └── test.png
            ├── test_image.py
            └── test_metrics.py
        ├── objects
            ├── __init__.py
            ├── test_data
            │   ├── expected_tables.json
            │   ├── ocr.csv
            │   ├── table.html
            │   └── tables.json
            ├── test_extraction.py
            ├── test_line.py
            ├── test_row.py
            └── test_table.py
        └── processing
            ├── __init__.py
            ├── bordered_tables
                ├── __init__.py
                ├── cells
                │   ├── __init__.py
                │   ├── test_cells.py
                │   ├── test_data
                │   │   ├── expected.csv
                │   │   ├── expected_ident_cells.csv
                │   │   ├── expected_potential_cells.csv
                │   │   ├── expected_vertical_dedup.csv
                │   │   └── lines.json
                │   ├── test_deduplication_cells.py
                │   └── test_identification_cells.py
                ├── lines
                │   ├── __init__.py
                │   ├── test_data
                │   │   ├── contours.json
                │   │   ├── expected.json
                │   │   └── test.png
                │   └── test_lines.py
                └── tables
                │   ├── __init__.py
                │   ├── test_cell_clustering.py
                │   ├── test_data
                │       ├── cell_clusters_normalized.json
                │       ├── cells.json
                │       ├── cells_clustered.json
                │       ├── contours.json
                │       ├── contours_implicit.json
                │       ├── expected.json
                │       ├── lines.json
                │       ├── table_implicit.json
                │       ├── tables_from_cells.json
                │       └── word_image.png
                │   ├── test_implicit.py
                │   ├── test_semi_bordered.py
                │   ├── test_table_creation.py
                │   └── test_tables.py
            ├── borderless_tables
                ├── __init__.py
                ├── borderless_tables
                │   ├── __init__.py
                │   ├── test_borderless_tables.py
                │   ├── test_data
                │   │   ├── contours.json
                │   │   ├── image_segment.json
                │   │   ├── lines.json
                │   │   └── test.png
                │   └── test_whitespaces.py
                ├── columns
                │   ├── __init__.py
                │   ├── test_columns.py
                │   └── test_data
                │   │   ├── delimiter_group.json
                │   │   └── table_segment.json
                ├── layout
                │   ├── __init__.py
                │   ├── test_column_segments.py
                │   ├── test_data
                │   │   ├── elements.json
                │   │   ├── lines.json
                │   │   ├── test.bmp
                │   │   └── text_thresh.bmp
                │   ├── test_image_elements.py
                │   ├── test_layout.py
                │   ├── test_rlsa.py
                │   └── test_table_segments.py
                ├── rows
                │   ├── __init__.py
                │   ├── test_data
                │   │   ├── contours.json
                │   │   ├── delimiter_group.json
                │   │   ├── h_whitespaces.json
                │   │   └── rows.json
                │   └── test_rows.py
                └── table
                │   ├── __init__.py
                │   ├── test_data
                │       ├── contours.json
                │       ├── delimiter_group.json
                │       └── rows.json
                │   ├── test_table.py
                │   └── test_table_creation.py
            ├── common
                ├── __init__.py
                ├── test_common.py
                └── test_data
                │   └── test.jpg
            └── text
                ├── __init__.py
                ├── test_data
                    ├── ocr.csv
                    ├── table.json
                    └── test.jpg
                └── test_titles.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | updates:
3 |   - package-ecosystem: "pip"
4 |     directory: "/"
5 |     schedule:
6 |       interval: "daily"
7 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to Pypi
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [released]
 6 | 
 7 | jobs:
 8 |   publish:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     steps:
12 |     - uses: actions/checkout@v3
13 |     - name: Set up Python
14 |       uses: actions/setup-python@v3
15 |       with:
16 |         python-version: 3.8
17 |     - name: Build package
18 |       run: make build
19 |     - name: Publish to Pypi
20 |       uses: pypa/gh-action-pypi-publish@release/v1
21 |       with:
22 |         password: ${{ secrets.PYPI_TOKEN }}
23 | 
24 | 


--------------------------------------------------------------------------------
/.github/workflows/test_workflow.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 | 
 8 |     runs-on: ubuntu-latest
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v3
16 |     - name: Set up Python ${{ matrix.python-version }}
17 |       uses: actions/setup-python@v3
18 |       with:
19 |         python-version: ${{ matrix.python-version }}
20 |         cache: 'pip'
21 |     - name: Install dependencies
22 |       run: make venv
23 |     - name: Perform tests
24 |       run: make test
25 |       env:
26 |         NUMBA_DISABLE_JIT: 1
27 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | .ipynb_checkpoints
 3 | __pycache__
 4 | .pytest_cache
 5 | .coverage
 6 | *.egg-info
 7 | dist
 8 | build
 9 | 
10 | certs
11 | venv
12 | profiling*
13 | examples/testing


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Xavier Canton
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 6 | copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 7 | 
 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
 9 | 
10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
11 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | VENV = ./activate_venv
 4 | DIR := $(shell pwd)
 5 | export PYTHONPATH := $(DIR)/src
 6 | 
 7 | # Virtual environment commands
 8 | venv:
 9 | 	python -m venv ./venv || true
10 | 	. $(VENV); python -m pip install pip wheel --upgrade;
11 | 	. $(VENV); python -m pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
12 | 
13 | update:
14 | 	. $(VENV); python -m pip install -U -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
15 | 
16 | # Test commands
17 | test:
18 | 	. $(VENV); pytest --cov-report term --cov=src
19 | 
20 | # Examples commands
21 | jupyter-examples:
22 | 	. $(VENV); cd examples && jupyter notebook
23 | 
24 | update-examples:
25 | 	. $(VENV);
26 | 	for f in $(PWD)/examples/*.ipynb; do \
27 | 	  jupyter nbconvert --to notebook --execute $$f --inplace; \
28 | 	done
29 | 
30 | # Build commands
31 | build: venv
32 | 	. $(VENV); python setup.py sdist bdist_wheel
33 | 
34 | 
35 | .PHONY: venv


--------------------------------------------------------------------------------
/activate_venv:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | if [ "$(uname)" == "Darwin" ]; then
 4 |     source .venv/bin/activate
 5 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
 6 |     source .venv/bin/activate
 7 | elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then
 8 |     source venv/Scripts/activate
 9 | elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW64_NT" ]; then
10 |     source venv/Scripts/activate
11 | fi


--------------------------------------------------------------------------------
/examples/data/borderless.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless.jpg


--------------------------------------------------------------------------------
/examples/data/borderless/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/1.png


--------------------------------------------------------------------------------
/examples/data/borderless/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/2.png


--------------------------------------------------------------------------------
/examples/data/borderless/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/3.png


--------------------------------------------------------------------------------
/examples/data/borderless/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/4.png


--------------------------------------------------------------------------------
/examples/data/implicit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/implicit.png


--------------------------------------------------------------------------------
/examples/data/tables.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/tables.pdf


--------------------------------------------------------------------------------
/examples/data/tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/tables.png


--------------------------------------------------------------------------------
/examples/data/tables.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/tables.xlsx


--------------------------------------------------------------------------------
/examples/utils.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import cv2
 4 | import numpy as np
 5 | 
 6 | from img2table.document import Image
 7 | from img2table.ocr.base import OCRInstance
 8 | 
 9 | 
10 | def display_borderless_tables(img: Image, ocr: OCRInstance) -> np.ndarray:
11 |     """
12 |     Create display of borderless table extraction
13 |     :param img: Image object
14 |     :param ocr: OCRInstance object
15 |     :return: display image
16 |     """
17 |     # Extract tables
18 |     extracted_tables = img.extract_tables(ocr=ocr,
19 |                                           borderless_tables=True)
20 | 
21 |     # Create image displaying extracted tables
22 |     display_image = list(img.images)[0].copy()
23 |     for tb in extracted_tables:
24 |         for row in tb.content.values():
25 |             for cell in row:
26 |                 cv2.rectangle(display_image, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2),
27 |                               (255, 0, 0), 2)
28 | 
29 |     # Create white separator image
30 |     width = min(display_image.shape[1] // 10, 100)
31 |     white_img = cv2.cvtColor(255 * np.ones((display_image.shape[0], width), dtype=np.uint8), cv2.COLOR_GRAY2RGB)
32 | 
33 |     # Stack images
34 |     final_image = np.hstack([list(img.images)[0].copy(),
35 |                              white_img,
36 |                              display_image])
37 | 
38 |     return final_image
39 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["pbr>=5.7.0", "setuptools>=42"]
3 | build-backend = "pbr.build"
4 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | minversion = 6.0
3 | pythonpath = . src
4 | testpaths = tests
5 | log_level = INFO
6 | python_files = test_*.py
7 | filterwarnings = ignore::UserWarning


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | 
 3 | # GCP
 4 | google-cloud-vision
 5 | 
 6 | # AWS
 7 | boto3
 8 | 
 9 | # Azure
10 | azure-cognitiveservices-vision-computervision
11 | 
12 | # Paddle
13 | paddlepaddle; python_version < '3.13'
14 | paddleocr>=2.0.6; python_version < '3.13'
15 | 
16 | # EasyOCR
17 | easyocr >= 1.7.0
18 | pillow>=10.0.1
19 | 
20 | # docTR
21 | python-doctr>=0.8; python_version < '3.12'
22 | 
23 | # Surya
24 | surya-ocr>=0.9; python_version >= '3.10'
25 | 
26 | # Test dependencies
27 | pytest >= 6
28 | pytest-cov
29 | pytest-xdist
30 | openpyxl
31 | sewar
32 | pipdeptree
33 | pyinstrument
34 | 
35 | # Examples dependencies
36 | jupyter
37 | ipython-autotime
38 | Pillow
39 | 
40 | # Build tools
41 | wheel
42 | setuptools
43 | pbr
44 | twine


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | polars[pandas]>=1.2
2 | pyarrow>=7
3 | numpy
4 | pypdfium2==4.30.0
5 | opencv-contrib-python>=4
6 | numba
7 | beautifulsoup4
8 | xlsxwriter>=3.0.6
9 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = img2table
 3 | author = Xavier Canton
 4 | summary = img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing
 5 | license = MIT
 6 | description_file = README.md
 7 | description_content_type = text/markdown
 8 | home_page = https://github.com/xavctn/img2table
 9 | python_requires = >=3.8, <3.14
10 | classifiers=
11 |     Programming Language :: Python :: 3 :: Only
12 |     License :: OSI Approved :: MIT License
13 |     Operating System :: OS Independent
14 | 
15 | [options]
16 | package_dir = src/
17 | packages = img2table
18 | 
19 | 
20 | [extras]
21 | gcp =
22 |     google-cloud-vision
23 |     requests
24 | aws =
25 |     boto3
26 | azure =
27 |     azure-cognitiveservices-vision-computervision
28 | paddle =
29 |     paddlepaddle
30 |     paddleocr>=2.0.6
31 | easyocr =
32 |     easyocr>=1.7.0
33 |     pillow>=10.0.1
34 | surya =
35 |     surya-ocr>=0.9:python_version>='3.10'
36 | 
37 | 
38 | [pbr]
39 | skip_authors = True
40 | skip_changelog = True
41 | skip_reno = True
42 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup(
4 |     setup_requires=['pbr'],
5 |     package_dir={'': 'src'},
6 |     pbr=True,
7 | )
8 | 


--------------------------------------------------------------------------------
/src/img2table/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | 
 4 | class Validations:
 5 |     def __post_init__(self):
 6 |         """Run validation methods if declared.
 7 |         The validation method can be a simple check
 8 |         that raises ValueError or a transformation to
 9 |         the field value.
10 |         The validation is performed by calling a function named:
11 |             `validate_<field_name>(self, value, field) -> field.type`
12 |         """
13 |         for name, field in self.__dataclass_fields__.items():
14 |             method = getattr(self, f"validate_{name}", None)
15 |             setattr(self, name, method(getattr(self, name), field=field))
16 | 


--------------------------------------------------------------------------------
/src/img2table/document/__init__.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | 
3 | from img2table.document.image import Image
4 | from img2table.document.pdf import PDF
5 | 


--------------------------------------------------------------------------------
/src/img2table/document/image.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import typing
 3 | from dataclasses import dataclass
 4 | from functools import cached_property
 5 | from typing import List
 6 | 
 7 | import cv2
 8 | import numpy as np
 9 | 
10 | from img2table.document.base import Document
11 | from img2table.document.base.rotation import fix_rotation_image
12 | from img2table.tables.objects.extraction import ExtractedTable
13 | 
14 | if typing.TYPE_CHECKING:
15 |     from img2table.ocr.base import OCRInstance
16 | 
17 | 
18 | @dataclass
19 | class Image(Document):
20 |     detect_rotation: bool = False
21 | 
22 |     def __post_init__(self):
23 |         self.pages = None
24 | 
25 |         super(Image, self).__post_init__()
26 | 
27 |     @cached_property
28 |     def images(self) -> List[np.ndarray]:
29 |         img = cv2.imdecode(np.frombuffer(self.bytes, np.uint8), cv2.IMREAD_COLOR)
30 |         img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
31 |         if self.detect_rotation:
32 |             rotated_img, _ = fix_rotation_image(img=img)
33 |             return [rotated_img]
34 |         else:
35 |             return [img]
36 | 
37 |     def extract_tables(self, ocr: "OCRInstance" = None, implicit_rows: bool = False, implicit_columns: bool = False,
38 |                        borderless_tables: bool = False, min_confidence: int = 50) -> List[ExtractedTable]:
39 |         """
40 |         Extract tables from document
41 |         :param ocr: OCRInstance object used to extract table content
42 |         :param implicit_rows: boolean indicating if implicit rows are splitted
43 |         :param implicit_columns: boolean indicating if implicit columns are splitted
44 |         :param borderless_tables: boolean indicating if borderless tables should be detected
45 |         :param min_confidence: minimum confidence level from OCR in order to process text, from 0 (worst) to 99 (best)
46 |         :return: list of extracted tables
47 |         """
48 |         extracted_tables = super(Image, self).extract_tables(ocr=ocr,
49 |                                                              implicit_rows=implicit_rows,
50 |                                                              implicit_columns=implicit_columns,
51 |                                                              borderless_tables=borderless_tables,
52 |                                                              min_confidence=min_confidence)
53 |         return extracted_tables.get(0)
54 | 


--------------------------------------------------------------------------------
/src/img2table/document/pdf.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import typing
 3 | from dataclasses import dataclass
 4 | from typing import Dict, List, Optional
 5 | 
 6 | import cv2
 7 | import numpy as np
 8 | import pypdfium2
 9 | 
10 | from img2table.document.base import Document
11 | from img2table.document.base.rotation import fix_rotation_image
12 | from img2table.ocr.pdf import PdfOCR
13 | 
14 | if typing.TYPE_CHECKING:
15 |     from img2table.ocr.base import OCRInstance
16 |     from img2table.tables.objects.extraction import ExtractedTable
17 |     from img2table.tables.objects.table import Table
18 | 
19 | 
20 | @dataclass
21 | class PDF(Document):
22 |     pages: List[int] = None
23 |     detect_rotation: bool = False
24 |     pdf_text_extraction: bool = True
25 |     _rotated: bool = False
26 |     _images: List[np.ndarray] = None
27 | 
28 |     def validate_pages(self, value, **_) -> Optional[List[int]]:
29 |         if value is not None:
30 |             if not isinstance(value, list):
31 |                 raise TypeError(f"Invalid type {type(value)} for pages argument")
32 |             if not all(isinstance(x, int) for x in value):
33 |                 raise TypeError("All values in pages argument should be integers")
34 |         return value
35 | 
36 |     def validate_pdf_text_extraction(self, value, **_) -> int:
37 |         if not isinstance(value, bool):
38 |             raise TypeError(f"Invalid type {type(value)} for pdf_text_extraction argument")
39 |         return value
40 | 
41 |     def validate__rotated(self, value, **_) -> int:
42 |         return value
43 | 
44 |     def validate__images(self, value, **_) -> int:
45 |         return value
46 | 
47 |     @property
48 |     def images(self) -> List[np.ndarray]:
49 |         if self._images is not None:
50 |             return self._images
51 | 
52 |         doc = pypdfium2.PdfDocument(input=self.bytes)
53 | 
54 |         # Get all images
55 |         images = list()
56 |         for page_number in self.pages or range(len(doc)):
57 |             page = doc[page_number]
58 |             img = cv2.cvtColor(page.render(scale=200 / 72).to_numpy(), cv2.COLOR_BGR2RGB)
59 |             # Handle rotation if needed
60 |             if self.detect_rotation:
61 |                 final, self._rotated = fix_rotation_image(img=img)
62 |             else:
63 |                 final, self._rotated = img, False
64 |             images.append(final)
65 | 
66 |         self._images = images
67 |         doc.close()
68 |         return images
69 | 
70 |     def get_table_content(self, tables: Dict[int, List["Table"]], ocr: "OCRInstance",
71 |                           min_confidence: int) -> Dict[int, List["ExtractedTable"]]:
72 |         if not self._rotated and self.pdf_text_extraction:
73 |             # Get pages where tables have been detected
74 |             table_pages = [self.pages[k] if self.pages else k for k, v in tables.items() if len(v) > 0]
75 |             images = [self.images[k] for k, v in tables.items() if len(v) > 0]
76 | 
77 |             if table_pages:
78 |                 # Create PDF object for OCR
79 |                 pdf_ocr = PDF(src=self.bytes,
80 |                               pages=table_pages,
81 |                               _images=images,
82 |                               _rotated=self._rotated)
83 | 
84 |                 # Try to get OCRDataframe from PDF
85 |                 self.ocr_df = PdfOCR().of(document=pdf_ocr)
86 | 
87 |         return super(PDF, self).get_table_content(tables=tables, ocr=ocr, min_confidence=min_confidence)
88 | 


--------------------------------------------------------------------------------
/src/img2table/ocr/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from img2table.ocr.aws_textract import TextractOCR
 4 | from img2table.ocr.azure import AzureOCR
 5 | from img2table.ocr.doctr import DocTR
 6 | from img2table.ocr.easyocr import EasyOCR
 7 | from img2table.ocr.google_vision import VisionOCR
 8 | from img2table.ocr.paddle import PaddleOCR
 9 | from img2table.ocr.surya import SuryaOCR
10 | from img2table.ocr.tesseract import TesseractOCR
11 | 


--------------------------------------------------------------------------------
/src/img2table/ocr/base.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import Any
 3 | 
 4 | import polars as pl
 5 | 
 6 | from img2table.document.base import Document
 7 | from img2table.ocr.data import OCRDataframe
 8 | 
 9 | 
10 | class OCRInstance:
11 |     @property
12 |     def pl_schema(self):
13 |         schema = {
14 |             "page": pl.Int64,
15 |             "class": str,
16 |             "id": str,
17 |             "parent": str,
18 |             "value": str,
19 |             "confidence": pl.Int64,
20 |             "x1": pl.Int64,
21 |             "y1": pl.Int64,
22 |             "x2": pl.Int64,
23 |             "y2": pl.Int64
24 |         }
25 |         return schema
26 | 
27 |     def content(self, document: Document) -> Any:
28 |         raise NotImplementedError
29 | 
30 |     def to_ocr_dataframe(self, content: Any) -> OCRDataframe:
31 |         raise NotImplementedError
32 | 
33 |     def of(self, document: Document) -> OCRDataframe:
34 |         """
35 |         Extract text from Document to OCRDataframe object
36 |         :param document: Document object
37 |         :return: OCRDataframe object
38 |         """
39 |         # Extract content from document
40 |         content = self.content(document=document)
41 | 
42 |         # Create OCRDataframe from content
43 |         return self.to_ocr_dataframe(content=content)
44 | 


--------------------------------------------------------------------------------
/src/img2table/ocr/doctr.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import typing
 4 | 
 5 | import polars as pl
 6 | 
 7 | from img2table.document.base import Document
 8 | from img2table.ocr.base import OCRInstance
 9 | from img2table.ocr.data import OCRDataframe
10 | 
11 | if typing.TYPE_CHECKING:
12 |     import doctr
13 | 
14 | 
15 | class DocTR(OCRInstance):
16 |     """
17 |     DocTR instance
18 |     """
19 |     def __init__(self, detect_language: bool = False, kw: typing.Dict = None):
20 |         """
21 |         Initialization of EasyOCR instance
22 |         """
23 |         try:
24 |             from doctr.models import ocr_predictor
25 |         except ModuleNotFoundError:
26 |             raise ModuleNotFoundError("Missing dependencies, please install doctr to use this class.")
27 | 
28 |         # Create kwargs dict for constructor
29 |         kw = kw or {}
30 |         kw["detect_language"] = detect_language
31 |         kw["pretrained"] = kw.get("pretrained") if kw.get("pretrained") is not None else True
32 | 
33 |         self.model = ocr_predictor(**kw)
34 | 
35 |     def content(self, document: Document) -> "doctr.io.elements.Document":
36 |         # Get OCR of all images
37 |         ocrs = self.model(document.images)
38 | 
39 |         return ocrs
40 | 
41 |     def to_ocr_dataframe(self, content: "doctr.io.elements.Document") -> OCRDataframe:
42 |         """
43 |         Convert docTR Document object to OCRDataframe object
44 |         :param content: docTR Document object
45 |         :return: OCRDataframe object corresponding to content
46 |         """
47 |         # Create list of elements
48 |         list_elements = list()
49 | 
50 |         for page_id, page in enumerate(content.pages):
51 |             dimensions = page.dimensions
52 |             word_id = 0
53 |             for block in page.blocks:
54 |                 for line_id, line in enumerate(block.lines):
55 |                     for word in line.words:
56 |                         word_id += 1
57 |                         dict_word = {
58 |                             "page": page_id,
59 |                             "class": "ocrx_word",
60 |                             "id": f"word_{page_id + 1}_{line_id}_{word_id}",
61 |                             "parent": f"word_{page_id + 1}_{line_id}",
62 |                             "value": word.value,
63 |                             "confidence": round(100 * word.confidence),
64 |                             "x1": round(word.geometry[0][0] * dimensions[1]),
65 |                             "y1": round(word.geometry[0][1] * dimensions[0]),
66 |                             "x2": round(word.geometry[1][0] * dimensions[1]),
67 |                             "y2": round(word.geometry[1][1] * dimensions[0])
68 |                         }
69 | 
70 |                         list_elements.append(dict_word)
71 | 
72 |         return OCRDataframe(df=pl.DataFrame(list_elements)) if list_elements else None
73 | 


--------------------------------------------------------------------------------
/src/img2table/ocr/easyocr.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from typing import List, Tuple, Dict
 4 | 
 5 | import polars as pl
 6 | 
 7 | from img2table.document.base import Document
 8 | from img2table.ocr.base import OCRInstance
 9 | from img2table.ocr.data import OCRDataframe
10 | 
11 | 
12 | class EasyOCR(OCRInstance):
13 |     """
14 |     EAsyOCR instance
15 |     """
16 |     def __init__(self, lang: List[str] = ['en'], kw: Dict = None):
17 |         """
18 |         Initialization of EasyOCR instance
19 |         :param lang: lang parameter used in EasyOCR
20 |         :param kw: dictionary containing kwargs for EasyOCR constructor
21 |         """
22 |         try:
23 |             from easyocr import Reader
24 |         except ModuleNotFoundError:
25 |             raise ModuleNotFoundError("Missing dependencies, please install 'img2table[easyocr]' to use this class.")
26 | 
27 |         if isinstance(lang, list):
28 |             if all([isinstance(lng, str) for lng in lang]):
29 |                 self.lang = lang
30 |         else:
31 |             raise TypeError(f"Invalid type {type(lang)} for lang argument")
32 | 
33 |         # Create kwargs dict for constructor
34 |         kw = kw or {}
35 |         kw["lang_list"] = self.lang
36 |         kw["verbose"] = kw.get("verbose") or False
37 | 
38 |         self.reader = Reader(**kw)
39 | 
40 |     def content(self, document: Document) -> List[List[Tuple]]:
41 |         # Get OCR of all images
42 |         ocrs = [self.reader.readtext(image) for image in document.images]
43 | 
44 |         return ocrs
45 | 
46 |     def to_ocr_dataframe(self, content: List[List]) -> OCRDataframe:
47 |         """
48 |         Convert hOCR HTML to OCRDataframe object
49 |         :param content: hOCR HTML string
50 |         :return: OCRDataframe object corresponding to content
51 |         """
52 |         # Create list of elements
53 |         list_elements = list()
54 | 
55 |         for page, ocr_result in enumerate(content):
56 |             word_id = 0
57 |             for word in ocr_result:
58 |                 word_id += 1
59 |                 dict_word = {
60 |                     "page": page,
61 |                     "class": "ocrx_word",
62 |                     "id": f"word_{page + 1}_{word_id}",
63 |                     "parent": f"word_{page + 1}_{word_id}",
64 |                     "value": word[1],
65 |                     "confidence": round(100 * word[2]),
66 |                     "x1": round(min([edge[0] for edge in word[0]])),
67 |                     "y1": round(min([edge[1] for edge in word[0]])),
68 |                     "x2": round(max([edge[0] for edge in word[0]])),
69 |                     "y2": round(max([edge[1] for edge in word[0]]))
70 |                 }
71 | 
72 |                 list_elements.append(dict_word)
73 | 
74 |         return OCRDataframe(df=pl.DataFrame(list_elements)) if list_elements else None
75 | 


--------------------------------------------------------------------------------
/src/img2table/ocr/paddle.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | import os
  4 | import warnings
  5 | from tempfile import NamedTemporaryFile
  6 | from typing import List, Dict
  7 | 
  8 | import cv2
  9 | import numpy as np
 10 | import polars as pl
 11 | 
 12 | from img2table.document.base import Document
 13 | from img2table.ocr.base import OCRInstance
 14 | from img2table.ocr.data import OCRDataframe
 15 | 
 16 | 
 17 | class PaddleOCR(OCRInstance):
 18 |     """
 19 |     Paddle-OCR instance
 20 |     """
 21 |     def __init__(self, lang: str = 'en', kw: Dict = None):
 22 |         """
 23 |         Initialization of Paddle OCR instance
 24 |         :param lang: lang parameter used in Paddle
 25 |         :param kw: dictionary containing kwargs for PaddleOCR constructor
 26 |         """
 27 |         try:
 28 |             with warnings.catch_warnings():
 29 |                 warnings.simplefilter("ignore")
 30 |                 from paddleocr import PaddleOCR as OCR
 31 |         except ModuleNotFoundError:
 32 |             raise ModuleNotFoundError("Missing dependencies, please install 'img2table[paddle]' to use this class.")
 33 | 
 34 |         if isinstance(lang, str):
 35 |             self.lang = lang
 36 |         else:
 37 |             raise TypeError(f"Invalid type {type(lang)} for lang argument")
 38 | 
 39 |         # Create kwargs dict for constructor
 40 |         kw = kw or {}
 41 |         kw["lang"] = self.lang
 42 |         kw["use_angle_cls"] = kw.get("use_angle_cls") or False
 43 |         kw["show_log"] = kw.get("show_log") or False
 44 | 
 45 |         self.ocr = OCR(**kw)
 46 | 
 47 |     def hocr(self, image: np.ndarray) -> List:
 48 |         """
 49 |         Get OCR of an image using Paddle
 50 |         :param image: numpy array representing the image
 51 |         :return: Paddle OCR result
 52 |         """
 53 |         with NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_f:
 54 |             tmp_file = tmp_f.name
 55 |             # Write image to temporary file
 56 |             cv2.imwrite(tmp_file, image)
 57 | 
 58 |             # Get OCR
 59 |             ocr_result = self.ocr.ocr(img=tmp_file, cls=False)
 60 | 
 61 |         # Remove temporary file
 62 |         while os.path.exists(tmp_file):
 63 |             try:
 64 |                 os.remove(tmp_file)
 65 |             except PermissionError:
 66 |                 pass
 67 | 
 68 |         # Get result
 69 |         ocr_result = ocr_result.pop()
 70 |         return [[bbox, (word[0], round(word[1], 2))] for bbox, word in ocr_result] if ocr_result else []
 71 | 
 72 |     def content(self, document: Document) -> List[List]:
 73 |         # Get OCR of all images
 74 |         ocrs = [self.hocr(image=image) for image in document.images]
 75 | 
 76 |         return ocrs
 77 | 
 78 |     def to_ocr_dataframe(self, content: List[List]) -> OCRDataframe:
 79 |         """
 80 |         Convert hOCR HTML to OCRDataframe object
 81 |         :param content: hOCR HTML string
 82 |         :return: OCRDataframe object corresponding to content
 83 |         """
 84 |         # Create list of elements
 85 |         list_elements = list()
 86 | 
 87 |         for page, ocr_result in enumerate(content):
 88 |             word_id = 0
 89 |             for bbox, word in ocr_result:
 90 |                 word_id += 1
 91 |                 dict_word = {
 92 |                     "page": page,
 93 |                     "class": "ocrx_word",
 94 |                     "id": f"word_{page + 1}_{word_id}",
 95 |                     "parent": f"word_{page + 1}_{word_id}",
 96 |                     "value": word[0],
 97 |                     "confidence": 100 * word[1],
 98 |                     "x1": round(min([edge[0] for edge in bbox])),
 99 |                     "y1": round(min([edge[1] for edge in bbox])),
100 |                     "x2": round(max([edge[0] for edge in bbox])),
101 |                     "y2": round(max([edge[1] for edge in bbox]))
102 |                 }
103 | 
104 |                 list_elements.append(dict_word)
105 | 
106 |         return OCRDataframe(df=pl.DataFrame(list_elements, schema=self.pl_schema)) if list_elements else None
107 | 


--------------------------------------------------------------------------------
/src/img2table/ocr/surya.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import typing
 4 | 
 5 | import polars as pl
 6 | from PIL import Image
 7 | 
 8 | from img2table.document.base import Document
 9 | from img2table.ocr.base import OCRInstance
10 | from img2table.ocr.data import OCRDataframe
11 | 
12 | if typing.TYPE_CHECKING:
13 |     import surya
14 | 
15 | 
16 | class SuryaOCR(OCRInstance):
17 |     """
18 |     DocTR instance
19 |     """
20 |     def __init__(self, langs: typing.List[str] = None):
21 |         """
22 |         Initialization of EasyOCR instance
23 |         """
24 |         try:
25 |             from surya.recognition import RecognitionPredictor
26 |             from surya.detection import DetectionPredictor
27 | 
28 |         except ModuleNotFoundError:
29 |             raise ModuleNotFoundError("Missing dependencies, please install 'img2table[surya]' to use this class.")
30 | 
31 |         if isinstance(langs, list):
32 |             if all([isinstance(lng, str) for lng in langs]):
33 |                 self.langs = langs or ["en"]
34 |             else:
35 |                 raise TypeError(f"All values should be strings for langs argument")
36 |         else:
37 |             raise TypeError(f"Invalid type {type(langs)} for langs argument")
38 | 
39 |         # Initialize model
40 |         self.det_predictor = DetectionPredictor()
41 |         self.rec_predictor = RecognitionPredictor()
42 | 
43 |     def content(self, document: Document) -> typing.List["surya.recognition.schema.OCRResult"]:
44 |         # Get OCR of all images
45 |         ocrs = self.rec_predictor(images=[Image.fromarray(img) for img in document.images],
46 |                                   langs=[self.langs],
47 |                                   det_predictor=self.det_predictor)
48 | 
49 |         return ocrs
50 | 
51 |     def to_ocr_dataframe(self, content: typing.List["surya.recognition.schema.OCRResult"]) -> OCRDataframe:
52 |         """
53 |         Convert docTR Document object to OCRDataframe object
54 |         :param content: docTR Document object
55 |         :return: OCRDataframe object corresponding to content
56 |         """
57 |         # Create list of elements
58 |         list_elements = list()
59 | 
60 |         for page_id, ocr_result in enumerate(content):
61 |             line_id = 0
62 |             for text_line in ocr_result.text_lines:
63 |                 line_id += 1
64 |                 dict_word = {
65 |                     "page": page_id,
66 |                     "class": "ocrx_word",
67 |                     "id": f"word_{page_id + 1}_{line_id}_0",
68 |                     "parent": f"word_{page_id + 1}_{line_id}",
69 |                     "value": text_line.text,
70 |                     "confidence": round(100 * text_line.confidence),
71 |                     "x1": int(text_line.bbox[0]),
72 |                     "y1": int(text_line.bbox[1]),
73 |                     "x2": int(text_line.bbox[2]),
74 |                     "y2": int(text_line.bbox[3])
75 |                 }
76 | 
77 |                 list_elements.append(dict_word)
78 | 
79 |         return OCRDataframe(df=pl.DataFrame(list_elements)) if list_elements else None
80 | 


--------------------------------------------------------------------------------
/src/img2table/tables/objects/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from functools import cached_property
 3 | 
 4 | 
 5 | class TableObject:
 6 |     def bbox(self, margin: int = 0, height_margin: int = 0, width_margin: int = 0) -> tuple:
 7 |         """
 8 |         Return bounding box corresponding to the object
 9 |         :param margin: general margin used for the bounding box
10 |         :param height_margin: vertical margin used for the bounding box
11 |         :param width_margin: horizontal margin used for the bounding box
12 |         :return: tuple representing a bounding box
13 |         """
14 |         # Apply margin on bbox
15 |         if margin != 0:
16 |             bbox = (self.x1 - margin,
17 |                     self.y1 - margin,
18 |                     self.x2 + margin,
19 |                     self.y2 + margin)
20 |         else:
21 |             bbox = (self.x1 - width_margin,
22 |                     self.y1 - height_margin,
23 |                     self.x2 + width_margin,
24 |                     self.y2 + height_margin)
25 | 
26 |         return bbox
27 | 
28 |     @cached_property
29 |     def height(self) -> int:
30 |         return self.y2 - self.y1
31 | 
32 |     @cached_property
33 |     def width(self) -> int:
34 |         return self.x2 - self.x1
35 | 
36 |     @cached_property
37 |     def area(self) -> int:
38 |         return self.height * self.width
39 | 


--------------------------------------------------------------------------------
/src/img2table/tables/objects/cell.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from dataclasses import dataclass
 3 | 
 4 | from img2table.tables.objects import TableObject
 5 | from img2table.tables.objects.extraction import TableCell, BBox
 6 | 
 7 | 
 8 | @dataclass
 9 | class Cell(TableObject):
10 |     x1: int
11 |     y1: int
12 |     x2: int
13 |     y2: int
14 |     content: str = None
15 | 
16 |     @property
17 |     def table_cell(self) -> TableCell:
18 |         bbox = BBox(x1=self.x1, x2=self.x2, y1=self.y1, y2=self.y2)
19 |         return TableCell(bbox=bbox, value=self.content)
20 | 
21 |     def __hash__(self):
22 |         return hash(repr(self))
23 | 


--------------------------------------------------------------------------------
/src/img2table/tables/objects/line.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import math
 3 | from dataclasses import dataclass
 4 | from typing import Optional
 5 | 
 6 | import numpy as np
 7 | 
 8 | from img2table.tables.objects import TableObject
 9 | 
10 | 
11 | @dataclass
12 | class Line(TableObject):
13 |     x1: int
14 |     y1: int
15 |     x2: int
16 |     y2: int
17 |     thickness: Optional[int] = None
18 | 
19 |     @property
20 |     def angle(self) -> float:
21 |         delta_x = self.x2 - self.x1
22 |         delta_y = self.y2 - self.y1
23 | 
24 |         return math.atan2(delta_y, delta_x) * 180 / np.pi
25 | 
26 |     @property
27 |     def length(self) -> float:
28 |         return np.sqrt(self.height ** 2 + self.width ** 2)
29 | 
30 |     @property
31 |     def horizontal(self) -> bool:
32 |         return self.angle % 180 == 0
33 | 
34 |     @property
35 |     def vertical(self) -> bool:
36 |         return self.angle % 180 == 90
37 | 
38 |     @property
39 |     def dict(self):
40 |         return {"x1": self.x1,
41 |                 "x2": self.x2,
42 |                 "y1": self.y1,
43 |                 "y2": self.y2,
44 |                 "width": self.width,
45 |                 "height": self.height,
46 |                 "thickness": self.thickness}
47 | 
48 |     @property
49 |     def transpose(self) -> "Line":
50 |         return Line(x1=self.y1, y1=self.x1, x2=self.y2, y2=self.x2, thickness=self.thickness)
51 | 
52 |     def reprocess(self):
53 |         # Reallocate coordinates in proper order
54 |         _x1 = min(self.x1, self.x2)
55 |         _x2 = max(self.x1, self.x2)
56 |         _y1 = min(self.y1, self.y2)
57 |         _y2 = max(self.y1, self.y2)
58 |         self.x1, self.x2, self.y1, self.y2 = _x1, _x2, _y1, _y2
59 | 
60 |         # Correct "almost" horizontal or vertical rows
61 |         if abs(self.angle) <= 5:
62 |             y_val = int(round((self.y1 + self.y2) / 2))
63 |             self.y2 = self.y1 = y_val
64 |         elif abs(self.angle - 90) <= 5:
65 |             x_val = int(round((self.x1 + self.x2) / 2))
66 |             self.x2 = self.x1 = x_val
67 | 
68 |         return self
69 | 
70 |     def __hash__(self):
71 |         return hash(repr(self))
72 | 


--------------------------------------------------------------------------------
/src/img2table/tables/objects/row.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import copy
 3 | from typing import Union, List
 4 | 
 5 | from img2table.tables.objects import TableObject
 6 | from img2table.tables.objects.cell import Cell
 7 | 
 8 | 
 9 | class Row(TableObject):
10 |     def __init__(self, cells: Union[Cell, List[Cell]]):
11 |         if cells is None:
12 |             raise ValueError("cells parameter is null")
13 |         elif isinstance(cells, Cell):
14 |             self._items = [cells]
15 |         else:
16 |             self._items = cells
17 |         self._contours = []
18 | 
19 |     @property
20 |     def items(self) -> List[Cell]:
21 |         return self._items
22 | 
23 |     @property
24 |     def nb_columns(self) -> int:
25 |         return len(self.items)
26 | 
27 |     @property
28 |     def x1(self) -> int:
29 |         return min(map(lambda x: x.x1, self.items))
30 | 
31 |     @property
32 |     def x2(self) -> int:
33 |         return max(map(lambda x: x.x2, self.items))
34 | 
35 |     @property
36 |     def y1(self) -> int:
37 |         return min(map(lambda x: x.y1, self.items))
38 | 
39 |     @property
40 |     def y2(self) -> int:
41 |         return max(map(lambda x: x.y2, self.items))
42 | 
43 |     @property
44 |     def v_consistent(self) -> bool:
45 |         """
46 |         Indicate if the row is vertically consistent (i.e all cells in row have the same vertical position)
47 |         :return: boolean indicating if the row is vertically consistent
48 |         """
49 |         return all(map(lambda x: (x.y1 == self.y1) and (x.y2 == self.y2), self.items))
50 | 
51 |     def add_cells(self, cells: Union[Cell, List[Cell]]) -> "Row":
52 |         """
53 |         Add cells to existing row items
54 |         :param cells: Cell object or list
55 |         :return: Row object with cells added
56 |         """
57 |         if isinstance(cells, Cell):
58 |             self._items += [cells]
59 |         else:
60 |             self._items += cells
61 | 
62 |         return self
63 | 
64 |     def split_in_rows(self, vertical_delimiters: List[int]) -> List["Row"]:
65 |         """
66 |         Split Row object into multiple objects based on vertical delimiters values
67 |         :param vertical_delimiters: list of vertical delimiters values
68 |         :return: list of splitted Row objects according to delimiters
69 |         """
70 |         # Create list of tuples for vertical boundaries
71 |         row_delimiters = [self.y1] + vertical_delimiters + [self.y2]
72 |         row_boundaries = [(i, j) for i, j in zip(row_delimiters, row_delimiters[1:])]
73 | 
74 |         # Create new list of rows
75 |         l_new_rows = list()
76 |         for boundary in row_boundaries:
77 |             cells = list()
78 |             for cell in self.items:
79 |                 _cell = copy.deepcopy(cell)
80 |                 _cell.y1, _cell.y2 = boundary
81 |                 cells.append(_cell)
82 |             l_new_rows.append(Row(cells=cells))
83 | 
84 |         return l_new_rows
85 | 
86 |     def __eq__(self, other) -> bool:
87 |         if isinstance(other, self.__class__):
88 |             try:
89 |                 assert self.items == other.items
90 |                 return True
91 |             except AssertionError:
92 |                 return False
93 |         return False
94 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/src/img2table/tables/processing/__init__.py


--------------------------------------------------------------------------------
/src/img2table/tables/processing/bordered_tables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/src/img2table/tables/processing/bordered_tables/__init__.py


--------------------------------------------------------------------------------
/src/img2table/tables/processing/bordered_tables/cells/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.line import Line
 6 | from img2table.tables.processing.bordered_tables.cells.deduplication import deduplicate_cells
 7 | from img2table.tables.processing.bordered_tables.cells.identification import get_cells_dataframe
 8 | 
 9 | 
10 | def get_cells(horizontal_lines: List[Line], vertical_lines: List[Line]) -> List[Cell]:
11 |     """
12 |     Identify cells from horizontal and vertical rows
13 |     :param horizontal_lines: list of horizontal rows
14 |     :param vertical_lines: list of vertical rows
15 |     :return: list of all cells in image
16 |     """
17 |     # Create dataframe with cells from horizontal and vertical rows
18 |     cells = get_cells_dataframe(horizontal_lines=horizontal_lines,
19 |                                 vertical_lines=vertical_lines)
20 | 
21 |     # Deduplicate cells
22 |     dedup_cells = deduplicate_cells(cells=cells)
23 | 
24 |     return dedup_cells
25 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/bordered_tables/cells/deduplication.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List
 3 | 
 4 | import numpy as np
 5 | 
 6 | from img2table.tables.objects.cell import Cell
 7 | 
 8 | 
 9 | def deduplicate_cells(cells: List[Cell]) -> List[Cell]:
10 |     """
11 |     Deduplicate nested cells in order to keep the smallest ones
12 |     :param cells: list of cells
13 |     :return: cells after deduplication of the nested ones
14 |     """
15 |     # Create array of cell coverages
16 |     x_max, y_max = max([c.x2 for c in cells] + [0]), max([c.y2 for c in cells] + [0])
17 |     coverage_array = np.ones((y_max, x_max), dtype=np.uint8)
18 | 
19 |     dedup_cells = list()
20 |     for c in sorted(cells, key=lambda c: c.area):
21 |         cropped = coverage_array[c.y1:c.y2, c.x1:c.x2]
22 |         # If cell has at least 25% of its area not covered, add it
23 |         if np.sum(cropped) >= 0.25 * c.area:
24 |             dedup_cells.append(c)
25 |             coverage_array[c.y1:c.y2, c.x1:c.x2] = 0
26 | 
27 |     return dedup_cells
28 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/bordered_tables/cells/identification.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from typing import List
  3 | 
  4 | import numpy as np
  5 | from numba import njit, prange
  6 | 
  7 | from img2table.tables.objects.cell import Cell
  8 | from img2table.tables.objects.line import Line
  9 | 
 10 | 
 11 | @njit("int64[:,:](int64[:,:],int64[:,:])", cache=True, fastmath=True)
 12 | def identify_cells(h_lines_arr: np.ndarray, v_lines_arr: np.ndarray) -> np.ndarray:
 13 |     """
 14 |     Identify cells from lines
 15 |     :param h_lines_arr: array containing horizontal lines
 16 |     :param v_lines_arr: array containing vertical lines
 17 |     :return: array of cells coordinates
 18 |     """
 19 |     # Get potential cells from horizontal lines
 20 |     potential_cells = list()
 21 |     for i in prange(h_lines_arr.shape[0]):
 22 |         x1i, y1i, x2i, y2i = h_lines_arr[i][:]
 23 |         for j in prange(h_lines_arr.shape[0]):
 24 |             x1j, y1j, x2j, y2j = h_lines_arr[j][:]
 25 | 
 26 |             if y1i >= y1j:
 27 |                 continue
 28 | 
 29 |             # Check correspondence between lines
 30 |             l_corresponds = -0.02 <= (x1i - x1j) / ((x2i - x1i) or 1) <= 0.02
 31 |             r_corresponds = -0.02 <= (x2i - x2j) / ((x2i - x1i) or 1) <= 0.02
 32 |             l_contained = (x1i <= x1j <= x2i) or (x1j <= x1i <= x2j)
 33 |             r_contained = (x1i <= x2j <= x2i) or (x1j <= x2i <= x2j)
 34 | 
 35 |             if (l_corresponds or l_contained) and (r_corresponds or r_contained):
 36 |                 potential_cells.append([max(x1i, x1j), min(x2i, x2j), y1i, y2j])
 37 | 
 38 |     if len(potential_cells) == 0:
 39 |         return np.empty((0, 4), dtype=np.int64)
 40 | 
 41 |     # Deduplicate on upper bound
 42 |     potential_cells = sorted(potential_cells)
 43 |     dedup_upper = list()
 44 |     prev_x1, prev_x2, prev_y1 = 0, 0, 0
 45 |     for idx in range(len(potential_cells)):
 46 |         x1, x2, y1, y2 = potential_cells[idx]
 47 | 
 48 |         if not (x1 == prev_x1 and x2 == prev_x2 and y1 == prev_y1):
 49 |             dedup_upper.append([x1, x2, y2, -y1])
 50 |         prev_x1, prev_x2, prev_y1 = x1, x2, y1
 51 | 
 52 |     # Deduplicate on lower bound
 53 |     dedup_upper = sorted(dedup_upper)
 54 |     dedup_lower = list()
 55 |     prev_x1, prev_x2, prev_y2 = 0, 0, 0
 56 |     for idx in range(len(dedup_upper)):
 57 |         x1, x2, y2, _y1 = dedup_upper[idx]
 58 |         y1 = -_y1
 59 | 
 60 |         if not (x1 == prev_x1 and x2 == prev_x2 and y2 == prev_y2):
 61 |             dedup_lower.append([x1, x2, y1, y2])
 62 |         prev_x1, prev_x2, prev_y2 = x1, x2, y2
 63 | 
 64 |     # Create array of potential cells
 65 |     cells_array = np.array(dedup_lower)
 66 |     cells = list()
 67 | 
 68 |     for i in prange(cells_array.shape[0]):
 69 |         x1, x2, y1, y2 = cells_array[i][:]
 70 | 
 71 |         # Compute horizontal margin
 72 |         margin = max(5, (x2 - x1) * 0.025)
 73 | 
 74 |         delimiters = list()
 75 |         for j in range(v_lines_arr.shape[0]):
 76 |             x1v, y1v, x2v, y2v = v_lines_arr[j][:]
 77 | 
 78 |             if x1 - margin <= x1v <= x2 + margin:
 79 |                 # Check vertical overlapping and tolerance
 80 |                 overlap = min(y2, y2v) - max(y1, y1v)
 81 |                 tolerance = max(5, min(10, 0.1 * (y2 - y1)))
 82 | 
 83 |                 if y2 - y1 - overlap <= tolerance:
 84 |                     delimiters.append(x1v)
 85 | 
 86 |         # Create new cells from delimiters
 87 |         if len(delimiters) >= 2:
 88 |             delimiters = sorted(delimiters)
 89 |             for j in range(len(delimiters) - 1):
 90 |                 cells.append([delimiters[j], y1, delimiters[j + 1], y2])
 91 | 
 92 |     return np.array(cells).astype(np.int64) if cells else np.empty((0, 4), dtype=np.int64)
 93 | 
 94 | 
 95 | def get_cells_dataframe(horizontal_lines: List[Line], vertical_lines: List[Line]) -> List[Cell]:
 96 |     """
 97 |     Create dataframe of all possible cells from horizontal and vertical rows
 98 |     :param horizontal_lines: list of horizontal rows
 99 |     :param vertical_lines: list of vertical rows
100 |     :return: list of detected cells
101 |     """
102 |     # Check for empty rows
103 |     if len(horizontal_lines) * len(vertical_lines) == 0:
104 |         return []
105 | 
106 |     # Create arrays from horizontal and vertical rows
107 |     h_lines_array = np.array([[line.x1, line.y1, line.x2, line.y2] for line in horizontal_lines], dtype=np.int64)
108 |     v_lines_array = np.array([[line.x1, line.y1, line.x2, line.y2] for line in vertical_lines], dtype=np.int64)
109 | 
110 |     # Compute cells
111 |     cells_array = identify_cells(h_lines_arr=h_lines_array,
112 |                                  v_lines_arr=v_lines_array)
113 | 
114 |     return [Cell(x1=c[0], y1=c[1], x2=c[2], y2=c[3]) for c in cells_array]
115 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/bordered_tables/tables/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.line import Line
 6 | from img2table.tables.objects.table import Table
 7 | from img2table.tables.processing.bordered_tables.tables.cell_clustering import cluster_cells_in_tables
 8 | from img2table.tables.processing.bordered_tables.tables.semi_bordered import add_semi_bordered_cells
 9 | from img2table.tables.processing.bordered_tables.tables.table_creation import cluster_to_table, normalize_table_cells
10 | 
11 | 
12 | def get_tables(cells: List[Cell], elements: List[Cell], lines: List[Line], char_length: float) -> List[Table]:
13 |     """
14 |     Identify and create Table object from list of image cells
15 |     :param cells: list of cells found in image
16 |     :param elements: list of image elements
17 |     :param lines: list of image lines
18 |     :param char_length: average character length
19 |     :return: list of Table objects inferred from cells
20 |     """
21 |     # Cluster cells into tables
22 |     list_cluster_cells = cluster_cells_in_tables(cells=cells)
23 | 
24 |     # Normalize cells in clusters
25 |     clusters_normalized = [normalize_table_cells(cluster_cells=cluster_cells)
26 |                            for cluster_cells in list_cluster_cells]
27 | 
28 |     # Add semi-bordered cells to clusters
29 |     complete_clusters = [add_semi_bordered_cells(cluster=cluster, lines=lines, char_length=char_length)
30 |                          for cluster in clusters_normalized if len(cluster) > 0]
31 | 
32 |     # Create tables from cells clusters
33 |     tables = [cluster_to_table(cluster_cells=cluster, elements=elements)
34 |               for cluster in complete_clusters]
35 | 
36 |     return [tb for tb in tables if tb.nb_rows * tb.nb_columns >= 2]
37 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/bordered_tables/tables/cell_clustering.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List, Set
 3 | 
 4 | import polars as pl
 5 | 
 6 | from img2table.tables import find_components
 7 | from img2table.tables.objects.cell import Cell
 8 | 
 9 | 
10 | def get_adjacent_cells(cells: List[Cell]) -> List[Set[int]]:
11 |     """
12 |     Identify adjacent cells
13 |     :param cells: list of cells
14 |     :return: list of sets of adjacent cells indexes
15 |     """
16 |     if len(cells) == 0:
17 |         return []
18 | 
19 |     df_cells = pl.DataFrame([{"idx": idx, "x1": c.x1, "y1": c.y1, "x2": c.x2, "y2": c.y2, "height": c.height,
20 |                               "width": c.width}
21 |                              for idx, c in enumerate(cells)])
22 | 
23 |     # Crossjoin and identify adjacent cells
24 |     df_adjacent_cells = (
25 |         df_cells.join(df_cells, how='cross')
26 |         # Compute horizontal and vertical overlap
27 |         .with_columns((pl.min_horizontal(['x2', 'x2_right']) - pl.max_horizontal(['x1', 'x1_right'])).alias("x_overlap"),
28 |                       (pl.min_horizontal(['y2', 'y2_right']) - pl.max_horizontal(['y1', 'y1_right'])).alias("y_overlap")
29 |                       )
30 |         # Compute horizontal and vertical differences
31 |         .with_columns(
32 |             pl.min_horizontal((pl.col('x1') - pl.col('x1_right')).abs(),
33 |                               (pl.col('x1') - pl.col('x2_right')).abs(),
34 |                               (pl.col('x2') - pl.col('x1_right')).abs(),
35 |                               (pl.col('x2') - pl.col('x2_right')).abs()
36 |                               ).alias('diff_x'),
37 |             pl.min_horizontal((pl.col('y1') - pl.col('y1_right')).abs(),
38 |                               (pl.col('y1') - pl.col('y2_right')).abs(),
39 |                               (pl.col('y2') - pl.col('y1_right')).abs(),
40 |                               (pl.col('y2') - pl.col('y2_right')).abs()
41 |                               ).alias('diff_y')
42 |         )
43 |         # Compute thresholds for horizontal and vertical differences
44 |         .with_columns(
45 |             pl.min_horizontal(pl.lit(5), 0.05 * pl.min_horizontal(pl.col('width'), pl.col('width_right'))).alias('thresh_x'),
46 |             pl.min_horizontal(pl.lit(5), 0.05 * pl.min_horizontal(pl.col('height'), pl.col('height_right'))).alias('thresh_y')
47 |         )
48 |         # Filter adjacent cells
49 |         .filter(
50 |            ((pl.col('y_overlap') > 5) & (pl.col('diff_x') <= pl.col('thresh_x')))
51 |             | ((pl.col('x_overlap') > 5) & (pl.col('diff_y') <= pl.col('thresh_y')))
52 |         )
53 |         .select("idx", "idx_right")
54 |         .unique()
55 |         .sort(by=['idx', 'idx_right'])
56 |     )
57 | 
58 |     # Get sets of adjacent cells indexes
59 |     adjacent_cells = [{row.get('idx'), row.get('idx_right')} for row in df_adjacent_cells.to_dicts()]
60 | 
61 |     return adjacent_cells
62 | 
63 | 
64 | def cluster_cells_in_tables(cells: List[Cell]) -> List[List[Cell]]:
65 |     """
66 |     Based on adjacent cells, create clusters of cells that corresponds to tables
67 |     :param cells: list cells in image
68 |     :return: list of list of cells, representing several clusters of cells that form a table
69 |     """
70 |     # Get couples of adjacent cells
71 |     adjacent_cells = get_adjacent_cells(cells=cells)
72 | 
73 |     # Loop over couples to create clusters
74 |     clusters = find_components(edges=adjacent_cells)
75 | 
76 |     # Return list of cell objects
77 |     list_table_cells = [[cells[idx] for idx in cl] for cl in clusters]
78 | 
79 |     return list_table_cells
80 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/bordered_tables/tables/consecutive.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.table import Table
 6 | 
 7 | 
 8 | def merge_consecutive_tables(tables: List[Table], contours: List[Cell]) -> List[Table]:
 9 |     """
10 |     Merge consecutive coherent tables
11 |     :param tables: list of detected tables
12 |     :param contours: list of image contours
13 |     :return: list of processed tables
14 |     """
15 |     if len(tables) == 0:
16 |         return []
17 | 
18 |     # Create table clusters
19 |     seq = iter(sorted(tables, key=lambda t: t.y1))
20 |     clusters = [[next(seq)]]
21 | 
22 |     for tb in seq:
23 |         prev_table = clusters[-1][-1]
24 |         # Check if there are elements between the two tables
25 |         in_between_contours = [c for c in contours if c.y1 >= prev_table.y2 and c.y2 <= tb.y1
26 |                                and c.x2 >= min(prev_table.x1, tb.x1)
27 |                                and c.x1 <= max(prev_table.x2, tb.x2)]
28 |         # Check coherency of tables
29 |         prev_tb_cols = sorted([l for l in prev_table.lines if l.vertical], key=lambda l: l.x1)
30 |         tb_cols = sorted([l for l in tb.lines if l.vertical], key=lambda l: l.x1)
31 |         coherency_lines = all([abs(l1.x1 - l2.x1) <= 2 for l1, l2 in zip(prev_tb_cols, tb_cols)])
32 | 
33 |         if not (len(in_between_contours) == 0 and prev_table.nb_columns == tb.nb_columns and coherency_lines):
34 |             clusters.append([])
35 |         clusters[-1].append(tb)
36 | 
37 |     # Create merged tables
38 |     merged_tables = list()
39 |     for cl in clusters:
40 |         if len(cl) == 1:
41 |             merged_tables += cl
42 |         else:
43 |             # Create new table
44 |             new_tb = Table(rows=[row for tb in cl for row in tb.items], borderless=False)
45 |             merged_tables.append(new_tb)
46 | 
47 |     return merged_tables
48 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/borderless_tables/layout/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List, Optional
 3 | 
 4 | import numpy as np
 5 | 
 6 | from img2table.tables.objects.line import Line
 7 | from img2table.tables.objects.table import Table
 8 | from img2table.tables.processing.borderless_tables.layout.column_segments import segment_image_columns
 9 | from img2table.tables.processing.borderless_tables.layout.image_elements import get_image_elements
10 | from img2table.tables.processing.borderless_tables.layout.rlsa import identify_text_mask
11 | from img2table.tables.processing.borderless_tables.layout.table_segments import get_table_segments
12 | from img2table.tables.processing.borderless_tables.model import TableSegment, ImageSegment
13 | 
14 | 
15 | def segment_image(thresh: np.ndarray, lines: List[Line], char_length: float,
16 |                   median_line_sep: float, existing_tables: Optional[List[Table]] = None) -> List[TableSegment]:
17 |     """
18 |     Segment image and its elements
19 |     :param thresh: threshold image array
20 |     :param lines: list of Line objects of the image
21 |     :param char_length: average character length
22 |     :param median_line_sep: median line separation
23 |     :param existing_tables: list of detected bordered tables
24 |     :return: list of ImageSegment objects with corresponding elements
25 |     """
26 |     # Identify text mask
27 |     text_thresh = identify_text_mask(thresh=thresh,
28 |                                      lines=lines,
29 |                                      char_length=char_length,
30 |                                      existing_tables=existing_tables)
31 | 
32 |     # Identify image elements
33 |     img_elements = get_image_elements(thresh=text_thresh,
34 |                                       char_length=char_length,
35 |                                       median_line_sep=median_line_sep)
36 | 
37 |     if len(img_elements) == 0:
38 |         return []
39 | 
40 |     # Identify column segments
41 |     y_min, y_max = min([el.y1 for el in img_elements]), max([el.y2 for el in img_elements])
42 |     image_segment = ImageSegment(x1=0, y1=y_min, x2=thresh.shape[1], y2=y_max, elements=img_elements)
43 | 
44 |     col_segments = segment_image_columns(image_segment=image_segment,
45 |                                          char_length=char_length,
46 |                                          lines=lines)
47 | 
48 |     # Within each column, identify segments that can correspond to tables
49 |     tb_segments = [table_segment for col_segment in col_segments
50 |                    for table_segment in get_table_segments(segment=col_segment,
51 |                                                            char_length=char_length,
52 |                                                            median_line_sep=median_line_sep)
53 |                    ]
54 | 
55 |     return tb_segments
56 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/borderless_tables/layout/image_elements.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | 
 7 | from img2table.tables.objects.cell import Cell
 8 | 
 9 | 
10 | def get_image_elements(thresh: np.ndarray, char_length: float, median_line_sep: float) -> List[Cell]:
11 |     """
12 |     Identify image elements
13 |     :param thresh: thresholded image array
14 |     :param char_length: average character length
15 |     :param median_line_sep: median line separation
16 |     :return: list of image elements
17 |     """
18 |     # Find contours, highlight text areas, and extract ROIs
19 |     cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
20 |     cnts = cnts[0] if len(cnts) == 2 else cnts[1]
21 | 
22 |     # Get list of contours
23 |     elements = list()
24 |     for c in cnts:
25 |         x, y, w, h = cv2.boundingRect(c)
26 |         if ((min(h, w) >= 0.5 * char_length and max(h, w) >= char_length)
27 |                 or (w / h >= 2 and 0.5 * char_length <= w <= 1.5 * char_length)):
28 |             elements.append(Cell(x1=x, y1=y, x2=x + w, y2=y + h))
29 | 
30 |     return elements
31 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/borderless_tables/table/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List, Optional
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.table import Table
 6 | from img2table.tables.processing.borderless_tables.model import ColumnGroup
 7 | from img2table.tables.processing.borderless_tables.table.coherency import check_table_coherency
 8 | from img2table.tables.processing.borderless_tables.table.table_creation import get_table
 9 | 
10 | 
11 | def identify_table(columns: ColumnGroup, row_delimiters: List[Cell], contours: List[Cell], median_line_sep: float,
12 |                    char_length: float) -> Optional[Table]:
13 |     """
14 |     Identify table from column delimiters and rows
15 |     :param columns: column delimiters group
16 |     :param row_delimiters: list of table row delimitres corresponding to columns
17 |     :param contours: list of image contours
18 |     :param median_line_sep: median line separation
19 |     :param char_length: average character length
20 |     :return: Table object
21 |     """
22 |     # Create table from rows and columns delimiters
23 |     table = get_table(columns=columns,
24 |                       row_delimiters=row_delimiters,
25 |                       contours=contours)
26 | 
27 |     if table:
28 |         if check_table_coherency(table=table,
29 |                                  median_line_sep=median_line_sep,
30 |                                  char_length=char_length):
31 |             return table
32 | 
33 |     return None
34 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/borderless_tables/table/coherency.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import numpy as np
 3 | 
 4 | from img2table.tables.objects.table import Table
 5 | 
 6 | 
 7 | def check_row_coherency(table: Table, median_line_sep: float) -> bool:
 8 |     """
 9 |     Check row coherency of table
10 |     :param table: Table object
11 |     :param median_line_sep: median line separation
12 |     :return: boolean indicating if table row heights are coherent
13 |     """
14 |     if table.nb_rows < 2:
15 |         return False
16 | 
17 |     # Get median row separation
18 |     median_row_separation = np.median([(lower_row.y1 + lower_row.y2 - upper_row.y1 - upper_row.y2) / 2
19 |                                        for upper_row, lower_row in zip(table.items, table.items[1:])])
20 | 
21 |     return median_row_separation >= median_line_sep / 3
22 | 
23 | 
24 | def check_column_coherency(table: Table, char_length: float) -> bool:
25 |     """
26 |     Check column coherency of table
27 |     :param table: Table object
28 |     :param char_length: average character length
29 |     :return: boolean indicating if table column widths are coherent
30 |     """
31 |     if table.nb_columns < 2:
32 |         return False
33 | 
34 |     # Get column widths
35 |     col_widths = list()
36 |     for idx in range(table.nb_columns):
37 |         col_elements = [row.items[idx] for row in table.items]
38 |         col_width = min([el.x2 for el in col_elements]) - max([el.x1 for el in col_elements])
39 |         col_widths.append(col_width)
40 | 
41 |     return np.median(col_widths) >= 3 * char_length
42 | 
43 | 
44 | def check_table_coherency(table: Table, median_line_sep: float, char_length: float) -> bool:
45 |     """
46 |     Check if table has coherent dimensions
47 |     :param table: Table object
48 |     :param median_line_sep: median line separation
49 |     :param char_length: average character length
50 |     :return: boolean indicating if table dimensions are coherent
51 |     """
52 |     # Check row coherency of table
53 |     row_coherency = check_row_coherency(table=table,
54 |                                         median_line_sep=median_line_sep)
55 | 
56 |     # Check column coherency of table
57 |     column_coherency = check_column_coherency(table=table,
58 |                                               char_length=char_length)
59 | 
60 |     return row_coherency and column_coherency
61 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/borderless_tables/table/table_creation.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import List
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.line import Line
 6 | from img2table.tables.objects.table import Table
 7 | from img2table.tables.processing.bordered_tables.cells import get_cells
 8 | from img2table.tables.processing.bordered_tables.tables import cluster_to_table
 9 | from img2table.tables.processing.borderless_tables.model import ColumnGroup
10 | 
11 | 
12 | def get_table(columns: ColumnGroup, row_delimiters: List[Cell], contours: List[Cell]) -> Table:
13 |     """
14 |     Create table object from column delimiters and rows
15 |     :param columns: column delimiters group
16 |     :param row_delimiters: list of table row delimiters
17 |     :param contours: list of image contours
18 |     :return: Table object
19 |     """
20 |     # Convert delimiters to lines
21 |     v_lines = list()
22 |     for col in columns.columns:
23 |         seq = iter(sorted([c for v_ws in col.whitespaces for c in v_ws.ws.cells],
24 |                           key=lambda c: c.y1 + c.y2))
25 |         line_groups = [[next(seq)]]
26 |         for c in seq:
27 |             if c.y1 > line_groups[-1][-1].y2:
28 |                 line_groups.append([])
29 |             line_groups[-1].append(c)
30 | 
31 |         v_lines += [Line(x1=(gp[0].x1 + gp[0].x2) // 2,
32 |                          y1=gp[0].y1,
33 |                          x2=(gp[0].x1 + gp[0].x2) // 2,
34 |                          y2=gp[-1].y2) for gp in line_groups]
35 | 
36 |     h_lines = [Line(x1=d.x1, x2=d.x2, y1=d.y1, y2=d.y2) for d in row_delimiters]
37 | 
38 |     # Identify cells
39 |     cells = get_cells(horizontal_lines=h_lines, vertical_lines=v_lines)
40 | 
41 |     # Create table object
42 |     table = cluster_to_table(cluster_cells=cells, elements=contours, borderless=True)
43 | 
44 |     return table if table.nb_columns >= 3 and table.nb_rows >= 2 else None
45 | 


--------------------------------------------------------------------------------
/src/img2table/tables/processing/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/src/img2table/tables/processing/text/__init__.py


--------------------------------------------------------------------------------
/src/img2table/tables/processing/text/titles.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import copy
 3 | from typing import List
 4 | 
 5 | import numpy as np
 6 | 
 7 | from img2table.ocr.data import OCRDataframe
 8 | from img2table.tables.objects.cell import Cell
 9 | from img2table.tables.objects.table import Table
10 | from img2table.tables.processing.common import get_contours_cell
11 | 
12 | 
13 | def get_title_tables(img: np.ndarray, tables: List[Table], ocr_df: OCRDataframe, margin: int = 5) -> List[Table]:
14 |     """
15 |     Retrieve titles of cell areas
16 |     :param img: image array
17 |     :param tables: list of Table objects
18 |     :param ocr_df: OCRDataframe object
19 |     :param margin: margin used
20 |     :return: list of tables with title extracted
21 |     """
22 |     height, width = img.shape[:2]
23 | 
24 |     if len(tables) == 0:
25 |         return []
26 | 
27 |     # Sort tables
28 |     sorted_tables = sorted(tables, key=lambda tb: (tb.y1, tb.x1, tb.x2))
29 | 
30 |     # Cluster table vertically
31 |     seq = iter(sorted_tables)
32 |     tb_cl = [[next(seq)]]
33 |     for tb in seq:
34 |         if tb.y1 > tb_cl[-1][-1].y2:
35 |             tb_cl.append([])
36 |         tb_cl[-1].append(tb)
37 | 
38 |     # Identify relative zones for each title corresponding to each cluster
39 |     final_tables = list()
40 |     for id_cl, cluster in enumerate(tb_cl):
41 |         # Compute horizontal boundaries of title
42 |         x_delimiters = [int(round((tb_1.x2 + tb_2.x1) / 2)) for tb_1, tb_2 in zip(cluster, cluster[1:])]
43 |         x_delimiters = [max(10, int(round(cluster[0].x1 - 0.2 * cluster[0].width)))] + x_delimiters + [width - 10]
44 |         x_delimiters = x_delimiters + [min(width - 10, int(round(cluster[-1].x2 + 0.2 * cluster[-1].width)))]
45 |         x_bounds = [(del_1, del_2) for del_1, del_2 in zip(x_delimiters, x_delimiters[1:])]
46 | 
47 |         # Compute vertical boundaries of title
48 |         y_bounds = (max([tb.y2 for tb in tb_cl[id_cl - 1]]) if id_cl > 0 else 0, min([tb.y1 for tb in cluster]))
49 | 
50 |         # Fetch title for each table
51 |         for id_tb, table in enumerate(cluster):
52 |             # Get contours in title area
53 |             cell_title = Cell(x1=x_bounds[id_tb][0], x2=x_bounds[id_tb][1], y1=y_bounds[0], y2=y_bounds[1])
54 |             contours = get_contours_cell(img=copy.deepcopy(img),
55 |                                          cell=cell_title,
56 |                                          margin=0,
57 |                                          blur_size=5,
58 |                                          kernel_size=9)
59 | 
60 |             # Get text from OCR
61 |             title = ocr_df.get_text_cell(cell=contours[-1], margin=margin) if contours else None
62 | 
63 |             table.set_title(title=title)
64 |             final_tables.append(table)
65 | 
66 |     return final_tables
67 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import os
 4 | import subprocess
 5 | 
 6 | CWD = os.path.dirname(__file__)
 7 | MOCK_DIR = os.path.join(CWD, "_mock_data")
 8 | 
 9 | TESSERACT_INSTALL = subprocess.run("tesseract --version", shell=True).returncode == 0
10 | 


--------------------------------------------------------------------------------
/tests/_mock_data/azure.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/_mock_data/azure.pkl


--------------------------------------------------------------------------------
/tests/_mock_data/surya.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/_mock_data/surya.pkl


--------------------------------------------------------------------------------
/tests/_mock_data/vision.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/_mock_data/vision.pkl


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import json
  3 | import os
  4 | import pickle
  5 | import subprocess
  6 | import sys
  7 | from typing import NamedTuple, Dict
  8 | 
  9 | import azure.cognitiveservices.vision.computervision
 10 | import boto3
 11 | import pytest
 12 | import requests
 13 | from google.cloud import vision
 14 | 
 15 | from tests import MOCK_DIR
 16 | 
 17 | 
 18 | @pytest.fixture(autouse=True)
 19 | def change_test_dir(request, monkeypatch):
 20 |     monkeypatch.chdir(request.fspath.dirname)
 21 | 
 22 | 
 23 | @pytest.fixture
 24 | def mock_tesseract(monkeypatch):
 25 |     def mock_check_output(*args, **kwargs):
 26 |         if "tesseract --list-langs" in args:
 27 |             return "Langs\neng".encode("utf-8")
 28 |         else:
 29 |             with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), "r") as f:
 30 |                 return f.read().encode("utf-8")
 31 | 
 32 |     def mock_run(*args, **kwargs):
 33 |         class MResp:
 34 |             @property
 35 |             def returncode(self):
 36 |                 return 0
 37 |         return MResp()
 38 | 
 39 |     monkeypatch.setattr(subprocess, "check_output", mock_check_output)
 40 |     monkeypatch.setattr(subprocess, "run", mock_run)
 41 | 
 42 | 
 43 | @pytest.fixture
 44 | def mock_vision(monkeypatch):
 45 |     class MockPost:
 46 |         def json(self, *args, **kwargs):
 47 |             with open(os.path.join(MOCK_DIR, "vision.json"), "r") as f:
 48 |                 return json.load(f)
 49 | 
 50 |     def mock_post(*args, **kwargs):
 51 |         return MockPost()
 52 | 
 53 |     # Mock post to API
 54 |     monkeypatch.setattr(requests, "post", mock_post)
 55 | 
 56 |     def mock_init(*args, **kwargs):
 57 |         pass
 58 | 
 59 |     def mock_annotate(*args, **kwargs):
 60 |         with open(os.path.join(MOCK_DIR, "vision.pkl"), "rb") as f:
 61 |             resp = pickle.load(f)
 62 | 
 63 |         return resp
 64 | 
 65 |     # Mock Vision API annotate
 66 |     monkeypatch.setattr(vision.ImageAnnotatorClient, "__init__", mock_init)
 67 |     monkeypatch.setattr(vision.ImageAnnotatorClient, "batch_annotate_images", mock_annotate)
 68 | 
 69 | 
 70 | @pytest.fixture
 71 | def mock_textract(monkeypatch):
 72 |     class MockClient:
 73 |         def __init__(self, *args, **kwargs):
 74 |             pass
 75 | 
 76 |         def detect_document_text(*args, **kwargs):
 77 |             with open(os.path.join(MOCK_DIR, "textract.json"), "r") as f:
 78 |                 resp = json.load(f)
 79 | 
 80 |             return resp
 81 | 
 82 |     # Mock boto3 client
 83 |     monkeypatch.setattr(boto3, "client", MockClient)
 84 | 
 85 | 
 86 | @pytest.fixture
 87 | def mock_azure(monkeypatch):
 88 |     class MockRead(NamedTuple):
 89 |         headers: Dict
 90 | 
 91 |     def mock_read_in_stream(*args, **kwargs):
 92 |         return MockRead(headers={"Operation-Location": "zz/zz"})
 93 | 
 94 |     def mock_get_read_result(*args, **kwargs):
 95 |         with open(os.path.join(MOCK_DIR, "azure.pkl"), "rb") as f:
 96 |             resp = pickle.load(f)
 97 |         return resp
 98 | 
 99 |     # Mock azure client
100 |     monkeypatch.setattr(azure.cognitiveservices.vision.computervision.ComputerVisionClient,
101 |                         "read_in_stream",
102 |                         mock_read_in_stream)
103 |     monkeypatch.setattr(azure.cognitiveservices.vision.computervision.ComputerVisionClient,
104 |                         "get_read_result",
105 |                         mock_get_read_result)
106 | 
107 | 
108 | @pytest.fixture
109 | def mock_surya(monkeypatch):
110 |     def mock_run_ocr(*args, **kwargs):
111 |         with open(os.path.join(MOCK_DIR, "surya.pkl"), "rb") as f:
112 |             resp = pickle.load(f)
113 |         return resp
114 | 
115 |     if sys.version_info >= (3, 10):
116 |         import surya.recognition
117 |         # Mock surya
118 |         monkeypatch.setattr(surya.recognition.RecognitionPredictor,
119 |                             "__call__",
120 |                             mock_run_ocr)
121 | 
122 | 


--------------------------------------------------------------------------------
/tests/document/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/__init__.py


--------------------------------------------------------------------------------
/tests/document/base/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/base/__init__.py


--------------------------------------------------------------------------------
/tests/document/base/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/base/test_data/test.png


--------------------------------------------------------------------------------
/tests/document/base/test_rotation.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import cv2
 3 | import numpy as np
 4 | from sewar import ssim
 5 | 
 6 | from img2table.document.base.rotation import rotate_img_with_border, fix_rotation_image, get_connected_components, \
 7 |     get_relevant_angles, angle_dixon_q_test
 8 | 
 9 | 
10 | def test_get_connected_components():
11 |     img = cv2.imread("test_data/test.png", cv2.IMREAD_GRAYSCALE)
12 | 
13 |     cc, ref_height, thresh = get_connected_components(img=img)
14 | 
15 |     assert len(cc) == 98
16 | 
17 | 
18 | def test_get_relevant_angles():
19 |     centroids = [[35.8676, 5473.6768],
20 |                  [45.4648, 8734.32],
21 |                  [476.386, 98.437],
22 |                  [9834.4648, 468.47],
23 |                  [746.746, 7348.43],
24 |                  [846.462, 8474.48],
25 |                  [2983.846, 94483.46],
26 |                  [1093.46, 8473.46],
27 |                  [3676.77, 84783.64]]
28 | 
29 |     result = get_relevant_angles(centroids=np.array(centroids), ref_height=1000, n_max=5)
30 | 
31 |     assert len(result) == 5
32 | 
33 | 
34 | def test_angle_dixon_q_test():
35 |     result = angle_dixon_q_test(angles=[12.23, 12.78, 12.79, 12.82], confidence=0.9)
36 | 
37 |     assert round(result, 3) == 12.797
38 | 
39 | 
40 | def test_fix_rotation_image():
41 |     def crop_to_orig_img(img, orig_img):
42 |         # Get original dimensions
43 |         orig_height, orig_width = orig_img.shape[:2]
44 | 
45 |         # Get center of img
46 |         center = (img.shape[0] // 2, img.shape[1] // 2)
47 |         # Crop img around centre
48 |         cropped = img[center[0] - orig_height // 2: center[0] + orig_height // 2 + 1,
49 |                       center[1] - orig_width // 2: center[1] + orig_width // 2 + 1]
50 | 
51 |         return cropped
52 | 
53 |     img = cv2.imread("test_data/test.png")
54 | 
55 |     similarities = list()
56 |     for angle in range(-30, 30, 3):
57 |         # Create test image by rotating it
58 |         test_img = rotate_img_with_border(img=img.copy(), angle=angle)
59 |         result = crop_to_orig_img(img=fix_rotation_image(img=test_img)[0],
60 |                                   orig_img=img)
61 | 
62 |         # Compute similarity between original image and result
63 |         similarities.append(ssim(GT=img, P=result)[0])
64 | 
65 |     assert np.mean(similarities) >= 0.85
66 | 


--------------------------------------------------------------------------------
/tests/document/image/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/__init__.py


--------------------------------------------------------------------------------
/tests/document/image/test_data/blank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/blank.png


--------------------------------------------------------------------------------
/tests/document/image/test_data/dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/dark.png


--------------------------------------------------------------------------------
/tests/document/image/test_data/expected.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/expected.xlsx


--------------------------------------------------------------------------------
/tests/document/image/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/test.png


--------------------------------------------------------------------------------
/tests/document/image/test_image.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from io import BytesIO
  3 | 
  4 | import pytest
  5 | from openpyxl import load_workbook
  6 | 
  7 | from img2table.document.image import Image
  8 | from img2table.ocr import TesseractOCR
  9 | from img2table.tables.objects.extraction import BBox
 10 | 
 11 | 
 12 | def test_validators():
 13 |     with pytest.raises(TypeError) as e_info:
 14 |         img = Image(src=1)
 15 | 
 16 |     with pytest.raises(TypeError) as e_info:
 17 |         img = Image(src="img", detect_rotation=3)
 18 | 
 19 | 
 20 | def test_load_image():
 21 |     # Load from path
 22 |     img_from_path = Image(src="test_data/test.png")
 23 | 
 24 |     # Load from bytes
 25 |     with open("test_data/test.png", "rb") as f:
 26 |         img_from_bytes = Image(src=f.read())
 27 | 
 28 |     # Load from BytesIO
 29 |     with open("test_data/test.png", "rb") as f:
 30 |         img_from_bytesio = Image(src=BytesIO(f.read()))
 31 | 
 32 |     assert img_from_path.bytes == img_from_bytes.bytes == img_from_bytesio.bytes
 33 | 
 34 |     assert list(img_from_path.images)[0].shape == (417, 1365, 3)
 35 | 
 36 | 
 37 | def test_blank_image(mock_tesseract):
 38 |     ocr = TesseractOCR()
 39 |     img = Image(src="test_data/blank.png",
 40 |                 detect_rotation=True)
 41 | 
 42 |     result = img.extract_tables(ocr=ocr,
 43 |                                 implicit_rows=True,
 44 |                                 borderless_tables=True,
 45 |                                 min_confidence=50)
 46 | 
 47 |     assert result == []
 48 | 
 49 | 
 50 | def test_blank_no_ocr():
 51 |     img = Image(src="test_data/blank.png",
 52 |                 detect_rotation=True)
 53 | 
 54 |     result = img.extract_tables(implicit_rows=True,
 55 |                                 borderless_tables=True,
 56 |                                 min_confidence=50)
 57 | 
 58 |     assert result == []
 59 | 
 60 | 
 61 | def test_image_tables(mock_tesseract):
 62 |     ocr = TesseractOCR()
 63 |     img = Image(src="test_data/test.png",
 64 |                 detect_rotation=True)
 65 | 
 66 |     result = img.extract_tables(ocr=ocr, implicit_rows=True, min_confidence=50)
 67 | 
 68 |     assert len(result) == 2
 69 | 
 70 |     assert result[0].title is None
 71 |     assert result[0].bbox == BBox(x1=36, y1=21, x2=770, y2=327)
 72 |     assert len(result[0].content) == 6
 73 |     assert len(result[0].content[0]) == 3
 74 | 
 75 |     assert result[1].title is None
 76 |     assert result[1].bbox == BBox(x1=962, y1=21, x2=1154, y2=123)
 77 |     assert len(result[1].content) == 2
 78 |     assert len(result[1].content[0]) == 2
 79 | 
 80 | 
 81 | def test_no_ocr():
 82 |     img = Image(src="test_data/dark.png",
 83 |                 detect_rotation=True)
 84 | 
 85 |     result = img.extract_tables(implicit_rows=True, min_confidence=50)
 86 | 
 87 |     assert len(result) == 1
 88 | 
 89 |     assert result[0].title is None
 90 |     assert result[0].bbox == BBox(x1=46, y1=37, x2=836, y2=529)
 91 |     assert len(result[0].content) == 19
 92 |     assert len(result[0].content[0]) == 5
 93 | 
 94 | 
 95 | def test_image_excel(mock_tesseract):
 96 |     ocr = TesseractOCR()
 97 |     img = Image(src="test_data/test.png",
 98 |                 detect_rotation=True)
 99 | 
100 |     result = img.to_xlsx(dest=BytesIO(), ocr=ocr, implicit_rows=True, min_confidence=50)
101 | 
102 |     expected = load_workbook(filename="test_data/expected.xlsx")
103 |     result_wb = load_workbook(filename=result)
104 | 
105 |     for idx, ws in enumerate(result_wb.worksheets):
106 |         assert ws.title == expected.worksheets[idx].title
107 |         assert list(ws.values) == list(expected.worksheets[idx].values)
108 | 


--------------------------------------------------------------------------------
/tests/document/pdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/pdf/__init__.py


--------------------------------------------------------------------------------
/tests/document/pdf/test_data/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/pdf/test_data/test.pdf


--------------------------------------------------------------------------------
/tests/document/pdf/test_pdf.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import sys
 3 | from io import BytesIO
 4 | 
 5 | import pytest
 6 | 
 7 | from img2table.document.pdf import PDF
 8 | from img2table.ocr import TesseractOCR
 9 | from img2table.tables.objects.extraction import BBox
10 | 
11 | 
12 | def test_validators():
13 |     with pytest.raises(TypeError) as e_info:
14 |         pdf = PDF(src=1)
15 | 
16 |     with pytest.raises(TypeError) as e_info:
17 |         pdf = PDF(src="img", pages=12)
18 | 
19 |     with pytest.raises(TypeError) as e_info:
20 |         pdf = PDF(src="img", pages=["12"])
21 | 
22 |     with pytest.raises(TypeError) as e_info:
23 |         pdf = PDF(src="img", pages=[1], detect_rotation="a")
24 | 
25 | 
26 | def test_load_pdf():
27 |     # Load from path
28 |     pdf_from_path = PDF(src="test_data/test.pdf")
29 | 
30 |     # Load from bytes
31 |     with open("test_data/test.pdf", "rb") as f:
32 |         pdf_from_bytes = PDF(src=f.read())
33 | 
34 |     # Load from BytesIO
35 |     with open("test_data/test.pdf", "rb") as f:
36 |         pdf_from_bytesio = PDF(src=BytesIO(f.read()))
37 | 
38 |     assert pdf_from_path.bytes == pdf_from_bytes.bytes == pdf_from_bytesio.bytes
39 | 
40 |     assert list(pdf_from_path.images)[0].shape == (2200, 1700, 3)
41 | 
42 | 
43 | def test_pdf_pages():
44 |     assert len(list(PDF(src="test_data/test.pdf").images)) == 2
45 |     assert len(list(PDF(src="test_data/test.pdf", pages=[0]).images)) == 1
46 | 
47 | 
48 | def test_pdf_tables(mock_tesseract):
49 |     ocr = TesseractOCR()
50 |     pdf = PDF(src="test_data/test.pdf")
51 | 
52 |     result = pdf.extract_tables(ocr=ocr, implicit_rows=True, min_confidence=50)
53 | 
54 |     assert result[0][0].title == "Example of Data Table 1"
55 |     if sys.version_info.minor < 11:
56 |         assert result[0][0].bbox == BBox(x1=235, y1=249, x2=1442, y2=543)
57 |     assert (len(result[0][0].content), len(result[0][0].content[0])) == (5, 4)
58 | 
59 |     assert result[0][1].title == "Example of Data Table 2"
60 |     if sys.version_info.minor < 11:
61 |         assert result[0][1].bbox == BBox(x1=236, y1=672, x2=1452, y2=972)
62 |     assert (len(result[0][1].content), len(result[0][1].content[0])) == (5, 4)
63 | 
64 |     assert result[1][0].title == "Example of Data Table 3"
65 |     if sys.version_info.minor < 11:
66 |         assert result[1][0].bbox == BBox(x1=235, y1=249, x2=1442, y2=543)
67 |     assert (len(result[1][0].content), len(result[1][0].content[0])) == (5, 4)
68 | 
69 |     assert result[1][1].title == "Example of Data Table 4"
70 |     if sys.version_info.minor < 11:
71 |         assert result[1][1].bbox == BBox(x1=236, y1=672, x2=1452, y2=972)
72 |     assert (len(result[1][1].content), len(result[1][1].content[0])) == (5, 4)
73 | 


--------------------------------------------------------------------------------
/tests/ocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/aws_textract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/aws_textract/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/aws_textract/test_aws_textract.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | import os
 4 | 
 5 | import polars as pl
 6 | 
 7 | from img2table.document import Image
 8 | from img2table.ocr import TextractOCR
 9 | from img2table.ocr.data import OCRDataframe
10 | from tests import MOCK_DIR
11 | 
12 | 
13 | def test_map_response(mock_textract):
14 |     img = Image("test_data/test.png")
15 | 
16 |     with open(os.path.join(MOCK_DIR, "textract.json"), "r") as f:
17 |         resp = json.load(f)
18 | 
19 |     result = TextractOCR().map_response(response=resp,
20 |                                         image=list(img.images)[0],
21 |                                         page=0)
22 | 
23 |     with open("test_data/content.json", "r") as f:
24 |         expected = json.load(f)
25 | 
26 |     assert result == expected
27 | 
28 | 
29 | def test_content(mock_textract):
30 |     img = Image("test_data/test.png")
31 |     ocr = TextractOCR()
32 | 
33 |     result = ocr.content(document=img)
34 | 
35 |     with open("test_data/content.json", "r") as f:
36 |         expected = json.load(f)
37 | 
38 |     assert list(result) == [expected]
39 | 
40 | 
41 | def test_to_ocr_df(mock_textract):
42 |     ocr = TextractOCR()
43 |     with open("test_data/content.json", "r") as f:
44 |         content = json.load(f)
45 | 
46 |     result = ocr.to_ocr_dataframe(content=[content])
47 | 
48 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
49 | 
50 |     assert result == expected
51 | 
52 | 
53 | def test_textract_ocr(mock_textract):
54 |     img = Image("test_data/test.png")
55 |     ocr = TextractOCR(aws_access_key_id="aws_access_key_id",
56 |                       aws_secret_access_key="aws_secret_access_key",
57 |                       aws_session_token="aws_session_token",
58 |                       region="eu-west-1")
59 | 
60 |     result = ocr.of(document=img)
61 | 
62 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
63 | 
64 |     assert result == expected
65 | 


--------------------------------------------------------------------------------
/tests/ocr/aws_textract/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/aws_textract/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/azure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/azure/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/azure/test_azure.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import pickle
 4 | 
 5 | import polars as pl
 6 | import pytest
 7 | 
 8 | from img2table.document import Image
 9 | from img2table.ocr import AzureOCR
10 | from img2table.ocr.data import OCRDataframe
11 | from tests import MOCK_DIR
12 | 
13 | 
14 | def test_content(mock_azure):
15 |     img = Image("test_data/test.png")
16 |     ocr = AzureOCR(endpoint="aa", subscription_key="bb")
17 | 
18 |     result = ocr.content(document=img)
19 | 
20 |     with open(os.path.join(MOCK_DIR, "azure.pkl"), "rb") as f:
21 |         expected = pickle.load(f)
22 | 
23 |     assert list(result) == [expected]
24 | 
25 | 
26 | def test_to_ocr_df(mock_azure):
27 |     ocr = AzureOCR(endpoint="aa", subscription_key="bb")
28 |     with open(os.path.join(MOCK_DIR, "azure.pkl"), "rb") as f:
29 |         content = pickle.load(f)
30 | 
31 |     result = ocr.to_ocr_dataframe(content=[content])
32 | 
33 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
34 | 
35 |     assert result == expected
36 | 
37 | 
38 | def test_azure_ocr(mock_azure):
39 |     # Test init error
40 |     with pytest.raises(TypeError) as e_info:
41 |         AzureOCR(subscription_key=8, endpoint="a")
42 | 
43 |     with pytest.raises(TypeError) as e_info:
44 |         AzureOCR(subscription_key="a", endpoint=0)
45 | 
46 |     with pytest.raises(ValueError) as e_info:
47 |         AzureOCR(subscription_key="a")
48 | 
49 |     with pytest.raises(ValueError) as e_info:
50 |         AzureOCR(subscription_key="a")
51 | 
52 |     img = Image("test_data/test.png")
53 |     ocr = AzureOCR(endpoint="aa", subscription_key="bb")
54 | 
55 |     result = ocr.of(document=img)
56 | 
57 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
58 | 
59 |     assert result == expected
60 | 


--------------------------------------------------------------------------------
/tests/ocr/azure/test_data/ocr_df.csv:
--------------------------------------------------------------------------------
 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2
 2 | 0;ocrx_word;word_1_1;word_1_1;Title;100;383;38;422;56
 3 | 0;ocrx_word;word_1_2;word_1_2;Test;99;965;39;1001;57
 4 | 0;ocrx_word;word_1_3;word_1_2;1;100;1004;39;1015;57
 5 | 0;ocrx_word;word_1_4;word_1_3;Test;99;1061;38;1096;57
 6 | 0;ocrx_word;word_1_5;word_1_3;2;100;1100;38;1111;57
 7 | 0;ocrx_word;word_1_6;word_1_4;Line;98;38;89;75;108
 8 | 0;ocrx_word;word_1_7;word_1_4;1;100;80;89;91;108
 9 | 0;ocrx_word;word_1_8;word_1_4;-;99;94;89;104;109
10 | 0;ocrx_word;word_1_9;word_1_4;Col;100;107;89;137;109
11 | 0;ocrx_word;word_1_10;word_1_4;1;99;141;88;153;109
12 | 0;ocrx_word;word_1_11;word_1_5;Line;98;278;89;315;109
13 | 0;ocrx_word;word_1_12;word_1_5;1;100;319;89;331;109
14 | 0;ocrx_word;word_1_13;word_1_5;-;100;335;89;343;109
15 | 0;ocrx_word;word_1_14;word_1_5;Col;100;347;88;377;109
16 | 0;ocrx_word;word_1_15;word_1_5;2;100;381;88;393;109
17 | 0;ocrx_word;word_1_16;word_1_6;Line;99;499;89;533;108
18 | 0;ocrx_word;word_1_17;word_1_6;1;99;538;89;549;108
19 | 0;ocrx_word;word_1_18;word_1_6;-;100;553;89;562;108
20 | 0;ocrx_word;word_1_19;word_1_6;Col;100;566;89;596;108
21 | 0;ocrx_word;word_1_20;word_1_6;3;100;599;88;611;108
22 | 0;ocrx_word;word_1_21;word_1_7;Test;99;964;89;1000;108
23 | 0;ocrx_word;word_1_22;word_1_7;3;100;1003;89;1014;108
24 | 0;ocrx_word;word_1_23;word_1_8;Test;99;1060;89;1096;108
25 | 0;ocrx_word;word_1_24;word_1_8;4;100;1099;89;1111;109
26 | 0;ocrx_word;word_1_25;word_1_9;Line;99;39;140;74;159
27 | 0;ocrx_word;word_1_26;word_1_9;2;99;79;141;91;159
28 | 0;ocrx_word;word_1_27;word_1_9;-;100;94;141;103;159
29 | 0;ocrx_word;word_1_28;word_1_9;Col;100;107;141;138;160
30 | 0;ocrx_word;word_1_29;word_1_9;1;100;142;140;152;160
31 | 0;ocrx_word;word_1_30;word_1_10;Line;99;497;140;533;159
32 | 0;ocrx_word;word_1_31;word_1_10;2;99;537;141;550;159
33 | 0;ocrx_word;word_1_32;word_1_10;-;100;554;141;563;159
34 | 0;ocrx_word;word_1_33;word_1_10;Col;100;566;140;596;159
35 | 0;ocrx_word;word_1_34;word_1_10;3;100;599;140;610;159
36 | 0;ocrx_word;word_1_35;word_1_11;Line;98;38;191;74;210
37 | 0;ocrx_word;word_1_36;word_1_11;3;100;80;191;91;210
38 | 0;ocrx_word;word_1_37;word_1_11;-;100;94;191;103;210
39 | 0;ocrx_word;word_1_38;word_1_11;Col;100;107;191;139;211
40 | 0;ocrx_word;word_1_39;word_1_11;1;100;142;190;153;211
41 | 0;ocrx_word;word_1_40;word_1_12;Merged;99;327;191;396;213
42 | 0;ocrx_word;word_1_41;word_1_12;Cells;100;400;190;444;213
43 | 0;ocrx_word;word_1_42;word_1_13;Line;99;498;191;533;210
44 | 0;ocrx_word;word_1_43;word_1_13;3;100;537;191;548;210
45 | 0;ocrx_word;word_1_44;word_1_13;-;100;553;191;562;210
46 | 0;ocrx_word;word_1_45;word_1_13;Col;100;566;191;595;210
47 | 0;ocrx_word;word_1_46;word_1_13;3;100;598;191;610;210
48 | 0;ocrx_word;word_1_47;word_1_14;Line;98;38;242;75;261
49 | 0;ocrx_word;word_1_48;word_1_14;4;100;79;242;91;262
50 | 0;ocrx_word;word_1_49;word_1_14;-;99;94;242;104;262
51 | 0;ocrx_word;word_1_50;word_1_14;Col;100;107;242;138;262
52 | 0;ocrx_word;word_1_51;word_1_14;1;100;142;242;153;262
53 | 0;ocrx_word;word_1_52;word_1_15;Line;99;497;242;533;262
54 | 0;ocrx_word;word_1_53;word_1_15;4;100;538;242;549;261
55 | 0;ocrx_word;word_1_54;word_1_15;-;100;554;242;562;261
56 | 0;ocrx_word;word_1_55;word_1_15;Col;100;566;242;596;261
57 | 0;ocrx_word;word_1_56;word_1_15;3;100;600;242;611;261
58 | 0;ocrx_word;word_1_57;word_1_16;Line;99;38;293;74;313
59 | 0;ocrx_word;word_1_58;word_1_16;5;98;78;293;91;313
60 | 0;ocrx_word;word_1_59;word_1_16;-;100;95;293;103;313
61 | 0;ocrx_word;word_1_60;word_1_16;Col;100;106;293;138;313
62 | 0;ocrx_word;word_1_61;word_1_16;1;100;141;293;153;313
63 | 0;ocrx_word;word_1_62;word_1_17;Line;98;278;293;314;313
64 | 0;ocrx_word;word_1_63;word_1_17;5;100;319;293;330;313
65 | 0;ocrx_word;word_1_64;word_1_17;-;100;335;293;343;313
66 | 0;ocrx_word;word_1_65;word_1_17;Col;100;347;292;376;313
67 | 0;ocrx_word;word_1_66;word_1_17;2;100;380;292;392;314
68 | 0;ocrx_word;word_1_67;word_1_18;Line;98;497;293;533;313
69 | 0;ocrx_word;word_1_68;word_1_18;5;98;537;293;550;313
70 | 0;ocrx_word;word_1_69;word_1_18;-;100;554;293;562;313
71 | 0;ocrx_word;word_1_70;word_1_18;Col;100;566;293;596;313
72 | 0;ocrx_word;word_1_71;word_1_18;3;100;599;293;611;313
73 | 


--------------------------------------------------------------------------------
/tests/ocr/azure/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/azure/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/data/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/data/test_data/expected_table.json:
--------------------------------------------------------------------------------
1 | [[{"x1": 439, "y1": 1581, "x2": 950, "y2": 1658, "content": "Number of Coils"}, {"x1": 950, "y1": 1581, "x2": 1580, "y2": 1658, "content": "Number of Paperclips"}], [{"x1": 439, "y1": 1658, "x2": 950, "y2": 1733, "content": "Craig\n5"}, {"x1": 950, "y1": 1658, "x2": 1580, "y2": 1733, "content": "Spirit of America,\n3, 5, 4"}], [{"x1": 439, "y1": 1733, "x2": 950, "y2": 1808, "content": "Gary Gabelich\n10"}, {"x1": 950, "y1": 1733, "x2": 1580, "y2": 1808, "content": "Blue Flame\n7, 8, 6"}], [{"x1": 439, "y1": 1808, "x2": 950, "y2": 1883, "content": "Richard Noble\n15"}, {"x1": 950, "y1": 1808, "x2": 1580, "y2": 1883, "content": "Thrust 2\n11, 10, 12"}], [{"x1": 439, "y1": 1883, "x2": 950, "y2": 1956, "content": "20\nAndy Green"}, {"x1": 950, "y1": 1883, "x2": 1580, "y2": 1956, "content": "15, 13, 14\nThrust SSC"}]]


--------------------------------------------------------------------------------
/tests/ocr/data/test_data/table.json:
--------------------------------------------------------------------------------
1 | [[{"x1": 439, "y1": 1581, "x2": 950, "y2": 1658}, {"x1": 950, "y1": 1581, "x2": 1580, "y2": 1658}], [{"x1": 439, "y1": 1658, "x2": 950, "y2": 1733}, {"x1": 950, "y1": 1658, "x2": 1580, "y2": 1733}], [{"x1": 439, "y1": 1733, "x2": 950, "y2": 1808}, {"x1": 950, "y1": 1733, "x2": 1580, "y2": 1808}], [{"x1": 439, "y1": 1808, "x2": 950, "y2": 1883}, {"x1": 950, "y1": 1808, "x2": 1580, "y2": 1883}], [{"x1": 439, "y1": 1883, "x2": 950, "y2": 1956}, {"x1": 950, "y1": 1883, "x2": 1580, "y2": 1956}]]


--------------------------------------------------------------------------------
/tests/ocr/data/test_ocr_data.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import polars as pl
 5 | 
 6 | from img2table.ocr.data import OCRDataframe
 7 | from img2table.tables.objects.cell import Cell
 8 | from img2table.tables.objects.row import Row
 9 | from img2table.tables.objects.table import Table
10 | 
11 | 
12 | def test_pages():
13 |     ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
14 | 
15 |     ocr_df_page_0 = ocr_df.page(page_number=0)
16 |     ocr_df_page_1 = ocr_df.page(page_number=1)
17 | 
18 |     assert isinstance(ocr_df_page_0, OCRDataframe)
19 |     assert isinstance(ocr_df_page_1, OCRDataframe)
20 | 
21 |     assert not ocr_df_page_0 == ocr_df_page_1
22 |     assert len(ocr_df_page_0.df) + len(ocr_df_page_1.df) == len(ocr_df.df)
23 | 
24 | 
25 | def test_get_text_cell():
26 |     ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
27 |     cell = Cell(x1=200, x2=800, y1=700, y2=850)
28 | 
29 |     result = ocr_df.get_text_cell(cell=cell,
30 |                                   min_confidence=50,
31 |                                   page_number=0)
32 | 
33 |     assert result == "http://www.landspeed.com/lsrinfo.asp.)\nUse these data to create\nChecklist for a Data Table."
34 | 
35 | 
36 | def test_get_text_table():
37 |     ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
38 | 
39 |     with open("test_data/table.json", "r") as f:
40 |         table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)])
41 | 
42 |     result = ocr_df.get_text_table(table=table,
43 |                                    page_number=0,
44 |                                    min_confidence=50)
45 | 
46 |     with open("test_data/expected_table.json", "r") as f:
47 |         expected = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)])
48 | 
49 |     assert result == expected
50 | 


--------------------------------------------------------------------------------
/tests/ocr/doctr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/doctr/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/doctr/test_data/ocr.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/doctr/test_data/ocr.pkl


--------------------------------------------------------------------------------
/tests/ocr/doctr/test_data/ocr_df.csv:
--------------------------------------------------------------------------------
 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2
 2 | 0;ocrx_word;word_1_0_1;word_1_0;Title;100;383;38;425;59
 3 | 0;ocrx_word;word_1_1_2;word_1_1;Test;100;962;38;1004;59
 4 | 0;ocrx_word;word_1_1_3;word_1_1;1;100;1005;38;1020;59
 5 | 0;ocrx_word;word_1_1_4;word_1_1;Test;100;1058;38;1100;59
 6 | 0;ocrx_word;word_1_1_5;word_1_1;2;100;1100;38;1116;59
 7 | 0;ocrx_word;word_1_2_6;word_1_2;Line;100;37;87;81;113
 8 | 0;ocrx_word;word_1_2_7;word_1_2;1;100;81;90;97;110
 9 | 0;ocrx_word;word_1_2_8;word_1_2;-;99;95;95;108;109
10 | 0;ocrx_word;word_1_2_9;word_1_2;Col;100;108;89;141;110
11 | 0;ocrx_word;word_1_2_10;word_1_2;1;100;141;89;156;110
12 | 0;ocrx_word;word_1_3_11;word_1_3;Line;100;277;87;321;113
13 | 0;ocrx_word;word_1_3_12;word_1_3;1;100;320;90;337;110
14 | 0;ocrx_word;word_1_3_13;word_1_3;-;98;335;95;347;106
15 | 0;ocrx_word;word_1_3_14;word_1_3;Col;100;348;89;380;110
16 | 0;ocrx_word;word_1_3_15;word_1_3;2;100;381;90;396;110
17 | 0;ocrx_word;word_1_4_16;word_1_4;Line;100;496;87;540;113
18 | 0;ocrx_word;word_1_4_17;word_1_4;1;100;540;90;556;110
19 | 0;ocrx_word;word_1_4_18;word_1_4;-;99;553;95;567;106
20 | 0;ocrx_word;word_1_4_19;word_1_4;Col;100;567;89;600;110
21 | 0;ocrx_word;word_1_4_20;word_1_4;3;100;600;89;616;110
22 | 0;ocrx_word;word_1_5_21;word_1_5;Test;100;962;89;1004;110
23 | 0;ocrx_word;word_1_5_22;word_1_5;3;100;1004;89;1020;110
24 | 0;ocrx_word;word_1_5_23;word_1_5;Test;100;1058;89;1100;110
25 | 0;ocrx_word;word_1_5_24;word_1_5;4;100;1100;90;1116;110
26 | 0;ocrx_word;word_1_6_25;word_1_6;Line;100;39;139;79;162
27 | 0;ocrx_word;word_1_6_26;word_1_6;2;100;80;141;96;161
28 | 0;ocrx_word;word_1_6_27;word_1_6;-;94;95;147;107;158
29 | 0;ocrx_word;word_1_6_28;word_1_6;Col;97;108;141;141;161
30 | 0;ocrx_word;word_1_6_29;word_1_6;1;100;141;139;156;162
31 | 0;ocrx_word;word_1_7_30;word_1_7;Line;100;496;138;540;163
32 | 0;ocrx_word;word_1_7_31;word_1_7;2;100;540;141;556;162
33 | 0;ocrx_word;word_1_7_32;word_1_7;-;97;553;147;567;158
34 | 0;ocrx_word;word_1_7_33;word_1_7;Col;97;567;141;600;161
35 | 0;ocrx_word;word_1_7_34;word_1_7;3;100;600;139;616;161
36 | 0;ocrx_word;word_1_8_35;word_1_8;Line;100;39;191;79;212
37 | 0;ocrx_word;word_1_8_36;word_1_8;3;100;80;191;96;212
38 | 0;ocrx_word;word_1_8_37;word_1_8;-;100;96;197;109;210
39 | 0;ocrx_word;word_1_8_38;word_1_8;Col;100;108;191;140;212
40 | 0;ocrx_word;word_1_8_39;word_1_8;1;100;141;191;157;212
41 | 0;ocrx_word;word_1_9_40;word_1_9;Merged;100;328;191;400;215
42 | 0;ocrx_word;word_1_9_41;word_1_9;Cells;99;403;191;447;211
43 | 0;ocrx_word;word_1_10_42;word_1_10;Line;100;497;191;539;212
44 | 0;ocrx_word;word_1_10_43;word_1_10;3;100;539;191;555;212
45 | 0;ocrx_word;word_1_10_44;word_1_10;-;100;555;197;568;210
46 | 0;ocrx_word;word_1_10_45;word_1_10;Col;100;567;191;600;212
47 | 0;ocrx_word;word_1_10_46;word_1_10;3;100;600;191;616;211
48 | 0;ocrx_word;word_1_11_47;word_1_11;Line;100;39;242;79;263
49 | 0;ocrx_word;word_1_11_48;word_1_11;4;100;80;242;96;262
50 | 0;ocrx_word;word_1_11_49;word_1_11;-;51;96;250;105;258
51 | 0;ocrx_word;word_1_11_50;word_1_11;Col;100;108;242;140;262
52 | 0;ocrx_word;word_1_11_51;word_1_11;1;100;141;242;156;263
53 | 0;ocrx_word;word_1_12_52;word_1_12;Line;100;497;242;539;263
54 | 0;ocrx_word;word_1_12_53;word_1_12;4;100;539;242;556;262
55 | 0;ocrx_word;word_1_12_54;word_1_12;=;53;555;250;563;258
56 | 0;ocrx_word;word_1_12_55;word_1_12;Col;100;567;242;600;262
57 | 0;ocrx_word;word_1_12_56;word_1_12;3;100;600;242;616;262
58 | 0;ocrx_word;word_1_13_57;word_1_13;Line;89;39;292;79;314
59 | 0;ocrx_word;word_1_13_58;word_1_13;5;100;80;292;96;312
60 | 0;ocrx_word;word_1_13_59;word_1_13;-;100;96;298;108;308
61 | 0;ocrx_word;word_1_13_60;word_1_13;Col;100;108;292;140;314
62 | 0;ocrx_word;word_1_13_61;word_1_13;1;100;141;292;157;314
63 | 0;ocrx_word;word_1_14_62;word_1_14;Line;89;279;292;319;314
64 | 0;ocrx_word;word_1_14_63;word_1_14;5;100;320;292;336;312
65 | 0;ocrx_word;word_1_14_64;word_1_14;-;99;336;299;344;307
66 | 0;ocrx_word;word_1_14_65;word_1_14;Col;100;348;292;380;314
67 | 0;ocrx_word;word_1_14_66;word_1_14;2;100;381;292;396;314
68 | 0;ocrx_word;word_1_15_67;word_1_15;Line;100;497;292;539;314
69 | 0;ocrx_word;word_1_15_68;word_1_15;5;100;540;292;555;312
70 | 0;ocrx_word;word_1_15_69;word_1_15;Col;100;567;292;600;314
71 | 0;ocrx_word;word_1_15_70;word_1_15;3;100;600;292;616;312
72 | 


--------------------------------------------------------------------------------
/tests/ocr/doctr/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/doctr/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/doctr/test_doctr.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import pickle
 4 | import sys
 5 | 
 6 | import polars as pl
 7 | import pytest
 8 | 
 9 | from img2table.document.image import Image
10 | from img2table.ocr import DocTR
11 | from img2table.ocr.data import OCRDataframe
12 | 
13 | 
14 | def format_content(content):
15 |     output = {
16 |         id_page: {id_line: [{"value": word.value,
17 |                              "confidence": round(word.confidence, 2),
18 |                              "geometry": word.geometry,
19 |                              }
20 |                             for word in line.words]
21 |                   for block in page.blocks for id_line, line in enumerate(block.lines)
22 |                   }
23 |         for id_page, page in enumerate(content.pages)
24 |     }
25 | 
26 |     return output
27 | 
28 | 
29 | @pytest.mark.skipif(sys.version_info >= (3, 12), reason="Error building with 3.12")
30 | @pytest.mark.skipif(sys.version_info < (3, 9), reason="Not supported anymore")
31 | def test_doctr_content():
32 |     instance = DocTR()
33 |     doc = Image(src="test_data/test.png")
34 | 
35 |     result = instance.content(document=doc)
36 | 
37 |     with open("test_data/ocr.pkl", "rb") as f:
38 |         expected = pickle.load(f)
39 | 
40 |     assert format_content(result) == format_content(expected)
41 | 
42 | 
43 | @pytest.mark.skipif(sys.version_info >= (3, 12), reason="Error building with 3.12")
44 | @pytest.mark.skipif(sys.version_info < (3, 9), reason="Not supported anymore")
45 | def test_doctr_ocr_df():
46 |     instance = DocTR()
47 | 
48 |     with open("test_data/ocr.pkl", "rb") as f:
49 |         content = pickle.load(f)
50 | 
51 |     result = instance.to_ocr_dataframe(content=content)
52 | 
53 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
54 | 
55 |     assert result == expected
56 | 
57 | 
58 | @pytest.mark.skipif(sys.version_info >= (3, 12), reason="Error building with 3.12")
59 | @pytest.mark.skipif(sys.version_info < (3, 9), reason="Not supported anymore")
60 | def test_doctr_document():
61 |     instance = DocTR()
62 |     doc = Image(src="test_data/test.png")
63 | 
64 |     result = instance.of(document=doc)
65 | 
66 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
67 | 
68 |     assert result.df.drop("confidence").equals(expected.df.drop("confidence"))
69 | 


--------------------------------------------------------------------------------
/tests/ocr/easyocr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/easyocr/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/easyocr/test_data/ocr.json:
--------------------------------------------------------------------------------
1 | [[[[[383, 37], [425, 37], [425, 57], [383, 57]], "Title", 0.9999929628110977], [[[962, 36], [1020, 36], [1020, 60], [962, 60]], "Test 1", 0.9936909608809552], [[[1058, 36], [1116, 36], [1116, 60], [1058, 60]], "Test 2", 0.8442916046434888], [[[39, 89], [97, 89], [97, 109], [39, 109]], "Line 1", 0.6476242997727373], [[[107, 89], [155, 89], [155, 109], [107, 109]], "Col 1", 0.9720920853320717], [[[279, 89], [395, 89], [395, 109], [279, 109]], "Line 1 - Col 2", 0.936260010219446], [[[497, 89], [615, 89], [615, 109], [497, 109]], "Line 1 - Col 3", 0.8419314287078425], [[[962, 86], [1020, 86], [1020, 110], [962, 110]], "Test 3", 0.5705159231431433], [[[1058, 86], [1116, 86], [1116, 110], [1058, 110]], "Test 4", 0.6396247875277982], [[[38, 138], [156, 138], [156, 162], [38, 162]], "Line 2 - Col 1", 0.8294489200341787], [[[496, 138], [616, 138], [616, 162], [496, 162]], "Line 2 - Col 3", 0.9406858946476558], [[[39, 191], [97, 191], [97, 211], [39, 211]], "Line 3", 0.9953379669670536], [[[107, 191], [155, 191], [155, 211], [107, 211]], "Col 1", 0.9720920853320717], [[[327, 187], [448, 187], [448, 217], [327, 217]], "Merged Cells", 0.7461553269995271], [[[497, 191], [557, 191], [557, 211], [497, 211]], "Line 3", 0.9993883375538367], [[[567, 191], [615, 191], [615, 211], [567, 211]], "Col 3", 0.9479543226130566], [[[38, 240], [156, 240], [156, 264], [38, 264]], "Line 4 - Col 1", 0.7398924631614736], [[[496, 240], [556, 240], [556, 264], [496, 264]], "Line 4", 0.956354950430173], [[[566, 240], [616, 240], [616, 264], [566, 264]], "Col 3", 0.9885079303307873], [[[39, 293], [155, 293], [155, 313], [39, 313]], "Line 5 - Col 1", 0.9406334482299059], [[[279, 293], [395, 293], [395, 313], [279, 313]], "Line 5 - Col 2", 0.893512125952306], [[[497, 293], [615, 293], [615, 313], [497, 313]], "Line 5 - Col 3", 0.8414449973587579]]]


--------------------------------------------------------------------------------
/tests/ocr/easyocr/test_data/ocr_df.csv:
--------------------------------------------------------------------------------
 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2
 2 | 0;ocrx_word;word_1_1;word_1_1;Title;100;383;37;425;57
 3 | 0;ocrx_word;word_1_2;word_1_2;Test 1;99;962;36;1020;60
 4 | 0;ocrx_word;word_1_3;word_1_3;Test 2;84;1058;36;1116;60
 5 | 0;ocrx_word;word_1_4;word_1_4;Line 1;65;39;89;97;109
 6 | 0;ocrx_word;word_1_5;word_1_5;Col 1;97;107;89;155;109
 7 | 0;ocrx_word;word_1_6;word_1_6;Line 1 - Col 2;94;279;89;395;109
 8 | 0;ocrx_word;word_1_7;word_1_7;Line 1 - Col 3;84;497;89;615;109
 9 | 0;ocrx_word;word_1_8;word_1_8;Test 3;57;962;86;1020;110
10 | 0;ocrx_word;word_1_9;word_1_9;Test 4;64;1058;86;1116;110
11 | 0;ocrx_word;word_1_10;word_1_10;Line 2 - Col 1;83;38;138;156;162
12 | 0;ocrx_word;word_1_11;word_1_11;Line 2 - Col 3;94;496;138;616;162
13 | 0;ocrx_word;word_1_12;word_1_12;Line 3;100;39;191;97;211
14 | 0;ocrx_word;word_1_13;word_1_13;Col 1;97;107;191;155;211
15 | 0;ocrx_word;word_1_14;word_1_14;Merged Cells;75;327;187;448;217
16 | 0;ocrx_word;word_1_15;word_1_15;Line 3;100;497;191;557;211
17 | 0;ocrx_word;word_1_16;word_1_16;Col 3;95;567;191;615;211
18 | 0;ocrx_word;word_1_17;word_1_17;Line 4 - Col 1;74;38;240;156;264
19 | 0;ocrx_word;word_1_18;word_1_18;Line 4;96;496;240;556;264
20 | 0;ocrx_word;word_1_19;word_1_19;Col 3;99;566;240;616;264
21 | 0;ocrx_word;word_1_20;word_1_20;Line 5 - Col 1;94;39;293;155;313
22 | 0;ocrx_word;word_1_21;word_1_21;Line 5 - Col 2;89;279;293;395;313
23 | 0;ocrx_word;word_1_22;word_1_22;Line 5 - Col 3;84;497;293;615;313
24 | 


--------------------------------------------------------------------------------
/tests/ocr/easyocr/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/easyocr/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/easyocr/test_easyocr.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import json
 4 | import sys
 5 | from typing import Any
 6 | 
 7 | import numpy as np
 8 | import polars as pl
 9 | import pytest
10 | 
11 | from img2table.document.image import Image
12 | from img2table.ocr import EasyOCR
13 | from img2table.ocr.data import OCRDataframe
14 | 
15 | 
16 | def convert_np_types(obj: Any):
17 |     if isinstance(obj, list):
18 |         return [convert_np_types(element) for element in obj]
19 |     elif isinstance(obj, dict):
20 |         return {convert_np_types(k): convert_np_types(v) for k, v in obj.values()}
21 |     elif isinstance(obj, tuple):
22 |         return list(convert_np_types(element) for element in obj)
23 |     elif isinstance(obj, np.int32):
24 |         return int(obj)
25 |     elif isinstance(obj, (np.float64, float)):
26 |         return None
27 |     else:
28 |         return obj
29 | 
30 | 
31 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12")
32 | def test_validators():
33 |     with pytest.raises(TypeError) as e_info:
34 |         ocr = EasyOCR(lang=12)
35 | 
36 | 
37 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12")
38 | def test_easyocr_content():
39 |     instance = EasyOCR()
40 |     doc = Image(src="test_data/test.png")
41 | 
42 |     result = instance.content(document=doc)
43 | 
44 |     with open("test_data/ocr.json", "r") as f:
45 |         expected = json.load(f)
46 | 
47 |     assert convert_np_types(result) == convert_np_types(expected)
48 | 
49 | 
50 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12")
51 | def test_easyocr_ocr_df():
52 |     instance = EasyOCR()
53 | 
54 |     with open("test_data/ocr.json", "r") as f:
55 |         content = json.load(f)
56 | 
57 |     result = instance.to_ocr_dataframe(content=content)
58 | 
59 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
60 | 
61 |     assert result == expected
62 | 
63 | 
64 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12")
65 | def test_easyocr_document():
66 |     instance = EasyOCR()
67 |     doc = Image(src="test_data/test.png")
68 | 
69 |     result = instance.of(document=doc)
70 | 
71 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
72 | 
73 |     assert result.df.drop("confidence").equals(expected.df.drop("confidence"))
74 | 


--------------------------------------------------------------------------------
/tests/ocr/google_vision/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/google_vision/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/google_vision/test_data/ocr_df.csv:
--------------------------------------------------------------------------------
 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2
 2 | 0;ocrx_word;word_0_0_0_0;line_0_0_0;Line;99;41;90;74;105
 3 | 0;ocrx_word;word_0_0_0_1;line_0_0_0;1;98;83;90;90;104
 4 | 0;ocrx_word;word_0_0_0_2;line_0_0_0;-;76;97;90;103;104
 5 | 0;ocrx_word;word_0_0_0_3;line_0_0_0;Col;98;110;90;136;104
 6 | 0;ocrx_word;word_0_0_0_4;line_0_0_0;1;98;142;90;151;104
 7 | 0;ocrx_word;word_1_0_0_0;line_1_0_0;Line;99;40;142;75;157
 8 | 0;ocrx_word;word_1_0_0_1;line_1_0_0;2;98;83;142;91;157
 9 | 0;ocrx_word;word_1_0_0_2;line_1_0_0;-;85;97;142;104;157
10 | 0;ocrx_word;word_1_0_0_3;line_1_0_0;Col;97;110;142;137;157
11 | 0;ocrx_word;word_1_0_0_4;line_1_0_0;1;99;144;142;152;157
12 | 0;ocrx_word;word_2_0_0_0;line_2_0_0;Line;99;41;193;75;207
13 | 0;ocrx_word;word_2_0_0_1;line_2_0_0;3;97;82;193;91;207
14 | 0;ocrx_word;word_2_0_0_2;line_2_0_0;-;79;97;193;105;207
15 | 0;ocrx_word;word_2_0_0_3;line_2_0_0;Col;98;110;193;137;207
16 | 0;ocrx_word;word_2_0_0_4;line_2_0_0;1;99;144;193;152;207
17 | 0;ocrx_word;word_3_0_0_0;line_3_0_0;Line;99;41;243;75;258
18 | 0;ocrx_word;word_3_0_0_1;line_3_0_0;4;98;82;243;91;258
19 | 0;ocrx_word;word_3_0_0_2;line_3_0_0;Col;99;110;243;137;258
20 | 0;ocrx_word;word_3_0_0_3;line_3_0_0;1;98;143;243;152;258
21 | 0;ocrx_word;word_4_0_0_0;line_4_0_0;Line;99;40;295;75;309
22 | 0;ocrx_word;word_4_0_0_1;line_4_0_0;5;98;82;295;90;309
23 | 0;ocrx_word;word_4_0_0_2;line_4_0_0;-;78;97;295;104;309
24 | 0;ocrx_word;word_4_0_0_3;line_4_0_0;Col;97;109;295;136;309
25 | 0;ocrx_word;word_4_0_0_4;line_4_0_0;1;98;142;295;152;309
26 | 0;ocrx_word;word_5_0_0_0;line_5_0_0;Title;99;383;39;423;55
27 | 0;ocrx_word;word_6_0_0_0;line_6_0_0;Line;99;281;91;316;105
28 | 0;ocrx_word;word_6_0_0_1;line_6_0_0;1;98;323;91;330;105
29 | 0;ocrx_word;word_6_0_0_2;line_6_0_0;-;76;337;91;344;105
30 | 0;ocrx_word;word_6_0_0_3;line_6_0_0;Col;91;350;91;378;105
31 | 0;ocrx_word;word_6_0_0_4;line_6_0_0;2;98;383;91;392;105
32 | 0;ocrx_word;word_7_0_0_0;line_7_0_0;Merged;99;329;191;395;211
33 | 0;ocrx_word;word_7_0_0_1;line_7_0_0;Cells;97;402;191;442;209
34 | 0;ocrx_word;word_8_0_0_0;line_8_0_0;Line;98;280;295;316;309
35 | 0;ocrx_word;word_8_0_0_1;line_8_0_0;5;97;323;295;332;309
36 | 0;ocrx_word;word_8_0_0_2;line_8_0_0;-;79;338;295;345;309
37 | 0;ocrx_word;word_8_0_0_3;line_8_0_0;Col;89;349;295;378;309
38 | 0;ocrx_word;word_8_0_0_4;line_8_0_0;2;98;383;295;392;309
39 | 0;ocrx_word;word_9_0_0_0;line_9_0_0;Line;99;499;91;534;105
40 | 0;ocrx_word;word_9_0_0_1;line_9_0_0;1;98;542;91;549;105
41 | 0;ocrx_word;word_9_0_0_2;line_9_0_0;-;75;556;91;563;105
42 | 0;ocrx_word;word_9_0_0_3;line_9_0_0;Col;94;569;91;596;105
43 | 0;ocrx_word;word_9_0_0_4;line_9_0_0;3;98;601;91;613;105
44 | 0;ocrx_word;word_10_0_0_0;line_10_0_0;Line;99;499;142;534;156
45 | 0;ocrx_word;word_10_0_0_1;line_10_0_0;2;98;541;142;550;156
46 | 0;ocrx_word;word_10_0_0_2;line_10_0_0;-;79;556;142;563;156
47 | 0;ocrx_word;word_10_0_0_3;line_10_0_0;Col;90;569;142;596;156
48 | 0;ocrx_word;word_10_0_0_4;line_10_0_0;3;98;603;142;612;156
49 | 0;ocrx_word;word_11_0_0_0;line_11_0_0;Line;99;500;193;534;207
50 | 0;ocrx_word;word_11_0_0_1;line_11_0_0;3;98;541;193;550;207
51 | 0;ocrx_word;word_11_0_0_2;line_11_0_0;-;79;556;193;564;207
52 | 0;ocrx_word;word_11_0_0_3;line_11_0_0;Col;92;569;193;597;207
53 | 0;ocrx_word;word_11_0_0_4;line_11_0_0;3;98;602;193;612;207
54 | 0;ocrx_word;word_12_0_0_0;line_12_0_0;Line;99;499;243;534;258
55 | 0;ocrx_word;word_12_0_0_1;line_12_0_0;4;97;541;243;550;258
56 | 0;ocrx_word;word_12_0_0_2;line_12_0_0;-;70;556;243;562;258
57 | 0;ocrx_word;word_12_0_0_3;line_12_0_0;Col;98;569;243;595;258
58 | 0;ocrx_word;word_12_0_0_4;line_12_0_0;3;98;600;244;611;259
59 | 0;ocrx_word;word_12_1_0_0;line_12_1_0;Line;99;500;294;534;309
60 | 0;ocrx_word;word_12_1_0_1;line_12_1_0;5;97;542;294;551;309
61 | 0;ocrx_word;word_12_1_0_2;line_12_1_0;-;72;556;294;563;309
62 | 0;ocrx_word;word_12_1_0_3;line_12_1_0;Col;97;569;294;597;309
63 | 0;ocrx_word;word_12_1_0_4;line_12_1_0;3;98;601;294;613;309
64 | 0;ocrx_word;word_13_0_0_0;line_13_0_0;Test;98;965;41;1001;54
65 | 0;ocrx_word;word_13_0_0_1;line_13_0_0;1;98;1006;41;1015;54
66 | 0;ocrx_word;word_14_0_0_0;line_14_0_0;Test;99;965;91;1001;106
67 | 0;ocrx_word;word_14_0_0_1;line_14_0_0;3;99;1006;91;1016;106
68 | 0;ocrx_word;word_15_0_0_0;line_15_0_0;Test;98;1061;40;1095;54
69 | 0;ocrx_word;word_15_0_0_1;line_15_0_0;2;98;1101;40;1110;53
70 | 0;ocrx_word;word_16_0_0_0;line_16_0_0;Test;95;1061;91;1096;104
71 | 0;ocrx_word;word_16_0_0_1;line_16_0_0;4;98;1102;91;1112;104
72 | 


--------------------------------------------------------------------------------
/tests/ocr/google_vision/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/google_vision/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/google_vision/test_google_vision.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | import os
 4 | import pickle
 5 | 
 6 | import polars as pl
 7 | import pytest
 8 | 
 9 | from img2table.document import Image
10 | from img2table.ocr.data import OCRDataframe
11 | from img2table.ocr.google_vision import VisionEndpointContent, VisionAPIContent, VisionOCR
12 | from tests import MOCK_DIR
13 | 
14 | 
15 | def test_vision_endpoint_content(mock_vision):
16 |     image = Image("test_data/test.png")
17 |     content = VisionEndpointContent(api_key="api_key", timeout=10)
18 | 
19 |     with open("test_data/expected_content.json", "r") as f:
20 |         expected = json.load(f)
21 | 
22 |     # Test for map_response method
23 |     with open(os.path.join(MOCK_DIR, "vision.json"), "r") as f:
24 |         response = json.load(f)
25 | 
26 |     result_map_response = content.map_response(response=response, page=0, height=417, width=1365)
27 |     assert result_map_response == expected[0]
28 | 
29 |     # Test for get_content method
30 |     result_get_content = content.get_content(document=image)
31 |     assert result_get_content == expected
32 | 
33 | 
34 | def test_vision_api_content(mock_vision):
35 |     image = Image("test_data/test.png")
36 |     content = VisionAPIContent(timeout=10)
37 | 
38 |     with open("test_data/expected_content.json", "r") as f:
39 |         expected = json.load(f)
40 | 
41 |     # Test for map_response method
42 |     with open(os.path.join(MOCK_DIR, "vision.pkl"), "rb") as f:
43 |         response = pickle.load(f)
44 | 
45 |     result_map_response = content.map_response(response=response, shapes=[(417, 1365)])
46 |     assert result_map_response == expected
47 | 
48 |     # Test for get_content method
49 |     result_get_content = content.get_content(document=image)
50 |     assert result_get_content == expected
51 | 
52 | 
53 | def test_vision_ocr(mock_vision):
54 |     image = Image("test_data/test.png")
55 | 
56 |     with open("test_data/expected_content.json", "r") as f:
57 |         content = json.load(f)
58 | 
59 |     expected_ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
60 | 
61 |     # Test init error
62 |     with pytest.raises(TypeError) as e_info:
63 |         VisionOCR(api_key=8)
64 | 
65 |     with pytest.raises(ValueError) as e_info:
66 |         VisionOCR()
67 | 
68 |     # Test with api_key
69 |     ocr_key = VisionOCR(timeout=10, api_key="api_key")
70 | 
71 |     result_to_ocr_df = ocr_key.to_ocr_dataframe(content=content)
72 |     assert result_to_ocr_df == expected_ocr_df
73 | 
74 |     result_ocr_df = ocr_key.of(document=image)
75 |     assert result_ocr_df == expected_ocr_df
76 | 
77 |     # Test with credentials
78 |     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "creds"
79 |     ocr_creds = VisionOCR(timeout=10)
80 | 
81 |     result_to_ocr_df = ocr_creds.to_ocr_dataframe(content=content)
82 |     assert result_to_ocr_df == expected_ocr_df
83 | 
84 |     result_ocr_df = ocr_creds.of(document=image)
85 |     assert result_ocr_df == expected_ocr_df
86 | 


--------------------------------------------------------------------------------
/tests/ocr/paddle/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/paddle/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/paddle/test_data/hocr.json:
--------------------------------------------------------------------------------
1 | [[[[381.0, 33.0], [426.0, 37.0], [424.0, 60.0], [379.0, 56.0]], ["Title", 1.0]], [[[964.0, 38.0], [1019.0, 38.0], [1019.0, 58.0], [964.0, 58.0]], ["Test 1", 0.96]], [[[1061.0, 36.0], [1115.0, 36.0], [1115.0, 58.0], [1061.0, 58.0]], ["Test 2", 1.0]], [[[40.0, 88.0], [155.0, 88.0], [155.0, 109.0], [40.0, 109.0]], ["Line 1 - Col 1", 0.93]], [[[282.0, 88.0], [395.0, 88.0], [395.0, 109.0], [282.0, 109.0]], ["Line 1 -Col 2", 0.94]], [[[499.0, 88.0], [614.0, 88.0], [614.0, 109.0], [499.0, 109.0]], ["Line 1-Col 3", 0.94]], [[[962.0, 90.0], [1019.0, 86.0], [1020.0, 108.0], [964.0, 112.0]], ["Test 3", 1.0]], [[[1061.0, 88.0], [1116.0, 88.0], [1116.0, 110.0], [1061.0, 110.0]], ["Test 4", 1.0]], [[[41.0, 140.0], [155.0, 140.0], [155.0, 161.0], [41.0, 161.0]], ["Line 2-Col 1", 0.92]], [[[499.0, 139.0], [614.0, 139.0], [614.0, 159.0], [499.0, 159.0]], ["Line2-Col3", 0.92]], [[[40.0, 191.0], [155.0, 191.0], [155.0, 211.0], [40.0, 211.0]], ["Line 3-Col 1", 0.93]], [[[327.0, 188.0], [446.0, 188.0], [446.0, 213.0], [327.0, 213.0]], ["Merged Cells", 0.99]], [[[499.0, 191.0], [616.0, 191.0], [616.0, 211.0], [499.0, 211.0]], ["Line 3-Col 3", 0.88]], [[[40.0, 242.0], [155.0, 242.0], [155.0, 262.0], [40.0, 262.0]], ["Line4-Col 1", 0.91]], [[[498.0, 242.0], [616.0, 242.0], [616.0, 262.0], [498.0, 262.0]], ["Line 4-Col 3", 0.88]], [[[40.0, 293.0], [155.0, 289.0], [155.0, 311.0], [40.0, 314.0]], ["Line 5 - Col 1", 0.98]], [[[282.0, 292.0], [395.0, 292.0], [395.0, 313.0], [282.0, 313.0]], ["Line 5-Col 2", 0.89]], [[[498.0, 292.0], [614.0, 292.0], [614.0, 313.0], [498.0, 313.0]], ["Line 5-Col 3", 0.91]]]


--------------------------------------------------------------------------------
/tests/ocr/paddle/test_data/ocr_df.csv:
--------------------------------------------------------------------------------
 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2
 2 | 0;ocrx_word;word_1_1;word_1_1;Title;100;379;33;426;60
 3 | 0;ocrx_word;word_1_2;word_1_2;Test 1;96;964;38;1019;58
 4 | 0;ocrx_word;word_1_3;word_1_3;Test 2;100;1061;36;1115;58
 5 | 0;ocrx_word;word_1_4;word_1_4;Line 1 - Col 1;93;40;88;155;109
 6 | 0;ocrx_word;word_1_5;word_1_5;Line 1 -Col 2;94;282;88;395;109
 7 | 0;ocrx_word;word_1_6;word_1_6;Line 1-Col 3;94;499;88;614;109
 8 | 0;ocrx_word;word_1_7;word_1_7;Test 3;100;962;86;1020;112
 9 | 0;ocrx_word;word_1_8;word_1_8;Test 4;100;1061;88;1116;110
10 | 0;ocrx_word;word_1_9;word_1_9;Line 2-Col 1;92;41;140;155;161
11 | 0;ocrx_word;word_1_10;word_1_10;Line2-Col3;92;499;139;614;159
12 | 0;ocrx_word;word_1_11;word_1_11;Line 3-Col 1;93;40;191;155;211
13 | 0;ocrx_word;word_1_12;word_1_12;Merged Cells;99;327;188;446;213
14 | 0;ocrx_word;word_1_13;word_1_13;Line 3-Col 3;88;499;191;616;211
15 | 0;ocrx_word;word_1_14;word_1_14;Line4-Col 1;91;40;242;155;262
16 | 0;ocrx_word;word_1_15;word_1_15;Line 4-Col 3;88;498;242;616;262
17 | 0;ocrx_word;word_1_16;word_1_16;Line 5 - Col 1;98;40;289;155;314
18 | 0;ocrx_word;word_1_17;word_1_17;Line 5-Col 2;89;282;292;395;313
19 | 0;ocrx_word;word_1_18;word_1_18;Line 5-Col 3;91;498;292;614;313
20 | 


--------------------------------------------------------------------------------
/tests/ocr/paddle/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/paddle/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/paddle/test_paddle.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import json
 4 | import sys
 5 | 
 6 | import cv2
 7 | import polars as pl
 8 | import pytest
 9 | 
10 | from img2table.document.image import Image
11 | from img2table.ocr.data import OCRDataframe
12 | 
13 | 
14 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12")
15 | def test_validators():
16 |     from img2table.ocr import PaddleOCR
17 | 
18 |     with pytest.raises(TypeError) as e_info:
19 |         ocr = PaddleOCR(lang=12)
20 | 
21 | 
22 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12")
23 | def test_paddle_hocr():
24 |     from img2table.ocr import PaddleOCR
25 | 
26 |     instance = PaddleOCR()
27 |     img = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB)
28 | 
29 |     result = instance.hocr(image=img)
30 | 
31 |     with open("test_data/hocr.json", "r") as f:
32 |         expected = [[element[0], tuple(element[1])] for element in json.load(f)]
33 | 
34 |     assert result == expected
35 | 
36 | 
37 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12")
38 | def test_paddle_content():
39 |     from img2table.ocr import PaddleOCR
40 | 
41 |     instance = PaddleOCR()
42 |     doc = Image(src="test_data/test.png")
43 | 
44 |     result = instance.content(document=doc)
45 | 
46 |     with open("test_data/hocr.json", "r") as f:
47 |         expected = [[[element[0], tuple(element[1])] for element in json.load(f)]]
48 | 
49 |     assert result == expected
50 | 
51 | 
52 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12")
53 | def test_paddle_ocr_df():
54 |     from img2table.ocr import PaddleOCR
55 | 
56 |     instance = PaddleOCR()
57 | 
58 |     with open("test_data/hocr.json", "r") as f:
59 |         content = [[[element[0], tuple(element[1])] for element in json.load(f)]]
60 | 
61 |     result = instance.to_ocr_dataframe(content=content)
62 | 
63 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
64 | 
65 |     assert result == expected
66 | 
67 | 
68 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12")
69 | def test_paddle_document():
70 |     from img2table.ocr import PaddleOCR
71 | 
72 |     instance = PaddleOCR()
73 |     doc = Image(src="test_data/test.png")
74 | 
75 |     result = instance.of(document=doc)
76 | 
77 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
78 | 
79 |     assert result == expected
80 | 


--------------------------------------------------------------------------------
/tests/ocr/pdf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/pdf/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/pdf/test_data/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/pdf/test_data/test.pdf


--------------------------------------------------------------------------------
/tests/ocr/pdf/test_pdf_ocr.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import polars as pl
 5 | 
 6 | from img2table.document.pdf import PDF
 7 | from img2table.ocr.data import OCRDataframe
 8 | from img2table.ocr.pdf import PdfOCR
 9 | 
10 | 
11 | def test_pdf_content(mock_tesseract):
12 |     instance = PdfOCR()
13 |     doc = PDF(src="test_data/test.pdf", pages=[0, 1])
14 | 
15 |     result = instance.content(document=doc)
16 | 
17 |     with open("test_data/content.json", "r") as f:
18 |         expected = json.load(f)
19 | 
20 |     assert result == expected
21 | 
22 | 
23 | def test_pdf_ocr_df():
24 |     instance = PdfOCR()
25 | 
26 |     with open("test_data/content.json", "r") as f:
27 |         content = json.load(f)
28 | 
29 |     result = instance.to_ocr_dataframe(content=content)
30 | 
31 |     df_expected = pl.read_csv("test_data/ocr_df.csv", separator=";")
32 |     expected = OCRDataframe(df=df_expected)
33 | 
34 |     assert result == expected
35 | 
36 | 
37 | def test_pdf_document():
38 |     instance = PdfOCR()
39 |     doc = PDF(src="test_data/test.pdf", pages=[0, 1])
40 | 
41 |     result = instance.of(document=doc)
42 | 
43 |     df_expected = pl.read_csv("test_data/ocr_df.csv", separator=";")
44 |     expected = OCRDataframe(df=df_expected)
45 | 
46 |     assert result == expected
47 | 


--------------------------------------------------------------------------------
/tests/ocr/surya/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/surya/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/surya/test_data/ocr_df.csv:
--------------------------------------------------------------------------------
 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2
 2 | 0;ocrx_word;word_1_1_0;word_1_1;20 PAYS LES PLUS PEUPLÉS (EN TEMPS RÉEL);99;206;33;862;64
 3 | 0;ocrx_word;word_1_2_0;word_1_2;k;57;94;132;118;149
 4 | 0;ocrx_word;word_1_3_0;word_1_3;Chine;93;163;132;229;153
 5 | 0;ocrx_word;word_1_4_0;word_1_4;Japon;97;675;132;741;153
 6 | 0;ocrx_word;word_1_5_0;word_1_5;1 448 949 124;98;360;133;504;153
 7 | 0;ocrx_word;word_1_6_0;word_1_6;11;93;563;133;587;151
 8 | 0;ocrx_word;word_1_7_0;word_1_7;125 815 814;94;856;133;979;153
 9 | 0;ocrx_word;word_1_8_0;word_1_8;1;93;60;134;74;150
10 | 0;ocrx_word;word_1_9_0;word_1_9;Ethiopie;98;675;193;768;216
11 | 0;ocrx_word;word_1_10_0;word_1_10;Inde;97;161;194;214;215
12 | 0;ocrx_word;word_1_11_0;word_1_11;1 403 805 173;98;361;194;505;216
13 | 0;ocrx_word;word_1_12_0;word_1_12;120 125 587;93;856;194;979;215
14 | 0;ocrx_word;word_1_13_0;word_1_13;2;81;57;195;76;215
15 | 0;ocrx_word;word_1_14_0;word_1_14;12 ;88;563;195;595;215
16 | 0;ocrx_word;word_1_15_0;word_1_15;138;23;599;196;651;214
17 | 0;ocrx_word;word_1_16_0;word_1_16;(a);45;102;197;126;211
18 | 0;ocrx_word;word_1_17_0;word_1_17;États-Unis;98;162;255;275;278
19 | 0;ocrx_word;word_1_18_0;word_1_18;Philippines;99;675;257;796;279
20 | 0;ocrx_word;word_1_19_0;word_1_19;3;87;59;258;75;278
21 | 0;ocrx_word;word_1_20_0;word_1_20;334 378 540;94;379;258;504;278
22 | 0;ocrx_word;word_1_21_0;word_1_21;112 159 001;94;857;258;979;278
23 | 0;ocrx_word;word_1_22_0;word_1_22;13;93;562;259;591;278
24 | 0;ocrx_word;word_1_23_0;word_1_23;A;40;603;259;629;275
25 | 0;ocrx_word;word_1_24_0;word_1_24;Égypte;95;672;315;755;343
26 | 0;ocrx_word;word_1_25_0;word_1_25;Indonésie;98;163;319;268;340
27 | 0;ocrx_word;word_1_26_0;word_1_26;105 787 594;91;856;319;980;341
28 | 0;ocrx_word;word_1_27_0;word_1_27;278 605 610;96;379;320;505;340
29 | 0;ocrx_word;word_1_28_0;word_1_28;14;96;564;320;588;340
30 | 0;ocrx_word;word_1_29_0;word_1_29;4;93;59;322;74;340
31 | 0;ocrx_word;word_1_30_0;word_1_30;Brésil;94;161;382;227;404
32 | 0;ocrx_word;word_1_31_0;word_1_31;5;83;60;383;75;402
33 | 0;ocrx_word;word_1_32_0;word_1_32;215 194 443;98;379;383;504;404
34 | 0;ocrx_word;word_1_33_0;word_1_33;l ર;73;562;383;592;402
35 | 0;ocrx_word;word_1_34_0;word_1_34;★;84;596;383;651;401
36 | 0;ocrx_word;word_1_35_0;word_1_35;Vietnam;98;676;383;768;403
37 | 0;ocrx_word;word_1_36_0;word_1_36;98 872 186;89;867;383;980;404
38 | 0;ocrx_word;word_1_37_0;word_1_37;94 560 352;91;867;443;981;467
39 | 0;ocrx_word;word_1_38_0;word_1_38;l C;61;91;444;142;463
40 | 0;ocrx_word;word_1_39_0;word_1_39;Pakistan;97;161;444;255;465
41 | 0;ocrx_word;word_1_40_0;word_1_40;Congo;97;674;444;750;466
42 | 0;ocrx_word;word_1_41_0;word_1_41;6;87;58;445;77;464
43 | 0;ocrx_word;word_1_42_0;word_1_42;228 575 462;96;379;445;504;465
44 | 0;ocrx_word;word_1_43_0;word_1_43;16;91;563;445;593;465
45 | 0;ocrx_word;word_1_44_0;word_1_44;"";26;599;445;652;462
46 | 0;ocrx_word;word_1_45_0;word_1_45;Allemagne;98;677;506;794;529
47 | 0;ocrx_word;word_1_46_0;word_1_46;7                                                                                                                                                                            ;91;57;507;141;528
48 | 0;ocrx_word;word_1_47_0;word_1_47;Nigeria;97;162;507;244;529
49 | 0;ocrx_word;word_1_48_0;word_1_48;215 401 411;92;380;508;503;528
50 | 0;ocrx_word;word_1_49_0;word_1_49;17;88;563;508;592;527
51 | 0;ocrx_word;word_1_50_0;word_1_50;84 247 226;90;866;508;980;528
52 | 0;ocrx_word;word_1_51_0;word_1_51;Bangladesh;99;162;569;286;591
53 | 0;ocrx_word;word_1_52_0;word_1_52;167 568 471;94;379;570;503;590
54 | 0;ocrx_word;word_1_53_0;word_1_53;personal program;47;604;570;653;590
55 | 0;ocrx_word;word_1_54_0;word_1_54;Iran;97;676;570;722;591
56 | 0;ocrx_word;word_1_55_0;word_1_55;85 891 049;92;867;570;979;591
57 | 0;ocrx_word;word_1_56_0;word_1_56;8;79;58;571;76;589
58 | 0;ocrx_word;word_1_57_0;word_1_57;18;93;563;571;591;590
59 | 0;ocrx_word;word_1_58_0;word_1_58;o;72;100;572;137;587
60 | 0;ocrx_word;word_1_59_0;word_1_59;Russie;96;163;632;237;653
61 | 0;ocrx_word;word_1_60_0;word_1_60;(+;59;604;632;647;652
62 | 0;ocrx_word;word_1_61_0;word_1_61;146 042 034;98;380;633;505;654
63 | 0;ocrx_word;word_1_62_0;word_1_62;Turquie;97;677;633;762;654
64 | 0;ocrx_word;word_1_63_0;word_1_63;85 934 290;93;866;633;980;654
65 | 0;ocrx_word;word_1_64_0;word_1_64;9;89;59;634;75;651
66 | 0;ocrx_word;word_1_65_0;word_1_65;19;96;562;634;590;653
67 | 0;ocrx_word;word_1_66_0;word_1_66;Thaïlande;95;678;693;783;716
68 | 0;ocrx_word;word_1_67_0;word_1_67;Mexique;97;161;694;258;716
69 | 0;ocrx_word;word_1_68_0;word_1_68;70 102 414;90;867;694;980;718
70 | 0;ocrx_word;word_1_69_0;word_1_69;131 312 546;95;379;695;505;716
71 | 0;ocrx_word;word_1_70_0;word_1_70;10;93;48;696;75;714
72 | 0;ocrx_word;word_1_71_0;word_1_71;19;19;101;697;125;713
73 | 0;ocrx_word;word_1_72_0;word_1_72;20;91;562;697;589;715
74 | 


--------------------------------------------------------------------------------
/tests/ocr/surya/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/surya/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/surya/test_surya.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | import pickle
 4 | import sys
 5 | 
 6 | import polars as pl
 7 | import pytest
 8 | 
 9 | from img2table.document import Image
10 | from img2table.ocr import SuryaOCR
11 | from img2table.ocr.data import OCRDataframe
12 | from tests import MOCK_DIR
13 | 
14 | 
15 | @pytest.mark.skipif(sys.version_info < (3, 10), reason="Library not available")
16 | def test_content(mock_surya):
17 |     img = Image("test_data/test.png")
18 |     ocr = SuryaOCR(langs=["en"])
19 | 
20 |     result = ocr.content(document=img)
21 | 
22 |     with open(os.path.join(MOCK_DIR, "surya.pkl"), "rb") as f:
23 |         expected = pickle.load(f)
24 | 
25 |     assert result == expected
26 | 
27 | 
28 | @pytest.mark.skipif(sys.version_info < (3, 10), reason="Library not available")
29 | def test_to_ocr_df():
30 |     ocr = SuryaOCR(langs=["en"])
31 |     with open(os.path.join(MOCK_DIR, "surya.pkl"), "rb") as f:
32 |         content = pickle.load(f)
33 | 
34 |     result = ocr.to_ocr_dataframe(content=content)
35 | 
36 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
37 | 
38 |     assert result == expected
39 | 
40 | 
41 | @pytest.mark.skipif(sys.version_info < (3, 10), reason="Library not available")
42 | def test_surya_ocr(mock_surya):
43 |     # Test init error
44 |     with pytest.raises(TypeError) as e_info:
45 |         SuryaOCR(langs=1)
46 | 
47 |     with pytest.raises(TypeError) as e_info:
48 |         SuryaOCR(langs=[1, 2])
49 | 
50 |     img = Image("test_data/test.png")
51 |     ocr = SuryaOCR(langs=["en"])
52 | 
53 |     result = ocr.of(document=img)
54 | 
55 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
56 | 
57 |     assert result == expected
58 | 


--------------------------------------------------------------------------------
/tests/ocr/tesseract/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/tesseract/__init__.py


--------------------------------------------------------------------------------
/tests/ocr/tesseract/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/tesseract/test_data/test.png


--------------------------------------------------------------------------------
/tests/ocr/tesseract/test_tesseract.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import os
 3 | 
 4 | import cv2
 5 | import polars as pl
 6 | import pytest
 7 | 
 8 | from img2table.document.image import Image
 9 | from img2table.ocr import TesseractOCR
10 | from img2table.ocr.data import OCRDataframe
11 | from tests import MOCK_DIR, TESSERACT_INSTALL
12 | 
13 | 
14 | def test_validators():
15 |     with pytest.raises(TypeError) as e_info:
16 |         ocr = TesseractOCR(n_threads=[8])
17 | 
18 |     with pytest.raises(TypeError) as e_info:
19 |         ocr = TesseractOCR(lang=12)
20 | 
21 |     with pytest.raises(TypeError) as e_info:
22 |         ocr = TesseractOCR(psm="r")
23 | 
24 | 
25 | @pytest.mark.skipif(TESSERACT_INSTALL, reason="Tesseract installed locally")
26 | def test_installed():
27 |     with pytest.raises(EnvironmentError) as e_info:
28 |         ocr = TesseractOCR()
29 | 
30 | 
31 | def test_lang_validators(mock_tesseract):
32 |     with pytest.raises(EnvironmentError) as e_info:
33 |         ocr = TesseractOCR(lang="zzz")
34 | 
35 | 
36 | def test_tesseract_hocr(mock_tesseract):
37 |     instance = TesseractOCR()
38 |     img = cv2.imread("test_data/test.png", cv2.IMREAD_GRAYSCALE)
39 | 
40 |     result = instance.hocr(image=img)
41 | 
42 |     with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), 'r') as f:
43 |         assert result == f.read()
44 | 
45 | 
46 | def test_tesseract_content(mock_tesseract):
47 |     instance = TesseractOCR()
48 |     doc = Image(src="test_data/test.png")
49 | 
50 |     result = instance.content(document=doc)
51 | 
52 |     with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), 'r') as f:
53 |         assert list(result) == [f.read()]
54 | 
55 | 
56 | def test_tesseract_ocr_df(mock_tesseract):
57 |     instance = TesseractOCR()
58 | 
59 |     with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), 'r') as f:
60 |         content = [f.read()]
61 | 
62 |     result = instance.to_ocr_dataframe(content=content)
63 | 
64 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
65 | 
66 |     assert result == expected
67 | 
68 | 
69 | def test_tesseract_document(mock_tesseract):
70 |     instance = TesseractOCR()
71 |     doc = Image(src="test_data/test.png")
72 | 
73 |     result = instance.of(document=doc)
74 | 
75 |     expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";"))
76 | 
77 |     assert result == expected
78 | 


--------------------------------------------------------------------------------
/tests/tables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/__init__.py


--------------------------------------------------------------------------------
/tests/tables/image/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/image/__init__.py


--------------------------------------------------------------------------------
/tests/tables/image/test_data/blank.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/image/test_data/blank.png


--------------------------------------------------------------------------------
/tests/tables/image/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/image/test_data/test.png


--------------------------------------------------------------------------------
/tests/tables/image/test_image.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import cv2
 4 | 
 5 | from img2table.tables.image import TableImage
 6 | 
 7 | 
 8 | def test_table_image():
 9 |     image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB)
10 | 
11 |     tb_image = TableImage(img=image,
12 |                           min_confidence=50)
13 | 
14 |     result = tb_image.extract_tables(implicit_rows=True)
15 |     result = sorted(result, key=lambda tb: tb.x1 + tb.x2)
16 | 
17 |     assert (result[0].x1, result[0].y1, result[0].x2, result[0].y2) == (36, 21, 770, 327)
18 |     assert (result[0].nb_rows, result[0].nb_columns) == (6, 3)
19 | 
20 |     assert (result[1].x1, result[1].y1, result[1].x2, result[1].y2) == (962, 21, 1154, 123)
21 |     assert (result[1].nb_rows, result[1].nb_columns) == (2, 2)
22 | 


--------------------------------------------------------------------------------
/tests/tables/image/test_metrics.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import cv2
 3 | 
 4 | from img2table.tables import threshold_dark_areas
 5 | from img2table.tables.metrics import compute_char_length, compute_median_line_sep, compute_img_metrics
 6 | 
 7 | 
 8 | def test_compute_char_length():
 9 |     image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB)
10 |     thresh = threshold_dark_areas(img=image, char_length=11)
11 | 
12 |     char_length, thresh_chars, chars_array = compute_char_length(thresh=thresh)
13 |     assert round(char_length, 2) == 9.0
14 |     assert thresh_chars.shape == (417, 1365)
15 | 
16 |     image = 255 - cv2.cvtColor(cv2.imread("test_data/blank.png"), cv2.COLOR_BGR2GRAY)
17 |     assert compute_char_length(thresh=image) == (None, None, None)
18 | 
19 | 
20 | def test_compute_median_line_sep():
21 |     image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB)
22 |     thresh = threshold_dark_areas(img=image, char_length=11)
23 |     char_length, thresh_chars, chars_array = compute_char_length(thresh=thresh)
24 | 
25 |     median_line_sep, contours = compute_median_line_sep(thresh_chars=thresh_chars,
26 |                                                         chars_array=chars_array,
27 |                                                         char_length=char_length)
28 | 
29 |     assert round(median_line_sep, 2) == 51
30 |     assert len(contours) == 71
31 | 
32 | 
33 | def test_compute_img_metrics():
34 |     image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB)
35 |     thresh = threshold_dark_areas(img=image, char_length=11)
36 |     char_length, median_line_sep, contours = compute_img_metrics(thresh=thresh)
37 | 
38 |     assert round(char_length, 2) == 9.0
39 |     assert round(median_line_sep, 2) == 51
40 |     assert len(contours) == 71
41 | 
42 |     image = 255 - cv2.cvtColor(cv2.imread("test_data/blank.png"), cv2.COLOR_BGR2GRAY)
43 |     assert compute_img_metrics(thresh=image) == (None, None, None)
44 | 


--------------------------------------------------------------------------------
/tests/tables/objects/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/objects/__init__.py


--------------------------------------------------------------------------------
/tests/tables/objects/test_data/expected_tables.json:
--------------------------------------------------------------------------------
1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71, "content": "Title"}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71, "content": "Title"}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71, "content": "Title"}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123, "content": "Line 1-Col 1"}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123, "content": "Line Col 2"}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123, "content": "Line 1-Col 3"}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 173, "content": "Line Col 1"}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275, "content": "Merged Cells"}, {"x1": 494, "y1": 123, "x2": 770, "y2": 173, "content": "Line 2-Col3"}], [{"x1": 35, "y1": 173, "x2": 276, "y2": 225, "content": "Line 3-Col1"}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275, "content": "Merged Cells"}, {"x1": 494, "y1": 173, "x2": 770, "y2": 225, "content": "Line 3-Col3"}], [{"x1": 35, "y1": 225, "x2": 276, "y2": 275, "content": "Line 4-Col1"}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275, "content": "Merged Cells"}, {"x1": 494, "y1": 225, "x2": 770, "y2": 275, "content": "Line 4-Col3"}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326, "content": "Line"}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326, "content": "Line 5-Col2"}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326, "content": "Line 3"}]], [[{"x1": 961, "y1": 21, "x2": 1058, "y2": 71, "content": "Test 1"}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71, "content": "Test 2"}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123, "content": "Test 3"}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123, "content": "Test 4"}]]]


--------------------------------------------------------------------------------
/tests/tables/objects/test_data/table.html:
--------------------------------------------------------------------------------
 1 | <table>
 2 |  <tr>
 3 |   <td colspan="1" rowspan="1">
 4 |    Test 1
 5 |   </td>
 6 |   <td colspan="1" rowspan="1">
 7 |    Test 2
 8 |   </td>
 9 |  </tr>
10 |  <tr>
11 |   <td colspan="1" rowspan="1">
12 |    Test 3
13 |   </td>
14 |   <td colspan="1" rowspan="1">
15 |    Test 4
16 |   </td>
17 |  </tr>
18 | </table>


--------------------------------------------------------------------------------
/tests/tables/objects/test_data/tables.json:
--------------------------------------------------------------------------------
1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 173}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 123, "x2": 770, "y2": 173}], [{"x1": 35, "y1": 173, "x2": 276, "y2": 225}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 173, "x2": 770, "y2": 225}], [{"x1": 35, "y1": 225, "x2": 276, "y2": 275}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 225, "x2": 770, "y2": 275}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}]], [[{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]]


--------------------------------------------------------------------------------
/tests/tables/objects/test_extraction.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | from io import BytesIO
 4 | 
 5 | from xlsxwriter import Workbook
 6 | 
 7 | from img2table.tables.objects.cell import Cell
 8 | from img2table.tables.objects.extraction import create_all_rectangles, CellPosition, TableCell, BBox, CellSpan
 9 | from img2table.tables.objects.row import Row
10 | from img2table.tables.objects.table import Table
11 | 
12 | 
13 | def test_create_all_rectangles():
14 |     c = TableCell(bbox=BBox(x1=0, y1=0, x2=0, y2=0), value="Test")
15 |     cell_positions = [CellPosition(cell=c, row=0, col=0), CellPosition(cell=c, row=1, col=0),
16 |                       CellPosition(cell=c, row=2, col=0), CellPosition(cell=c, row=3, col=0),
17 |                       CellPosition(cell=c, row=0, col=1), CellPosition(cell=c, row=1, col=1),
18 |                       CellPosition(cell=c, row=2, col=1), CellPosition(cell=c, row=3, col=1),
19 |                       CellPosition(cell=c, row=2, col=2), CellPosition(cell=c, row=3, col=2),
20 |                       CellPosition(cell=c, row=2, col=3), CellPosition(cell=c, row=3, col=3),
21 |                       ]
22 | 
23 |     result = create_all_rectangles(cell_positions=cell_positions)
24 | 
25 |     assert result == [CellSpan(top_row=0, bottom_row=3, col_left=0, col_right=1, value='Test'),
26 |                       CellSpan(top_row=2, bottom_row=3, col_left=2, col_right=3, value='Test')]
27 | 
28 | 
29 | def test_table_html():
30 |     with open("test_data/expected_tables.json", "r") as f:
31 |         table = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb])
32 |                  for tb in json.load(f)].pop()
33 | 
34 |     with open("test_data/table.html", "r") as f:
35 |         expected = f.read()
36 | 
37 |     assert table.extracted_table.html == expected
38 | 
39 | 
40 | def test_extracted_table_worksheet():
41 |     with open("test_data/expected_tables.json", "r") as f:
42 |         tables = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb])
43 |                   for tb in json.load(f)]
44 | 
45 |     wb = Workbook(BytesIO())
46 |     for table in tables:
47 |         ws = wb.add_worksheet()
48 |         extracted_table = table.extracted_table
49 |         extracted_table._to_worksheet(sheet=ws)
50 | 
51 |         assert ws.dim_colmax + 1 == table.nb_columns
52 |         assert ws.dim_rowmax + 1 == table.nb_rows
53 | 
54 |         str_map = {v: k for k, v in ws.str_table.string_table.items()}
55 |         ws_values = sorted([str_map.get(c.string) for row in ws.table.values() for c in row.values()])
56 |         table_values = sorted(set([c.value for row in extracted_table.content.values()
57 |                                    for c in row]))
58 |         assert ws_values == table_values
59 | 
60 |     wb.close()
61 | 


--------------------------------------------------------------------------------
/tests/tables/objects/test_line.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | from img2table.tables.objects.line import Line
 4 | 
 5 | 
 6 | def test_line():
 7 |     line = Line(x1=0, y1=20, x2=46, y2=73)
 8 | 
 9 |     assert round(line.angle) == 49
10 |     assert line.width == 46
11 |     assert line.height == 53
12 |     assert round(line.length) == 70
13 |     assert not line.vertical
14 |     assert not line.horizontal
15 | 
16 | 
17 | def test_reprocess_line():
18 |     line = Line(x1=20, y1=73, x2=19, y2=20, thickness=18)
19 | 
20 |     reprocessed_line = line.reprocess()
21 |     assert reprocessed_line == Line(x1=20, x2=20, y1=20, y2=73, thickness=18)
22 |     assert reprocessed_line.vertical
23 | 


--------------------------------------------------------------------------------
/tests/tables/objects/test_row.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from img2table.tables.objects.cell import Cell
 3 | from img2table.tables.objects.row import Row
 4 | 
 5 | 
 6 | def test_row():
 7 |     row = Row(cells=[Cell(x1=0, x2=20, y1=0, y2=20), Cell(x1=20, x2=40, y1=0, y2=20)])
 8 | 
 9 |     assert row.x1 == 0
10 |     assert row.y1 == 0
11 |     assert row.x2 == 40
12 |     assert row.y2 == 20
13 |     assert row.nb_columns == 2
14 |     assert row.v_consistent
15 | 
16 | 
17 | def test_add_cells():
18 |     row = Row(cells=[Cell(x1=0, x2=20, y1=0, y2=20), Cell(x1=20, x2=40, y1=0, y2=20)])
19 | 
20 |     row.add_cells(cells=Cell(x1=40, x2=60, y1=0, y2=20))
21 | 
22 |     assert row.nb_columns == 3
23 |     assert row.x2 == 60
24 | 
25 | 
26 | def test_split_in_rows():
27 |     row = Row(cells=[Cell(x1=0, x2=20, y1=0, y2=20), Cell(x1=20, x2=40, y1=0, y2=20)])
28 | 
29 |     rows_splitted = row.split_in_rows(vertical_delimiters=[10, 15])
30 | 
31 |     expected = [Row(cells=[Cell(x1=0, x2=20, y1=0, y2=10), Cell(x1=20, x2=40, y1=0, y2=10)]),
32 |                 Row(cells=[Cell(x1=0, x2=20, y1=10, y2=15), Cell(x1=20, x2=40, y1=10, y2=15)]),
33 |                 Row(cells=[Cell(x1=0, x2=20, y1=15, y2=20), Cell(x1=20, x2=40, y1=15, y2=20)])
34 |                 ]
35 | 
36 |     assert rows_splitted == expected
37 | 


--------------------------------------------------------------------------------
/tests/tables/objects/test_table.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import polars as pl
 5 | 
 6 | from img2table.ocr.data import OCRDataframe
 7 | from img2table.tables.objects.cell import Cell
 8 | from img2table.tables.objects.row import Row
 9 | from img2table.tables.objects.table import Table
10 | 
11 | 
12 | def test_remove_rows():
13 |     table = Table(rows=[Row(cells=[Cell(x1=0, x2=100, y1=0, y2=10)]),
14 |                         Row(cells=[Cell(x1=0, x2=100, y1=10, y2=20)]),
15 |                         Row(cells=[Cell(x1=0, x2=100, y1=20, y2=30)])])
16 |     table.remove_rows(row_ids=[1])
17 | 
18 |     expected = Table(rows=[Row(cells=[Cell(x1=0, x2=100, y1=0, y2=15)]),
19 |                            Row(cells=[Cell(x1=0, x2=100, y1=15, y2=30)])])
20 | 
21 |     assert table == expected
22 | 
23 | 
24 | def test_remove_columns():
25 |     table = Table(rows=[Row(cells=[Cell(x1=0, x2=100, y1=0, y2=10),
26 |                                    Cell(x1=100, x2=200, y1=0, y2=10),
27 |                                    Cell(x1=200, x2=300, y1=0, y2=10)]),
28 |                         Row(cells=[Cell(x1=0, x2=100, y1=10, y2=20),
29 |                                    Cell(x1=100, x2=200, y1=10, y2=20),
30 |                                    Cell(x1=200, x2=300, y1=10, y2=20)]),
31 |                         ])
32 | 
33 |     table.remove_columns(col_ids=[1])
34 | 
35 |     expected = Table(rows=[Row(cells=[Cell(x1=0, x2=150, y1=0, y2=10),
36 |                                       Cell(x1=150, x2=300, y1=0, y2=10)]),
37 |                            Row(cells=[Cell(x1=0, x2=150, y1=10, y2=20),
38 |                                       Cell(x1=150, x2=300, y1=10, y2=20)])
39 |                            ])
40 | 
41 |     assert table == expected
42 | 
43 | 
44 | def test_table():
45 |     with open("test_data/tables.json", "r") as f:
46 |         tables = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb])
47 |                   for tb in json.load(f)]
48 | 
49 |     assert tables[0].nb_columns == 3
50 |     assert tables[0].nb_rows == 6
51 |     assert tables[0].bbox() == (35, 20, 770, 326)
52 | 
53 |     assert tables[1].nb_columns == 2
54 |     assert tables[1].nb_rows == 2
55 |     assert tables[1].bbox() == (961, 21, 1154, 123)
56 | 
57 | 
58 | def test_get_table_content():
59 |     with open("test_data/tables.json", "r") as f:
60 |         tables = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb])
61 |                   for tb in json.load(f)]
62 | 
63 |     # Load OCR
64 |     ocr_df = OCRDataframe(pl.read_csv("test_data/ocr.csv", separator=";", encoding="utf-8"))
65 | 
66 |     result = [table.get_content(ocr_df=ocr_df, min_confidence=50) for table in tables]
67 | 
68 |     with open("test_data/expected_tables.json", "r") as f:
69 |         expected = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb])
70 |                     for tb in json.load(f)]
71 | 
72 |     assert result == expected
73 | 


--------------------------------------------------------------------------------
/tests/tables/processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/cells/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_cells.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import polars as pl
 5 | 
 6 | from img2table.tables.objects.cell import Cell
 7 | from img2table.tables.objects.line import Line
 8 | from img2table.tables.processing.bordered_tables.cells import get_cells
 9 | 
10 | 
11 | def test_get_cells():
12 |     with open("test_data/lines.json", 'r') as f:
13 |         data = json.load(f)
14 |     h_lines = [Line(**el) for el in data.get('h_lines')]
15 |     v_lines = [Line(**el) for el in data.get('v_lines')]
16 | 
17 |     result = get_cells(horizontal_lines=h_lines,
18 |                        vertical_lines=v_lines)
19 | 
20 |     df_expected = pl.read_csv("test_data/expected.csv", separator=";", encoding="utf-8")
21 |     expected = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"])
22 |                 for row in df_expected.to_dicts()]
23 | 
24 |     assert sorted(result, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) == sorted(expected, key=lambda c: (c.x1, c.y1, c.x2, c.y2))
25 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_data/expected.csv:
--------------------------------------------------------------------------------
 1 | x1;y1;x2;y2
 2 | 1058;21;1154;71
 3 | 961;21;1058;71
 4 | 1058;71;1154;123
 5 | 961;71;1058;123
 6 | 276;275;494;326
 7 | 276;71;494;123
 8 | 35;123;276;173
 9 | 35;225;276;275
10 | 35;275;276;326
11 | 35;71;276;123
12 | 35;173;276;225
13 | 494;174;770;224
14 | 494;123;770;174
15 | 494;224;770;275
16 | 494;275;770;326
17 | 494;71;770;123
18 | 276;123;494;275
19 | 35;20;770;71
20 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_data/expected_ident_cells.csv:
--------------------------------------------------------------------------------
 1 | index;x1;y1;x2;y2
 2 | 0;35;20;770;71
 3 | 1;35;20;770;275
 4 | 2;35;20;770;326
 5 | 3;35;71;276;123
 6 | 4;35;123;276;173
 7 | 5;35;123;276;275
 8 | 6;35;173;276;225
 9 | 7;35;225;276;275
10 | 8;35;275;276;326
11 | 9;276;71;494;123
12 | 10;276;123;494;275
13 | 11;276;275;494;326
14 | 12;494;71;770;123
15 | 13;494;123;770;174
16 | 14;494;123;770;275
17 | 15;494;174;770;224
18 | 16;494;224;770;275
19 | 17;494;275;770;326
20 | 18;961;21;1058;71
21 | 19;961;71;1058;123
22 | 20;1058;21;1154;71
23 | 21;1058;71;1154;123
24 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_data/expected_potential_cells.csv:
--------------------------------------------------------------------------------
 1 | idx;x1_bbox;x2_bbox;y1_bbox;y2_bbox
 2 | 7;35;772;20;326
 3 | 18;36;277;123;173
 4 | 24;36;277;173;225
 5 | 30;36;277;225;275
 6 | 0;36;771;20;71
 7 | 10;36;771;71;123
 8 | 22;36;771;123;275
 9 | 6;36;772;20;275
10 | 34;36;772;275;326
11 | 19;495;771;123;174
12 | 27;495;771;174;224
13 | 32;495;771;224;275
14 | 8;962;1156;21;71
15 | 17;962;1156;71;123
16 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_data/expected_vertical_dedup.csv:
--------------------------------------------------------------------------------
 1 | index;x1;y1;x2;y2
 2 | 4;35;71;276;123
 3 | 9;35;123;276;173
 4 | 13;35;173;276;225
 5 | 16;35;225;276;275
 6 | 18;35;275;276;326
 7 | 1;35;20;770;71
 8 | 20;276;71;494;123
 9 | 23;276;123;494;275
10 | 25;276;275;494;326
11 | 27;494;71;770;123
12 | 32;494;123;770;174
13 | 36;494;174;770;224
14 | 39;494;224;770;275
15 | 41;494;275;770;326
16 | 42;961;21;1058;71
17 | 44;961;71;1058;123
18 | 45;1058;21;1154;71
19 | 47;1058;71;1154;123
20 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_data/lines.json:
--------------------------------------------------------------------------------
1 | {"h_lines": [{"x1": 35, "x2": 772, "y1": 20, "y2": 20}, {"x1": 961, "x2": 1156, "y1": 21, "y2": 21}, {"x1": 36, "x2": 771, "y1": 71, "y2": 71}, {"x1": 962, "x2": 1156, "y1": 71, "y2": 71}, {"x1": 36, "x2": 771, "y1": 123, "y2": 123}, {"x1": 962, "x2": 1156, "y1": 123, "y2": 123}, {"x1": 36, "x2": 277, "y1": 173, "y2": 173}, {"x1": 495, "x2": 771, "y1": 174, "y2": 174}, {"x1": 36, "x2": 277, "y1": 225, "y2": 225}, {"x1": 495, "x2": 771, "y1": 224, "y2": 224}, {"x1": 36, "x2": 772, "y1": 275, "y2": 275}, {"x1": 35, "x2": 772, "y1": 326, "y2": 326}], "v_lines": [{"x1": 35, "x2": 35, "y1": 20, "y2": 329}, {"x1": 276, "x2": 276, "y1": 72, "y2": 328}, {"x1": 494, "x2": 494, "y1": 72, "y2": 328}, {"x1": 770, "x2": 770, "y1": 20, "y2": 329}, {"x1": 961, "x2": 961, "y1": 20, "y2": 125}, {"x1": 1058, "x2": 1058, "y1": 21, "y2": 124}, {"x1": 1154, "x2": 1154, "y1": 20, "y2": 125}]}


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_deduplication_cells.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import polars as pl
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.processing.bordered_tables.cells.deduplication import deduplicate_cells
 6 | 
 7 | 
 8 | def test_deduplicate_cells():
 9 |     df_cells = pl.read_csv("test_data/expected_ident_cells.csv", separator=";", encoding="utf-8")
10 |     cells = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"])
11 |              for row in df_cells.to_dicts()]
12 | 
13 |     result = deduplicate_cells(cells=cells)
14 | 
15 |     df_expected = pl.read_csv("test_data/expected.csv", separator=";", encoding="utf-8")
16 |     expected = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"])
17 |                 for row in df_expected.to_dicts()]
18 | 
19 |     assert sorted(result, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) == sorted(expected, key=lambda c: (c.x1, c.y1, c.x2, c.y2))
20 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/cells/test_identification_cells.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import polars as pl
 5 | 
 6 | from img2table.tables.objects.cell import Cell
 7 | from img2table.tables.objects.line import Line
 8 | from img2table.tables.processing.bordered_tables.cells.identification import get_cells_dataframe
 9 | 
10 | 
11 | def test_get_cells_dataframe():
12 |     with open("test_data/lines.json", 'r') as f:
13 |         data = json.load(f)
14 |     h_lines = [Line(**el) for el in data.get('h_lines')]
15 |     v_lines = [Line(**el) for el in data.get('v_lines')]
16 | 
17 |     result = get_cells_dataframe(horizontal_lines=h_lines,
18 |                                  vertical_lines=v_lines)
19 | 
20 |     df_expected = pl.read_csv("test_data/expected_ident_cells.csv", separator=";", encoding="utf-8")
21 |     expected = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"])
22 |                 for row in df_expected.to_dicts()]
23 | 
24 |     assert sorted(result, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) == sorted(expected, key=lambda c: (c.x1, c.y1, c.x2, c.y2))
25 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/lines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/lines/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/lines/test_data/contours.json:
--------------------------------------------------------------------------------
1 | [{"x1": 603, "y1": 296, "x2": 612, "y2": 310}, {"x1": 570, "y1": 296, "x2": 591, "y2": 310}, {"x1": 501, "y1": 296, "x2": 551, "y2": 310}, {"x1": 384, "y1": 296, "x2": 393, "y2": 310}, {"x1": 351, "y1": 296, "x2": 372, "y2": 310}, {"x1": 282, "y1": 296, "x2": 332, "y2": 310}, {"x1": 145, "y1": 296, "x2": 153, "y2": 310}, {"x1": 111, "y1": 296, "x2": 132, "y2": 310}, {"x1": 42, "y1": 296, "x2": 92, "y2": 310}, {"x1": 603, "y1": 245, "x2": 612, "y2": 259}, {"x1": 570, "y1": 245, "x2": 591, "y2": 259}, {"x1": 501, "y1": 245, "x2": 551, "y2": 259}, {"x1": 145, "y1": 245, "x2": 153, "y2": 259}, {"x1": 111, "y1": 245, "x2": 132, "y2": 259}, {"x1": 42, "y1": 245, "x2": 92, "y2": 259}, {"x1": 437, "y1": 198, "x2": 444, "y2": 208}, {"x1": 603, "y1": 194, "x2": 612, "y2": 208}, {"x1": 570, "y1": 194, "x2": 591, "y2": 208}, {"x1": 501, "y1": 194, "x2": 551, "y2": 208}, {"x1": 145, "y1": 194, "x2": 153, "y2": 208}, {"x1": 111, "y1": 194, "x2": 132, "y2": 208}, {"x1": 42, "y1": 194, "x2": 92, "y2": 208}, {"x1": 331, "y1": 193, "x2": 425, "y2": 212}, {"x1": 603, "y1": 143, "x2": 612, "y2": 157}, {"x1": 570, "y1": 143, "x2": 591, "y2": 157}, {"x1": 501, "y1": 143, "x2": 551, "y2": 157}, {"x1": 145, "y1": 143, "x2": 153, "y2": 157}, {"x1": 111, "y1": 143, "x2": 132, "y2": 157}, {"x1": 42, "y1": 143, "x2": 92, "y2": 157}, {"x1": 1062, "y1": 92, "x2": 1112, "y2": 106}, {"x1": 966, "y1": 92, "x2": 1016, "y2": 106}, {"x1": 603, "y1": 92, "x2": 612, "y2": 106}, {"x1": 570, "y1": 92, "x2": 591, "y2": 106}, {"x1": 501, "y1": 92, "x2": 551, "y2": 106}, {"x1": 384, "y1": 92, "x2": 393, "y2": 106}, {"x1": 351, "y1": 92, "x2": 372, "y2": 106}, {"x1": 282, "y1": 92, "x2": 332, "y2": 106}, {"x1": 145, "y1": 92, "x2": 153, "y2": 106}, {"x1": 111, "y1": 92, "x2": 132, "y2": 106}, {"x1": 42, "y1": 92, "x2": 92, "y2": 106}, {"x1": 1062, "y1": 41, "x2": 1112, "y2": 55}, {"x1": 966, "y1": 41, "x2": 1016, "y2": 55}, {"x1": 385, "y1": 41, "x2": 422, "y2": 55}]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/lines/test_data/expected.json:
--------------------------------------------------------------------------------
1 | {"h_lines": [{"x1": 36, "y1": 21, "x2": 771, "y2": 21, "thickness": 4}, {"x1": 962, "y1": 21, "x2": 1156, "y2": 21, "thickness": 4}, {"x1": 36, "y1": 72, "x2": 771, "y2": 72, "thickness": 4}, {"x1": 962, "y1": 72, "x2": 1156, "y2": 72, "thickness": 4}, {"x1": 36, "y1": 123, "x2": 771, "y2": 123, "thickness": 4}, {"x1": 962, "y1": 123, "x2": 1156, "y2": 123, "thickness": 4}, {"x1": 36, "y1": 174, "x2": 277, "y2": 174, "thickness": 4}, {"x1": 495, "y1": 174, "x2": 771, "y2": 174, "thickness": 4}, {"x1": 428, "y1": 192, "x2": 435, "y2": 192, "thickness": 2}, {"x1": 428, "y1": 207, "x2": 435, "y2": 207, "thickness": 2}, {"x1": 36, "y1": 225, "x2": 277, "y2": 225, "thickness": 4}, {"x1": 495, "y1": 225, "x2": 771, "y2": 225, "thickness": 4}, {"x1": 36, "y1": 276, "x2": 772, "y2": 276, "thickness": 4}, {"x1": 36, "y1": 327, "x2": 772, "y2": 327, "thickness": 4}], "v_lines": [{"x1": 36, "y1": 21, "x2": 36, "y2": 328, "thickness": 4}, {"x1": 770, "y1": 21, "x2": 770, "y2": 328, "thickness": 4}, {"x1": 962, "y1": 21, "x2": 962, "y2": 124, "thickness": 4}, {"x1": 1058, "y1": 21, "x2": 1058, "y2": 124, "thickness": 4}, {"x1": 1154, "y1": 21, "x2": 1154, "y2": 124, "thickness": 4}, {"x1": 276, "y1": 72, "x2": 276, "y2": 328, "thickness": 4}, {"x1": 495, "y1": 72, "x2": 495, "y2": 328, "thickness": 4}, {"x1": 135, "y1": 91, "x2": 135, "y2": 107, "thickness": 5}, {"x1": 375, "y1": 91, "x2": 375, "y2": 107, "thickness": 5}, {"x1": 594, "y1": 91, "x2": 594, "y2": 107, "thickness": 5}, {"x1": 135, "y1": 142, "x2": 135, "y2": 158, "thickness": 5}, {"x1": 594, "y1": 142, "x2": 594, "y2": 158, "thickness": 5}, {"x1": 135, "y1": 193, "x2": 135, "y2": 209, "thickness": 5}, {"x1": 594, "y1": 193, "x2": 594, "y2": 209, "thickness": 5}, {"x1": 329, "y1": 195, "x2": 329, "y2": 208, "thickness": 1}, {"x1": 135, "y1": 244, "x2": 135, "y2": 260, "thickness": 5}, {"x1": 594, "y1": 244, "x2": 594, "y2": 260, "thickness": 5}, {"x1": 135, "y1": 295, "x2": 135, "y2": 311, "thickness": 5}, {"x1": 375, "y1": 295, "x2": 375, "y2": 311, "thickness": 5}, {"x1": 594, "y1": 295, "x2": 594, "y2": 311, "thickness": 5}]}


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/lines/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/lines/test_data/test.png


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/lines/test_lines.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import cv2
 5 | 
 6 | from img2table.tables.objects.cell import Cell
 7 | from img2table.tables.objects.line import Line
 8 | from img2table.tables.processing.bordered_tables.lines import detect_lines
 9 | 
10 | 
11 | def test_detect_lines():
12 |     img = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB)
13 |     with open("test_data/contours.json", "r") as f:
14 |         contours = [Cell(**el) for el in json.load(f)]
15 | 
16 |     h_lines, v_lines = detect_lines(img=img,
17 |                                     contours=contours,
18 |                                     char_length=8.85,
19 |                                     min_line_length=10)
20 | 
21 |     with open("test_data/expected.json", 'r') as f:
22 |         data = json.load(f)
23 |     h_lines_expected = [Line(**el) for el in data.get('h_lines')]
24 |     v_lines_expected = [Line(**el) for el in data.get('v_lines')]
25 | 
26 |     h_lines = sorted(h_lines, key=lambda l: (l.x1, l.y1, l.x2, l.y2))
27 |     v_lines = sorted(v_lines, key=lambda l: (l.x1, l.y1, l.x2, l.y2))
28 |     h_lines_expected = sorted(h_lines_expected, key=lambda l: (l.x1, l.y1, l.x2, l.y2))
29 |     v_lines_expected = sorted(v_lines_expected, key=lambda l: (l.x1, l.y1, l.x2, l.y2))
30 | 
31 |     assert (h_lines, v_lines) == (h_lines_expected, v_lines_expected)
32 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/tables/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_cell_clustering.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.processing.bordered_tables.tables.cell_clustering import cluster_cells_in_tables
 6 | 
 7 | 
 8 | def test_cluster_cells_in_tables():
 9 |     with open("test_data/cells.json", 'r') as f:
10 |         cells = [Cell(**el) for el in json.load(f)]
11 | 
12 |     result = cluster_cells_in_tables(cells=cells)
13 | 
14 |     with open("test_data/cells_clustered.json", 'r') as f:
15 |         expected = [[Cell(**el) for el in cluster] for cluster in json.load(f)]
16 | 
17 |     assert all([cl in result for cl in expected])
18 |     assert all([cl in expected for cl in result])
19 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/cell_clusters_normalized.json:
--------------------------------------------------------------------------------
1 | [[{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 35, "y1": 123, "x2": 276, "y2": 174}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 35, "y1": 174, "x2": 276, "y2": 224}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 35, "y1": 224, "x2": 276, "y2": 275}, {"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}], [{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/cells.json:
--------------------------------------------------------------------------------
1 | [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 35, "y1": 123, "x2": 276, "y2": 173}, {"x1": 35, "y1": 173, "x2": 276, "y2": 225}, {"x1": 35, "y1": 225, "x2": 276, "y2": 275}, {"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}, {"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/cells_clustered.json:
--------------------------------------------------------------------------------
1 | [[{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 35, "y1": 123, "x2": 276, "y2": 173}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 35, "y1": 173, "x2": 276, "y2": 225}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 35, "y1": 225, "x2": 276, "y2": 275}, {"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}], [{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/contours.json:
--------------------------------------------------------------------------------
1 | [{"x1": 603, "y1": 296, "x2": 612, "y2": 310}, {"x1": 570, "y1": 296, "x2": 591, "y2": 310}, {"x1": 501, "y1": 296, "x2": 551, "y2": 310}, {"x1": 384, "y1": 296, "x2": 393, "y2": 310}, {"x1": 351, "y1": 296, "x2": 372, "y2": 310}, {"x1": 282, "y1": 296, "x2": 332, "y2": 310}, {"x1": 145, "y1": 296, "x2": 153, "y2": 310}, {"x1": 111, "y1": 296, "x2": 132, "y2": 310}, {"x1": 42, "y1": 296, "x2": 92, "y2": 310}, {"x1": 603, "y1": 245, "x2": 612, "y2": 259}, {"x1": 570, "y1": 245, "x2": 591, "y2": 259}, {"x1": 501, "y1": 245, "x2": 551, "y2": 259}, {"x1": 145, "y1": 245, "x2": 153, "y2": 259}, {"x1": 111, "y1": 245, "x2": 132, "y2": 259}, {"x1": 42, "y1": 245, "x2": 92, "y2": 259}, {"x1": 437, "y1": 198, "x2": 444, "y2": 208}, {"x1": 603, "y1": 194, "x2": 612, "y2": 208}, {"x1": 570, "y1": 194, "x2": 591, "y2": 208}, {"x1": 501, "y1": 194, "x2": 551, "y2": 208}, {"x1": 145, "y1": 194, "x2": 153, "y2": 208}, {"x1": 111, "y1": 194, "x2": 132, "y2": 208}, {"x1": 42, "y1": 194, "x2": 92, "y2": 208}, {"x1": 331, "y1": 193, "x2": 425, "y2": 212}, {"x1": 603, "y1": 143, "x2": 612, "y2": 157}, {"x1": 570, "y1": 143, "x2": 591, "y2": 157}, {"x1": 501, "y1": 143, "x2": 551, "y2": 157}, {"x1": 145, "y1": 143, "x2": 153, "y2": 157}, {"x1": 111, "y1": 143, "x2": 132, "y2": 157}, {"x1": 42, "y1": 143, "x2": 92, "y2": 157}, {"x1": 1062, "y1": 92, "x2": 1112, "y2": 106}, {"x1": 966, "y1": 92, "x2": 1016, "y2": 106}, {"x1": 603, "y1": 92, "x2": 612, "y2": 106}, {"x1": 570, "y1": 92, "x2": 591, "y2": 106}, {"x1": 501, "y1": 92, "x2": 551, "y2": 106}, {"x1": 384, "y1": 92, "x2": 393, "y2": 106}, {"x1": 351, "y1": 92, "x2": 372, "y2": 106}, {"x1": 282, "y1": 92, "x2": 332, "y2": 106}, {"x1": 145, "y1": 92, "x2": 153, "y2": 106}, {"x1": 111, "y1": 92, "x2": 132, "y2": 106}, {"x1": 42, "y1": 92, "x2": 92, "y2": 106}, {"x1": 1062, "y1": 41, "x2": 1112, "y2": 55}, {"x1": 966, "y1": 41, "x2": 1016, "y2": 55}, {"x1": 385, "y1": 41, "x2": 422, "y2": 55}]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/expected.json:
--------------------------------------------------------------------------------
1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 174}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}], [{"x1": 35, "y1": 174, "x2": 276, "y2": 224}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}], [{"x1": 35, "y1": 224, "x2": 276, "y2": 275}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}]], [[{"x1": 961, "y1": 20, "x2": 1058, "y2": 71}, {"x1": 1058, "y1": 20, "x2": 1154, "y2": 71}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/lines.json:
--------------------------------------------------------------------------------
1 | {"h_lines": [{"x1": 492, "y1": 173, "x2": 775, "y2": 173, "thickness": 1}, {"x1": 492, "y1": 225, "x2": 775, "y2": 225, "thickness": 1}, {"x1": 333, "y1": 308, "x2": 350, "y2": 308, "thickness": 1}, {"x1": 373, "y1": 308, "x2": 383, "y2": 308, "thickness": 1}, {"x1": 959, "y1": 20, "x2": 1160, "y2": 20, "thickness": 1}, {"x1": 959, "y1": 71, "x2": 1160, "y2": 71, "thickness": 1}, {"x1": 33, "y1": 224, "x2": 281, "y2": 224, "thickness": 1}, {"x1": 33, "y1": 20, "x2": 775, "y2": 20, "thickness": 1}, {"x1": 33, "y1": 122, "x2": 775, "y2": 122, "thickness": 1}, {"x1": 33, "y1": 72, "x2": 775, "y2": 72, "thickness": 1}, {"x1": 959, "y1": 122, "x2": 1160, "y2": 122, "thickness": 1}, {"x1": 33, "y1": 174, "x2": 281, "y2": 174, "thickness": 1}, {"x1": 33, "y1": 276, "x2": 776, "y2": 276, "thickness": 1}, {"x1": 33, "y1": 326, "x2": 776, "y2": 326, "thickness": 1}, {"x1": 93, "y1": 308, "x2": 110, "y2": 308, "thickness": 1}, {"x1": 552, "y1": 308, "x2": 569, "y2": 308, "thickness": 1}, {"x1": 592, "y1": 308, "x2": 602, "y2": 308, "thickness": 1}], "v_lines": [{"x1": 1154, "y1": 21, "x2": 1154, "y2": 124, "thickness": 1}, {"x1": 276, "y1": 69, "x2": 276, "y2": 332, "thickness": 1}, {"x1": 1058, "y1": 21, "x2": 1058, "y2": 124, "thickness": 1}, {"x1": 36, "y1": 18, "x2": 36, "y2": 332, "thickness": 1}, {"x1": 494, "y1": 69, "x2": 494, "y2": 332, "thickness": 1}, {"x1": 770, "y1": 18, "x2": 770, "y2": 332, "thickness": 1}, {"x1": 962, "y1": 21, "x2": 962, "y2": 124, "thickness": 1}]}


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/table_implicit.json:
--------------------------------------------------------------------------------
1 | [[{"x1": 224, "y1": 644, "x2": 1001, "y2": 746}, {"x1": 1001, "y1": 644, "x2": 1106, "y2": 746}, {"x1": 1106, "y1": 644, "x2": 1501, "y2": 746}], [{"x1": 224, "y1": 746, "x2": 1001, "y2": 848}, {"x1": 1001, "y1": 746, "x2": 1106, "y2": 848}, {"x1": 1106, "y1": 746, "x2": 1501, "y2": 848}], [{"x1": 224, "y1": 848, "x2": 1001, "y2": 950}, {"x1": 1001, "y1": 848, "x2": 1106, "y2": 950}, {"x1": 1106, "y1": 848, "x2": 1501, "y2": 950}], [{"x1": 224, "y1": 950, "x2": 1001, "y2": 1051}, {"x1": 1001, "y1": 950, "x2": 1106, "y2": 1051}, {"x1": 1106, "y1": 950, "x2": 1501, "y2": 1051}], [{"x1": 224, "y1": 1051, "x2": 1001, "y2": 1153}, {"x1": 1001, "y1": 1051, "x2": 1106, "y2": 1153}, {"x1": 1106, "y1": 1051, "x2": 1501, "y2": 1153}], [{"x1": 224, "y1": 1153, "x2": 1001, "y2": 1255}, {"x1": 1001, "y1": 1153, "x2": 1106, "y2": 1255}, {"x1": 1106, "y1": 1153, "x2": 1501, "y2": 1255}], [{"x1": 224, "y1": 1255, "x2": 1001, "y2": 1356}, {"x1": 1001, "y1": 1255, "x2": 1106, "y2": 1356}, {"x1": 1106, "y1": 1255, "x2": 1501, "y2": 1356}], [{"x1": 224, "y1": 1356, "x2": 1001, "y2": 1458}, {"x1": 1001, "y1": 1356, "x2": 1106, "y2": 1458}, {"x1": 1106, "y1": 1356, "x2": 1501, "y2": 1458}], [{"x1": 224, "y1": 1458, "x2": 1001, "y2": 1560}, {"x1": 1001, "y1": 1458, "x2": 1106, "y2": 1560}, {"x1": 1106, "y1": 1458, "x2": 1501, "y2": 1560}], [{"x1": 224, "y1": 1560, "x2": 1001, "y2": 1661}, {"x1": 1001, "y1": 1560, "x2": 1106, "y2": 1661}, {"x1": 1106, "y1": 1560, "x2": 1501, "y2": 1661}]]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/tables_from_cells.json:
--------------------------------------------------------------------------------
1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 174}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}], [{"x1": 35, "y1": 174, "x2": 276, "y2": 224}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}], [{"x1": 35, "y1": 224, "x2": 276, "y2": 275}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}]], [[{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]]


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_data/word_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/tables/test_data/word_image.png


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_implicit.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.row import Row
 6 | from img2table.tables.objects.table import Table
 7 | from img2table.tables.processing.bordered_tables.tables.implicit import implicit_content, implicit_rows_lines, \
 8 |     implicit_columns_lines
 9 | from img2table.tables.processing.borderless_tables.model import ImageSegment
10 | 
11 | 
12 | def test_implicit_rows_lines():
13 |     with open("test_data/table_implicit.json", 'r') as f:
14 |         table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)])
15 | 
16 |     with open("test_data/contours_implicit.json", "r") as f:
17 |         contours = [Cell(**el) for el in json.load(f)]
18 | 
19 |     segment = ImageSegment(x1=table.x1, y1=table.y1, x2=table.x2, y2=table.y2,
20 |                            elements=contours)
21 | 
22 |     result = implicit_rows_lines(table=table,
23 |                                  segment=segment)
24 | 
25 |     # Check that all created lines have right width
26 |     assert all([line.width == table.width for line in result])
27 | 
28 |     # Check positions
29 |     assert sorted([line.y1 for line in result]) == [682, 716, 784, 817, 884, 919, 986, 1020,
30 |                                                     1089, 1121, 1189, 1223, 1292, 1325, 1394,
31 |                                                     1427, 1494, 1529, 1597, 1630]
32 | 
33 | 
34 | def test_implicit_columns_lines():
35 |     with open("test_data/table_implicit.json", 'r') as f:
36 |         table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)])
37 | 
38 |     with open("test_data/contours_implicit.json", "r") as f:
39 |         contours = [Cell(**el) for el in json.load(f)]
40 | 
41 |     segment = ImageSegment(x1=table.x1, y1=table.y1, x2=table.x2, y2=table.y2,
42 |                            elements=contours)
43 | 
44 |     result = implicit_columns_lines(table=table,
45 |                                     segment=segment,
46 |                                     char_length=11)
47 | 
48 |     # Check that all created lines have right height
49 |     assert all([line.height == table.height for line in result])
50 | 
51 |     # Check positions
52 |     assert sorted([line.x1 for line in result]) == [395, 605, 725, 809, 886, 1212, 1285, 1396]
53 | 
54 | 
55 | def test_implicit_content():
56 |     with open("test_data/table_implicit.json", 'r') as f:
57 |         table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)])
58 | 
59 |     with open("test_data/contours_implicit.json", "r") as f:
60 |         contours = [Cell(**el) for el in json.load(f)]
61 | 
62 |     result = implicit_content(table=table,
63 |                               contours=contours,
64 |                               char_length=11,
65 |                               implicit_rows=True,
66 |                               implicit_columns=True)
67 | 
68 |     # Check that 20 more rows have been created
69 |     assert result.nb_rows == table.nb_rows + 20
70 | 
71 |     # Check that 8 more columns have been created
72 |     assert result.nb_columns == table.nb_columns + 8
73 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_table_creation.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.row import Row
 6 | from img2table.tables.objects.table import Table
 7 | from img2table.tables.processing.bordered_tables.tables.table_creation import normalize_table_cells, cluster_to_table, \
 8 |     remove_unwanted_elements
 9 | 
10 | 
11 | def test_normalize_table_cells():
12 |     with open("test_data/cells_clustered.json", 'r') as f:
13 |         cell_clusters = [[Cell(**el) for el in cluster] for cluster in json.load(f)]
14 | 
15 |     result = [normalize_table_cells(cluster_cells=cell_cluster) for cell_cluster in cell_clusters]
16 | 
17 |     with open("test_data/cell_clusters_normalized.json", "r") as f:
18 |         expected = [[Cell(**el) for el in cluster] for cluster in json.load(f)]
19 | 
20 |     assert result == expected
21 | 
22 | 
23 | def test_remove_unwanted_elements():
24 |     table = Table(rows=[Row(cells=[Cell(x1=0, y1=0, x2=20, y2=20),
25 |                                    Cell(x1=20, y1=0, x2=40, y2=20),
26 |                                    Cell(x1=40, y1=0, x2=60, y2=20)]),
27 |                         Row(cells=[Cell(x1=0, y1=20, x2=20, y2=40),
28 |                                    Cell(x1=20, y1=20, x2=40, y2=40),
29 |                                    Cell(x1=40, y1=20, x2=60, y2=40)]),
30 |                         Row(cells=[Cell(x1=0, y1=40, x2=20, y2=60),
31 |                                    Cell(x1=20, y1=40, x2=40, y2=60),
32 |                                    Cell(x1=40, y1=40, x2=60, y2=60)])
33 |                         ]
34 |                   )
35 |     elements = [Cell(x1=25, y1=5, x2=35, y2=15), Cell(x1=45, y1=5, x2=55, y2=15),
36 |                 Cell(x1=25, y1=25, x2=35, y2=35), Cell(x1=45, y1=25, x2=55, y2=35)]
37 | 
38 |     result = remove_unwanted_elements(table=table, elements=elements)
39 | 
40 |     expected = Table(rows=[Row(cells=[Cell(x1=20, y1=0, x2=40, y2=20),
41 |                                       Cell(x1=40, y1=0, x2=60, y2=20)]),
42 |                            Row(cells=[Cell(x1=20, y1=20, x2=40, y2=40),
43 |                                       Cell(x1=40, y1=20, x2=60, y2=40)]),
44 |                            ]
45 |                      )
46 | 
47 |     assert result == expected
48 | 
49 | 
50 | def test_cluster_to_table():
51 |     with open("test_data/cell_clusters_normalized.json", "r") as f:
52 |         cell_clusters = [[Cell(**el) for el in cluster] for cluster in json.load(f)]
53 |     with open("test_data/contours.json", "r") as f:
54 |         contours = [Cell(**el) for el in json.load(f)]
55 | 
56 |     result = [cluster_to_table(cluster, contours) for cluster in cell_clusters]
57 | 
58 |     with open("test_data/tables_from_cells.json", "r") as f:
59 |         expected = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb])
60 |                     for tb in json.load(f)]
61 | 
62 |     assert result == expected
63 | 


--------------------------------------------------------------------------------
/tests/tables/processing/bordered_tables/tables/test_tables.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.objects.line import Line
 6 | from img2table.tables.objects.row import Row
 7 | from img2table.tables.objects.table import Table
 8 | from img2table.tables.processing.bordered_tables.tables import get_tables
 9 | 
10 | 
11 | def test_get_tables():
12 |     with open("test_data/cells.json", 'r') as f:
13 |         cells = [Cell(**el) for el in json.load(f)]
14 |     with open("test_data/contours.json", "r") as f:
15 |         contours = [Cell(**el) for el in json.load(f)]
16 |     with open("test_data/lines.json", 'r') as f:
17 |         data = json.load(f)
18 |         lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')]
19 | 
20 |     result = get_tables(cells=cells, elements=contours, lines=lines, char_length=8.44)
21 | 
22 |     with open("test_data/expected.json", "r") as f:
23 |         expected = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb])
24 |                     for tb in json.load(f)]
25 | 
26 |     assert result == expected
27 | 
28 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/borderless_tables/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/borderless_tables/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/borderless_tables/test_borderless_tables.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import cv2
 5 | 
 6 | from img2table.tables import threshold_dark_areas
 7 | from img2table.tables.objects.cell import Cell
 8 | from img2table.tables.objects.line import Line
 9 | from img2table.tables.processing.borderless_tables import identify_borderless_tables
10 | 
11 | 
12 | def test_identify_borderless_tables():
13 |     img = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB)
14 |     thresh = threshold_dark_areas(img=img, char_length=11)
15 | 
16 |     with open("test_data/lines.json", 'r') as f:
17 |         data = json.load(f)
18 |     lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')]
19 | 
20 |     with open("test_data/contours.json", 'r') as f:
21 |         contours = [Cell(**el) for el in json.load(f)]
22 | 
23 |     result = identify_borderless_tables(thresh=thresh,
24 |                                         char_length=7.0,
25 |                                         median_line_sep=66,
26 |                                         lines=lines,
27 |                                         contours=contours,
28 |                                         existing_tables=[])
29 | 
30 |     assert len(result) == 1
31 |     assert result[0].nb_rows == 16
32 |     assert result[0].nb_columns == 7
33 |     assert (result[0].x1, result[0].y1, result[0].x2, result[0].y2) == (135, 52, 1155, 1054)
34 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/borderless_tables/test_data/lines.json:
--------------------------------------------------------------------------------
1 | {"h_lines": [{"x1": 98, "y1": 1085, "x2": 1227, "y2": 1085, "thickness": 1}, {"x1": 146, "y1": 1109, "x2": 224, "y2": 1109, "thickness": 1}, {"x1": 911, "y1": 1110, "x2": 1228, "y2": 1110, "thickness": 1}, {"x1": 143, "y1": 1144, "x2": 227, "y2": 1144, "thickness": 1}, {"x1": 908, "y1": 1144, "x2": 1231, "y2": 1144, "thickness": 1}], "v_lines": []}


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/borderless_tables/test_data/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/borderless_tables/test_data/test.png


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/borderless_tables/test_whitespaces.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import json
 4 | 
 5 | from img2table.tables.objects.cell import Cell
 6 | from img2table.tables.processing.borderless_tables.model import ImageSegment
 7 | from img2table.tables.processing.borderless_tables.whitespaces import get_whitespaces, adjacent_whitespaces, \
 8 |     identify_coherent_v_whitespaces, get_relevant_vertical_whitespaces
 9 | 
10 | 
11 | def test_get_whitespaces():
12 |     with open("test_data/image_segment.json", "r") as f:
13 |         data = json.load(f)
14 |     img_segment = ImageSegment(x1=data.get('x1'),
15 |                                y1=data.get('y1'),
16 |                                x2=data.get('x2'),
17 |                                y2=data.get('y2'),
18 |                                elements=[Cell(**c) for c in data.get('elements')])
19 | 
20 |     result = get_whitespaces(segment=img_segment, vertical=True)
21 | 
22 |     assert len(result) == 38
23 | 
24 | 
25 | def test_adjacent_whitespaces():
26 |     c_1 = Cell(x1=0, x2=10, y1=0, y2=10)
27 |     c_2 = Cell(x1=10, x2=20, y1=0, y2=10)
28 |     c_3 = Cell(x1=10, x2=20, y1=0, y2=20)
29 |     c_4 = Cell(x1=20, x2=30, y1=0, y2=10)
30 | 
31 |     assert adjacent_whitespaces(c_1, c_2)
32 |     assert adjacent_whitespaces(c_1, c_3)
33 |     assert not adjacent_whitespaces(c_1, c_4)
34 | 
35 | 
36 | def test_identify_coherent_v_whitespaces():
37 |     v_whitespaces = [Cell(x1=0, x2=10, y1=0, y2=10),
38 |                      Cell(x1=10, x2=20, y1=0, y2=20),
39 |                      Cell(x1=20, x2=30, y1=0, y2=10),
40 |                      Cell(x1=50, x2=60, y1=0, y2=20),
41 |                      Cell(x1=60, x2=70, y1=0, y2=18),
42 |                      Cell(x1=70, x2=80, y1=0, y2=10),
43 |                      Cell(x1=80, x2=90, y1=0, y2=20),
44 |                      Cell(x1=100, x2=110, y1=0, y2=10)]
45 | 
46 |     result = identify_coherent_v_whitespaces(v_whitespaces=v_whitespaces)
47 | 
48 |     expected = [Cell(x1=10, x2=20, y1=0, y2=20),
49 |                 Cell(x1=50, x2=60, y1=0, y2=20),
50 |                 Cell(x1=80, x2=90, y1=0, y2=20),
51 |                 Cell(x1=100, x2=110, y1=0, y2=10)]
52 | 
53 |     assert set(result) == set(expected)
54 | 
55 | 
56 | def test_get_relevant_vertical_whitespaces():
57 |     with open("test_data/image_segment.json", "r") as f:
58 |         data = json.load(f)
59 |     img_segment = ImageSegment(x1=data.get('x1'),
60 |                                y1=data.get('y1'),
61 |                                x2=data.get('x2'),
62 |                                y2=data.get('y2'),
63 |                                elements=[Cell(**c) for c in data.get('elements')])
64 | 
65 |     result = get_relevant_vertical_whitespaces(segment=img_segment,
66 |                                                char_length=7.0,
67 |                                                median_line_sep=14)
68 | 
69 |     assert len(result) == 12
70 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/columns/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/columns/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/columns/test_columns.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import json
 4 | 
 5 | from img2table.tables.objects.cell import Cell
 6 | from img2table.tables.processing.borderless_tables.columns import get_columns_delimiters, identify_columns
 7 | from img2table.tables.processing.borderless_tables.model import ImageSegment, TableSegment, ColumnGroup, Whitespace, \
 8 |     Column, VerticalWS
 9 | 
10 | 
11 | def test_get_columns_delimiters():
12 |     with open("test_data/table_segment.json", "r") as f:
13 |         data = json.load(f)
14 | 
15 |     table_segment = TableSegment(table_areas=[
16 |         ImageSegment(x1=tb.get('x1'), y1=tb.get('y1'), x2=tb.get('x2'), y2=tb.get('y2'),
17 |                      elements=[Cell(**el) for el in tb.get('elements')],
18 |                      whitespaces=[Whitespace(cells=[Cell(**el)]) for el in tb.get('whitespaces')],
19 |                      position=tb.get('position'))
20 |         for tb in data.get("table_areas")
21 |     ])
22 | 
23 |     result = get_columns_delimiters(table_segment=table_segment,
24 |                                     char_length=14)
25 | 
26 |     assert result == [Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=7, y1=0, x2=21, y2=544)])),
27 |                                           VerticalWS(ws=Whitespace(cells=[Cell(x1=7, y1=496, x2=21, y2=660)]))]),
28 |                       Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=270, y1=69, x2=372, y2=544)])),
29 |                                           VerticalWS(ws=Whitespace(cells=[Cell(x1=270, y1=496, x2=372, y2=626)]))]),
30 |                       Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=1659, y1=69, x2=1758, y2=544)])),
31 |                                           VerticalWS(ws=Whitespace(cells=[Cell(x1=1659, y1=496, x2=1758, y2=626)]))]),
32 |                       Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=1845, y1=0, x2=1859, y2=544)])),
33 |                                           VerticalWS(ws=Whitespace(cells=[Cell(x1=1845, y1=496, x2=1859, y2=660)]))])]
34 | 
35 | 
36 | def test_identify_columns():
37 |     with open("test_data/table_segment.json", "r") as f:
38 |         data = json.load(f)
39 | 
40 |     table_segment = TableSegment(table_areas=[
41 |         ImageSegment(x1=tb.get('x1'), y1=tb.get('y1'), x2=tb.get('x2'), y2=tb.get('y2'),
42 |                      elements=[Cell(**el) for el in tb.get('elements')],
43 |                      whitespaces=[Whitespace(cells=[Cell(**el)]) for el in tb.get('whitespaces')],
44 |                      position=tb.get('position'))
45 |         for tb in data.get("table_areas")
46 |     ])
47 | 
48 |     result = identify_columns(table_segment=table_segment,
49 |                               char_length=14,
50 |                               median_line_sep=16)
51 | 
52 |     with open("test_data/delimiter_group.json", "r") as f:
53 |         data = json.load(f)
54 |         expected = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**d)])) for d in col])
55 |                                         for col in data.get('columns')],
56 |                                elements=[Cell(**el) for el in data.get('elements')],
57 |                                char_length=14)
58 | 
59 |     assert result == expected
60 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/columns/test_data/delimiter_group.json:
--------------------------------------------------------------------------------
1 | {"columns": [[{"x1": 14, "y1": 0, "x2": 14, "y2": 544}, {"x1": 14, "y1": 496, "x2": 14, "y2": 660}], [{"x1": 270, "y1": 69, "x2": 372, "y2": 544}, {"x1": 270, "y1": 496, "x2": 372, "y2": 626}], [{"x1": 1659, "y1": 69, "x2": 1758, "y2": 544}, {"x1": 1659, "y1": 496, "x2": 1758, "y2": 626}], [{"x1": 1852, "y1": 0, "x2": 1852, "y2": 544}, {"x1": 1852, "y1": 496, "x2": 1852, "y2": 660}]], "elements": [{"x1": 21, "y1": 458, "x2": 202, "y2": 496}, {"x1": 1760, "y1": 437, "x2": 1845, "y2": 474}, {"x1": 372, "y1": 413, "x2": 1659, "y2": 496}, {"x1": 23, "y1": 412, "x2": 240, "y2": 452}, {"x1": 1760, "y1": 329, "x2": 1844, "y2": 366}, {"x1": 372, "y1": 326, "x2": 1585, "y2": 373}, {"x1": 23, "y1": 326, "x2": 216, "y2": 373}, {"x1": 375, "y1": 242, "x2": 500, "y2": 286}, {"x1": 23, "y1": 241, "x2": 243, "y2": 287}, {"x1": 1760, "y1": 221, "x2": 1845, "y2": 258}, {"x1": 373, "y1": 196, "x2": 1648, "y2": 243}, {"x1": 23, "y1": 196, "x2": 240, "y2": 236}, {"x1": 1758, "y1": 113, "x2": 1843, "y2": 150}, {"x1": 374, "y1": 111, "x2": 563, "y2": 156}, {"x1": 22, "y1": 111, "x2": 270, "y2": 150}, {"x1": 21, "y1": 0, "x2": 1792, "y2": 69}, {"x1": 23, "y1": 587, "x2": 230, "y2": 626}, {"x1": 1760, "y1": 567, "x2": 1845, "y2": 604}, {"x1": 372, "y1": 564, "x2": 1468, "y2": 611}, {"x1": 23, "y1": 544, "x2": 236, "y2": 588}]}


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/columns/test_data/table_segment.json:
--------------------------------------------------------------------------------
1 | {"elements": [{"x1": 21, "y1": 458, "x2": 202, "y2": 496}, {"x1": 1760, "y1": 437, "x2": 1845, "y2": 474}, {"x1": 372, "y1": 413, "x2": 1659, "y2": 496}, {"x1": 23, "y1": 412, "x2": 240, "y2": 452}, {"x1": 1760, "y1": 329, "x2": 1844, "y2": 366}, {"x1": 372, "y1": 326, "x2": 1585, "y2": 373}, {"x1": 23, "y1": 326, "x2": 216, "y2": 373}, {"x1": 375, "y1": 242, "x2": 500, "y2": 286}, {"x1": 23, "y1": 241, "x2": 243, "y2": 287}, {"x1": 1760, "y1": 221, "x2": 1845, "y2": 258}, {"x1": 373, "y1": 196, "x2": 1648, "y2": 243}, {"x1": 23, "y1": 196, "x2": 240, "y2": 236}, {"x1": 1758, "y1": 113, "x2": 1843, "y2": 150}, {"x1": 374, "y1": 111, "x2": 563, "y2": 156}, {"x1": 22, "y1": 111, "x2": 270, "y2": 150}, {"x1": 21, "y1": 0, "x2": 1792, "y2": 69}, {"x1": 23, "y1": 587, "x2": 230, "y2": 626}, {"x1": 1760, "y1": 567, "x2": 1845, "y2": 604}, {"x1": 372, "y1": 564, "x2": 1468, "y2": 611}, {"x1": 23, "y1": 544, "x2": 236, "y2": 588}], "whitespaces": [{"x1": 7, "y1": 0, "x2": 21, "y2": 496}, {"x1": 270, "y1": 69, "x2": 372, "y2": 496}, {"x1": 1659, "y1": 69, "x2": 1758, "y2": 496}, {"x1": 1845, "y1": 0, "x2": 1859, "y2": 496}, {"x1": 7, "y1": 544, "x2": 23, "y2": 660}, {"x1": 236, "y1": 544, "x2": 372, "y2": 626}, {"x1": 1468, "y1": 544, "x2": 1760, "y2": 626}, {"x1": 1845, "y1": 544, "x2": 1859, "y2": 660}], "table_areas": [{"x1": 7, "y1": 0, "x2": 1859, "y2": 496, "elements": [{"x1": 21, "y1": 458, "x2": 202, "y2": 496}, {"x1": 1760, "y1": 437, "x2": 1845, "y2": 474}, {"x1": 372, "y1": 413, "x2": 1659, "y2": 496}, {"x1": 23, "y1": 412, "x2": 240, "y2": 452}, {"x1": 1760, "y1": 329, "x2": 1844, "y2": 366}, {"x1": 372, "y1": 326, "x2": 1585, "y2": 373}, {"x1": 23, "y1": 326, "x2": 216, "y2": 373}, {"x1": 375, "y1": 242, "x2": 500, "y2": 286}, {"x1": 23, "y1": 241, "x2": 243, "y2": 287}, {"x1": 1760, "y1": 221, "x2": 1845, "y2": 258}, {"x1": 373, "y1": 196, "x2": 1648, "y2": 243}, {"x1": 23, "y1": 196, "x2": 240, "y2": 236}, {"x1": 1758, "y1": 113, "x2": 1843, "y2": 150}, {"x1": 374, "y1": 111, "x2": 563, "y2": 156}, {"x1": 22, "y1": 111, "x2": 270, "y2": 150}, {"x1": 21, "y1": 0, "x2": 1792, "y2": 69}], "whitespaces": [{"x1": 7, "y1": 0, "x2": 21, "y2": 496}, {"x1": 270, "y1": 69, "x2": 372, "y2": 496}, {"x1": 1659, "y1": 69, "x2": 1758, "y2": 496}, {"x1": 1845, "y1": 0, "x2": 1859, "y2": 496}], "position": 1}, {"x1": 7, "y1": 544, "x2": 1859, "y2": 660, "elements": [{"x1": 23, "y1": 587, "x2": 230, "y2": 626}, {"x1": 1760, "y1": 567, "x2": 1845, "y2": 604}, {"x1": 372, "y1": 564, "x2": 1468, "y2": 611}, {"x1": 23, "y1": 544, "x2": 236, "y2": 588}], "whitespaces": [{"x1": 7, "y1": 544, "x2": 23, "y2": 660}, {"x1": 236, "y1": 544, "x2": 372, "y2": 626}, {"x1": 1468, "y1": 544, "x2": 1760, "y2": 626}, {"x1": 1845, "y1": 544, "x2": 1859, "y2": 660}], "position": 2}]}


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/layout/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/layout/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/layout/test_data/lines.json:
--------------------------------------------------------------------------------
1 | {"h_lines": [{"x1": 405, "y1": 158, "x2": 735, "y2": 158, "thickness": 1}, {"x1": 405, "y1": 196, "x2": 735, "y2": 196, "thickness": 1}, {"x1": 405, "y1": 271, "x2": 735, "y2": 271, "thickness": 1}, {"x1": 356, "y1": 356, "x2": 368, "y2": 356, "thickness": 1}, {"x1": 165, "y1": 372, "x2": 182, "y2": 372, "thickness": 1}, {"x1": 104, "y1": 587, "x2": 115, "y2": 587, "thickness": 1}, {"x1": 600, "y1": 592, "x2": 615, "y2": 592, "thickness": 1}, {"x1": 668, "y1": 600, "x2": 684, "y2": 600, "thickness": 1}, {"x1": 231, "y1": 603, "x2": 241, "y2": 603, "thickness": 1}, {"x1": 178, "y1": 643, "x2": 193, "y2": 643, "thickness": 1}, {"x1": 244, "y1": 755, "x2": 259, "y2": 755, "thickness": 1}, {"x1": 278, "y1": 755, "x2": 288, "y2": 755, "thickness": 1}, {"x1": 410, "y1": 791, "x2": 421, "y2": 791, "thickness": 1}, {"x1": 410, "y1": 807, "x2": 421, "y2": 807, "thickness": 1}, {"x1": 121, "y1": 842, "x2": 131, "y2": 842, "thickness": 1}, {"x1": 89, "y1": 866, "x2": 104, "y2": 866, "thickness": 1}], "v_lines": [{"x1": 87, "y1": 643, "x2": 87, "y2": 653, "thickness": 1}, {"x1": 326, "y1": 627, "x2": 326, "y2": 637, "thickness": 1}, {"x1": 405, "y1": 431, "x2": 405, "y2": 441, "thickness": 1}]}


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/layout/test_data/test.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/layout/test_data/test.bmp


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/layout/test_data/text_thresh.bmp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/layout/test_data/text_thresh.bmp


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/layout/test_image_elements.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import cv2
 5 | 
 6 | from img2table.tables.objects.cell import Cell
 7 | from img2table.tables.processing.borderless_tables.layout import get_image_elements
 8 | 
 9 | 
10 | def test_get_image_elements():
11 |     thresh = cv2.imread("test_data/text_thresh.bmp", cv2.IMREAD_GRAYSCALE)
12 | 
13 |     result = get_image_elements(thresh=thresh,
14 |                                 char_length=6.0,
15 |                                 median_line_sep=16)
16 | 
17 |     with open("test_data/elements.json", "r") as f:
18 |         expected = [Cell(**el) for el in json.load(f)]
19 | 
20 |     assert result == expected
21 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/layout/test_layout.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import cv2
 5 | 
 6 | from img2table.tables import threshold_dark_areas
 7 | from img2table.tables.objects.line import Line
 8 | from img2table.tables.processing.borderless_tables import segment_image
 9 | 
10 | 
11 | def test_segment_image():
12 |     img = cv2.cvtColor(cv2.imread("test_data/test.bmp"), cv2.COLOR_BGR2RGB)
13 |     thresh = threshold_dark_areas(img=img, char_length=6)
14 | 
15 |     with open("test_data/lines.json", 'r') as f:
16 |         data = json.load(f)
17 |     lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')]
18 | 
19 |     result = segment_image(thresh=thresh,
20 |                            lines=lines,
21 |                            char_length=6.0,
22 |                            median_line_sep=16)
23 | 
24 |     assert len(result) == 2
25 | 
26 |     assert len(result[0].elements) == 30
27 |     assert len(result[0].table_areas) == 5
28 |     assert len(result[0].whitespaces) == 21
29 | 
30 |     assert len(result[1].elements) == 4
31 |     assert len(result[1].table_areas) == 1
32 |     assert len(result[1].whitespaces) == 4
33 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/layout/test_rlsa.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | from numba import config
 7 | 
 8 | from img2table.tables import threshold_dark_areas
 9 | from img2table.tables.objects.line import Line
10 | from img2table.tables.processing.borderless_tables.layout.rlsa import identify_text_mask
11 | 
12 | 
13 | def test_identify_text_mask():
14 |     config.DISABLE_JIT = True
15 | 
16 |     img = cv2.cvtColor(cv2.imread("test_data/test.bmp"), cv2.COLOR_BGR2RGB)
17 |     thresh = threshold_dark_areas(img=img, char_length=6)
18 | 
19 |     with open("test_data/lines.json", 'r') as f:
20 |         data = json.load(f)
21 |     lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')]
22 | 
23 |     result = identify_text_mask(thresh=thresh,
24 |                                 lines=lines,
25 |                                 char_length=6.0)
26 | 
27 |     expected = cv2.imread("test_data/text_thresh.bmp", cv2.IMREAD_GRAYSCALE)
28 | 
29 |     assert np.array_equal(result, expected)
30 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/rows/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/rows/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/rows/test_data/h_whitespaces.json:
--------------------------------------------------------------------------------
1 | [{"x1": 93, "y1": 45, "x2": 1233, "y2": 45}, {"x1": 93, "y1": 78, "x2": 1233, "y2": 78}, {"x1": 93, "y1": 146, "x2": 1233, "y2": 146}, {"x1": 93, "y1": 212, "x2": 1233, "y2": 212}, {"x1": 93, "y1": 278, "x2": 1233, "y2": 278}, {"x1": 93, "y1": 344, "x2": 1233, "y2": 344}, {"x1": 93, "y1": 410, "x2": 1233, "y2": 410}, {"x1": 93, "y1": 476, "x2": 1233, "y2": 476}, {"x1": 93, "y1": 542, "x2": 1233, "y2": 542}, {"x1": 93, "y1": 608, "x2": 1233, "y2": 608}, {"x1": 93, "y1": 674, "x2": 1233, "y2": 674}, {"x1": 93, "y1": 740, "x2": 1233, "y2": 740}, {"x1": 93, "y1": 806, "x2": 1233, "y2": 806}, {"x1": 93, "y1": 872, "x2": 1233, "y2": 872}, {"x1": 93, "y1": 938, "x2": 1233, "y2": 938}, {"x1": 93, "y1": 1004, "x2": 1233, "y2": 1004}, {"x1": 93, "y1": 1084, "x2": 1233, "y2": 1084}, {"x1": 93, "y1": 1147, "x2": 1233, "y2": 1147}]


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/rows/test_data/rows.json:
--------------------------------------------------------------------------------
1 | [{"x1": 53, "y1": 45, "x2": 1277, "y2": 45}, {"x1": 53, "y1": 78, "x2": 1277, "y2": 78}, {"x1": 53, "y1": 146, "x2": 1277, "y2": 146}, {"x1": 53, "y1": 212, "x2": 1277, "y2": 212}, {"x1": 53, "y1": 278, "x2": 1277, "y2": 278}, {"x1": 53, "y1": 344, "x2": 1277, "y2": 344}, {"x1": 53, "y1": 410, "x2": 1277, "y2": 410}, {"x1": 53, "y1": 476, "x2": 1277, "y2": 476}, {"x1": 53, "y1": 542, "x2": 1277, "y2": 542}, {"x1": 53, "y1": 608, "x2": 1277, "y2": 608}, {"x1": 53, "y1": 674, "x2": 1277, "y2": 674}, {"x1": 53, "y1": 740, "x2": 1277, "y2": 740}, {"x1": 53, "y1": 806, "x2": 1277, "y2": 806}, {"x1": 53, "y1": 872, "x2": 1277, "y2": 872}, {"x1": 53, "y1": 938, "x2": 1277, "y2": 938}, {"x1": 53, "y1": 1004, "x2": 1277, "y2": 1004}, {"x1": 53, "y1": 1084, "x2": 1277, "y2": 1084}, {"x1": 53, "y1": 1147, "x2": 1277, "y2": 1147}]


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/rows/test_rows.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import json
 4 | 
 5 | from img2table.tables.objects.cell import Cell
 6 | from img2table.tables.processing.borderless_tables.model import ColumnGroup, Column, VerticalWS, Whitespace
 7 | from img2table.tables.processing.borderless_tables.rows import \
 8 |     identify_delimiter_group_rows, identify_row_delimiters, filter_coherent_row_delimiters, correct_delimiter_width
 9 | 
10 | 
11 | def test_identify_row_delimiters():
12 |     with open("test_data/delimiter_group.json", "r") as f:
13 |         data = json.load(f)
14 |     column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))])
15 |                                         for col in data.get('delimiters')],
16 |                                elements=[Cell(**el) for el in data.get('elements')],
17 |                                char_length=14)
18 | 
19 |     result = identify_row_delimiters(column_group=column_group)
20 | 
21 |     with open("test_data/h_whitespaces.json", "r") as f:
22 |         expected = [Cell(**c) for c in json.load(f)]
23 | 
24 |     assert result == expected
25 | 
26 | 
27 | def test_filter_coherent_row_delimiters():
28 |     row_delimiters = [Cell(x1=0, x2=100, y1=0, y2=0),
29 |                       Cell(x1=0, x2=80, y1=10, y2=10),
30 |                       Cell(x1=0, x2=100, y1=20, y2=20)]
31 | 
32 |     column_group = ColumnGroup(columns=[Column([VerticalWS(Whitespace(cells=[Cell(x1=0, x2=0, y1=0, y2=20)]))]),
33 |                                         Column([VerticalWS(Whitespace(cells=[Cell(x1=30, x2=30, y1=0, y2=20)]))]),
34 |                                         Column([VerticalWS(Whitespace(cells=[Cell(x1=60, x2=60, y1=0, y2=20)]))]),
35 |                                         Column([VerticalWS(Whitespace(cells=[Cell(x1=100, x2=100, y1=0, y2=20)]))])],
36 |                                elements=[Cell(x1=85, x2=95, y1=2, y2=7)],
37 |                                char_length=14)
38 | 
39 |     result = filter_coherent_row_delimiters(row_delimiters=row_delimiters,
40 |                                             column_group=column_group)
41 | 
42 |     expected = [Cell(x1=0, x2=100, y1=0, y2=0),
43 |                 Cell(x1=0, x2=100, y1=20, y2=20)]
44 | 
45 |     assert result == expected
46 | 
47 | 
48 | def test_correct_delimiter_width():
49 |     row_delimiters = [Cell(x1=0, x2=100, y1=0, y2=0),
50 |                       Cell(x1=0, x2=80, y1=10, y2=10),
51 |                       Cell(x1=30, x2=100, y1=20, y2=20),
52 |                       Cell(x1=0, x2=100, y1=30, y2=30)]
53 | 
54 |     contours = [Cell(x1=23, x2=34, y1=12, y2=18),
55 |                 Cell(x1=86, x2=93, y1=2, y2=9),
56 |                 Cell(x1=3, x2=17, y1=18, y2=24)]
57 | 
58 |     result = correct_delimiter_width(row_delimiters=row_delimiters,
59 |                                      contours=contours)
60 | 
61 |     expected = [Cell(x1=0, x2=100, y1=0, y2=0),
62 |                 Cell(x1=0, x2=100, y1=10, y2=10),
63 |                 Cell(x1=17, x2=100, y1=20, y2=20),
64 |                 Cell(x1=0, x2=100, y1=30, y2=30)]
65 | 
66 |     assert result == expected
67 | 
68 | 
69 | def test_identify_delimiter_group_rows():
70 |     with open("test_data/delimiter_group.json", "r") as f:
71 |         data = json.load(f)
72 |     column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))])
73 |                                         for col in data.get('delimiters')],
74 |                                elements=[Cell(**el) for el in data.get('elements')],
75 |                                char_length=14)
76 | 
77 |     with open("test_data/contours.json", 'r') as f:
78 |         contours = [Cell(**el) for el in json.load(f)]
79 | 
80 |     result = identify_delimiter_group_rows(column_group=column_group,
81 |                                            contours=contours)
82 | 
83 |     assert len(result) == 18
84 |     assert min([d.y1 for d in result]) == 45
85 |     assert max([d.y2 for d in result]) == 1147
86 |     assert min([d.x1 for d in result]) == 93
87 |     assert max([d.x2 for d in result]) == 1233
88 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/table/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/table/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/table/test_data/rows.json:
--------------------------------------------------------------------------------
1 | [{"x1": 53, "y1": 45, "x2": 1277, "y2": 45}, {"x1": 53, "y1": 78, "x2": 1277, "y2": 78}, {"x1": 53, "y1": 146, "x2": 1277, "y2": 146}, {"x1": 53, "y1": 212, "x2": 1277, "y2": 212}, {"x1": 53, "y1": 278, "x2": 1277, "y2": 278}, {"x1": 53, "y1": 344, "x2": 1277, "y2": 344}, {"x1": 53, "y1": 410, "x2": 1277, "y2": 410}, {"x1": 53, "y1": 476, "x2": 1277, "y2": 476}, {"x1": 53, "y1": 542, "x2": 1277, "y2": 542}, {"x1": 53, "y1": 608, "x2": 1277, "y2": 608}, {"x1": 53, "y1": 674, "x2": 1277, "y2": 674}, {"x1": 53, "y1": 740, "x2": 1277, "y2": 740}, {"x1": 53, "y1": 806, "x2": 1277, "y2": 806}, {"x1": 53, "y1": 872, "x2": 1277, "y2": 872}, {"x1": 53, "y1": 938, "x2": 1277, "y2": 938}, {"x1": 53, "y1": 1004, "x2": 1277, "y2": 1004}, {"x1": 53, "y1": 1084, "x2": 1277, "y2": 1084}, {"x1": 53, "y1": 1147, "x2": 1277, "y2": 1147}]


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/table/test_table.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import json
 4 | 
 5 | from img2table.tables.objects.cell import Cell
 6 | from img2table.tables.processing.borderless_tables import identify_table
 7 | from img2table.tables.processing.borderless_tables.model import ColumnGroup, Column, VerticalWS, Whitespace
 8 | 
 9 | 
10 | def test_identify_table():
11 |     with open("test_data/delimiter_group.json", "r") as f:
12 |         data = json.load(f)
13 |     column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))])
14 |                                         for col in data.get('delimiters')],
15 |                                elements=[Cell(**c) for c in data.get('elements')],
16 |                                char_length=4.66)
17 | 
18 |     with open("test_data/contours.json", 'r') as f:
19 |         contours = [Cell(**el) for el in json.load(f)]
20 | 
21 |     with open("test_data/rows.json", "r") as f:
22 |         row_delimiters = [Cell(**c) for c in json.load(f)]
23 | 
24 |     result = identify_table(columns=column_group,
25 |                             row_delimiters=row_delimiters,
26 |                             contours=contours,
27 |                             median_line_sep=16,
28 |                             char_length=4.66)
29 | 
30 |     assert result.nb_rows == 17
31 |     assert result.nb_columns == 8
32 |     assert (result.x1, result.y1, result.x2, result.y2) == (91, 45, 1235, 1147)
33 | 


--------------------------------------------------------------------------------
/tests/tables/processing/borderless_tables/table/test_table_creation.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import json
 4 | 
 5 | from img2table.tables.objects.cell import Cell
 6 | from img2table.tables.processing.borderless_tables.model import ColumnGroup, VerticalWS, Column, Whitespace
 7 | from img2table.tables.processing.borderless_tables.table.table_creation import get_table
 8 | 
 9 | 
10 | def test_get_table():
11 |     with open("test_data/delimiter_group.json", "r") as f:
12 |         data = json.load(f)
13 |     column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))])
14 |                                         for col in data.get('delimiters')],
15 |                                elements=[Cell(**c) for c in data.get('elements')],
16 |                                char_length=4.66)
17 | 
18 |     with open("test_data/contours.json", 'r') as f:
19 |         contours = [Cell(**el) for el in json.load(f)]
20 | 
21 |     with open("test_data/rows.json", "r") as f:
22 |         row_delimiters = [Cell(**c) for c in json.load(f)]
23 | 
24 |     result = get_table(columns=column_group,
25 |                        row_delimiters=row_delimiters,
26 |                        contours=contours)
27 | 
28 |     assert result.nb_rows == 17
29 |     assert result.nb_columns == 8
30 |     assert (result.x1, result.y1, result.x2, result.y2) == (91, 45, 1235, 1147)
31 | 


--------------------------------------------------------------------------------
/tests/tables/processing/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/common/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/common/test_common.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import cv2
 3 | 
 4 | from img2table.tables.objects.cell import Cell
 5 | from img2table.tables.processing.common import is_contained_cell, merge_contours, get_contours_cell
 6 | 
 7 | 
 8 | def test_is_contained_cell():
 9 |     cell_1 = Cell(x1=0, x2=20, y1=0, y2=20)
10 |     cell_2 = Cell(x1=0, x2=40, y1=0, y2=25)
11 |     cell_3 = Cell(x1=50, x2=70, y1=123, y2=256)
12 | 
13 |     assert is_contained_cell(inner_cell=cell_1, outer_cell=cell_2)
14 |     assert not is_contained_cell(inner_cell=cell_2, outer_cell=cell_1)
15 |     assert not is_contained_cell(inner_cell=cell_1, outer_cell=cell_3)
16 |     assert not is_contained_cell(inner_cell=cell_2, outer_cell=cell_3)
17 | 
18 | 
19 | def test_merge_contours():
20 |     contours = [Cell(x1=0, x2=20, y1=0, y2=20),
21 |                 Cell(x1=0, x2=20, y1=10, y2=20),
22 |                 Cell(x1=60, x2=80, y1=0, y2=20),
23 |                 Cell(x1=10, x2=20, y1=100, y2=200)]
24 | 
25 |     # Do not merge by axis
26 |     expected = [Cell(x1=0, x2=20, y1=0, y2=20),
27 |                 Cell(x1=60, x2=80, y1=0, y2=20),
28 |                 Cell(x1=10, x2=20, y1=100, y2=200)]
29 |     assert set(merge_contours(contours=contours, vertically=None)) == set(expected)
30 | 
31 |     # Merge vertically
32 |     expected_vertical = [Cell(x1=0, x2=80, y1=0, y2=20), Cell(x1=10, x2=20, y1=100, y2=200)]
33 |     assert merge_contours(contours=contours, vertically=True) == expected_vertical
34 | 
35 |     # Merge horizontally
36 |     expected_horizontal = [Cell(x1=0, x2=20, y1=0, y2=200), Cell(x1=60, x2=80, y1=0, y2=20)]
37 |     assert merge_contours(contours=contours, vertically=False) == expected_horizontal
38 | 
39 | 
40 | def test_get_contours_cell():
41 |     img = cv2.cvtColor(cv2.imread("test_data/test.jpg"), cv2.COLOR_BGR2RGB)
42 |     cell = Cell(x1=0, x2=img.shape[1], y1=0, y2=img.shape[0])
43 | 
44 |     result = get_contours_cell(img=img,
45 |                                cell=cell,
46 |                                margin=5,
47 |                                blur_size=5,
48 |                                kernel_size=9,
49 |                                merge_vertically=True)
50 | 
51 |     expected = [Cell(x1=51, y1=19, x2=518, y2=146),
52 |                 Cell(x1=60, y1=156, x2=534, y2=691),
53 |                 Cell(x1=65, y1=765, x2=543, y2=811)]
54 | 
55 |     assert result == expected
56 | 


--------------------------------------------------------------------------------
/tests/tables/processing/common/test_data/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/common/test_data/test.jpg


--------------------------------------------------------------------------------
/tests/tables/processing/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/text/__init__.py


--------------------------------------------------------------------------------
/tests/tables/processing/text/test_data/table.json:
--------------------------------------------------------------------------------
1 | [[{"x1": 12, "y1": 67, "x2": 104, "y2": 177}, {"x1": 104, "y1": 67, "x2": 288, "y2": 177}, {"x1": 288, "y1": 67, "x2": 440, "y2": 177}, {"x1": 440, "y1": 67, "x2": 596, "y2": 177}, {"x1": 596, "y1": 67, "x2": 734, "y2": 177}, {"x1": 734, "y1": 67, "x2": 1043, "y2": 177}], [{"x1": 12, "y1": 177, "x2": 104, "y2": 220}, {"x1": 104, "y1": 177, "x2": 288, "y2": 220}, {"x1": 288, "y1": 177, "x2": 440, "y2": 220}, {"x1": 440, "y1": 177, "x2": 596, "y2": 220}, {"x1": 596, "y1": 177, "x2": 734, "y2": 220}, {"x1": 734, "y1": 177, "x2": 1043, "y2": 220}], [{"x1": 12, "y1": 220, "x2": 104, "y2": 264}, {"x1": 104, "y1": 220, "x2": 288, "y2": 264}, {"x1": 288, "y1": 220, "x2": 440, "y2": 264}, {"x1": 440, "y1": 220, "x2": 596, "y2": 264}, {"x1": 596, "y1": 220, "x2": 734, "y2": 264}, {"x1": 734, "y1": 220, "x2": 1043, "y2": 264}], [{"x1": 12, "y1": 264, "x2": 104, "y2": 341}, {"x1": 104, "y1": 264, "x2": 288, "y2": 341}, {"x1": 288, "y1": 264, "x2": 440, "y2": 341}, {"x1": 440, "y1": 264, "x2": 596, "y2": 341}, {"x1": 596, "y1": 264, "x2": 734, "y2": 341}, {"x1": 734, "y1": 264, "x2": 1043, "y2": 341}], [{"x1": 12, "y1": 341, "x2": 104, "y2": 384}, {"x1": 104, "y1": 341, "x2": 288, "y2": 384}, {"x1": 288, "y1": 341, "x2": 440, "y2": 384}, {"x1": 440, "y1": 341, "x2": 596, "y2": 384}, {"x1": 596, "y1": 341, "x2": 734, "y2": 384}, {"x1": 734, "y1": 341, "x2": 1043, "y2": 384}], [{"x1": 12, "y1": 384, "x2": 104, "y2": 428}, {"x1": 104, "y1": 384, "x2": 288, "y2": 428}, {"x1": 288, "y1": 384, "x2": 440, "y2": 428}, {"x1": 440, "y1": 384, "x2": 596, "y2": 428}, {"x1": 596, "y1": 384, "x2": 734, "y2": 428}, {"x1": 734, "y1": 384, "x2": 1043, "y2": 428}], [{"x1": 12, "y1": 428, "x2": 104, "y2": 471}, {"x1": 104, "y1": 428, "x2": 288, "y2": 471}, {"x1": 288, "y1": 428, "x2": 440, "y2": 471}, {"x1": 440, "y1": 428, "x2": 596, "y2": 471}, {"x1": 596, "y1": 428, "x2": 734, "y2": 471}, {"x1": 734, "y1": 428, "x2": 1043, "y2": 471}], [{"x1": 12, "y1": 471, "x2": 104, "y2": 514}, {"x1": 104, "y1": 471, "x2": 288, "y2": 514}, {"x1": 288, "y1": 471, "x2": 440, "y2": 514}, {"x1": 440, "y1": 471, "x2": 596, "y2": 514}, {"x1": 596, "y1": 471, "x2": 734, "y2": 514}, {"x1": 734, "y1": 471, "x2": 1043, "y2": 514}], [{"x1": 12, "y1": 514, "x2": 104, "y2": 558}, {"x1": 104, "y1": 514, "x2": 288, "y2": 558}, {"x1": 288, "y1": 514, "x2": 440, "y2": 558}, {"x1": 440, "y1": 514, "x2": 596, "y2": 558}, {"x1": 596, "y1": 514, "x2": 734, "y2": 558}, {"x1": 734, "y1": 514, "x2": 1043, "y2": 558}], [{"x1": 12, "y1": 558, "x2": 104, "y2": 635}, {"x1": 104, "y1": 558, "x2": 288, "y2": 635}, {"x1": 288, "y1": 558, "x2": 440, "y2": 635}, {"x1": 440, "y1": 558, "x2": 596, "y2": 635}, {"x1": 596, "y1": 558, "x2": 734, "y2": 635}, {"x1": 734, "y1": 558, "x2": 1043, "y2": 635}], [{"x1": 12, "y1": 635, "x2": 104, "y2": 678}, {"x1": 104, "y1": 635, "x2": 288, "y2": 678}, {"x1": 288, "y1": 635, "x2": 440, "y2": 678}, {"x1": 440, "y1": 635, "x2": 596, "y2": 678}, {"x1": 596, "y1": 635, "x2": 734, "y2": 678}, {"x1": 734, "y1": 635, "x2": 1043, "y2": 678}]]


--------------------------------------------------------------------------------
/tests/tables/processing/text/test_data/test.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/text/test_data/test.jpg


--------------------------------------------------------------------------------
/tests/tables/processing/text/test_titles.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import json
 3 | 
 4 | import cv2
 5 | import polars as pl
 6 | 
 7 | from img2table.ocr.data import OCRDataframe
 8 | from img2table.tables.objects.cell import Cell
 9 | from img2table.tables.objects.row import Row
10 | from img2table.tables.objects.table import Table
11 | from img2table.tables.processing.text.titles import get_title_tables
12 | 
13 | 
14 | def test_get_title_tables():
15 |     img = cv2.cvtColor(cv2.imread("test_data/test.jpg"), cv2.COLOR_BGR2RGB)
16 |     with open("test_data/table.json", "r") as f:
17 |         table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)])
18 |     ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr.csv", separator=";"))
19 | 
20 |     result = get_title_tables(img=img, tables=[table], ocr_df=ocr_df)
21 | 
22 |     assert result[0].title == "10 most populous countries"
23 |     assert get_title_tables(img=img, tables=[], ocr_df=ocr_df) == []
24 | 


--------------------------------------------------------------------------------