├── .github ├── dependabot.yml └── workflows │ ├── publish.yml │ └── test_workflow.yml ├── .gitignore ├── LICENSE.txt ├── Makefile ├── README.md ├── activate_venv ├── examples ├── Basic_usage.ipynb ├── Implicit.ipynb ├── borderless.ipynb ├── data │ ├── borderless.jpg │ ├── borderless │ │ ├── 1.png │ │ ├── 2.png │ │ ├── 3.png │ │ └── 4.png │ ├── implicit.png │ ├── tables.pdf │ ├── tables.png │ └── tables.xlsx └── utils.py ├── pyproject.toml ├── pytest.ini ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── src └── img2table │ ├── __init__.py │ ├── document │ ├── __init__.py │ ├── base │ │ ├── __init__.py │ │ └── rotation.py │ ├── image.py │ └── pdf.py │ ├── ocr │ ├── __init__.py │ ├── aws_textract.py │ ├── azure.py │ ├── base.py │ ├── data.py │ ├── doctr.py │ ├── easyocr.py │ ├── google_vision.py │ ├── paddle.py │ ├── pdf.py │ ├── surya.py │ └── tesseract.py │ └── tables │ ├── __init__.py │ ├── image.py │ ├── metrics.py │ ├── objects │ ├── __init__.py │ ├── cell.py │ ├── extraction.py │ ├── line.py │ ├── row.py │ └── table.py │ └── processing │ ├── __init__.py │ ├── bordered_tables │ ├── __init__.py │ ├── cells │ │ ├── __init__.py │ │ ├── deduplication.py │ │ └── identification.py │ ├── lines.py │ └── tables │ │ ├── __init__.py │ │ ├── cell_clustering.py │ │ ├── consecutive.py │ │ ├── implicit.py │ │ ├── semi_bordered.py │ │ └── table_creation.py │ ├── borderless_tables │ ├── __init__.py │ ├── columns.py │ ├── layout │ │ ├── __init__.py │ │ ├── column_segments.py │ │ ├── image_elements.py │ │ ├── rlsa.py │ │ └── table_segments.py │ ├── model.py │ ├── rows.py │ ├── table │ │ ├── __init__.py │ │ ├── coherency.py │ │ └── table_creation.py │ └── whitespaces.py │ ├── common.py │ └── text │ ├── __init__.py │ └── titles.py └── tests ├── __init__.py ├── _mock_data ├── azure.pkl ├── surya.pkl ├── tesseract_hocr.html ├── textract.json ├── vision.json └── vision.pkl ├── conftest.py ├── document ├── __init__.py ├── base │ ├── __init__.py │ ├── test_data │ │ └── test.png │ └── test_rotation.py ├── image │ ├── __init__.py │ ├── test_data │ │ ├── blank.png │ │ ├── dark.png │ │ ├── expected.xlsx │ │ └── test.png │ └── test_image.py └── pdf │ ├── __init__.py │ ├── test_data │ └── test.pdf │ └── test_pdf.py ├── ocr ├── __init__.py ├── aws_textract │ ├── __init__.py │ ├── test_aws_textract.py │ └── test_data │ │ ├── content.json │ │ ├── ocr_df.csv │ │ └── test.png ├── azure │ ├── __init__.py │ ├── test_azure.py │ └── test_data │ │ ├── ocr_df.csv │ │ └── test.png ├── data │ ├── __init__.py │ ├── test_data │ │ ├── expected_table.json │ │ ├── ocr_df.csv │ │ └── table.json │ └── test_ocr_data.py ├── doctr │ ├── __init__.py │ ├── test_data │ │ ├── ocr.pkl │ │ ├── ocr_df.csv │ │ └── test.png │ └── test_doctr.py ├── easyocr │ ├── __init__.py │ ├── test_data │ │ ├── ocr.json │ │ ├── ocr_df.csv │ │ └── test.png │ └── test_easyocr.py ├── google_vision │ ├── __init__.py │ ├── test_data │ │ ├── expected_content.json │ │ ├── ocr_df.csv │ │ └── test.png │ └── test_google_vision.py ├── paddle │ ├── __init__.py │ ├── test_data │ │ ├── hocr.json │ │ ├── ocr_df.csv │ │ └── test.png │ └── test_paddle.py ├── pdf │ ├── __init__.py │ ├── test_data │ │ ├── content.json │ │ ├── ocr_df.csv │ │ └── test.pdf │ └── test_pdf_ocr.py ├── surya │ ├── __init__.py │ ├── test_data │ │ ├── ocr_df.csv │ │ └── test.png │ └── test_surya.py └── tesseract │ ├── __init__.py │ ├── test_data │ ├── ocr_df.csv │ └── test.png │ └── test_tesseract.py └── tables ├── __init__.py ├── image ├── __init__.py ├── test_data │ ├── blank.png │ ├── ocr.csv │ └── test.png ├── test_image.py └── test_metrics.py ├── objects ├── __init__.py ├── test_data │ ├── expected_tables.json │ ├── ocr.csv │ ├── table.html │ └── tables.json ├── test_extraction.py ├── test_line.py ├── test_row.py └── test_table.py └── processing ├── __init__.py ├── bordered_tables ├── __init__.py ├── cells │ ├── __init__.py │ ├── test_cells.py │ ├── test_data │ │ ├── expected.csv │ │ ├── expected_ident_cells.csv │ │ ├── expected_potential_cells.csv │ │ ├── expected_vertical_dedup.csv │ │ └── lines.json │ ├── test_deduplication_cells.py │ └── test_identification_cells.py ├── lines │ ├── __init__.py │ ├── test_data │ │ ├── contours.json │ │ ├── expected.json │ │ └── test.png │ └── test_lines.py └── tables │ ├── __init__.py │ ├── test_cell_clustering.py │ ├── test_data │ ├── cell_clusters_normalized.json │ ├── cells.json │ ├── cells_clustered.json │ ├── contours.json │ ├── contours_implicit.json │ ├── expected.json │ ├── lines.json │ ├── table_implicit.json │ ├── tables_from_cells.json │ └── word_image.png │ ├── test_implicit.py │ ├── test_semi_bordered.py │ ├── test_table_creation.py │ └── test_tables.py ├── borderless_tables ├── __init__.py ├── borderless_tables │ ├── __init__.py │ ├── test_borderless_tables.py │ ├── test_data │ │ ├── contours.json │ │ ├── image_segment.json │ │ ├── lines.json │ │ └── test.png │ └── test_whitespaces.py ├── columns │ ├── __init__.py │ ├── test_columns.py │ └── test_data │ │ ├── delimiter_group.json │ │ └── table_segment.json ├── layout │ ├── __init__.py │ ├── test_column_segments.py │ ├── test_data │ │ ├── elements.json │ │ ├── lines.json │ │ ├── test.bmp │ │ └── text_thresh.bmp │ ├── test_image_elements.py │ ├── test_layout.py │ ├── test_rlsa.py │ └── test_table_segments.py ├── rows │ ├── __init__.py │ ├── test_data │ │ ├── contours.json │ │ ├── delimiter_group.json │ │ ├── h_whitespaces.json │ │ └── rows.json │ └── test_rows.py └── table │ ├── __init__.py │ ├── test_data │ ├── contours.json │ ├── delimiter_group.json │ └── rows.json │ ├── test_table.py │ └── test_table_creation.py ├── common ├── __init__.py ├── test_common.py └── test_data │ └── test.jpg └── text ├── __init__.py ├── test_data ├── ocr.csv ├── table.json └── test.jpg └── test_titles.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Pypi 2 | 3 | on: 4 | release: 5 | types: [released] 6 | 7 | jobs: 8 | publish: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: 3.8 17 | - name: Build package 18 | run: make build 19 | - name: Publish to Pypi 20 | uses: pypa/gh-action-pypi-publish@release/v1 21 | with: 22 | password: ${{ secrets.PYPI_TOKEN }} 23 | 24 | -------------------------------------------------------------------------------- /.github/workflows/test_workflow.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v3 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | cache: 'pip' 21 | - name: Install dependencies 22 | run: make venv 23 | - name: Perform tests 24 | run: make test 25 | env: 26 | NUMBA_DISABLE_JIT: 1 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .ipynb_checkpoints 3 | __pycache__ 4 | .pytest_cache 5 | .coverage 6 | *.egg-info 7 | dist 8 | build 9 | 10 | certs 11 | venv 12 | profiling* 13 | examples/testing -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Xavier Canton 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 6 | copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 7 | 8 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 9 | 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 11 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | VENV = ./activate_venv 4 | DIR := $(shell pwd) 5 | export PYTHONPATH := $(DIR)/src 6 | 7 | # Virtual environment commands 8 | venv: 9 | python -m venv ./venv || true 10 | . $(VENV); python -m pip install pip wheel --upgrade; 11 | . $(VENV); python -m pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu 12 | 13 | update: 14 | . $(VENV); python -m pip install -U -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu 15 | 16 | # Test commands 17 | test: 18 | . $(VENV); pytest --cov-report term --cov=src 19 | 20 | # Examples commands 21 | jupyter-examples: 22 | . $(VENV); cd examples && jupyter notebook 23 | 24 | update-examples: 25 | . $(VENV); 26 | for f in $(PWD)/examples/*.ipynb; do \ 27 | jupyter nbconvert --to notebook --execute $$f --inplace; \ 28 | done 29 | 30 | # Build commands 31 | build: venv 32 | . $(VENV); python setup.py sdist bdist_wheel 33 | 34 | 35 | .PHONY: venv -------------------------------------------------------------------------------- /activate_venv: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | if [ "$(uname)" == "Darwin" ]; then 4 | source .venv/bin/activate 5 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then 6 | source .venv/bin/activate 7 | elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW32_NT" ]; then 8 | source venv/Scripts/activate 9 | elif [ "$(expr substr $(uname -s) 1 10)" == "MINGW64_NT" ]; then 10 | source venv/Scripts/activate 11 | fi -------------------------------------------------------------------------------- /examples/data/borderless.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless.jpg -------------------------------------------------------------------------------- /examples/data/borderless/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/1.png -------------------------------------------------------------------------------- /examples/data/borderless/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/2.png -------------------------------------------------------------------------------- /examples/data/borderless/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/3.png -------------------------------------------------------------------------------- /examples/data/borderless/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/borderless/4.png -------------------------------------------------------------------------------- /examples/data/implicit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/implicit.png -------------------------------------------------------------------------------- /examples/data/tables.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/tables.pdf -------------------------------------------------------------------------------- /examples/data/tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/tables.png -------------------------------------------------------------------------------- /examples/data/tables.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/examples/data/tables.xlsx -------------------------------------------------------------------------------- /examples/utils.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import cv2 4 | import numpy as np 5 | 6 | from img2table.document import Image 7 | from img2table.ocr.base import OCRInstance 8 | 9 | 10 | def display_borderless_tables(img: Image, ocr: OCRInstance) -> np.ndarray: 11 | """ 12 | Create display of borderless table extraction 13 | :param img: Image object 14 | :param ocr: OCRInstance object 15 | :return: display image 16 | """ 17 | # Extract tables 18 | extracted_tables = img.extract_tables(ocr=ocr, 19 | borderless_tables=True) 20 | 21 | # Create image displaying extracted tables 22 | display_image = list(img.images)[0].copy() 23 | for tb in extracted_tables: 24 | for row in tb.content.values(): 25 | for cell in row: 26 | cv2.rectangle(display_image, (cell.bbox.x1, cell.bbox.y1), (cell.bbox.x2, cell.bbox.y2), 27 | (255, 0, 0), 2) 28 | 29 | # Create white separator image 30 | width = min(display_image.shape[1] // 10, 100) 31 | white_img = cv2.cvtColor(255 * np.ones((display_image.shape[0], width), dtype=np.uint8), cv2.COLOR_GRAY2RGB) 32 | 33 | # Stack images 34 | final_image = np.hstack([list(img.images)[0].copy(), 35 | white_img, 36 | display_image]) 37 | 38 | return final_image 39 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["pbr>=5.7.0", "setuptools>=42"] 3 | build-backend = "pbr.build" 4 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | minversion = 6.0 3 | pythonpath = . src 4 | testpaths = tests 5 | log_level = INFO 6 | python_files = test_*.py 7 | filterwarnings = ignore::UserWarning -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | 3 | # GCP 4 | google-cloud-vision 5 | 6 | # AWS 7 | boto3 8 | 9 | # Azure 10 | azure-cognitiveservices-vision-computervision 11 | 12 | # Paddle 13 | paddlepaddle; python_version < '3.13' 14 | paddleocr>=2.0.6; python_version < '3.13' 15 | 16 | # EasyOCR 17 | easyocr >= 1.7.0 18 | pillow>=10.0.1 19 | 20 | # docTR 21 | python-doctr>=0.8; python_version < '3.12' 22 | 23 | # Surya 24 | surya-ocr>=0.9; python_version >= '3.10' 25 | 26 | # Test dependencies 27 | pytest >= 6 28 | pytest-cov 29 | pytest-xdist 30 | openpyxl 31 | sewar 32 | pipdeptree 33 | pyinstrument 34 | 35 | # Examples dependencies 36 | jupyter 37 | ipython-autotime 38 | Pillow 39 | 40 | # Build tools 41 | wheel 42 | setuptools 43 | pbr 44 | twine -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | polars[pandas]>=1.2 2 | pyarrow>=7 3 | numpy 4 | pypdfium2==4.30.0 5 | opencv-contrib-python>=4 6 | numba 7 | beautifulsoup4 8 | xlsxwriter>=3.0.6 9 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = img2table 3 | author = Xavier Canton 4 | summary = img2table is a table identification and extraction Python Library for PDF and images, based on OpenCV image processing 5 | license = MIT 6 | description_file = README.md 7 | description_content_type = text/markdown 8 | home_page = https://github.com/xavctn/img2table 9 | python_requires = >=3.8, <3.14 10 | classifiers= 11 | Programming Language :: Python :: 3 :: Only 12 | License :: OSI Approved :: MIT License 13 | Operating System :: OS Independent 14 | 15 | [options] 16 | package_dir = src/ 17 | packages = img2table 18 | 19 | 20 | [extras] 21 | gcp = 22 | google-cloud-vision 23 | requests 24 | aws = 25 | boto3 26 | azure = 27 | azure-cognitiveservices-vision-computervision 28 | paddle = 29 | paddlepaddle 30 | paddleocr>=2.0.6 31 | easyocr = 32 | easyocr>=1.7.0 33 | pillow>=10.0.1 34 | surya = 35 | surya-ocr>=0.9:python_version>='3.10' 36 | 37 | 38 | [pbr] 39 | skip_authors = True 40 | skip_changelog = True 41 | skip_reno = True 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | setup_requires=['pbr'], 5 | package_dir={'': 'src'}, 6 | pbr=True, 7 | ) 8 | -------------------------------------------------------------------------------- /src/img2table/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | class Validations: 5 | def __post_init__(self): 6 | """Run validation methods if declared. 7 | The validation method can be a simple check 8 | that raises ValueError or a transformation to 9 | the field value. 10 | The validation is performed by calling a function named: 11 | `validate_(self, value, field) -> field.type` 12 | """ 13 | for name, field in self.__dataclass_fields__.items(): 14 | method = getattr(self, f"validate_{name}", None) 15 | setattr(self, name, method(getattr(self, name), field=field)) 16 | -------------------------------------------------------------------------------- /src/img2table/document/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from img2table.document.image import Image 4 | from img2table.document.pdf import PDF 5 | -------------------------------------------------------------------------------- /src/img2table/document/image.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import typing 3 | from dataclasses import dataclass 4 | from functools import cached_property 5 | from typing import List 6 | 7 | import cv2 8 | import numpy as np 9 | 10 | from img2table.document.base import Document 11 | from img2table.document.base.rotation import fix_rotation_image 12 | from img2table.tables.objects.extraction import ExtractedTable 13 | 14 | if typing.TYPE_CHECKING: 15 | from img2table.ocr.base import OCRInstance 16 | 17 | 18 | @dataclass 19 | class Image(Document): 20 | detect_rotation: bool = False 21 | 22 | def __post_init__(self): 23 | self.pages = None 24 | 25 | super(Image, self).__post_init__() 26 | 27 | @cached_property 28 | def images(self) -> List[np.ndarray]: 29 | img = cv2.imdecode(np.frombuffer(self.bytes, np.uint8), cv2.IMREAD_COLOR) 30 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 31 | if self.detect_rotation: 32 | rotated_img, _ = fix_rotation_image(img=img) 33 | return [rotated_img] 34 | else: 35 | return [img] 36 | 37 | def extract_tables(self, ocr: "OCRInstance" = None, implicit_rows: bool = False, implicit_columns: bool = False, 38 | borderless_tables: bool = False, min_confidence: int = 50) -> List[ExtractedTable]: 39 | """ 40 | Extract tables from document 41 | :param ocr: OCRInstance object used to extract table content 42 | :param implicit_rows: boolean indicating if implicit rows are splitted 43 | :param implicit_columns: boolean indicating if implicit columns are splitted 44 | :param borderless_tables: boolean indicating if borderless tables should be detected 45 | :param min_confidence: minimum confidence level from OCR in order to process text, from 0 (worst) to 99 (best) 46 | :return: list of extracted tables 47 | """ 48 | extracted_tables = super(Image, self).extract_tables(ocr=ocr, 49 | implicit_rows=implicit_rows, 50 | implicit_columns=implicit_columns, 51 | borderless_tables=borderless_tables, 52 | min_confidence=min_confidence) 53 | return extracted_tables.get(0) 54 | -------------------------------------------------------------------------------- /src/img2table/document/pdf.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import typing 3 | from dataclasses import dataclass 4 | from typing import Dict, List, Optional 5 | 6 | import cv2 7 | import numpy as np 8 | import pypdfium2 9 | 10 | from img2table.document.base import Document 11 | from img2table.document.base.rotation import fix_rotation_image 12 | from img2table.ocr.pdf import PdfOCR 13 | 14 | if typing.TYPE_CHECKING: 15 | from img2table.ocr.base import OCRInstance 16 | from img2table.tables.objects.extraction import ExtractedTable 17 | from img2table.tables.objects.table import Table 18 | 19 | 20 | @dataclass 21 | class PDF(Document): 22 | pages: List[int] = None 23 | detect_rotation: bool = False 24 | pdf_text_extraction: bool = True 25 | _rotated: bool = False 26 | _images: List[np.ndarray] = None 27 | 28 | def validate_pages(self, value, **_) -> Optional[List[int]]: 29 | if value is not None: 30 | if not isinstance(value, list): 31 | raise TypeError(f"Invalid type {type(value)} for pages argument") 32 | if not all(isinstance(x, int) for x in value): 33 | raise TypeError("All values in pages argument should be integers") 34 | return value 35 | 36 | def validate_pdf_text_extraction(self, value, **_) -> int: 37 | if not isinstance(value, bool): 38 | raise TypeError(f"Invalid type {type(value)} for pdf_text_extraction argument") 39 | return value 40 | 41 | def validate__rotated(self, value, **_) -> int: 42 | return value 43 | 44 | def validate__images(self, value, **_) -> int: 45 | return value 46 | 47 | @property 48 | def images(self) -> List[np.ndarray]: 49 | if self._images is not None: 50 | return self._images 51 | 52 | doc = pypdfium2.PdfDocument(input=self.bytes) 53 | 54 | # Get all images 55 | images = list() 56 | for page_number in self.pages or range(len(doc)): 57 | page = doc[page_number] 58 | img = cv2.cvtColor(page.render(scale=200 / 72).to_numpy(), cv2.COLOR_BGR2RGB) 59 | # Handle rotation if needed 60 | if self.detect_rotation: 61 | final, self._rotated = fix_rotation_image(img=img) 62 | else: 63 | final, self._rotated = img, False 64 | images.append(final) 65 | 66 | self._images = images 67 | doc.close() 68 | return images 69 | 70 | def get_table_content(self, tables: Dict[int, List["Table"]], ocr: "OCRInstance", 71 | min_confidence: int) -> Dict[int, List["ExtractedTable"]]: 72 | if not self._rotated and self.pdf_text_extraction: 73 | # Get pages where tables have been detected 74 | table_pages = [self.pages[k] if self.pages else k for k, v in tables.items() if len(v) > 0] 75 | images = [self.images[k] for k, v in tables.items() if len(v) > 0] 76 | 77 | if table_pages: 78 | # Create PDF object for OCR 79 | pdf_ocr = PDF(src=self.bytes, 80 | pages=table_pages, 81 | _images=images, 82 | _rotated=self._rotated) 83 | 84 | # Try to get OCRDataframe from PDF 85 | self.ocr_df = PdfOCR().of(document=pdf_ocr) 86 | 87 | return super(PDF, self).get_table_content(tables=tables, ocr=ocr, min_confidence=min_confidence) 88 | -------------------------------------------------------------------------------- /src/img2table/ocr/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from img2table.ocr.aws_textract import TextractOCR 4 | from img2table.ocr.azure import AzureOCR 5 | from img2table.ocr.doctr import DocTR 6 | from img2table.ocr.easyocr import EasyOCR 7 | from img2table.ocr.google_vision import VisionOCR 8 | from img2table.ocr.paddle import PaddleOCR 9 | from img2table.ocr.surya import SuryaOCR 10 | from img2table.ocr.tesseract import TesseractOCR 11 | -------------------------------------------------------------------------------- /src/img2table/ocr/base.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import Any 3 | 4 | import polars as pl 5 | 6 | from img2table.document.base import Document 7 | from img2table.ocr.data import OCRDataframe 8 | 9 | 10 | class OCRInstance: 11 | @property 12 | def pl_schema(self): 13 | schema = { 14 | "page": pl.Int64, 15 | "class": str, 16 | "id": str, 17 | "parent": str, 18 | "value": str, 19 | "confidence": pl.Int64, 20 | "x1": pl.Int64, 21 | "y1": pl.Int64, 22 | "x2": pl.Int64, 23 | "y2": pl.Int64 24 | } 25 | return schema 26 | 27 | def content(self, document: Document) -> Any: 28 | raise NotImplementedError 29 | 30 | def to_ocr_dataframe(self, content: Any) -> OCRDataframe: 31 | raise NotImplementedError 32 | 33 | def of(self, document: Document) -> OCRDataframe: 34 | """ 35 | Extract text from Document to OCRDataframe object 36 | :param document: Document object 37 | :return: OCRDataframe object 38 | """ 39 | # Extract content from document 40 | content = self.content(document=document) 41 | 42 | # Create OCRDataframe from content 43 | return self.to_ocr_dataframe(content=content) 44 | -------------------------------------------------------------------------------- /src/img2table/ocr/doctr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import typing 4 | 5 | import polars as pl 6 | 7 | from img2table.document.base import Document 8 | from img2table.ocr.base import OCRInstance 9 | from img2table.ocr.data import OCRDataframe 10 | 11 | if typing.TYPE_CHECKING: 12 | import doctr 13 | 14 | 15 | class DocTR(OCRInstance): 16 | """ 17 | DocTR instance 18 | """ 19 | def __init__(self, detect_language: bool = False, kw: typing.Dict = None): 20 | """ 21 | Initialization of EasyOCR instance 22 | """ 23 | try: 24 | from doctr.models import ocr_predictor 25 | except ModuleNotFoundError: 26 | raise ModuleNotFoundError("Missing dependencies, please install doctr to use this class.") 27 | 28 | # Create kwargs dict for constructor 29 | kw = kw or {} 30 | kw["detect_language"] = detect_language 31 | kw["pretrained"] = kw.get("pretrained") if kw.get("pretrained") is not None else True 32 | 33 | self.model = ocr_predictor(**kw) 34 | 35 | def content(self, document: Document) -> "doctr.io.elements.Document": 36 | # Get OCR of all images 37 | ocrs = self.model(document.images) 38 | 39 | return ocrs 40 | 41 | def to_ocr_dataframe(self, content: "doctr.io.elements.Document") -> OCRDataframe: 42 | """ 43 | Convert docTR Document object to OCRDataframe object 44 | :param content: docTR Document object 45 | :return: OCRDataframe object corresponding to content 46 | """ 47 | # Create list of elements 48 | list_elements = list() 49 | 50 | for page_id, page in enumerate(content.pages): 51 | dimensions = page.dimensions 52 | word_id = 0 53 | for block in page.blocks: 54 | for line_id, line in enumerate(block.lines): 55 | for word in line.words: 56 | word_id += 1 57 | dict_word = { 58 | "page": page_id, 59 | "class": "ocrx_word", 60 | "id": f"word_{page_id + 1}_{line_id}_{word_id}", 61 | "parent": f"word_{page_id + 1}_{line_id}", 62 | "value": word.value, 63 | "confidence": round(100 * word.confidence), 64 | "x1": round(word.geometry[0][0] * dimensions[1]), 65 | "y1": round(word.geometry[0][1] * dimensions[0]), 66 | "x2": round(word.geometry[1][0] * dimensions[1]), 67 | "y2": round(word.geometry[1][1] * dimensions[0]) 68 | } 69 | 70 | list_elements.append(dict_word) 71 | 72 | return OCRDataframe(df=pl.DataFrame(list_elements)) if list_elements else None 73 | -------------------------------------------------------------------------------- /src/img2table/ocr/easyocr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from typing import List, Tuple, Dict 4 | 5 | import polars as pl 6 | 7 | from img2table.document.base import Document 8 | from img2table.ocr.base import OCRInstance 9 | from img2table.ocr.data import OCRDataframe 10 | 11 | 12 | class EasyOCR(OCRInstance): 13 | """ 14 | EAsyOCR instance 15 | """ 16 | def __init__(self, lang: List[str] = ['en'], kw: Dict = None): 17 | """ 18 | Initialization of EasyOCR instance 19 | :param lang: lang parameter used in EasyOCR 20 | :param kw: dictionary containing kwargs for EasyOCR constructor 21 | """ 22 | try: 23 | from easyocr import Reader 24 | except ModuleNotFoundError: 25 | raise ModuleNotFoundError("Missing dependencies, please install 'img2table[easyocr]' to use this class.") 26 | 27 | if isinstance(lang, list): 28 | if all([isinstance(lng, str) for lng in lang]): 29 | self.lang = lang 30 | else: 31 | raise TypeError(f"Invalid type {type(lang)} for lang argument") 32 | 33 | # Create kwargs dict for constructor 34 | kw = kw or {} 35 | kw["lang_list"] = self.lang 36 | kw["verbose"] = kw.get("verbose") or False 37 | 38 | self.reader = Reader(**kw) 39 | 40 | def content(self, document: Document) -> List[List[Tuple]]: 41 | # Get OCR of all images 42 | ocrs = [self.reader.readtext(image) for image in document.images] 43 | 44 | return ocrs 45 | 46 | def to_ocr_dataframe(self, content: List[List]) -> OCRDataframe: 47 | """ 48 | Convert hOCR HTML to OCRDataframe object 49 | :param content: hOCR HTML string 50 | :return: OCRDataframe object corresponding to content 51 | """ 52 | # Create list of elements 53 | list_elements = list() 54 | 55 | for page, ocr_result in enumerate(content): 56 | word_id = 0 57 | for word in ocr_result: 58 | word_id += 1 59 | dict_word = { 60 | "page": page, 61 | "class": "ocrx_word", 62 | "id": f"word_{page + 1}_{word_id}", 63 | "parent": f"word_{page + 1}_{word_id}", 64 | "value": word[1], 65 | "confidence": round(100 * word[2]), 66 | "x1": round(min([edge[0] for edge in word[0]])), 67 | "y1": round(min([edge[1] for edge in word[0]])), 68 | "x2": round(max([edge[0] for edge in word[0]])), 69 | "y2": round(max([edge[1] for edge in word[0]])) 70 | } 71 | 72 | list_elements.append(dict_word) 73 | 74 | return OCRDataframe(df=pl.DataFrame(list_elements)) if list_elements else None 75 | -------------------------------------------------------------------------------- /src/img2table/ocr/paddle.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import warnings 5 | from tempfile import NamedTemporaryFile 6 | from typing import List, Dict 7 | 8 | import cv2 9 | import numpy as np 10 | import polars as pl 11 | 12 | from img2table.document.base import Document 13 | from img2table.ocr.base import OCRInstance 14 | from img2table.ocr.data import OCRDataframe 15 | 16 | 17 | class PaddleOCR(OCRInstance): 18 | """ 19 | Paddle-OCR instance 20 | """ 21 | def __init__(self, lang: str = 'en', kw: Dict = None): 22 | """ 23 | Initialization of Paddle OCR instance 24 | :param lang: lang parameter used in Paddle 25 | :param kw: dictionary containing kwargs for PaddleOCR constructor 26 | """ 27 | try: 28 | with warnings.catch_warnings(): 29 | warnings.simplefilter("ignore") 30 | from paddleocr import PaddleOCR as OCR 31 | except ModuleNotFoundError: 32 | raise ModuleNotFoundError("Missing dependencies, please install 'img2table[paddle]' to use this class.") 33 | 34 | if isinstance(lang, str): 35 | self.lang = lang 36 | else: 37 | raise TypeError(f"Invalid type {type(lang)} for lang argument") 38 | 39 | # Create kwargs dict for constructor 40 | kw = kw or {} 41 | kw["lang"] = self.lang 42 | kw["use_angle_cls"] = kw.get("use_angle_cls") or False 43 | kw["show_log"] = kw.get("show_log") or False 44 | 45 | self.ocr = OCR(**kw) 46 | 47 | def hocr(self, image: np.ndarray) -> List: 48 | """ 49 | Get OCR of an image using Paddle 50 | :param image: numpy array representing the image 51 | :return: Paddle OCR result 52 | """ 53 | with NamedTemporaryFile(suffix='.jpg', delete=False) as tmp_f: 54 | tmp_file = tmp_f.name 55 | # Write image to temporary file 56 | cv2.imwrite(tmp_file, image) 57 | 58 | # Get OCR 59 | ocr_result = self.ocr.ocr(img=tmp_file, cls=False) 60 | 61 | # Remove temporary file 62 | while os.path.exists(tmp_file): 63 | try: 64 | os.remove(tmp_file) 65 | except PermissionError: 66 | pass 67 | 68 | # Get result 69 | ocr_result = ocr_result.pop() 70 | return [[bbox, (word[0], round(word[1], 2))] for bbox, word in ocr_result] if ocr_result else [] 71 | 72 | def content(self, document: Document) -> List[List]: 73 | # Get OCR of all images 74 | ocrs = [self.hocr(image=image) for image in document.images] 75 | 76 | return ocrs 77 | 78 | def to_ocr_dataframe(self, content: List[List]) -> OCRDataframe: 79 | """ 80 | Convert hOCR HTML to OCRDataframe object 81 | :param content: hOCR HTML string 82 | :return: OCRDataframe object corresponding to content 83 | """ 84 | # Create list of elements 85 | list_elements = list() 86 | 87 | for page, ocr_result in enumerate(content): 88 | word_id = 0 89 | for bbox, word in ocr_result: 90 | word_id += 1 91 | dict_word = { 92 | "page": page, 93 | "class": "ocrx_word", 94 | "id": f"word_{page + 1}_{word_id}", 95 | "parent": f"word_{page + 1}_{word_id}", 96 | "value": word[0], 97 | "confidence": 100 * word[1], 98 | "x1": round(min([edge[0] for edge in bbox])), 99 | "y1": round(min([edge[1] for edge in bbox])), 100 | "x2": round(max([edge[0] for edge in bbox])), 101 | "y2": round(max([edge[1] for edge in bbox])) 102 | } 103 | 104 | list_elements.append(dict_word) 105 | 106 | return OCRDataframe(df=pl.DataFrame(list_elements, schema=self.pl_schema)) if list_elements else None 107 | -------------------------------------------------------------------------------- /src/img2table/ocr/surya.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import typing 4 | 5 | import polars as pl 6 | from PIL import Image 7 | 8 | from img2table.document.base import Document 9 | from img2table.ocr.base import OCRInstance 10 | from img2table.ocr.data import OCRDataframe 11 | 12 | if typing.TYPE_CHECKING: 13 | import surya 14 | 15 | 16 | class SuryaOCR(OCRInstance): 17 | """ 18 | DocTR instance 19 | """ 20 | def __init__(self, langs: typing.List[str] = None): 21 | """ 22 | Initialization of EasyOCR instance 23 | """ 24 | try: 25 | from surya.recognition import RecognitionPredictor 26 | from surya.detection import DetectionPredictor 27 | 28 | except ModuleNotFoundError: 29 | raise ModuleNotFoundError("Missing dependencies, please install 'img2table[surya]' to use this class.") 30 | 31 | if isinstance(langs, list): 32 | if all([isinstance(lng, str) for lng in langs]): 33 | self.langs = langs or ["en"] 34 | else: 35 | raise TypeError(f"All values should be strings for langs argument") 36 | else: 37 | raise TypeError(f"Invalid type {type(langs)} for langs argument") 38 | 39 | # Initialize model 40 | self.det_predictor = DetectionPredictor() 41 | self.rec_predictor = RecognitionPredictor() 42 | 43 | def content(self, document: Document) -> typing.List["surya.recognition.schema.OCRResult"]: 44 | # Get OCR of all images 45 | ocrs = self.rec_predictor(images=[Image.fromarray(img) for img in document.images], 46 | langs=[self.langs], 47 | det_predictor=self.det_predictor) 48 | 49 | return ocrs 50 | 51 | def to_ocr_dataframe(self, content: typing.List["surya.recognition.schema.OCRResult"]) -> OCRDataframe: 52 | """ 53 | Convert docTR Document object to OCRDataframe object 54 | :param content: docTR Document object 55 | :return: OCRDataframe object corresponding to content 56 | """ 57 | # Create list of elements 58 | list_elements = list() 59 | 60 | for page_id, ocr_result in enumerate(content): 61 | line_id = 0 62 | for text_line in ocr_result.text_lines: 63 | line_id += 1 64 | dict_word = { 65 | "page": page_id, 66 | "class": "ocrx_word", 67 | "id": f"word_{page_id + 1}_{line_id}_0", 68 | "parent": f"word_{page_id + 1}_{line_id}", 69 | "value": text_line.text, 70 | "confidence": round(100 * text_line.confidence), 71 | "x1": int(text_line.bbox[0]), 72 | "y1": int(text_line.bbox[1]), 73 | "x2": int(text_line.bbox[2]), 74 | "y2": int(text_line.bbox[3]) 75 | } 76 | 77 | list_elements.append(dict_word) 78 | 79 | return OCRDataframe(df=pl.DataFrame(list_elements)) if list_elements else None 80 | -------------------------------------------------------------------------------- /src/img2table/tables/objects/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from functools import cached_property 3 | 4 | 5 | class TableObject: 6 | def bbox(self, margin: int = 0, height_margin: int = 0, width_margin: int = 0) -> tuple: 7 | """ 8 | Return bounding box corresponding to the object 9 | :param margin: general margin used for the bounding box 10 | :param height_margin: vertical margin used for the bounding box 11 | :param width_margin: horizontal margin used for the bounding box 12 | :return: tuple representing a bounding box 13 | """ 14 | # Apply margin on bbox 15 | if margin != 0: 16 | bbox = (self.x1 - margin, 17 | self.y1 - margin, 18 | self.x2 + margin, 19 | self.y2 + margin) 20 | else: 21 | bbox = (self.x1 - width_margin, 22 | self.y1 - height_margin, 23 | self.x2 + width_margin, 24 | self.y2 + height_margin) 25 | 26 | return bbox 27 | 28 | @cached_property 29 | def height(self) -> int: 30 | return self.y2 - self.y1 31 | 32 | @cached_property 33 | def width(self) -> int: 34 | return self.x2 - self.x1 35 | 36 | @cached_property 37 | def area(self) -> int: 38 | return self.height * self.width 39 | -------------------------------------------------------------------------------- /src/img2table/tables/objects/cell.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from dataclasses import dataclass 3 | 4 | from img2table.tables.objects import TableObject 5 | from img2table.tables.objects.extraction import TableCell, BBox 6 | 7 | 8 | @dataclass 9 | class Cell(TableObject): 10 | x1: int 11 | y1: int 12 | x2: int 13 | y2: int 14 | content: str = None 15 | 16 | @property 17 | def table_cell(self) -> TableCell: 18 | bbox = BBox(x1=self.x1, x2=self.x2, y1=self.y1, y2=self.y2) 19 | return TableCell(bbox=bbox, value=self.content) 20 | 21 | def __hash__(self): 22 | return hash(repr(self)) 23 | -------------------------------------------------------------------------------- /src/img2table/tables/objects/line.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import math 3 | from dataclasses import dataclass 4 | from typing import Optional 5 | 6 | import numpy as np 7 | 8 | from img2table.tables.objects import TableObject 9 | 10 | 11 | @dataclass 12 | class Line(TableObject): 13 | x1: int 14 | y1: int 15 | x2: int 16 | y2: int 17 | thickness: Optional[int] = None 18 | 19 | @property 20 | def angle(self) -> float: 21 | delta_x = self.x2 - self.x1 22 | delta_y = self.y2 - self.y1 23 | 24 | return math.atan2(delta_y, delta_x) * 180 / np.pi 25 | 26 | @property 27 | def length(self) -> float: 28 | return np.sqrt(self.height ** 2 + self.width ** 2) 29 | 30 | @property 31 | def horizontal(self) -> bool: 32 | return self.angle % 180 == 0 33 | 34 | @property 35 | def vertical(self) -> bool: 36 | return self.angle % 180 == 90 37 | 38 | @property 39 | def dict(self): 40 | return {"x1": self.x1, 41 | "x2": self.x2, 42 | "y1": self.y1, 43 | "y2": self.y2, 44 | "width": self.width, 45 | "height": self.height, 46 | "thickness": self.thickness} 47 | 48 | @property 49 | def transpose(self) -> "Line": 50 | return Line(x1=self.y1, y1=self.x1, x2=self.y2, y2=self.x2, thickness=self.thickness) 51 | 52 | def reprocess(self): 53 | # Reallocate coordinates in proper order 54 | _x1 = min(self.x1, self.x2) 55 | _x2 = max(self.x1, self.x2) 56 | _y1 = min(self.y1, self.y2) 57 | _y2 = max(self.y1, self.y2) 58 | self.x1, self.x2, self.y1, self.y2 = _x1, _x2, _y1, _y2 59 | 60 | # Correct "almost" horizontal or vertical rows 61 | if abs(self.angle) <= 5: 62 | y_val = int(round((self.y1 + self.y2) / 2)) 63 | self.y2 = self.y1 = y_val 64 | elif abs(self.angle - 90) <= 5: 65 | x_val = int(round((self.x1 + self.x2) / 2)) 66 | self.x2 = self.x1 = x_val 67 | 68 | return self 69 | 70 | def __hash__(self): 71 | return hash(repr(self)) 72 | -------------------------------------------------------------------------------- /src/img2table/tables/objects/row.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import copy 3 | from typing import Union, List 4 | 5 | from img2table.tables.objects import TableObject 6 | from img2table.tables.objects.cell import Cell 7 | 8 | 9 | class Row(TableObject): 10 | def __init__(self, cells: Union[Cell, List[Cell]]): 11 | if cells is None: 12 | raise ValueError("cells parameter is null") 13 | elif isinstance(cells, Cell): 14 | self._items = [cells] 15 | else: 16 | self._items = cells 17 | self._contours = [] 18 | 19 | @property 20 | def items(self) -> List[Cell]: 21 | return self._items 22 | 23 | @property 24 | def nb_columns(self) -> int: 25 | return len(self.items) 26 | 27 | @property 28 | def x1(self) -> int: 29 | return min(map(lambda x: x.x1, self.items)) 30 | 31 | @property 32 | def x2(self) -> int: 33 | return max(map(lambda x: x.x2, self.items)) 34 | 35 | @property 36 | def y1(self) -> int: 37 | return min(map(lambda x: x.y1, self.items)) 38 | 39 | @property 40 | def y2(self) -> int: 41 | return max(map(lambda x: x.y2, self.items)) 42 | 43 | @property 44 | def v_consistent(self) -> bool: 45 | """ 46 | Indicate if the row is vertically consistent (i.e all cells in row have the same vertical position) 47 | :return: boolean indicating if the row is vertically consistent 48 | """ 49 | return all(map(lambda x: (x.y1 == self.y1) and (x.y2 == self.y2), self.items)) 50 | 51 | def add_cells(self, cells: Union[Cell, List[Cell]]) -> "Row": 52 | """ 53 | Add cells to existing row items 54 | :param cells: Cell object or list 55 | :return: Row object with cells added 56 | """ 57 | if isinstance(cells, Cell): 58 | self._items += [cells] 59 | else: 60 | self._items += cells 61 | 62 | return self 63 | 64 | def split_in_rows(self, vertical_delimiters: List[int]) -> List["Row"]: 65 | """ 66 | Split Row object into multiple objects based on vertical delimiters values 67 | :param vertical_delimiters: list of vertical delimiters values 68 | :return: list of splitted Row objects according to delimiters 69 | """ 70 | # Create list of tuples for vertical boundaries 71 | row_delimiters = [self.y1] + vertical_delimiters + [self.y2] 72 | row_boundaries = [(i, j) for i, j in zip(row_delimiters, row_delimiters[1:])] 73 | 74 | # Create new list of rows 75 | l_new_rows = list() 76 | for boundary in row_boundaries: 77 | cells = list() 78 | for cell in self.items: 79 | _cell = copy.deepcopy(cell) 80 | _cell.y1, _cell.y2 = boundary 81 | cells.append(_cell) 82 | l_new_rows.append(Row(cells=cells)) 83 | 84 | return l_new_rows 85 | 86 | def __eq__(self, other) -> bool: 87 | if isinstance(other, self.__class__): 88 | try: 89 | assert self.items == other.items 90 | return True 91 | except AssertionError: 92 | return False 93 | return False 94 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/src/img2table/tables/processing/__init__.py -------------------------------------------------------------------------------- /src/img2table/tables/processing/bordered_tables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/src/img2table/tables/processing/bordered_tables/__init__.py -------------------------------------------------------------------------------- /src/img2table/tables/processing/bordered_tables/cells/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.line import Line 6 | from img2table.tables.processing.bordered_tables.cells.deduplication import deduplicate_cells 7 | from img2table.tables.processing.bordered_tables.cells.identification import get_cells_dataframe 8 | 9 | 10 | def get_cells(horizontal_lines: List[Line], vertical_lines: List[Line]) -> List[Cell]: 11 | """ 12 | Identify cells from horizontal and vertical rows 13 | :param horizontal_lines: list of horizontal rows 14 | :param vertical_lines: list of vertical rows 15 | :return: list of all cells in image 16 | """ 17 | # Create dataframe with cells from horizontal and vertical rows 18 | cells = get_cells_dataframe(horizontal_lines=horizontal_lines, 19 | vertical_lines=vertical_lines) 20 | 21 | # Deduplicate cells 22 | dedup_cells = deduplicate_cells(cells=cells) 23 | 24 | return dedup_cells 25 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/bordered_tables/cells/deduplication.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List 3 | 4 | import numpy as np 5 | 6 | from img2table.tables.objects.cell import Cell 7 | 8 | 9 | def deduplicate_cells(cells: List[Cell]) -> List[Cell]: 10 | """ 11 | Deduplicate nested cells in order to keep the smallest ones 12 | :param cells: list of cells 13 | :return: cells after deduplication of the nested ones 14 | """ 15 | # Create array of cell coverages 16 | x_max, y_max = max([c.x2 for c in cells] + [0]), max([c.y2 for c in cells] + [0]) 17 | coverage_array = np.ones((y_max, x_max), dtype=np.uint8) 18 | 19 | dedup_cells = list() 20 | for c in sorted(cells, key=lambda c: c.area): 21 | cropped = coverage_array[c.y1:c.y2, c.x1:c.x2] 22 | # If cell has at least 25% of its area not covered, add it 23 | if np.sum(cropped) >= 0.25 * c.area: 24 | dedup_cells.append(c) 25 | coverage_array[c.y1:c.y2, c.x1:c.x2] = 0 26 | 27 | return dedup_cells 28 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/bordered_tables/cells/identification.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List 3 | 4 | import numpy as np 5 | from numba import njit, prange 6 | 7 | from img2table.tables.objects.cell import Cell 8 | from img2table.tables.objects.line import Line 9 | 10 | 11 | @njit("int64[:,:](int64[:,:],int64[:,:])", cache=True, fastmath=True) 12 | def identify_cells(h_lines_arr: np.ndarray, v_lines_arr: np.ndarray) -> np.ndarray: 13 | """ 14 | Identify cells from lines 15 | :param h_lines_arr: array containing horizontal lines 16 | :param v_lines_arr: array containing vertical lines 17 | :return: array of cells coordinates 18 | """ 19 | # Get potential cells from horizontal lines 20 | potential_cells = list() 21 | for i in prange(h_lines_arr.shape[0]): 22 | x1i, y1i, x2i, y2i = h_lines_arr[i][:] 23 | for j in prange(h_lines_arr.shape[0]): 24 | x1j, y1j, x2j, y2j = h_lines_arr[j][:] 25 | 26 | if y1i >= y1j: 27 | continue 28 | 29 | # Check correspondence between lines 30 | l_corresponds = -0.02 <= (x1i - x1j) / ((x2i - x1i) or 1) <= 0.02 31 | r_corresponds = -0.02 <= (x2i - x2j) / ((x2i - x1i) or 1) <= 0.02 32 | l_contained = (x1i <= x1j <= x2i) or (x1j <= x1i <= x2j) 33 | r_contained = (x1i <= x2j <= x2i) or (x1j <= x2i <= x2j) 34 | 35 | if (l_corresponds or l_contained) and (r_corresponds or r_contained): 36 | potential_cells.append([max(x1i, x1j), min(x2i, x2j), y1i, y2j]) 37 | 38 | if len(potential_cells) == 0: 39 | return np.empty((0, 4), dtype=np.int64) 40 | 41 | # Deduplicate on upper bound 42 | potential_cells = sorted(potential_cells) 43 | dedup_upper = list() 44 | prev_x1, prev_x2, prev_y1 = 0, 0, 0 45 | for idx in range(len(potential_cells)): 46 | x1, x2, y1, y2 = potential_cells[idx] 47 | 48 | if not (x1 == prev_x1 and x2 == prev_x2 and y1 == prev_y1): 49 | dedup_upper.append([x1, x2, y2, -y1]) 50 | prev_x1, prev_x2, prev_y1 = x1, x2, y1 51 | 52 | # Deduplicate on lower bound 53 | dedup_upper = sorted(dedup_upper) 54 | dedup_lower = list() 55 | prev_x1, prev_x2, prev_y2 = 0, 0, 0 56 | for idx in range(len(dedup_upper)): 57 | x1, x2, y2, _y1 = dedup_upper[idx] 58 | y1 = -_y1 59 | 60 | if not (x1 == prev_x1 and x2 == prev_x2 and y2 == prev_y2): 61 | dedup_lower.append([x1, x2, y1, y2]) 62 | prev_x1, prev_x2, prev_y2 = x1, x2, y2 63 | 64 | # Create array of potential cells 65 | cells_array = np.array(dedup_lower) 66 | cells = list() 67 | 68 | for i in prange(cells_array.shape[0]): 69 | x1, x2, y1, y2 = cells_array[i][:] 70 | 71 | # Compute horizontal margin 72 | margin = max(5, (x2 - x1) * 0.025) 73 | 74 | delimiters = list() 75 | for j in range(v_lines_arr.shape[0]): 76 | x1v, y1v, x2v, y2v = v_lines_arr[j][:] 77 | 78 | if x1 - margin <= x1v <= x2 + margin: 79 | # Check vertical overlapping and tolerance 80 | overlap = min(y2, y2v) - max(y1, y1v) 81 | tolerance = max(5, min(10, 0.1 * (y2 - y1))) 82 | 83 | if y2 - y1 - overlap <= tolerance: 84 | delimiters.append(x1v) 85 | 86 | # Create new cells from delimiters 87 | if len(delimiters) >= 2: 88 | delimiters = sorted(delimiters) 89 | for j in range(len(delimiters) - 1): 90 | cells.append([delimiters[j], y1, delimiters[j + 1], y2]) 91 | 92 | return np.array(cells).astype(np.int64) if cells else np.empty((0, 4), dtype=np.int64) 93 | 94 | 95 | def get_cells_dataframe(horizontal_lines: List[Line], vertical_lines: List[Line]) -> List[Cell]: 96 | """ 97 | Create dataframe of all possible cells from horizontal and vertical rows 98 | :param horizontal_lines: list of horizontal rows 99 | :param vertical_lines: list of vertical rows 100 | :return: list of detected cells 101 | """ 102 | # Check for empty rows 103 | if len(horizontal_lines) * len(vertical_lines) == 0: 104 | return [] 105 | 106 | # Create arrays from horizontal and vertical rows 107 | h_lines_array = np.array([[line.x1, line.y1, line.x2, line.y2] for line in horizontal_lines], dtype=np.int64) 108 | v_lines_array = np.array([[line.x1, line.y1, line.x2, line.y2] for line in vertical_lines], dtype=np.int64) 109 | 110 | # Compute cells 111 | cells_array = identify_cells(h_lines_arr=h_lines_array, 112 | v_lines_arr=v_lines_array) 113 | 114 | return [Cell(x1=c[0], y1=c[1], x2=c[2], y2=c[3]) for c in cells_array] 115 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/bordered_tables/tables/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.line import Line 6 | from img2table.tables.objects.table import Table 7 | from img2table.tables.processing.bordered_tables.tables.cell_clustering import cluster_cells_in_tables 8 | from img2table.tables.processing.bordered_tables.tables.semi_bordered import add_semi_bordered_cells 9 | from img2table.tables.processing.bordered_tables.tables.table_creation import cluster_to_table, normalize_table_cells 10 | 11 | 12 | def get_tables(cells: List[Cell], elements: List[Cell], lines: List[Line], char_length: float) -> List[Table]: 13 | """ 14 | Identify and create Table object from list of image cells 15 | :param cells: list of cells found in image 16 | :param elements: list of image elements 17 | :param lines: list of image lines 18 | :param char_length: average character length 19 | :return: list of Table objects inferred from cells 20 | """ 21 | # Cluster cells into tables 22 | list_cluster_cells = cluster_cells_in_tables(cells=cells) 23 | 24 | # Normalize cells in clusters 25 | clusters_normalized = [normalize_table_cells(cluster_cells=cluster_cells) 26 | for cluster_cells in list_cluster_cells] 27 | 28 | # Add semi-bordered cells to clusters 29 | complete_clusters = [add_semi_bordered_cells(cluster=cluster, lines=lines, char_length=char_length) 30 | for cluster in clusters_normalized if len(cluster) > 0] 31 | 32 | # Create tables from cells clusters 33 | tables = [cluster_to_table(cluster_cells=cluster, elements=elements) 34 | for cluster in complete_clusters] 35 | 36 | return [tb for tb in tables if tb.nb_rows * tb.nb_columns >= 2] 37 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/bordered_tables/tables/cell_clustering.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List, Set 3 | 4 | import polars as pl 5 | 6 | from img2table.tables import find_components 7 | from img2table.tables.objects.cell import Cell 8 | 9 | 10 | def get_adjacent_cells(cells: List[Cell]) -> List[Set[int]]: 11 | """ 12 | Identify adjacent cells 13 | :param cells: list of cells 14 | :return: list of sets of adjacent cells indexes 15 | """ 16 | if len(cells) == 0: 17 | return [] 18 | 19 | df_cells = pl.DataFrame([{"idx": idx, "x1": c.x1, "y1": c.y1, "x2": c.x2, "y2": c.y2, "height": c.height, 20 | "width": c.width} 21 | for idx, c in enumerate(cells)]) 22 | 23 | # Crossjoin and identify adjacent cells 24 | df_adjacent_cells = ( 25 | df_cells.join(df_cells, how='cross') 26 | # Compute horizontal and vertical overlap 27 | .with_columns((pl.min_horizontal(['x2', 'x2_right']) - pl.max_horizontal(['x1', 'x1_right'])).alias("x_overlap"), 28 | (pl.min_horizontal(['y2', 'y2_right']) - pl.max_horizontal(['y1', 'y1_right'])).alias("y_overlap") 29 | ) 30 | # Compute horizontal and vertical differences 31 | .with_columns( 32 | pl.min_horizontal((pl.col('x1') - pl.col('x1_right')).abs(), 33 | (pl.col('x1') - pl.col('x2_right')).abs(), 34 | (pl.col('x2') - pl.col('x1_right')).abs(), 35 | (pl.col('x2') - pl.col('x2_right')).abs() 36 | ).alias('diff_x'), 37 | pl.min_horizontal((pl.col('y1') - pl.col('y1_right')).abs(), 38 | (pl.col('y1') - pl.col('y2_right')).abs(), 39 | (pl.col('y2') - pl.col('y1_right')).abs(), 40 | (pl.col('y2') - pl.col('y2_right')).abs() 41 | ).alias('diff_y') 42 | ) 43 | # Compute thresholds for horizontal and vertical differences 44 | .with_columns( 45 | pl.min_horizontal(pl.lit(5), 0.05 * pl.min_horizontal(pl.col('width'), pl.col('width_right'))).alias('thresh_x'), 46 | pl.min_horizontal(pl.lit(5), 0.05 * pl.min_horizontal(pl.col('height'), pl.col('height_right'))).alias('thresh_y') 47 | ) 48 | # Filter adjacent cells 49 | .filter( 50 | ((pl.col('y_overlap') > 5) & (pl.col('diff_x') <= pl.col('thresh_x'))) 51 | | ((pl.col('x_overlap') > 5) & (pl.col('diff_y') <= pl.col('thresh_y'))) 52 | ) 53 | .select("idx", "idx_right") 54 | .unique() 55 | .sort(by=['idx', 'idx_right']) 56 | ) 57 | 58 | # Get sets of adjacent cells indexes 59 | adjacent_cells = [{row.get('idx'), row.get('idx_right')} for row in df_adjacent_cells.to_dicts()] 60 | 61 | return adjacent_cells 62 | 63 | 64 | def cluster_cells_in_tables(cells: List[Cell]) -> List[List[Cell]]: 65 | """ 66 | Based on adjacent cells, create clusters of cells that corresponds to tables 67 | :param cells: list cells in image 68 | :return: list of list of cells, representing several clusters of cells that form a table 69 | """ 70 | # Get couples of adjacent cells 71 | adjacent_cells = get_adjacent_cells(cells=cells) 72 | 73 | # Loop over couples to create clusters 74 | clusters = find_components(edges=adjacent_cells) 75 | 76 | # Return list of cell objects 77 | list_table_cells = [[cells[idx] for idx in cl] for cl in clusters] 78 | 79 | return list_table_cells 80 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/bordered_tables/tables/consecutive.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.table import Table 6 | 7 | 8 | def merge_consecutive_tables(tables: List[Table], contours: List[Cell]) -> List[Table]: 9 | """ 10 | Merge consecutive coherent tables 11 | :param tables: list of detected tables 12 | :param contours: list of image contours 13 | :return: list of processed tables 14 | """ 15 | if len(tables) == 0: 16 | return [] 17 | 18 | # Create table clusters 19 | seq = iter(sorted(tables, key=lambda t: t.y1)) 20 | clusters = [[next(seq)]] 21 | 22 | for tb in seq: 23 | prev_table = clusters[-1][-1] 24 | # Check if there are elements between the two tables 25 | in_between_contours = [c for c in contours if c.y1 >= prev_table.y2 and c.y2 <= tb.y1 26 | and c.x2 >= min(prev_table.x1, tb.x1) 27 | and c.x1 <= max(prev_table.x2, tb.x2)] 28 | # Check coherency of tables 29 | prev_tb_cols = sorted([l for l in prev_table.lines if l.vertical], key=lambda l: l.x1) 30 | tb_cols = sorted([l for l in tb.lines if l.vertical], key=lambda l: l.x1) 31 | coherency_lines = all([abs(l1.x1 - l2.x1) <= 2 for l1, l2 in zip(prev_tb_cols, tb_cols)]) 32 | 33 | if not (len(in_between_contours) == 0 and prev_table.nb_columns == tb.nb_columns and coherency_lines): 34 | clusters.append([]) 35 | clusters[-1].append(tb) 36 | 37 | # Create merged tables 38 | merged_tables = list() 39 | for cl in clusters: 40 | if len(cl) == 1: 41 | merged_tables += cl 42 | else: 43 | # Create new table 44 | new_tb = Table(rows=[row for tb in cl for row in tb.items], borderless=False) 45 | merged_tables.append(new_tb) 46 | 47 | return merged_tables 48 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/borderless_tables/layout/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List, Optional 3 | 4 | import numpy as np 5 | 6 | from img2table.tables.objects.line import Line 7 | from img2table.tables.objects.table import Table 8 | from img2table.tables.processing.borderless_tables.layout.column_segments import segment_image_columns 9 | from img2table.tables.processing.borderless_tables.layout.image_elements import get_image_elements 10 | from img2table.tables.processing.borderless_tables.layout.rlsa import identify_text_mask 11 | from img2table.tables.processing.borderless_tables.layout.table_segments import get_table_segments 12 | from img2table.tables.processing.borderless_tables.model import TableSegment, ImageSegment 13 | 14 | 15 | def segment_image(thresh: np.ndarray, lines: List[Line], char_length: float, 16 | median_line_sep: float, existing_tables: Optional[List[Table]] = None) -> List[TableSegment]: 17 | """ 18 | Segment image and its elements 19 | :param thresh: threshold image array 20 | :param lines: list of Line objects of the image 21 | :param char_length: average character length 22 | :param median_line_sep: median line separation 23 | :param existing_tables: list of detected bordered tables 24 | :return: list of ImageSegment objects with corresponding elements 25 | """ 26 | # Identify text mask 27 | text_thresh = identify_text_mask(thresh=thresh, 28 | lines=lines, 29 | char_length=char_length, 30 | existing_tables=existing_tables) 31 | 32 | # Identify image elements 33 | img_elements = get_image_elements(thresh=text_thresh, 34 | char_length=char_length, 35 | median_line_sep=median_line_sep) 36 | 37 | if len(img_elements) == 0: 38 | return [] 39 | 40 | # Identify column segments 41 | y_min, y_max = min([el.y1 for el in img_elements]), max([el.y2 for el in img_elements]) 42 | image_segment = ImageSegment(x1=0, y1=y_min, x2=thresh.shape[1], y2=y_max, elements=img_elements) 43 | 44 | col_segments = segment_image_columns(image_segment=image_segment, 45 | char_length=char_length, 46 | lines=lines) 47 | 48 | # Within each column, identify segments that can correspond to tables 49 | tb_segments = [table_segment for col_segment in col_segments 50 | for table_segment in get_table_segments(segment=col_segment, 51 | char_length=char_length, 52 | median_line_sep=median_line_sep) 53 | ] 54 | 55 | return tb_segments 56 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/borderless_tables/layout/image_elements.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List 3 | 4 | import cv2 5 | import numpy as np 6 | 7 | from img2table.tables.objects.cell import Cell 8 | 9 | 10 | def get_image_elements(thresh: np.ndarray, char_length: float, median_line_sep: float) -> List[Cell]: 11 | """ 12 | Identify image elements 13 | :param thresh: thresholded image array 14 | :param char_length: average character length 15 | :param median_line_sep: median line separation 16 | :return: list of image elements 17 | """ 18 | # Find contours, highlight text areas, and extract ROIs 19 | cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 20 | cnts = cnts[0] if len(cnts) == 2 else cnts[1] 21 | 22 | # Get list of contours 23 | elements = list() 24 | for c in cnts: 25 | x, y, w, h = cv2.boundingRect(c) 26 | if ((min(h, w) >= 0.5 * char_length and max(h, w) >= char_length) 27 | or (w / h >= 2 and 0.5 * char_length <= w <= 1.5 * char_length)): 28 | elements.append(Cell(x1=x, y1=y, x2=x + w, y2=y + h)) 29 | 30 | return elements 31 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/borderless_tables/table/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List, Optional 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.table import Table 6 | from img2table.tables.processing.borderless_tables.model import ColumnGroup 7 | from img2table.tables.processing.borderless_tables.table.coherency import check_table_coherency 8 | from img2table.tables.processing.borderless_tables.table.table_creation import get_table 9 | 10 | 11 | def identify_table(columns: ColumnGroup, row_delimiters: List[Cell], contours: List[Cell], median_line_sep: float, 12 | char_length: float) -> Optional[Table]: 13 | """ 14 | Identify table from column delimiters and rows 15 | :param columns: column delimiters group 16 | :param row_delimiters: list of table row delimitres corresponding to columns 17 | :param contours: list of image contours 18 | :param median_line_sep: median line separation 19 | :param char_length: average character length 20 | :return: Table object 21 | """ 22 | # Create table from rows and columns delimiters 23 | table = get_table(columns=columns, 24 | row_delimiters=row_delimiters, 25 | contours=contours) 26 | 27 | if table: 28 | if check_table_coherency(table=table, 29 | median_line_sep=median_line_sep, 30 | char_length=char_length): 31 | return table 32 | 33 | return None 34 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/borderless_tables/table/coherency.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | from img2table.tables.objects.table import Table 5 | 6 | 7 | def check_row_coherency(table: Table, median_line_sep: float) -> bool: 8 | """ 9 | Check row coherency of table 10 | :param table: Table object 11 | :param median_line_sep: median line separation 12 | :return: boolean indicating if table row heights are coherent 13 | """ 14 | if table.nb_rows < 2: 15 | return False 16 | 17 | # Get median row separation 18 | median_row_separation = np.median([(lower_row.y1 + lower_row.y2 - upper_row.y1 - upper_row.y2) / 2 19 | for upper_row, lower_row in zip(table.items, table.items[1:])]) 20 | 21 | return median_row_separation >= median_line_sep / 3 22 | 23 | 24 | def check_column_coherency(table: Table, char_length: float) -> bool: 25 | """ 26 | Check column coherency of table 27 | :param table: Table object 28 | :param char_length: average character length 29 | :return: boolean indicating if table column widths are coherent 30 | """ 31 | if table.nb_columns < 2: 32 | return False 33 | 34 | # Get column widths 35 | col_widths = list() 36 | for idx in range(table.nb_columns): 37 | col_elements = [row.items[idx] for row in table.items] 38 | col_width = min([el.x2 for el in col_elements]) - max([el.x1 for el in col_elements]) 39 | col_widths.append(col_width) 40 | 41 | return np.median(col_widths) >= 3 * char_length 42 | 43 | 44 | def check_table_coherency(table: Table, median_line_sep: float, char_length: float) -> bool: 45 | """ 46 | Check if table has coherent dimensions 47 | :param table: Table object 48 | :param median_line_sep: median line separation 49 | :param char_length: average character length 50 | :return: boolean indicating if table dimensions are coherent 51 | """ 52 | # Check row coherency of table 53 | row_coherency = check_row_coherency(table=table, 54 | median_line_sep=median_line_sep) 55 | 56 | # Check column coherency of table 57 | column_coherency = check_column_coherency(table=table, 58 | char_length=char_length) 59 | 60 | return row_coherency and column_coherency 61 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/borderless_tables/table/table_creation.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import List 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.line import Line 6 | from img2table.tables.objects.table import Table 7 | from img2table.tables.processing.bordered_tables.cells import get_cells 8 | from img2table.tables.processing.bordered_tables.tables import cluster_to_table 9 | from img2table.tables.processing.borderless_tables.model import ColumnGroup 10 | 11 | 12 | def get_table(columns: ColumnGroup, row_delimiters: List[Cell], contours: List[Cell]) -> Table: 13 | """ 14 | Create table object from column delimiters and rows 15 | :param columns: column delimiters group 16 | :param row_delimiters: list of table row delimiters 17 | :param contours: list of image contours 18 | :return: Table object 19 | """ 20 | # Convert delimiters to lines 21 | v_lines = list() 22 | for col in columns.columns: 23 | seq = iter(sorted([c for v_ws in col.whitespaces for c in v_ws.ws.cells], 24 | key=lambda c: c.y1 + c.y2)) 25 | line_groups = [[next(seq)]] 26 | for c in seq: 27 | if c.y1 > line_groups[-1][-1].y2: 28 | line_groups.append([]) 29 | line_groups[-1].append(c) 30 | 31 | v_lines += [Line(x1=(gp[0].x1 + gp[0].x2) // 2, 32 | y1=gp[0].y1, 33 | x2=(gp[0].x1 + gp[0].x2) // 2, 34 | y2=gp[-1].y2) for gp in line_groups] 35 | 36 | h_lines = [Line(x1=d.x1, x2=d.x2, y1=d.y1, y2=d.y2) for d in row_delimiters] 37 | 38 | # Identify cells 39 | cells = get_cells(horizontal_lines=h_lines, vertical_lines=v_lines) 40 | 41 | # Create table object 42 | table = cluster_to_table(cluster_cells=cells, elements=contours, borderless=True) 43 | 44 | return table if table.nb_columns >= 3 and table.nb_rows >= 2 else None 45 | -------------------------------------------------------------------------------- /src/img2table/tables/processing/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/src/img2table/tables/processing/text/__init__.py -------------------------------------------------------------------------------- /src/img2table/tables/processing/text/titles.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import copy 3 | from typing import List 4 | 5 | import numpy as np 6 | 7 | from img2table.ocr.data import OCRDataframe 8 | from img2table.tables.objects.cell import Cell 9 | from img2table.tables.objects.table import Table 10 | from img2table.tables.processing.common import get_contours_cell 11 | 12 | 13 | def get_title_tables(img: np.ndarray, tables: List[Table], ocr_df: OCRDataframe, margin: int = 5) -> List[Table]: 14 | """ 15 | Retrieve titles of cell areas 16 | :param img: image array 17 | :param tables: list of Table objects 18 | :param ocr_df: OCRDataframe object 19 | :param margin: margin used 20 | :return: list of tables with title extracted 21 | """ 22 | height, width = img.shape[:2] 23 | 24 | if len(tables) == 0: 25 | return [] 26 | 27 | # Sort tables 28 | sorted_tables = sorted(tables, key=lambda tb: (tb.y1, tb.x1, tb.x2)) 29 | 30 | # Cluster table vertically 31 | seq = iter(sorted_tables) 32 | tb_cl = [[next(seq)]] 33 | for tb in seq: 34 | if tb.y1 > tb_cl[-1][-1].y2: 35 | tb_cl.append([]) 36 | tb_cl[-1].append(tb) 37 | 38 | # Identify relative zones for each title corresponding to each cluster 39 | final_tables = list() 40 | for id_cl, cluster in enumerate(tb_cl): 41 | # Compute horizontal boundaries of title 42 | x_delimiters = [int(round((tb_1.x2 + tb_2.x1) / 2)) for tb_1, tb_2 in zip(cluster, cluster[1:])] 43 | x_delimiters = [max(10, int(round(cluster[0].x1 - 0.2 * cluster[0].width)))] + x_delimiters + [width - 10] 44 | x_delimiters = x_delimiters + [min(width - 10, int(round(cluster[-1].x2 + 0.2 * cluster[-1].width)))] 45 | x_bounds = [(del_1, del_2) for del_1, del_2 in zip(x_delimiters, x_delimiters[1:])] 46 | 47 | # Compute vertical boundaries of title 48 | y_bounds = (max([tb.y2 for tb in tb_cl[id_cl - 1]]) if id_cl > 0 else 0, min([tb.y1 for tb in cluster])) 49 | 50 | # Fetch title for each table 51 | for id_tb, table in enumerate(cluster): 52 | # Get contours in title area 53 | cell_title = Cell(x1=x_bounds[id_tb][0], x2=x_bounds[id_tb][1], y1=y_bounds[0], y2=y_bounds[1]) 54 | contours = get_contours_cell(img=copy.deepcopy(img), 55 | cell=cell_title, 56 | margin=0, 57 | blur_size=5, 58 | kernel_size=9) 59 | 60 | # Get text from OCR 61 | title = ocr_df.get_text_cell(cell=contours[-1], margin=margin) if contours else None 62 | 63 | table.set_title(title=title) 64 | final_tables.append(table) 65 | 66 | return final_tables 67 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import subprocess 5 | 6 | CWD = os.path.dirname(__file__) 7 | MOCK_DIR = os.path.join(CWD, "_mock_data") 8 | 9 | TESSERACT_INSTALL = subprocess.run("tesseract --version", shell=True).returncode == 0 10 | -------------------------------------------------------------------------------- /tests/_mock_data/azure.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/_mock_data/azure.pkl -------------------------------------------------------------------------------- /tests/_mock_data/surya.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/_mock_data/surya.pkl -------------------------------------------------------------------------------- /tests/_mock_data/vision.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/_mock_data/vision.pkl -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import os 4 | import pickle 5 | import subprocess 6 | import sys 7 | from typing import NamedTuple, Dict 8 | 9 | import azure.cognitiveservices.vision.computervision 10 | import boto3 11 | import pytest 12 | import requests 13 | from google.cloud import vision 14 | 15 | from tests import MOCK_DIR 16 | 17 | 18 | @pytest.fixture(autouse=True) 19 | def change_test_dir(request, monkeypatch): 20 | monkeypatch.chdir(request.fspath.dirname) 21 | 22 | 23 | @pytest.fixture 24 | def mock_tesseract(monkeypatch): 25 | def mock_check_output(*args, **kwargs): 26 | if "tesseract --list-langs" in args: 27 | return "Langs\neng".encode("utf-8") 28 | else: 29 | with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), "r") as f: 30 | return f.read().encode("utf-8") 31 | 32 | def mock_run(*args, **kwargs): 33 | class MResp: 34 | @property 35 | def returncode(self): 36 | return 0 37 | return MResp() 38 | 39 | monkeypatch.setattr(subprocess, "check_output", mock_check_output) 40 | monkeypatch.setattr(subprocess, "run", mock_run) 41 | 42 | 43 | @pytest.fixture 44 | def mock_vision(monkeypatch): 45 | class MockPost: 46 | def json(self, *args, **kwargs): 47 | with open(os.path.join(MOCK_DIR, "vision.json"), "r") as f: 48 | return json.load(f) 49 | 50 | def mock_post(*args, **kwargs): 51 | return MockPost() 52 | 53 | # Mock post to API 54 | monkeypatch.setattr(requests, "post", mock_post) 55 | 56 | def mock_init(*args, **kwargs): 57 | pass 58 | 59 | def mock_annotate(*args, **kwargs): 60 | with open(os.path.join(MOCK_DIR, "vision.pkl"), "rb") as f: 61 | resp = pickle.load(f) 62 | 63 | return resp 64 | 65 | # Mock Vision API annotate 66 | monkeypatch.setattr(vision.ImageAnnotatorClient, "__init__", mock_init) 67 | monkeypatch.setattr(vision.ImageAnnotatorClient, "batch_annotate_images", mock_annotate) 68 | 69 | 70 | @pytest.fixture 71 | def mock_textract(monkeypatch): 72 | class MockClient: 73 | def __init__(self, *args, **kwargs): 74 | pass 75 | 76 | def detect_document_text(*args, **kwargs): 77 | with open(os.path.join(MOCK_DIR, "textract.json"), "r") as f: 78 | resp = json.load(f) 79 | 80 | return resp 81 | 82 | # Mock boto3 client 83 | monkeypatch.setattr(boto3, "client", MockClient) 84 | 85 | 86 | @pytest.fixture 87 | def mock_azure(monkeypatch): 88 | class MockRead(NamedTuple): 89 | headers: Dict 90 | 91 | def mock_read_in_stream(*args, **kwargs): 92 | return MockRead(headers={"Operation-Location": "zz/zz"}) 93 | 94 | def mock_get_read_result(*args, **kwargs): 95 | with open(os.path.join(MOCK_DIR, "azure.pkl"), "rb") as f: 96 | resp = pickle.load(f) 97 | return resp 98 | 99 | # Mock azure client 100 | monkeypatch.setattr(azure.cognitiveservices.vision.computervision.ComputerVisionClient, 101 | "read_in_stream", 102 | mock_read_in_stream) 103 | monkeypatch.setattr(azure.cognitiveservices.vision.computervision.ComputerVisionClient, 104 | "get_read_result", 105 | mock_get_read_result) 106 | 107 | 108 | @pytest.fixture 109 | def mock_surya(monkeypatch): 110 | def mock_run_ocr(*args, **kwargs): 111 | with open(os.path.join(MOCK_DIR, "surya.pkl"), "rb") as f: 112 | resp = pickle.load(f) 113 | return resp 114 | 115 | if sys.version_info >= (3, 10): 116 | import surya.recognition 117 | # Mock surya 118 | monkeypatch.setattr(surya.recognition.RecognitionPredictor, 119 | "__call__", 120 | mock_run_ocr) 121 | 122 | -------------------------------------------------------------------------------- /tests/document/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/__init__.py -------------------------------------------------------------------------------- /tests/document/base/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/base/__init__.py -------------------------------------------------------------------------------- /tests/document/base/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/base/test_data/test.png -------------------------------------------------------------------------------- /tests/document/base/test_rotation.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import cv2 3 | import numpy as np 4 | from sewar import ssim 5 | 6 | from img2table.document.base.rotation import rotate_img_with_border, fix_rotation_image, get_connected_components, \ 7 | get_relevant_angles, angle_dixon_q_test 8 | 9 | 10 | def test_get_connected_components(): 11 | img = cv2.imread("test_data/test.png", cv2.IMREAD_GRAYSCALE) 12 | 13 | cc, ref_height, thresh = get_connected_components(img=img) 14 | 15 | assert len(cc) == 98 16 | 17 | 18 | def test_get_relevant_angles(): 19 | centroids = [[35.8676, 5473.6768], 20 | [45.4648, 8734.32], 21 | [476.386, 98.437], 22 | [9834.4648, 468.47], 23 | [746.746, 7348.43], 24 | [846.462, 8474.48], 25 | [2983.846, 94483.46], 26 | [1093.46, 8473.46], 27 | [3676.77, 84783.64]] 28 | 29 | result = get_relevant_angles(centroids=np.array(centroids), ref_height=1000, n_max=5) 30 | 31 | assert len(result) == 5 32 | 33 | 34 | def test_angle_dixon_q_test(): 35 | result = angle_dixon_q_test(angles=[12.23, 12.78, 12.79, 12.82], confidence=0.9) 36 | 37 | assert round(result, 3) == 12.797 38 | 39 | 40 | def test_fix_rotation_image(): 41 | def crop_to_orig_img(img, orig_img): 42 | # Get original dimensions 43 | orig_height, orig_width = orig_img.shape[:2] 44 | 45 | # Get center of img 46 | center = (img.shape[0] // 2, img.shape[1] // 2) 47 | # Crop img around centre 48 | cropped = img[center[0] - orig_height // 2: center[0] + orig_height // 2 + 1, 49 | center[1] - orig_width // 2: center[1] + orig_width // 2 + 1] 50 | 51 | return cropped 52 | 53 | img = cv2.imread("test_data/test.png") 54 | 55 | similarities = list() 56 | for angle in range(-30, 30, 3): 57 | # Create test image by rotating it 58 | test_img = rotate_img_with_border(img=img.copy(), angle=angle) 59 | result = crop_to_orig_img(img=fix_rotation_image(img=test_img)[0], 60 | orig_img=img) 61 | 62 | # Compute similarity between original image and result 63 | similarities.append(ssim(GT=img, P=result)[0]) 64 | 65 | assert np.mean(similarities) >= 0.85 66 | -------------------------------------------------------------------------------- /tests/document/image/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/__init__.py -------------------------------------------------------------------------------- /tests/document/image/test_data/blank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/blank.png -------------------------------------------------------------------------------- /tests/document/image/test_data/dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/dark.png -------------------------------------------------------------------------------- /tests/document/image/test_data/expected.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/expected.xlsx -------------------------------------------------------------------------------- /tests/document/image/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/image/test_data/test.png -------------------------------------------------------------------------------- /tests/document/image/test_image.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from io import BytesIO 3 | 4 | import pytest 5 | from openpyxl import load_workbook 6 | 7 | from img2table.document.image import Image 8 | from img2table.ocr import TesseractOCR 9 | from img2table.tables.objects.extraction import BBox 10 | 11 | 12 | def test_validators(): 13 | with pytest.raises(TypeError) as e_info: 14 | img = Image(src=1) 15 | 16 | with pytest.raises(TypeError) as e_info: 17 | img = Image(src="img", detect_rotation=3) 18 | 19 | 20 | def test_load_image(): 21 | # Load from path 22 | img_from_path = Image(src="test_data/test.png") 23 | 24 | # Load from bytes 25 | with open("test_data/test.png", "rb") as f: 26 | img_from_bytes = Image(src=f.read()) 27 | 28 | # Load from BytesIO 29 | with open("test_data/test.png", "rb") as f: 30 | img_from_bytesio = Image(src=BytesIO(f.read())) 31 | 32 | assert img_from_path.bytes == img_from_bytes.bytes == img_from_bytesio.bytes 33 | 34 | assert list(img_from_path.images)[0].shape == (417, 1365, 3) 35 | 36 | 37 | def test_blank_image(mock_tesseract): 38 | ocr = TesseractOCR() 39 | img = Image(src="test_data/blank.png", 40 | detect_rotation=True) 41 | 42 | result = img.extract_tables(ocr=ocr, 43 | implicit_rows=True, 44 | borderless_tables=True, 45 | min_confidence=50) 46 | 47 | assert result == [] 48 | 49 | 50 | def test_blank_no_ocr(): 51 | img = Image(src="test_data/blank.png", 52 | detect_rotation=True) 53 | 54 | result = img.extract_tables(implicit_rows=True, 55 | borderless_tables=True, 56 | min_confidence=50) 57 | 58 | assert result == [] 59 | 60 | 61 | def test_image_tables(mock_tesseract): 62 | ocr = TesseractOCR() 63 | img = Image(src="test_data/test.png", 64 | detect_rotation=True) 65 | 66 | result = img.extract_tables(ocr=ocr, implicit_rows=True, min_confidence=50) 67 | 68 | assert len(result) == 2 69 | 70 | assert result[0].title is None 71 | assert result[0].bbox == BBox(x1=36, y1=21, x2=770, y2=327) 72 | assert len(result[0].content) == 6 73 | assert len(result[0].content[0]) == 3 74 | 75 | assert result[1].title is None 76 | assert result[1].bbox == BBox(x1=962, y1=21, x2=1154, y2=123) 77 | assert len(result[1].content) == 2 78 | assert len(result[1].content[0]) == 2 79 | 80 | 81 | def test_no_ocr(): 82 | img = Image(src="test_data/dark.png", 83 | detect_rotation=True) 84 | 85 | result = img.extract_tables(implicit_rows=True, min_confidence=50) 86 | 87 | assert len(result) == 1 88 | 89 | assert result[0].title is None 90 | assert result[0].bbox == BBox(x1=46, y1=37, x2=836, y2=529) 91 | assert len(result[0].content) == 19 92 | assert len(result[0].content[0]) == 5 93 | 94 | 95 | def test_image_excel(mock_tesseract): 96 | ocr = TesseractOCR() 97 | img = Image(src="test_data/test.png", 98 | detect_rotation=True) 99 | 100 | result = img.to_xlsx(dest=BytesIO(), ocr=ocr, implicit_rows=True, min_confidence=50) 101 | 102 | expected = load_workbook(filename="test_data/expected.xlsx") 103 | result_wb = load_workbook(filename=result) 104 | 105 | for idx, ws in enumerate(result_wb.worksheets): 106 | assert ws.title == expected.worksheets[idx].title 107 | assert list(ws.values) == list(expected.worksheets[idx].values) 108 | -------------------------------------------------------------------------------- /tests/document/pdf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/pdf/__init__.py -------------------------------------------------------------------------------- /tests/document/pdf/test_data/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/document/pdf/test_data/test.pdf -------------------------------------------------------------------------------- /tests/document/pdf/test_pdf.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import sys 3 | from io import BytesIO 4 | 5 | import pytest 6 | 7 | from img2table.document.pdf import PDF 8 | from img2table.ocr import TesseractOCR 9 | from img2table.tables.objects.extraction import BBox 10 | 11 | 12 | def test_validators(): 13 | with pytest.raises(TypeError) as e_info: 14 | pdf = PDF(src=1) 15 | 16 | with pytest.raises(TypeError) as e_info: 17 | pdf = PDF(src="img", pages=12) 18 | 19 | with pytest.raises(TypeError) as e_info: 20 | pdf = PDF(src="img", pages=["12"]) 21 | 22 | with pytest.raises(TypeError) as e_info: 23 | pdf = PDF(src="img", pages=[1], detect_rotation="a") 24 | 25 | 26 | def test_load_pdf(): 27 | # Load from path 28 | pdf_from_path = PDF(src="test_data/test.pdf") 29 | 30 | # Load from bytes 31 | with open("test_data/test.pdf", "rb") as f: 32 | pdf_from_bytes = PDF(src=f.read()) 33 | 34 | # Load from BytesIO 35 | with open("test_data/test.pdf", "rb") as f: 36 | pdf_from_bytesio = PDF(src=BytesIO(f.read())) 37 | 38 | assert pdf_from_path.bytes == pdf_from_bytes.bytes == pdf_from_bytesio.bytes 39 | 40 | assert list(pdf_from_path.images)[0].shape == (2200, 1700, 3) 41 | 42 | 43 | def test_pdf_pages(): 44 | assert len(list(PDF(src="test_data/test.pdf").images)) == 2 45 | assert len(list(PDF(src="test_data/test.pdf", pages=[0]).images)) == 1 46 | 47 | 48 | def test_pdf_tables(mock_tesseract): 49 | ocr = TesseractOCR() 50 | pdf = PDF(src="test_data/test.pdf") 51 | 52 | result = pdf.extract_tables(ocr=ocr, implicit_rows=True, min_confidence=50) 53 | 54 | assert result[0][0].title == "Example of Data Table 1" 55 | if sys.version_info.minor < 11: 56 | assert result[0][0].bbox == BBox(x1=235, y1=249, x2=1442, y2=543) 57 | assert (len(result[0][0].content), len(result[0][0].content[0])) == (5, 4) 58 | 59 | assert result[0][1].title == "Example of Data Table 2" 60 | if sys.version_info.minor < 11: 61 | assert result[0][1].bbox == BBox(x1=236, y1=672, x2=1452, y2=972) 62 | assert (len(result[0][1].content), len(result[0][1].content[0])) == (5, 4) 63 | 64 | assert result[1][0].title == "Example of Data Table 3" 65 | if sys.version_info.minor < 11: 66 | assert result[1][0].bbox == BBox(x1=235, y1=249, x2=1442, y2=543) 67 | assert (len(result[1][0].content), len(result[1][0].content[0])) == (5, 4) 68 | 69 | assert result[1][1].title == "Example of Data Table 4" 70 | if sys.version_info.minor < 11: 71 | assert result[1][1].bbox == BBox(x1=236, y1=672, x2=1452, y2=972) 72 | assert (len(result[1][1].content), len(result[1][1].content[0])) == (5, 4) 73 | -------------------------------------------------------------------------------- /tests/ocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/__init__.py -------------------------------------------------------------------------------- /tests/ocr/aws_textract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/aws_textract/__init__.py -------------------------------------------------------------------------------- /tests/ocr/aws_textract/test_aws_textract.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import os 4 | 5 | import polars as pl 6 | 7 | from img2table.document import Image 8 | from img2table.ocr import TextractOCR 9 | from img2table.ocr.data import OCRDataframe 10 | from tests import MOCK_DIR 11 | 12 | 13 | def test_map_response(mock_textract): 14 | img = Image("test_data/test.png") 15 | 16 | with open(os.path.join(MOCK_DIR, "textract.json"), "r") as f: 17 | resp = json.load(f) 18 | 19 | result = TextractOCR().map_response(response=resp, 20 | image=list(img.images)[0], 21 | page=0) 22 | 23 | with open("test_data/content.json", "r") as f: 24 | expected = json.load(f) 25 | 26 | assert result == expected 27 | 28 | 29 | def test_content(mock_textract): 30 | img = Image("test_data/test.png") 31 | ocr = TextractOCR() 32 | 33 | result = ocr.content(document=img) 34 | 35 | with open("test_data/content.json", "r") as f: 36 | expected = json.load(f) 37 | 38 | assert list(result) == [expected] 39 | 40 | 41 | def test_to_ocr_df(mock_textract): 42 | ocr = TextractOCR() 43 | with open("test_data/content.json", "r") as f: 44 | content = json.load(f) 45 | 46 | result = ocr.to_ocr_dataframe(content=[content]) 47 | 48 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 49 | 50 | assert result == expected 51 | 52 | 53 | def test_textract_ocr(mock_textract): 54 | img = Image("test_data/test.png") 55 | ocr = TextractOCR(aws_access_key_id="aws_access_key_id", 56 | aws_secret_access_key="aws_secret_access_key", 57 | aws_session_token="aws_session_token", 58 | region="eu-west-1") 59 | 60 | result = ocr.of(document=img) 61 | 62 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 63 | 64 | assert result == expected 65 | -------------------------------------------------------------------------------- /tests/ocr/aws_textract/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/aws_textract/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/azure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/azure/__init__.py -------------------------------------------------------------------------------- /tests/ocr/azure/test_azure.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import pickle 4 | 5 | import polars as pl 6 | import pytest 7 | 8 | from img2table.document import Image 9 | from img2table.ocr import AzureOCR 10 | from img2table.ocr.data import OCRDataframe 11 | from tests import MOCK_DIR 12 | 13 | 14 | def test_content(mock_azure): 15 | img = Image("test_data/test.png") 16 | ocr = AzureOCR(endpoint="aa", subscription_key="bb") 17 | 18 | result = ocr.content(document=img) 19 | 20 | with open(os.path.join(MOCK_DIR, "azure.pkl"), "rb") as f: 21 | expected = pickle.load(f) 22 | 23 | assert list(result) == [expected] 24 | 25 | 26 | def test_to_ocr_df(mock_azure): 27 | ocr = AzureOCR(endpoint="aa", subscription_key="bb") 28 | with open(os.path.join(MOCK_DIR, "azure.pkl"), "rb") as f: 29 | content = pickle.load(f) 30 | 31 | result = ocr.to_ocr_dataframe(content=[content]) 32 | 33 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 34 | 35 | assert result == expected 36 | 37 | 38 | def test_azure_ocr(mock_azure): 39 | # Test init error 40 | with pytest.raises(TypeError) as e_info: 41 | AzureOCR(subscription_key=8, endpoint="a") 42 | 43 | with pytest.raises(TypeError) as e_info: 44 | AzureOCR(subscription_key="a", endpoint=0) 45 | 46 | with pytest.raises(ValueError) as e_info: 47 | AzureOCR(subscription_key="a") 48 | 49 | with pytest.raises(ValueError) as e_info: 50 | AzureOCR(subscription_key="a") 51 | 52 | img = Image("test_data/test.png") 53 | ocr = AzureOCR(endpoint="aa", subscription_key="bb") 54 | 55 | result = ocr.of(document=img) 56 | 57 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 58 | 59 | assert result == expected 60 | -------------------------------------------------------------------------------- /tests/ocr/azure/test_data/ocr_df.csv: -------------------------------------------------------------------------------- 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2 2 | 0;ocrx_word;word_1_1;word_1_1;Title;100;383;38;422;56 3 | 0;ocrx_word;word_1_2;word_1_2;Test;99;965;39;1001;57 4 | 0;ocrx_word;word_1_3;word_1_2;1;100;1004;39;1015;57 5 | 0;ocrx_word;word_1_4;word_1_3;Test;99;1061;38;1096;57 6 | 0;ocrx_word;word_1_5;word_1_3;2;100;1100;38;1111;57 7 | 0;ocrx_word;word_1_6;word_1_4;Line;98;38;89;75;108 8 | 0;ocrx_word;word_1_7;word_1_4;1;100;80;89;91;108 9 | 0;ocrx_word;word_1_8;word_1_4;-;99;94;89;104;109 10 | 0;ocrx_word;word_1_9;word_1_4;Col;100;107;89;137;109 11 | 0;ocrx_word;word_1_10;word_1_4;1;99;141;88;153;109 12 | 0;ocrx_word;word_1_11;word_1_5;Line;98;278;89;315;109 13 | 0;ocrx_word;word_1_12;word_1_5;1;100;319;89;331;109 14 | 0;ocrx_word;word_1_13;word_1_5;-;100;335;89;343;109 15 | 0;ocrx_word;word_1_14;word_1_5;Col;100;347;88;377;109 16 | 0;ocrx_word;word_1_15;word_1_5;2;100;381;88;393;109 17 | 0;ocrx_word;word_1_16;word_1_6;Line;99;499;89;533;108 18 | 0;ocrx_word;word_1_17;word_1_6;1;99;538;89;549;108 19 | 0;ocrx_word;word_1_18;word_1_6;-;100;553;89;562;108 20 | 0;ocrx_word;word_1_19;word_1_6;Col;100;566;89;596;108 21 | 0;ocrx_word;word_1_20;word_1_6;3;100;599;88;611;108 22 | 0;ocrx_word;word_1_21;word_1_7;Test;99;964;89;1000;108 23 | 0;ocrx_word;word_1_22;word_1_7;3;100;1003;89;1014;108 24 | 0;ocrx_word;word_1_23;word_1_8;Test;99;1060;89;1096;108 25 | 0;ocrx_word;word_1_24;word_1_8;4;100;1099;89;1111;109 26 | 0;ocrx_word;word_1_25;word_1_9;Line;99;39;140;74;159 27 | 0;ocrx_word;word_1_26;word_1_9;2;99;79;141;91;159 28 | 0;ocrx_word;word_1_27;word_1_9;-;100;94;141;103;159 29 | 0;ocrx_word;word_1_28;word_1_9;Col;100;107;141;138;160 30 | 0;ocrx_word;word_1_29;word_1_9;1;100;142;140;152;160 31 | 0;ocrx_word;word_1_30;word_1_10;Line;99;497;140;533;159 32 | 0;ocrx_word;word_1_31;word_1_10;2;99;537;141;550;159 33 | 0;ocrx_word;word_1_32;word_1_10;-;100;554;141;563;159 34 | 0;ocrx_word;word_1_33;word_1_10;Col;100;566;140;596;159 35 | 0;ocrx_word;word_1_34;word_1_10;3;100;599;140;610;159 36 | 0;ocrx_word;word_1_35;word_1_11;Line;98;38;191;74;210 37 | 0;ocrx_word;word_1_36;word_1_11;3;100;80;191;91;210 38 | 0;ocrx_word;word_1_37;word_1_11;-;100;94;191;103;210 39 | 0;ocrx_word;word_1_38;word_1_11;Col;100;107;191;139;211 40 | 0;ocrx_word;word_1_39;word_1_11;1;100;142;190;153;211 41 | 0;ocrx_word;word_1_40;word_1_12;Merged;99;327;191;396;213 42 | 0;ocrx_word;word_1_41;word_1_12;Cells;100;400;190;444;213 43 | 0;ocrx_word;word_1_42;word_1_13;Line;99;498;191;533;210 44 | 0;ocrx_word;word_1_43;word_1_13;3;100;537;191;548;210 45 | 0;ocrx_word;word_1_44;word_1_13;-;100;553;191;562;210 46 | 0;ocrx_word;word_1_45;word_1_13;Col;100;566;191;595;210 47 | 0;ocrx_word;word_1_46;word_1_13;3;100;598;191;610;210 48 | 0;ocrx_word;word_1_47;word_1_14;Line;98;38;242;75;261 49 | 0;ocrx_word;word_1_48;word_1_14;4;100;79;242;91;262 50 | 0;ocrx_word;word_1_49;word_1_14;-;99;94;242;104;262 51 | 0;ocrx_word;word_1_50;word_1_14;Col;100;107;242;138;262 52 | 0;ocrx_word;word_1_51;word_1_14;1;100;142;242;153;262 53 | 0;ocrx_word;word_1_52;word_1_15;Line;99;497;242;533;262 54 | 0;ocrx_word;word_1_53;word_1_15;4;100;538;242;549;261 55 | 0;ocrx_word;word_1_54;word_1_15;-;100;554;242;562;261 56 | 0;ocrx_word;word_1_55;word_1_15;Col;100;566;242;596;261 57 | 0;ocrx_word;word_1_56;word_1_15;3;100;600;242;611;261 58 | 0;ocrx_word;word_1_57;word_1_16;Line;99;38;293;74;313 59 | 0;ocrx_word;word_1_58;word_1_16;5;98;78;293;91;313 60 | 0;ocrx_word;word_1_59;word_1_16;-;100;95;293;103;313 61 | 0;ocrx_word;word_1_60;word_1_16;Col;100;106;293;138;313 62 | 0;ocrx_word;word_1_61;word_1_16;1;100;141;293;153;313 63 | 0;ocrx_word;word_1_62;word_1_17;Line;98;278;293;314;313 64 | 0;ocrx_word;word_1_63;word_1_17;5;100;319;293;330;313 65 | 0;ocrx_word;word_1_64;word_1_17;-;100;335;293;343;313 66 | 0;ocrx_word;word_1_65;word_1_17;Col;100;347;292;376;313 67 | 0;ocrx_word;word_1_66;word_1_17;2;100;380;292;392;314 68 | 0;ocrx_word;word_1_67;word_1_18;Line;98;497;293;533;313 69 | 0;ocrx_word;word_1_68;word_1_18;5;98;537;293;550;313 70 | 0;ocrx_word;word_1_69;word_1_18;-;100;554;293;562;313 71 | 0;ocrx_word;word_1_70;word_1_18;Col;100;566;293;596;313 72 | 0;ocrx_word;word_1_71;word_1_18;3;100;599;293;611;313 73 | -------------------------------------------------------------------------------- /tests/ocr/azure/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/azure/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/data/__init__.py -------------------------------------------------------------------------------- /tests/ocr/data/test_data/expected_table.json: -------------------------------------------------------------------------------- 1 | [[{"x1": 439, "y1": 1581, "x2": 950, "y2": 1658, "content": "Number of Coils"}, {"x1": 950, "y1": 1581, "x2": 1580, "y2": 1658, "content": "Number of Paperclips"}], [{"x1": 439, "y1": 1658, "x2": 950, "y2": 1733, "content": "Craig\n5"}, {"x1": 950, "y1": 1658, "x2": 1580, "y2": 1733, "content": "Spirit of America,\n3, 5, 4"}], [{"x1": 439, "y1": 1733, "x2": 950, "y2": 1808, "content": "Gary Gabelich\n10"}, {"x1": 950, "y1": 1733, "x2": 1580, "y2": 1808, "content": "Blue Flame\n7, 8, 6"}], [{"x1": 439, "y1": 1808, "x2": 950, "y2": 1883, "content": "Richard Noble\n15"}, {"x1": 950, "y1": 1808, "x2": 1580, "y2": 1883, "content": "Thrust 2\n11, 10, 12"}], [{"x1": 439, "y1": 1883, "x2": 950, "y2": 1956, "content": "20\nAndy Green"}, {"x1": 950, "y1": 1883, "x2": 1580, "y2": 1956, "content": "15, 13, 14\nThrust SSC"}]] -------------------------------------------------------------------------------- /tests/ocr/data/test_data/table.json: -------------------------------------------------------------------------------- 1 | [[{"x1": 439, "y1": 1581, "x2": 950, "y2": 1658}, {"x1": 950, "y1": 1581, "x2": 1580, "y2": 1658}], [{"x1": 439, "y1": 1658, "x2": 950, "y2": 1733}, {"x1": 950, "y1": 1658, "x2": 1580, "y2": 1733}], [{"x1": 439, "y1": 1733, "x2": 950, "y2": 1808}, {"x1": 950, "y1": 1733, "x2": 1580, "y2": 1808}], [{"x1": 439, "y1": 1808, "x2": 950, "y2": 1883}, {"x1": 950, "y1": 1808, "x2": 1580, "y2": 1883}], [{"x1": 439, "y1": 1883, "x2": 950, "y2": 1956}, {"x1": 950, "y1": 1883, "x2": 1580, "y2": 1956}]] -------------------------------------------------------------------------------- /tests/ocr/data/test_ocr_data.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import polars as pl 5 | 6 | from img2table.ocr.data import OCRDataframe 7 | from img2table.tables.objects.cell import Cell 8 | from img2table.tables.objects.row import Row 9 | from img2table.tables.objects.table import Table 10 | 11 | 12 | def test_pages(): 13 | ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 14 | 15 | ocr_df_page_0 = ocr_df.page(page_number=0) 16 | ocr_df_page_1 = ocr_df.page(page_number=1) 17 | 18 | assert isinstance(ocr_df_page_0, OCRDataframe) 19 | assert isinstance(ocr_df_page_1, OCRDataframe) 20 | 21 | assert not ocr_df_page_0 == ocr_df_page_1 22 | assert len(ocr_df_page_0.df) + len(ocr_df_page_1.df) == len(ocr_df.df) 23 | 24 | 25 | def test_get_text_cell(): 26 | ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 27 | cell = Cell(x1=200, x2=800, y1=700, y2=850) 28 | 29 | result = ocr_df.get_text_cell(cell=cell, 30 | min_confidence=50, 31 | page_number=0) 32 | 33 | assert result == "http://www.landspeed.com/lsrinfo.asp.)\nUse these data to create\nChecklist for a Data Table." 34 | 35 | 36 | def test_get_text_table(): 37 | ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 38 | 39 | with open("test_data/table.json", "r") as f: 40 | table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)]) 41 | 42 | result = ocr_df.get_text_table(table=table, 43 | page_number=0, 44 | min_confidence=50) 45 | 46 | with open("test_data/expected_table.json", "r") as f: 47 | expected = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)]) 48 | 49 | assert result == expected 50 | -------------------------------------------------------------------------------- /tests/ocr/doctr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/doctr/__init__.py -------------------------------------------------------------------------------- /tests/ocr/doctr/test_data/ocr.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/doctr/test_data/ocr.pkl -------------------------------------------------------------------------------- /tests/ocr/doctr/test_data/ocr_df.csv: -------------------------------------------------------------------------------- 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2 2 | 0;ocrx_word;word_1_0_1;word_1_0;Title;100;383;38;425;59 3 | 0;ocrx_word;word_1_1_2;word_1_1;Test;100;962;38;1004;59 4 | 0;ocrx_word;word_1_1_3;word_1_1;1;100;1005;38;1020;59 5 | 0;ocrx_word;word_1_1_4;word_1_1;Test;100;1058;38;1100;59 6 | 0;ocrx_word;word_1_1_5;word_1_1;2;100;1100;38;1116;59 7 | 0;ocrx_word;word_1_2_6;word_1_2;Line;100;37;87;81;113 8 | 0;ocrx_word;word_1_2_7;word_1_2;1;100;81;90;97;110 9 | 0;ocrx_word;word_1_2_8;word_1_2;-;99;95;95;108;109 10 | 0;ocrx_word;word_1_2_9;word_1_2;Col;100;108;89;141;110 11 | 0;ocrx_word;word_1_2_10;word_1_2;1;100;141;89;156;110 12 | 0;ocrx_word;word_1_3_11;word_1_3;Line;100;277;87;321;113 13 | 0;ocrx_word;word_1_3_12;word_1_3;1;100;320;90;337;110 14 | 0;ocrx_word;word_1_3_13;word_1_3;-;98;335;95;347;106 15 | 0;ocrx_word;word_1_3_14;word_1_3;Col;100;348;89;380;110 16 | 0;ocrx_word;word_1_3_15;word_1_3;2;100;381;90;396;110 17 | 0;ocrx_word;word_1_4_16;word_1_4;Line;100;496;87;540;113 18 | 0;ocrx_word;word_1_4_17;word_1_4;1;100;540;90;556;110 19 | 0;ocrx_word;word_1_4_18;word_1_4;-;99;553;95;567;106 20 | 0;ocrx_word;word_1_4_19;word_1_4;Col;100;567;89;600;110 21 | 0;ocrx_word;word_1_4_20;word_1_4;3;100;600;89;616;110 22 | 0;ocrx_word;word_1_5_21;word_1_5;Test;100;962;89;1004;110 23 | 0;ocrx_word;word_1_5_22;word_1_5;3;100;1004;89;1020;110 24 | 0;ocrx_word;word_1_5_23;word_1_5;Test;100;1058;89;1100;110 25 | 0;ocrx_word;word_1_5_24;word_1_5;4;100;1100;90;1116;110 26 | 0;ocrx_word;word_1_6_25;word_1_6;Line;100;39;139;79;162 27 | 0;ocrx_word;word_1_6_26;word_1_6;2;100;80;141;96;161 28 | 0;ocrx_word;word_1_6_27;word_1_6;-;94;95;147;107;158 29 | 0;ocrx_word;word_1_6_28;word_1_6;Col;97;108;141;141;161 30 | 0;ocrx_word;word_1_6_29;word_1_6;1;100;141;139;156;162 31 | 0;ocrx_word;word_1_7_30;word_1_7;Line;100;496;138;540;163 32 | 0;ocrx_word;word_1_7_31;word_1_7;2;100;540;141;556;162 33 | 0;ocrx_word;word_1_7_32;word_1_7;-;97;553;147;567;158 34 | 0;ocrx_word;word_1_7_33;word_1_7;Col;97;567;141;600;161 35 | 0;ocrx_word;word_1_7_34;word_1_7;3;100;600;139;616;161 36 | 0;ocrx_word;word_1_8_35;word_1_8;Line;100;39;191;79;212 37 | 0;ocrx_word;word_1_8_36;word_1_8;3;100;80;191;96;212 38 | 0;ocrx_word;word_1_8_37;word_1_8;-;100;96;197;109;210 39 | 0;ocrx_word;word_1_8_38;word_1_8;Col;100;108;191;140;212 40 | 0;ocrx_word;word_1_8_39;word_1_8;1;100;141;191;157;212 41 | 0;ocrx_word;word_1_9_40;word_1_9;Merged;100;328;191;400;215 42 | 0;ocrx_word;word_1_9_41;word_1_9;Cells;99;403;191;447;211 43 | 0;ocrx_word;word_1_10_42;word_1_10;Line;100;497;191;539;212 44 | 0;ocrx_word;word_1_10_43;word_1_10;3;100;539;191;555;212 45 | 0;ocrx_word;word_1_10_44;word_1_10;-;100;555;197;568;210 46 | 0;ocrx_word;word_1_10_45;word_1_10;Col;100;567;191;600;212 47 | 0;ocrx_word;word_1_10_46;word_1_10;3;100;600;191;616;211 48 | 0;ocrx_word;word_1_11_47;word_1_11;Line;100;39;242;79;263 49 | 0;ocrx_word;word_1_11_48;word_1_11;4;100;80;242;96;262 50 | 0;ocrx_word;word_1_11_49;word_1_11;-;51;96;250;105;258 51 | 0;ocrx_word;word_1_11_50;word_1_11;Col;100;108;242;140;262 52 | 0;ocrx_word;word_1_11_51;word_1_11;1;100;141;242;156;263 53 | 0;ocrx_word;word_1_12_52;word_1_12;Line;100;497;242;539;263 54 | 0;ocrx_word;word_1_12_53;word_1_12;4;100;539;242;556;262 55 | 0;ocrx_word;word_1_12_54;word_1_12;=;53;555;250;563;258 56 | 0;ocrx_word;word_1_12_55;word_1_12;Col;100;567;242;600;262 57 | 0;ocrx_word;word_1_12_56;word_1_12;3;100;600;242;616;262 58 | 0;ocrx_word;word_1_13_57;word_1_13;Line;89;39;292;79;314 59 | 0;ocrx_word;word_1_13_58;word_1_13;5;100;80;292;96;312 60 | 0;ocrx_word;word_1_13_59;word_1_13;-;100;96;298;108;308 61 | 0;ocrx_word;word_1_13_60;word_1_13;Col;100;108;292;140;314 62 | 0;ocrx_word;word_1_13_61;word_1_13;1;100;141;292;157;314 63 | 0;ocrx_word;word_1_14_62;word_1_14;Line;89;279;292;319;314 64 | 0;ocrx_word;word_1_14_63;word_1_14;5;100;320;292;336;312 65 | 0;ocrx_word;word_1_14_64;word_1_14;-;99;336;299;344;307 66 | 0;ocrx_word;word_1_14_65;word_1_14;Col;100;348;292;380;314 67 | 0;ocrx_word;word_1_14_66;word_1_14;2;100;381;292;396;314 68 | 0;ocrx_word;word_1_15_67;word_1_15;Line;100;497;292;539;314 69 | 0;ocrx_word;word_1_15_68;word_1_15;5;100;540;292;555;312 70 | 0;ocrx_word;word_1_15_69;word_1_15;Col;100;567;292;600;314 71 | 0;ocrx_word;word_1_15_70;word_1_15;3;100;600;292;616;312 72 | -------------------------------------------------------------------------------- /tests/ocr/doctr/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/doctr/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/doctr/test_doctr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import pickle 4 | import sys 5 | 6 | import polars as pl 7 | import pytest 8 | 9 | from img2table.document.image import Image 10 | from img2table.ocr import DocTR 11 | from img2table.ocr.data import OCRDataframe 12 | 13 | 14 | def format_content(content): 15 | output = { 16 | id_page: {id_line: [{"value": word.value, 17 | "confidence": round(word.confidence, 2), 18 | "geometry": word.geometry, 19 | } 20 | for word in line.words] 21 | for block in page.blocks for id_line, line in enumerate(block.lines) 22 | } 23 | for id_page, page in enumerate(content.pages) 24 | } 25 | 26 | return output 27 | 28 | 29 | @pytest.mark.skipif(sys.version_info >= (3, 12), reason="Error building with 3.12") 30 | @pytest.mark.skipif(sys.version_info < (3, 9), reason="Not supported anymore") 31 | def test_doctr_content(): 32 | instance = DocTR() 33 | doc = Image(src="test_data/test.png") 34 | 35 | result = instance.content(document=doc) 36 | 37 | with open("test_data/ocr.pkl", "rb") as f: 38 | expected = pickle.load(f) 39 | 40 | assert format_content(result) == format_content(expected) 41 | 42 | 43 | @pytest.mark.skipif(sys.version_info >= (3, 12), reason="Error building with 3.12") 44 | @pytest.mark.skipif(sys.version_info < (3, 9), reason="Not supported anymore") 45 | def test_doctr_ocr_df(): 46 | instance = DocTR() 47 | 48 | with open("test_data/ocr.pkl", "rb") as f: 49 | content = pickle.load(f) 50 | 51 | result = instance.to_ocr_dataframe(content=content) 52 | 53 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 54 | 55 | assert result == expected 56 | 57 | 58 | @pytest.mark.skipif(sys.version_info >= (3, 12), reason="Error building with 3.12") 59 | @pytest.mark.skipif(sys.version_info < (3, 9), reason="Not supported anymore") 60 | def test_doctr_document(): 61 | instance = DocTR() 62 | doc = Image(src="test_data/test.png") 63 | 64 | result = instance.of(document=doc) 65 | 66 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 67 | 68 | assert result.df.drop("confidence").equals(expected.df.drop("confidence")) 69 | -------------------------------------------------------------------------------- /tests/ocr/easyocr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/easyocr/__init__.py -------------------------------------------------------------------------------- /tests/ocr/easyocr/test_data/ocr.json: -------------------------------------------------------------------------------- 1 | [[[[[383, 37], [425, 37], [425, 57], [383, 57]], "Title", 0.9999929628110977], [[[962, 36], [1020, 36], [1020, 60], [962, 60]], "Test 1", 0.9936909608809552], [[[1058, 36], [1116, 36], [1116, 60], [1058, 60]], "Test 2", 0.8442916046434888], [[[39, 89], [97, 89], [97, 109], [39, 109]], "Line 1", 0.6476242997727373], [[[107, 89], [155, 89], [155, 109], [107, 109]], "Col 1", 0.9720920853320717], [[[279, 89], [395, 89], [395, 109], [279, 109]], "Line 1 - Col 2", 0.936260010219446], [[[497, 89], [615, 89], [615, 109], [497, 109]], "Line 1 - Col 3", 0.8419314287078425], [[[962, 86], [1020, 86], [1020, 110], [962, 110]], "Test 3", 0.5705159231431433], [[[1058, 86], [1116, 86], [1116, 110], [1058, 110]], "Test 4", 0.6396247875277982], [[[38, 138], [156, 138], [156, 162], [38, 162]], "Line 2 - Col 1", 0.8294489200341787], [[[496, 138], [616, 138], [616, 162], [496, 162]], "Line 2 - Col 3", 0.9406858946476558], [[[39, 191], [97, 191], [97, 211], [39, 211]], "Line 3", 0.9953379669670536], [[[107, 191], [155, 191], [155, 211], [107, 211]], "Col 1", 0.9720920853320717], [[[327, 187], [448, 187], [448, 217], [327, 217]], "Merged Cells", 0.7461553269995271], [[[497, 191], [557, 191], [557, 211], [497, 211]], "Line 3", 0.9993883375538367], [[[567, 191], [615, 191], [615, 211], [567, 211]], "Col 3", 0.9479543226130566], [[[38, 240], [156, 240], [156, 264], [38, 264]], "Line 4 - Col 1", 0.7398924631614736], [[[496, 240], [556, 240], [556, 264], [496, 264]], "Line 4", 0.956354950430173], [[[566, 240], [616, 240], [616, 264], [566, 264]], "Col 3", 0.9885079303307873], [[[39, 293], [155, 293], [155, 313], [39, 313]], "Line 5 - Col 1", 0.9406334482299059], [[[279, 293], [395, 293], [395, 313], [279, 313]], "Line 5 - Col 2", 0.893512125952306], [[[497, 293], [615, 293], [615, 313], [497, 313]], "Line 5 - Col 3", 0.8414449973587579]]] -------------------------------------------------------------------------------- /tests/ocr/easyocr/test_data/ocr_df.csv: -------------------------------------------------------------------------------- 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2 2 | 0;ocrx_word;word_1_1;word_1_1;Title;100;383;37;425;57 3 | 0;ocrx_word;word_1_2;word_1_2;Test 1;99;962;36;1020;60 4 | 0;ocrx_word;word_1_3;word_1_3;Test 2;84;1058;36;1116;60 5 | 0;ocrx_word;word_1_4;word_1_4;Line 1;65;39;89;97;109 6 | 0;ocrx_word;word_1_5;word_1_5;Col 1;97;107;89;155;109 7 | 0;ocrx_word;word_1_6;word_1_6;Line 1 - Col 2;94;279;89;395;109 8 | 0;ocrx_word;word_1_7;word_1_7;Line 1 - Col 3;84;497;89;615;109 9 | 0;ocrx_word;word_1_8;word_1_8;Test 3;57;962;86;1020;110 10 | 0;ocrx_word;word_1_9;word_1_9;Test 4;64;1058;86;1116;110 11 | 0;ocrx_word;word_1_10;word_1_10;Line 2 - Col 1;83;38;138;156;162 12 | 0;ocrx_word;word_1_11;word_1_11;Line 2 - Col 3;94;496;138;616;162 13 | 0;ocrx_word;word_1_12;word_1_12;Line 3;100;39;191;97;211 14 | 0;ocrx_word;word_1_13;word_1_13;Col 1;97;107;191;155;211 15 | 0;ocrx_word;word_1_14;word_1_14;Merged Cells;75;327;187;448;217 16 | 0;ocrx_word;word_1_15;word_1_15;Line 3;100;497;191;557;211 17 | 0;ocrx_word;word_1_16;word_1_16;Col 3;95;567;191;615;211 18 | 0;ocrx_word;word_1_17;word_1_17;Line 4 - Col 1;74;38;240;156;264 19 | 0;ocrx_word;word_1_18;word_1_18;Line 4;96;496;240;556;264 20 | 0;ocrx_word;word_1_19;word_1_19;Col 3;99;566;240;616;264 21 | 0;ocrx_word;word_1_20;word_1_20;Line 5 - Col 1;94;39;293;155;313 22 | 0;ocrx_word;word_1_21;word_1_21;Line 5 - Col 2;89;279;293;395;313 23 | 0;ocrx_word;word_1_22;word_1_22;Line 5 - Col 3;84;497;293;615;313 24 | -------------------------------------------------------------------------------- /tests/ocr/easyocr/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/easyocr/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/easyocr/test_easyocr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import json 4 | import sys 5 | from typing import Any 6 | 7 | import numpy as np 8 | import polars as pl 9 | import pytest 10 | 11 | from img2table.document.image import Image 12 | from img2table.ocr import EasyOCR 13 | from img2table.ocr.data import OCRDataframe 14 | 15 | 16 | def convert_np_types(obj: Any): 17 | if isinstance(obj, list): 18 | return [convert_np_types(element) for element in obj] 19 | elif isinstance(obj, dict): 20 | return {convert_np_types(k): convert_np_types(v) for k, v in obj.values()} 21 | elif isinstance(obj, tuple): 22 | return list(convert_np_types(element) for element in obj) 23 | elif isinstance(obj, np.int32): 24 | return int(obj) 25 | elif isinstance(obj, (np.float64, float)): 26 | return None 27 | else: 28 | return obj 29 | 30 | 31 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12") 32 | def test_validators(): 33 | with pytest.raises(TypeError) as e_info: 34 | ocr = EasyOCR(lang=12) 35 | 36 | 37 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12") 38 | def test_easyocr_content(): 39 | instance = EasyOCR() 40 | doc = Image(src="test_data/test.png") 41 | 42 | result = instance.content(document=doc) 43 | 44 | with open("test_data/ocr.json", "r") as f: 45 | expected = json.load(f) 46 | 47 | assert convert_np_types(result) == convert_np_types(expected) 48 | 49 | 50 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12") 51 | def test_easyocr_ocr_df(): 52 | instance = EasyOCR() 53 | 54 | with open("test_data/ocr.json", "r") as f: 55 | content = json.load(f) 56 | 57 | result = instance.to_ocr_dataframe(content=content) 58 | 59 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 60 | 61 | assert result == expected 62 | 63 | 64 | @pytest.mark.skipif(sys.version_info >= (3, 14), reason="Error building with 3.12") 65 | def test_easyocr_document(): 66 | instance = EasyOCR() 67 | doc = Image(src="test_data/test.png") 68 | 69 | result = instance.of(document=doc) 70 | 71 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 72 | 73 | assert result.df.drop("confidence").equals(expected.df.drop("confidence")) 74 | -------------------------------------------------------------------------------- /tests/ocr/google_vision/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/google_vision/__init__.py -------------------------------------------------------------------------------- /tests/ocr/google_vision/test_data/ocr_df.csv: -------------------------------------------------------------------------------- 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2 2 | 0;ocrx_word;word_0_0_0_0;line_0_0_0;Line;99;41;90;74;105 3 | 0;ocrx_word;word_0_0_0_1;line_0_0_0;1;98;83;90;90;104 4 | 0;ocrx_word;word_0_0_0_2;line_0_0_0;-;76;97;90;103;104 5 | 0;ocrx_word;word_0_0_0_3;line_0_0_0;Col;98;110;90;136;104 6 | 0;ocrx_word;word_0_0_0_4;line_0_0_0;1;98;142;90;151;104 7 | 0;ocrx_word;word_1_0_0_0;line_1_0_0;Line;99;40;142;75;157 8 | 0;ocrx_word;word_1_0_0_1;line_1_0_0;2;98;83;142;91;157 9 | 0;ocrx_word;word_1_0_0_2;line_1_0_0;-;85;97;142;104;157 10 | 0;ocrx_word;word_1_0_0_3;line_1_0_0;Col;97;110;142;137;157 11 | 0;ocrx_word;word_1_0_0_4;line_1_0_0;1;99;144;142;152;157 12 | 0;ocrx_word;word_2_0_0_0;line_2_0_0;Line;99;41;193;75;207 13 | 0;ocrx_word;word_2_0_0_1;line_2_0_0;3;97;82;193;91;207 14 | 0;ocrx_word;word_2_0_0_2;line_2_0_0;-;79;97;193;105;207 15 | 0;ocrx_word;word_2_0_0_3;line_2_0_0;Col;98;110;193;137;207 16 | 0;ocrx_word;word_2_0_0_4;line_2_0_0;1;99;144;193;152;207 17 | 0;ocrx_word;word_3_0_0_0;line_3_0_0;Line;99;41;243;75;258 18 | 0;ocrx_word;word_3_0_0_1;line_3_0_0;4;98;82;243;91;258 19 | 0;ocrx_word;word_3_0_0_2;line_3_0_0;Col;99;110;243;137;258 20 | 0;ocrx_word;word_3_0_0_3;line_3_0_0;1;98;143;243;152;258 21 | 0;ocrx_word;word_4_0_0_0;line_4_0_0;Line;99;40;295;75;309 22 | 0;ocrx_word;word_4_0_0_1;line_4_0_0;5;98;82;295;90;309 23 | 0;ocrx_word;word_4_0_0_2;line_4_0_0;-;78;97;295;104;309 24 | 0;ocrx_word;word_4_0_0_3;line_4_0_0;Col;97;109;295;136;309 25 | 0;ocrx_word;word_4_0_0_4;line_4_0_0;1;98;142;295;152;309 26 | 0;ocrx_word;word_5_0_0_0;line_5_0_0;Title;99;383;39;423;55 27 | 0;ocrx_word;word_6_0_0_0;line_6_0_0;Line;99;281;91;316;105 28 | 0;ocrx_word;word_6_0_0_1;line_6_0_0;1;98;323;91;330;105 29 | 0;ocrx_word;word_6_0_0_2;line_6_0_0;-;76;337;91;344;105 30 | 0;ocrx_word;word_6_0_0_3;line_6_0_0;Col;91;350;91;378;105 31 | 0;ocrx_word;word_6_0_0_4;line_6_0_0;2;98;383;91;392;105 32 | 0;ocrx_word;word_7_0_0_0;line_7_0_0;Merged;99;329;191;395;211 33 | 0;ocrx_word;word_7_0_0_1;line_7_0_0;Cells;97;402;191;442;209 34 | 0;ocrx_word;word_8_0_0_0;line_8_0_0;Line;98;280;295;316;309 35 | 0;ocrx_word;word_8_0_0_1;line_8_0_0;5;97;323;295;332;309 36 | 0;ocrx_word;word_8_0_0_2;line_8_0_0;-;79;338;295;345;309 37 | 0;ocrx_word;word_8_0_0_3;line_8_0_0;Col;89;349;295;378;309 38 | 0;ocrx_word;word_8_0_0_4;line_8_0_0;2;98;383;295;392;309 39 | 0;ocrx_word;word_9_0_0_0;line_9_0_0;Line;99;499;91;534;105 40 | 0;ocrx_word;word_9_0_0_1;line_9_0_0;1;98;542;91;549;105 41 | 0;ocrx_word;word_9_0_0_2;line_9_0_0;-;75;556;91;563;105 42 | 0;ocrx_word;word_9_0_0_3;line_9_0_0;Col;94;569;91;596;105 43 | 0;ocrx_word;word_9_0_0_4;line_9_0_0;3;98;601;91;613;105 44 | 0;ocrx_word;word_10_0_0_0;line_10_0_0;Line;99;499;142;534;156 45 | 0;ocrx_word;word_10_0_0_1;line_10_0_0;2;98;541;142;550;156 46 | 0;ocrx_word;word_10_0_0_2;line_10_0_0;-;79;556;142;563;156 47 | 0;ocrx_word;word_10_0_0_3;line_10_0_0;Col;90;569;142;596;156 48 | 0;ocrx_word;word_10_0_0_4;line_10_0_0;3;98;603;142;612;156 49 | 0;ocrx_word;word_11_0_0_0;line_11_0_0;Line;99;500;193;534;207 50 | 0;ocrx_word;word_11_0_0_1;line_11_0_0;3;98;541;193;550;207 51 | 0;ocrx_word;word_11_0_0_2;line_11_0_0;-;79;556;193;564;207 52 | 0;ocrx_word;word_11_0_0_3;line_11_0_0;Col;92;569;193;597;207 53 | 0;ocrx_word;word_11_0_0_4;line_11_0_0;3;98;602;193;612;207 54 | 0;ocrx_word;word_12_0_0_0;line_12_0_0;Line;99;499;243;534;258 55 | 0;ocrx_word;word_12_0_0_1;line_12_0_0;4;97;541;243;550;258 56 | 0;ocrx_word;word_12_0_0_2;line_12_0_0;-;70;556;243;562;258 57 | 0;ocrx_word;word_12_0_0_3;line_12_0_0;Col;98;569;243;595;258 58 | 0;ocrx_word;word_12_0_0_4;line_12_0_0;3;98;600;244;611;259 59 | 0;ocrx_word;word_12_1_0_0;line_12_1_0;Line;99;500;294;534;309 60 | 0;ocrx_word;word_12_1_0_1;line_12_1_0;5;97;542;294;551;309 61 | 0;ocrx_word;word_12_1_0_2;line_12_1_0;-;72;556;294;563;309 62 | 0;ocrx_word;word_12_1_0_3;line_12_1_0;Col;97;569;294;597;309 63 | 0;ocrx_word;word_12_1_0_4;line_12_1_0;3;98;601;294;613;309 64 | 0;ocrx_word;word_13_0_0_0;line_13_0_0;Test;98;965;41;1001;54 65 | 0;ocrx_word;word_13_0_0_1;line_13_0_0;1;98;1006;41;1015;54 66 | 0;ocrx_word;word_14_0_0_0;line_14_0_0;Test;99;965;91;1001;106 67 | 0;ocrx_word;word_14_0_0_1;line_14_0_0;3;99;1006;91;1016;106 68 | 0;ocrx_word;word_15_0_0_0;line_15_0_0;Test;98;1061;40;1095;54 69 | 0;ocrx_word;word_15_0_0_1;line_15_0_0;2;98;1101;40;1110;53 70 | 0;ocrx_word;word_16_0_0_0;line_16_0_0;Test;95;1061;91;1096;104 71 | 0;ocrx_word;word_16_0_0_1;line_16_0_0;4;98;1102;91;1112;104 72 | -------------------------------------------------------------------------------- /tests/ocr/google_vision/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/google_vision/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/google_vision/test_google_vision.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | import os 4 | import pickle 5 | 6 | import polars as pl 7 | import pytest 8 | 9 | from img2table.document import Image 10 | from img2table.ocr.data import OCRDataframe 11 | from img2table.ocr.google_vision import VisionEndpointContent, VisionAPIContent, VisionOCR 12 | from tests import MOCK_DIR 13 | 14 | 15 | def test_vision_endpoint_content(mock_vision): 16 | image = Image("test_data/test.png") 17 | content = VisionEndpointContent(api_key="api_key", timeout=10) 18 | 19 | with open("test_data/expected_content.json", "r") as f: 20 | expected = json.load(f) 21 | 22 | # Test for map_response method 23 | with open(os.path.join(MOCK_DIR, "vision.json"), "r") as f: 24 | response = json.load(f) 25 | 26 | result_map_response = content.map_response(response=response, page=0, height=417, width=1365) 27 | assert result_map_response == expected[0] 28 | 29 | # Test for get_content method 30 | result_get_content = content.get_content(document=image) 31 | assert result_get_content == expected 32 | 33 | 34 | def test_vision_api_content(mock_vision): 35 | image = Image("test_data/test.png") 36 | content = VisionAPIContent(timeout=10) 37 | 38 | with open("test_data/expected_content.json", "r") as f: 39 | expected = json.load(f) 40 | 41 | # Test for map_response method 42 | with open(os.path.join(MOCK_DIR, "vision.pkl"), "rb") as f: 43 | response = pickle.load(f) 44 | 45 | result_map_response = content.map_response(response=response, shapes=[(417, 1365)]) 46 | assert result_map_response == expected 47 | 48 | # Test for get_content method 49 | result_get_content = content.get_content(document=image) 50 | assert result_get_content == expected 51 | 52 | 53 | def test_vision_ocr(mock_vision): 54 | image = Image("test_data/test.png") 55 | 56 | with open("test_data/expected_content.json", "r") as f: 57 | content = json.load(f) 58 | 59 | expected_ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 60 | 61 | # Test init error 62 | with pytest.raises(TypeError) as e_info: 63 | VisionOCR(api_key=8) 64 | 65 | with pytest.raises(ValueError) as e_info: 66 | VisionOCR() 67 | 68 | # Test with api_key 69 | ocr_key = VisionOCR(timeout=10, api_key="api_key") 70 | 71 | result_to_ocr_df = ocr_key.to_ocr_dataframe(content=content) 72 | assert result_to_ocr_df == expected_ocr_df 73 | 74 | result_ocr_df = ocr_key.of(document=image) 75 | assert result_ocr_df == expected_ocr_df 76 | 77 | # Test with credentials 78 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "creds" 79 | ocr_creds = VisionOCR(timeout=10) 80 | 81 | result_to_ocr_df = ocr_creds.to_ocr_dataframe(content=content) 82 | assert result_to_ocr_df == expected_ocr_df 83 | 84 | result_ocr_df = ocr_creds.of(document=image) 85 | assert result_ocr_df == expected_ocr_df 86 | -------------------------------------------------------------------------------- /tests/ocr/paddle/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/paddle/__init__.py -------------------------------------------------------------------------------- /tests/ocr/paddle/test_data/hocr.json: -------------------------------------------------------------------------------- 1 | [[[[381.0, 33.0], [426.0, 37.0], [424.0, 60.0], [379.0, 56.0]], ["Title", 1.0]], [[[964.0, 38.0], [1019.0, 38.0], [1019.0, 58.0], [964.0, 58.0]], ["Test 1", 0.96]], [[[1061.0, 36.0], [1115.0, 36.0], [1115.0, 58.0], [1061.0, 58.0]], ["Test 2", 1.0]], [[[40.0, 88.0], [155.0, 88.0], [155.0, 109.0], [40.0, 109.0]], ["Line 1 - Col 1", 0.93]], [[[282.0, 88.0], [395.0, 88.0], [395.0, 109.0], [282.0, 109.0]], ["Line 1 -Col 2", 0.94]], [[[499.0, 88.0], [614.0, 88.0], [614.0, 109.0], [499.0, 109.0]], ["Line 1-Col 3", 0.94]], [[[962.0, 90.0], [1019.0, 86.0], [1020.0, 108.0], [964.0, 112.0]], ["Test 3", 1.0]], [[[1061.0, 88.0], [1116.0, 88.0], [1116.0, 110.0], [1061.0, 110.0]], ["Test 4", 1.0]], [[[41.0, 140.0], [155.0, 140.0], [155.0, 161.0], [41.0, 161.0]], ["Line 2-Col 1", 0.92]], [[[499.0, 139.0], [614.0, 139.0], [614.0, 159.0], [499.0, 159.0]], ["Line2-Col3", 0.92]], [[[40.0, 191.0], [155.0, 191.0], [155.0, 211.0], [40.0, 211.0]], ["Line 3-Col 1", 0.93]], [[[327.0, 188.0], [446.0, 188.0], [446.0, 213.0], [327.0, 213.0]], ["Merged Cells", 0.99]], [[[499.0, 191.0], [616.0, 191.0], [616.0, 211.0], [499.0, 211.0]], ["Line 3-Col 3", 0.88]], [[[40.0, 242.0], [155.0, 242.0], [155.0, 262.0], [40.0, 262.0]], ["Line4-Col 1", 0.91]], [[[498.0, 242.0], [616.0, 242.0], [616.0, 262.0], [498.0, 262.0]], ["Line 4-Col 3", 0.88]], [[[40.0, 293.0], [155.0, 289.0], [155.0, 311.0], [40.0, 314.0]], ["Line 5 - Col 1", 0.98]], [[[282.0, 292.0], [395.0, 292.0], [395.0, 313.0], [282.0, 313.0]], ["Line 5-Col 2", 0.89]], [[[498.0, 292.0], [614.0, 292.0], [614.0, 313.0], [498.0, 313.0]], ["Line 5-Col 3", 0.91]]] -------------------------------------------------------------------------------- /tests/ocr/paddle/test_data/ocr_df.csv: -------------------------------------------------------------------------------- 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2 2 | 0;ocrx_word;word_1_1;word_1_1;Title;100;379;33;426;60 3 | 0;ocrx_word;word_1_2;word_1_2;Test 1;96;964;38;1019;58 4 | 0;ocrx_word;word_1_3;word_1_3;Test 2;100;1061;36;1115;58 5 | 0;ocrx_word;word_1_4;word_1_4;Line 1 - Col 1;93;40;88;155;109 6 | 0;ocrx_word;word_1_5;word_1_5;Line 1 -Col 2;94;282;88;395;109 7 | 0;ocrx_word;word_1_6;word_1_6;Line 1-Col 3;94;499;88;614;109 8 | 0;ocrx_word;word_1_7;word_1_7;Test 3;100;962;86;1020;112 9 | 0;ocrx_word;word_1_8;word_1_8;Test 4;100;1061;88;1116;110 10 | 0;ocrx_word;word_1_9;word_1_9;Line 2-Col 1;92;41;140;155;161 11 | 0;ocrx_word;word_1_10;word_1_10;Line2-Col3;92;499;139;614;159 12 | 0;ocrx_word;word_1_11;word_1_11;Line 3-Col 1;93;40;191;155;211 13 | 0;ocrx_word;word_1_12;word_1_12;Merged Cells;99;327;188;446;213 14 | 0;ocrx_word;word_1_13;word_1_13;Line 3-Col 3;88;499;191;616;211 15 | 0;ocrx_word;word_1_14;word_1_14;Line4-Col 1;91;40;242;155;262 16 | 0;ocrx_word;word_1_15;word_1_15;Line 4-Col 3;88;498;242;616;262 17 | 0;ocrx_word;word_1_16;word_1_16;Line 5 - Col 1;98;40;289;155;314 18 | 0;ocrx_word;word_1_17;word_1_17;Line 5-Col 2;89;282;292;395;313 19 | 0;ocrx_word;word_1_18;word_1_18;Line 5-Col 3;91;498;292;614;313 20 | -------------------------------------------------------------------------------- /tests/ocr/paddle/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/paddle/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/paddle/test_paddle.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import json 4 | import sys 5 | 6 | import cv2 7 | import polars as pl 8 | import pytest 9 | 10 | from img2table.document.image import Image 11 | from img2table.ocr.data import OCRDataframe 12 | 13 | 14 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12") 15 | def test_validators(): 16 | from img2table.ocr import PaddleOCR 17 | 18 | with pytest.raises(TypeError) as e_info: 19 | ocr = PaddleOCR(lang=12) 20 | 21 | 22 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12") 23 | def test_paddle_hocr(): 24 | from img2table.ocr import PaddleOCR 25 | 26 | instance = PaddleOCR() 27 | img = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB) 28 | 29 | result = instance.hocr(image=img) 30 | 31 | with open("test_data/hocr.json", "r") as f: 32 | expected = [[element[0], tuple(element[1])] for element in json.load(f)] 33 | 34 | assert result == expected 35 | 36 | 37 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12") 38 | def test_paddle_content(): 39 | from img2table.ocr import PaddleOCR 40 | 41 | instance = PaddleOCR() 42 | doc = Image(src="test_data/test.png") 43 | 44 | result = instance.content(document=doc) 45 | 46 | with open("test_data/hocr.json", "r") as f: 47 | expected = [[[element[0], tuple(element[1])] for element in json.load(f)]] 48 | 49 | assert result == expected 50 | 51 | 52 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12") 53 | def test_paddle_ocr_df(): 54 | from img2table.ocr import PaddleOCR 55 | 56 | instance = PaddleOCR() 57 | 58 | with open("test_data/hocr.json", "r") as f: 59 | content = [[[element[0], tuple(element[1])] for element in json.load(f)]] 60 | 61 | result = instance.to_ocr_dataframe(content=content) 62 | 63 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 64 | 65 | assert result == expected 66 | 67 | 68 | @pytest.mark.skipif(sys.version_info >= (3, 13), reason="Error building with 3.12") 69 | def test_paddle_document(): 70 | from img2table.ocr import PaddleOCR 71 | 72 | instance = PaddleOCR() 73 | doc = Image(src="test_data/test.png") 74 | 75 | result = instance.of(document=doc) 76 | 77 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 78 | 79 | assert result == expected 80 | -------------------------------------------------------------------------------- /tests/ocr/pdf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/pdf/__init__.py -------------------------------------------------------------------------------- /tests/ocr/pdf/test_data/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/pdf/test_data/test.pdf -------------------------------------------------------------------------------- /tests/ocr/pdf/test_pdf_ocr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import polars as pl 5 | 6 | from img2table.document.pdf import PDF 7 | from img2table.ocr.data import OCRDataframe 8 | from img2table.ocr.pdf import PdfOCR 9 | 10 | 11 | def test_pdf_content(mock_tesseract): 12 | instance = PdfOCR() 13 | doc = PDF(src="test_data/test.pdf", pages=[0, 1]) 14 | 15 | result = instance.content(document=doc) 16 | 17 | with open("test_data/content.json", "r") as f: 18 | expected = json.load(f) 19 | 20 | assert result == expected 21 | 22 | 23 | def test_pdf_ocr_df(): 24 | instance = PdfOCR() 25 | 26 | with open("test_data/content.json", "r") as f: 27 | content = json.load(f) 28 | 29 | result = instance.to_ocr_dataframe(content=content) 30 | 31 | df_expected = pl.read_csv("test_data/ocr_df.csv", separator=";") 32 | expected = OCRDataframe(df=df_expected) 33 | 34 | assert result == expected 35 | 36 | 37 | def test_pdf_document(): 38 | instance = PdfOCR() 39 | doc = PDF(src="test_data/test.pdf", pages=[0, 1]) 40 | 41 | result = instance.of(document=doc) 42 | 43 | df_expected = pl.read_csv("test_data/ocr_df.csv", separator=";") 44 | expected = OCRDataframe(df=df_expected) 45 | 46 | assert result == expected 47 | -------------------------------------------------------------------------------- /tests/ocr/surya/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/surya/__init__.py -------------------------------------------------------------------------------- /tests/ocr/surya/test_data/ocr_df.csv: -------------------------------------------------------------------------------- 1 | page;class;id;parent;value;confidence;x1;y1;x2;y2 2 | 0;ocrx_word;word_1_1_0;word_1_1;20 PAYS LES PLUS PEUPLÉS (EN TEMPS RÉEL);99;206;33;862;64 3 | 0;ocrx_word;word_1_2_0;word_1_2;k;57;94;132;118;149 4 | 0;ocrx_word;word_1_3_0;word_1_3;Chine;93;163;132;229;153 5 | 0;ocrx_word;word_1_4_0;word_1_4;Japon;97;675;132;741;153 6 | 0;ocrx_word;word_1_5_0;word_1_5;1 448 949 124;98;360;133;504;153 7 | 0;ocrx_word;word_1_6_0;word_1_6;11;93;563;133;587;151 8 | 0;ocrx_word;word_1_7_0;word_1_7;125 815 814;94;856;133;979;153 9 | 0;ocrx_word;word_1_8_0;word_1_8;1;93;60;134;74;150 10 | 0;ocrx_word;word_1_9_0;word_1_9;Ethiopie;98;675;193;768;216 11 | 0;ocrx_word;word_1_10_0;word_1_10;Inde;97;161;194;214;215 12 | 0;ocrx_word;word_1_11_0;word_1_11;1 403 805 173;98;361;194;505;216 13 | 0;ocrx_word;word_1_12_0;word_1_12;120 125 587;93;856;194;979;215 14 | 0;ocrx_word;word_1_13_0;word_1_13;2;81;57;195;76;215 15 | 0;ocrx_word;word_1_14_0;word_1_14;12 ;88;563;195;595;215 16 | 0;ocrx_word;word_1_15_0;word_1_15;138;23;599;196;651;214 17 | 0;ocrx_word;word_1_16_0;word_1_16;(a);45;102;197;126;211 18 | 0;ocrx_word;word_1_17_0;word_1_17;États-Unis;98;162;255;275;278 19 | 0;ocrx_word;word_1_18_0;word_1_18;Philippines;99;675;257;796;279 20 | 0;ocrx_word;word_1_19_0;word_1_19;3;87;59;258;75;278 21 | 0;ocrx_word;word_1_20_0;word_1_20;334 378 540;94;379;258;504;278 22 | 0;ocrx_word;word_1_21_0;word_1_21;112 159 001;94;857;258;979;278 23 | 0;ocrx_word;word_1_22_0;word_1_22;13;93;562;259;591;278 24 | 0;ocrx_word;word_1_23_0;word_1_23;A;40;603;259;629;275 25 | 0;ocrx_word;word_1_24_0;word_1_24;Égypte;95;672;315;755;343 26 | 0;ocrx_word;word_1_25_0;word_1_25;Indonésie;98;163;319;268;340 27 | 0;ocrx_word;word_1_26_0;word_1_26;105 787 594;91;856;319;980;341 28 | 0;ocrx_word;word_1_27_0;word_1_27;278 605 610;96;379;320;505;340 29 | 0;ocrx_word;word_1_28_0;word_1_28;14;96;564;320;588;340 30 | 0;ocrx_word;word_1_29_0;word_1_29;4;93;59;322;74;340 31 | 0;ocrx_word;word_1_30_0;word_1_30;Brésil;94;161;382;227;404 32 | 0;ocrx_word;word_1_31_0;word_1_31;5;83;60;383;75;402 33 | 0;ocrx_word;word_1_32_0;word_1_32;215 194 443;98;379;383;504;404 34 | 0;ocrx_word;word_1_33_0;word_1_33;l ર;73;562;383;592;402 35 | 0;ocrx_word;word_1_34_0;word_1_34;★;84;596;383;651;401 36 | 0;ocrx_word;word_1_35_0;word_1_35;Vietnam;98;676;383;768;403 37 | 0;ocrx_word;word_1_36_0;word_1_36;98 872 186;89;867;383;980;404 38 | 0;ocrx_word;word_1_37_0;word_1_37;94 560 352;91;867;443;981;467 39 | 0;ocrx_word;word_1_38_0;word_1_38;l C;61;91;444;142;463 40 | 0;ocrx_word;word_1_39_0;word_1_39;Pakistan;97;161;444;255;465 41 | 0;ocrx_word;word_1_40_0;word_1_40;Congo;97;674;444;750;466 42 | 0;ocrx_word;word_1_41_0;word_1_41;6;87;58;445;77;464 43 | 0;ocrx_word;word_1_42_0;word_1_42;228 575 462;96;379;445;504;465 44 | 0;ocrx_word;word_1_43_0;word_1_43;16;91;563;445;593;465 45 | 0;ocrx_word;word_1_44_0;word_1_44;"";26;599;445;652;462 46 | 0;ocrx_word;word_1_45_0;word_1_45;Allemagne;98;677;506;794;529 47 | 0;ocrx_word;word_1_46_0;word_1_46;7 ;91;57;507;141;528 48 | 0;ocrx_word;word_1_47_0;word_1_47;Nigeria;97;162;507;244;529 49 | 0;ocrx_word;word_1_48_0;word_1_48;215 401 411;92;380;508;503;528 50 | 0;ocrx_word;word_1_49_0;word_1_49;17;88;563;508;592;527 51 | 0;ocrx_word;word_1_50_0;word_1_50;84 247 226;90;866;508;980;528 52 | 0;ocrx_word;word_1_51_0;word_1_51;Bangladesh;99;162;569;286;591 53 | 0;ocrx_word;word_1_52_0;word_1_52;167 568 471;94;379;570;503;590 54 | 0;ocrx_word;word_1_53_0;word_1_53;personal program;47;604;570;653;590 55 | 0;ocrx_word;word_1_54_0;word_1_54;Iran;97;676;570;722;591 56 | 0;ocrx_word;word_1_55_0;word_1_55;85 891 049;92;867;570;979;591 57 | 0;ocrx_word;word_1_56_0;word_1_56;8;79;58;571;76;589 58 | 0;ocrx_word;word_1_57_0;word_1_57;18;93;563;571;591;590 59 | 0;ocrx_word;word_1_58_0;word_1_58;o;72;100;572;137;587 60 | 0;ocrx_word;word_1_59_0;word_1_59;Russie;96;163;632;237;653 61 | 0;ocrx_word;word_1_60_0;word_1_60;(+;59;604;632;647;652 62 | 0;ocrx_word;word_1_61_0;word_1_61;146 042 034;98;380;633;505;654 63 | 0;ocrx_word;word_1_62_0;word_1_62;Turquie;97;677;633;762;654 64 | 0;ocrx_word;word_1_63_0;word_1_63;85 934 290;93;866;633;980;654 65 | 0;ocrx_word;word_1_64_0;word_1_64;9;89;59;634;75;651 66 | 0;ocrx_word;word_1_65_0;word_1_65;19;96;562;634;590;653 67 | 0;ocrx_word;word_1_66_0;word_1_66;Thaïlande;95;678;693;783;716 68 | 0;ocrx_word;word_1_67_0;word_1_67;Mexique;97;161;694;258;716 69 | 0;ocrx_word;word_1_68_0;word_1_68;70 102 414;90;867;694;980;718 70 | 0;ocrx_word;word_1_69_0;word_1_69;131 312 546;95;379;695;505;716 71 | 0;ocrx_word;word_1_70_0;word_1_70;10;93;48;696;75;714 72 | 0;ocrx_word;word_1_71_0;word_1_71;19;19;101;697;125;713 73 | 0;ocrx_word;word_1_72_0;word_1_72;20;91;562;697;589;715 74 | -------------------------------------------------------------------------------- /tests/ocr/surya/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/surya/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/surya/test_surya.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | import pickle 4 | import sys 5 | 6 | import polars as pl 7 | import pytest 8 | 9 | from img2table.document import Image 10 | from img2table.ocr import SuryaOCR 11 | from img2table.ocr.data import OCRDataframe 12 | from tests import MOCK_DIR 13 | 14 | 15 | @pytest.mark.skipif(sys.version_info < (3, 10), reason="Library not available") 16 | def test_content(mock_surya): 17 | img = Image("test_data/test.png") 18 | ocr = SuryaOCR(langs=["en"]) 19 | 20 | result = ocr.content(document=img) 21 | 22 | with open(os.path.join(MOCK_DIR, "surya.pkl"), "rb") as f: 23 | expected = pickle.load(f) 24 | 25 | assert result == expected 26 | 27 | 28 | @pytest.mark.skipif(sys.version_info < (3, 10), reason="Library not available") 29 | def test_to_ocr_df(): 30 | ocr = SuryaOCR(langs=["en"]) 31 | with open(os.path.join(MOCK_DIR, "surya.pkl"), "rb") as f: 32 | content = pickle.load(f) 33 | 34 | result = ocr.to_ocr_dataframe(content=content) 35 | 36 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 37 | 38 | assert result == expected 39 | 40 | 41 | @pytest.mark.skipif(sys.version_info < (3, 10), reason="Library not available") 42 | def test_surya_ocr(mock_surya): 43 | # Test init error 44 | with pytest.raises(TypeError) as e_info: 45 | SuryaOCR(langs=1) 46 | 47 | with pytest.raises(TypeError) as e_info: 48 | SuryaOCR(langs=[1, 2]) 49 | 50 | img = Image("test_data/test.png") 51 | ocr = SuryaOCR(langs=["en"]) 52 | 53 | result = ocr.of(document=img) 54 | 55 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 56 | 57 | assert result == expected 58 | -------------------------------------------------------------------------------- /tests/ocr/tesseract/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/tesseract/__init__.py -------------------------------------------------------------------------------- /tests/ocr/tesseract/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/ocr/tesseract/test_data/test.png -------------------------------------------------------------------------------- /tests/ocr/tesseract/test_tesseract.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import os 3 | 4 | import cv2 5 | import polars as pl 6 | import pytest 7 | 8 | from img2table.document.image import Image 9 | from img2table.ocr import TesseractOCR 10 | from img2table.ocr.data import OCRDataframe 11 | from tests import MOCK_DIR, TESSERACT_INSTALL 12 | 13 | 14 | def test_validators(): 15 | with pytest.raises(TypeError) as e_info: 16 | ocr = TesseractOCR(n_threads=[8]) 17 | 18 | with pytest.raises(TypeError) as e_info: 19 | ocr = TesseractOCR(lang=12) 20 | 21 | with pytest.raises(TypeError) as e_info: 22 | ocr = TesseractOCR(psm="r") 23 | 24 | 25 | @pytest.mark.skipif(TESSERACT_INSTALL, reason="Tesseract installed locally") 26 | def test_installed(): 27 | with pytest.raises(EnvironmentError) as e_info: 28 | ocr = TesseractOCR() 29 | 30 | 31 | def test_lang_validators(mock_tesseract): 32 | with pytest.raises(EnvironmentError) as e_info: 33 | ocr = TesseractOCR(lang="zzz") 34 | 35 | 36 | def test_tesseract_hocr(mock_tesseract): 37 | instance = TesseractOCR() 38 | img = cv2.imread("test_data/test.png", cv2.IMREAD_GRAYSCALE) 39 | 40 | result = instance.hocr(image=img) 41 | 42 | with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), 'r') as f: 43 | assert result == f.read() 44 | 45 | 46 | def test_tesseract_content(mock_tesseract): 47 | instance = TesseractOCR() 48 | doc = Image(src="test_data/test.png") 49 | 50 | result = instance.content(document=doc) 51 | 52 | with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), 'r') as f: 53 | assert list(result) == [f.read()] 54 | 55 | 56 | def test_tesseract_ocr_df(mock_tesseract): 57 | instance = TesseractOCR() 58 | 59 | with open(os.path.join(MOCK_DIR, "tesseract_hocr.html"), 'r') as f: 60 | content = [f.read()] 61 | 62 | result = instance.to_ocr_dataframe(content=content) 63 | 64 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 65 | 66 | assert result == expected 67 | 68 | 69 | def test_tesseract_document(mock_tesseract): 70 | instance = TesseractOCR() 71 | doc = Image(src="test_data/test.png") 72 | 73 | result = instance.of(document=doc) 74 | 75 | expected = OCRDataframe(df=pl.read_csv("test_data/ocr_df.csv", separator=";")) 76 | 77 | assert result == expected 78 | -------------------------------------------------------------------------------- /tests/tables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/__init__.py -------------------------------------------------------------------------------- /tests/tables/image/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/image/__init__.py -------------------------------------------------------------------------------- /tests/tables/image/test_data/blank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/image/test_data/blank.png -------------------------------------------------------------------------------- /tests/tables/image/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/image/test_data/test.png -------------------------------------------------------------------------------- /tests/tables/image/test_image.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import cv2 4 | 5 | from img2table.tables.image import TableImage 6 | 7 | 8 | def test_table_image(): 9 | image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB) 10 | 11 | tb_image = TableImage(img=image, 12 | min_confidence=50) 13 | 14 | result = tb_image.extract_tables(implicit_rows=True) 15 | result = sorted(result, key=lambda tb: tb.x1 + tb.x2) 16 | 17 | assert (result[0].x1, result[0].y1, result[0].x2, result[0].y2) == (36, 21, 770, 327) 18 | assert (result[0].nb_rows, result[0].nb_columns) == (6, 3) 19 | 20 | assert (result[1].x1, result[1].y1, result[1].x2, result[1].y2) == (962, 21, 1154, 123) 21 | assert (result[1].nb_rows, result[1].nb_columns) == (2, 2) 22 | -------------------------------------------------------------------------------- /tests/tables/image/test_metrics.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import cv2 3 | 4 | from img2table.tables import threshold_dark_areas 5 | from img2table.tables.metrics import compute_char_length, compute_median_line_sep, compute_img_metrics 6 | 7 | 8 | def test_compute_char_length(): 9 | image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB) 10 | thresh = threshold_dark_areas(img=image, char_length=11) 11 | 12 | char_length, thresh_chars, chars_array = compute_char_length(thresh=thresh) 13 | assert round(char_length, 2) == 9.0 14 | assert thresh_chars.shape == (417, 1365) 15 | 16 | image = 255 - cv2.cvtColor(cv2.imread("test_data/blank.png"), cv2.COLOR_BGR2GRAY) 17 | assert compute_char_length(thresh=image) == (None, None, None) 18 | 19 | 20 | def test_compute_median_line_sep(): 21 | image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB) 22 | thresh = threshold_dark_areas(img=image, char_length=11) 23 | char_length, thresh_chars, chars_array = compute_char_length(thresh=thresh) 24 | 25 | median_line_sep, contours = compute_median_line_sep(thresh_chars=thresh_chars, 26 | chars_array=chars_array, 27 | char_length=char_length) 28 | 29 | assert round(median_line_sep, 2) == 51 30 | assert len(contours) == 71 31 | 32 | 33 | def test_compute_img_metrics(): 34 | image = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB) 35 | thresh = threshold_dark_areas(img=image, char_length=11) 36 | char_length, median_line_sep, contours = compute_img_metrics(thresh=thresh) 37 | 38 | assert round(char_length, 2) == 9.0 39 | assert round(median_line_sep, 2) == 51 40 | assert len(contours) == 71 41 | 42 | image = 255 - cv2.cvtColor(cv2.imread("test_data/blank.png"), cv2.COLOR_BGR2GRAY) 43 | assert compute_img_metrics(thresh=image) == (None, None, None) 44 | -------------------------------------------------------------------------------- /tests/tables/objects/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/objects/__init__.py -------------------------------------------------------------------------------- /tests/tables/objects/test_data/expected_tables.json: -------------------------------------------------------------------------------- 1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71, "content": "Title"}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71, "content": "Title"}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71, "content": "Title"}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123, "content": "Line 1-Col 1"}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123, "content": "Line Col 2"}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123, "content": "Line 1-Col 3"}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 173, "content": "Line Col 1"}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275, "content": "Merged Cells"}, {"x1": 494, "y1": 123, "x2": 770, "y2": 173, "content": "Line 2-Col3"}], [{"x1": 35, "y1": 173, "x2": 276, "y2": 225, "content": "Line 3-Col1"}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275, "content": "Merged Cells"}, {"x1": 494, "y1": 173, "x2": 770, "y2": 225, "content": "Line 3-Col3"}], [{"x1": 35, "y1": 225, "x2": 276, "y2": 275, "content": "Line 4-Col1"}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275, "content": "Merged Cells"}, {"x1": 494, "y1": 225, "x2": 770, "y2": 275, "content": "Line 4-Col3"}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326, "content": "Line"}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326, "content": "Line 5-Col2"}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326, "content": "Line 3"}]], [[{"x1": 961, "y1": 21, "x2": 1058, "y2": 71, "content": "Test 1"}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71, "content": "Test 2"}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123, "content": "Test 3"}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123, "content": "Test 4"}]]] -------------------------------------------------------------------------------- /tests/tables/objects/test_data/table.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | 9 | 10 | 11 | 14 | 17 | 18 |
4 | Test 1 5 | 7 | Test 2 8 |
12 | Test 3 13 | 15 | Test 4 16 |
-------------------------------------------------------------------------------- /tests/tables/objects/test_data/tables.json: -------------------------------------------------------------------------------- 1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 173}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 123, "x2": 770, "y2": 173}], [{"x1": 35, "y1": 173, "x2": 276, "y2": 225}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 173, "x2": 770, "y2": 225}], [{"x1": 35, "y1": 225, "x2": 276, "y2": 275}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 225, "x2": 770, "y2": 275}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}]], [[{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]] -------------------------------------------------------------------------------- /tests/tables/objects/test_extraction.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | from io import BytesIO 4 | 5 | from xlsxwriter import Workbook 6 | 7 | from img2table.tables.objects.cell import Cell 8 | from img2table.tables.objects.extraction import create_all_rectangles, CellPosition, TableCell, BBox, CellSpan 9 | from img2table.tables.objects.row import Row 10 | from img2table.tables.objects.table import Table 11 | 12 | 13 | def test_create_all_rectangles(): 14 | c = TableCell(bbox=BBox(x1=0, y1=0, x2=0, y2=0), value="Test") 15 | cell_positions = [CellPosition(cell=c, row=0, col=0), CellPosition(cell=c, row=1, col=0), 16 | CellPosition(cell=c, row=2, col=0), CellPosition(cell=c, row=3, col=0), 17 | CellPosition(cell=c, row=0, col=1), CellPosition(cell=c, row=1, col=1), 18 | CellPosition(cell=c, row=2, col=1), CellPosition(cell=c, row=3, col=1), 19 | CellPosition(cell=c, row=2, col=2), CellPosition(cell=c, row=3, col=2), 20 | CellPosition(cell=c, row=2, col=3), CellPosition(cell=c, row=3, col=3), 21 | ] 22 | 23 | result = create_all_rectangles(cell_positions=cell_positions) 24 | 25 | assert result == [CellSpan(top_row=0, bottom_row=3, col_left=0, col_right=1, value='Test'), 26 | CellSpan(top_row=2, bottom_row=3, col_left=2, col_right=3, value='Test')] 27 | 28 | 29 | def test_table_html(): 30 | with open("test_data/expected_tables.json", "r") as f: 31 | table = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb]) 32 | for tb in json.load(f)].pop() 33 | 34 | with open("test_data/table.html", "r") as f: 35 | expected = f.read() 36 | 37 | assert table.extracted_table.html == expected 38 | 39 | 40 | def test_extracted_table_worksheet(): 41 | with open("test_data/expected_tables.json", "r") as f: 42 | tables = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb]) 43 | for tb in json.load(f)] 44 | 45 | wb = Workbook(BytesIO()) 46 | for table in tables: 47 | ws = wb.add_worksheet() 48 | extracted_table = table.extracted_table 49 | extracted_table._to_worksheet(sheet=ws) 50 | 51 | assert ws.dim_colmax + 1 == table.nb_columns 52 | assert ws.dim_rowmax + 1 == table.nb_rows 53 | 54 | str_map = {v: k for k, v in ws.str_table.string_table.items()} 55 | ws_values = sorted([str_map.get(c.string) for row in ws.table.values() for c in row.values()]) 56 | table_values = sorted(set([c.value for row in extracted_table.content.values() 57 | for c in row])) 58 | assert ws_values == table_values 59 | 60 | wb.close() 61 | -------------------------------------------------------------------------------- /tests/tables/objects/test_line.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | from img2table.tables.objects.line import Line 4 | 5 | 6 | def test_line(): 7 | line = Line(x1=0, y1=20, x2=46, y2=73) 8 | 9 | assert round(line.angle) == 49 10 | assert line.width == 46 11 | assert line.height == 53 12 | assert round(line.length) == 70 13 | assert not line.vertical 14 | assert not line.horizontal 15 | 16 | 17 | def test_reprocess_line(): 18 | line = Line(x1=20, y1=73, x2=19, y2=20, thickness=18) 19 | 20 | reprocessed_line = line.reprocess() 21 | assert reprocessed_line == Line(x1=20, x2=20, y1=20, y2=73, thickness=18) 22 | assert reprocessed_line.vertical 23 | -------------------------------------------------------------------------------- /tests/tables/objects/test_row.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from img2table.tables.objects.cell import Cell 3 | from img2table.tables.objects.row import Row 4 | 5 | 6 | def test_row(): 7 | row = Row(cells=[Cell(x1=0, x2=20, y1=0, y2=20), Cell(x1=20, x2=40, y1=0, y2=20)]) 8 | 9 | assert row.x1 == 0 10 | assert row.y1 == 0 11 | assert row.x2 == 40 12 | assert row.y2 == 20 13 | assert row.nb_columns == 2 14 | assert row.v_consistent 15 | 16 | 17 | def test_add_cells(): 18 | row = Row(cells=[Cell(x1=0, x2=20, y1=0, y2=20), Cell(x1=20, x2=40, y1=0, y2=20)]) 19 | 20 | row.add_cells(cells=Cell(x1=40, x2=60, y1=0, y2=20)) 21 | 22 | assert row.nb_columns == 3 23 | assert row.x2 == 60 24 | 25 | 26 | def test_split_in_rows(): 27 | row = Row(cells=[Cell(x1=0, x2=20, y1=0, y2=20), Cell(x1=20, x2=40, y1=0, y2=20)]) 28 | 29 | rows_splitted = row.split_in_rows(vertical_delimiters=[10, 15]) 30 | 31 | expected = [Row(cells=[Cell(x1=0, x2=20, y1=0, y2=10), Cell(x1=20, x2=40, y1=0, y2=10)]), 32 | Row(cells=[Cell(x1=0, x2=20, y1=10, y2=15), Cell(x1=20, x2=40, y1=10, y2=15)]), 33 | Row(cells=[Cell(x1=0, x2=20, y1=15, y2=20), Cell(x1=20, x2=40, y1=15, y2=20)]) 34 | ] 35 | 36 | assert rows_splitted == expected 37 | -------------------------------------------------------------------------------- /tests/tables/objects/test_table.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import polars as pl 5 | 6 | from img2table.ocr.data import OCRDataframe 7 | from img2table.tables.objects.cell import Cell 8 | from img2table.tables.objects.row import Row 9 | from img2table.tables.objects.table import Table 10 | 11 | 12 | def test_remove_rows(): 13 | table = Table(rows=[Row(cells=[Cell(x1=0, x2=100, y1=0, y2=10)]), 14 | Row(cells=[Cell(x1=0, x2=100, y1=10, y2=20)]), 15 | Row(cells=[Cell(x1=0, x2=100, y1=20, y2=30)])]) 16 | table.remove_rows(row_ids=[1]) 17 | 18 | expected = Table(rows=[Row(cells=[Cell(x1=0, x2=100, y1=0, y2=15)]), 19 | Row(cells=[Cell(x1=0, x2=100, y1=15, y2=30)])]) 20 | 21 | assert table == expected 22 | 23 | 24 | def test_remove_columns(): 25 | table = Table(rows=[Row(cells=[Cell(x1=0, x2=100, y1=0, y2=10), 26 | Cell(x1=100, x2=200, y1=0, y2=10), 27 | Cell(x1=200, x2=300, y1=0, y2=10)]), 28 | Row(cells=[Cell(x1=0, x2=100, y1=10, y2=20), 29 | Cell(x1=100, x2=200, y1=10, y2=20), 30 | Cell(x1=200, x2=300, y1=10, y2=20)]), 31 | ]) 32 | 33 | table.remove_columns(col_ids=[1]) 34 | 35 | expected = Table(rows=[Row(cells=[Cell(x1=0, x2=150, y1=0, y2=10), 36 | Cell(x1=150, x2=300, y1=0, y2=10)]), 37 | Row(cells=[Cell(x1=0, x2=150, y1=10, y2=20), 38 | Cell(x1=150, x2=300, y1=10, y2=20)]) 39 | ]) 40 | 41 | assert table == expected 42 | 43 | 44 | def test_table(): 45 | with open("test_data/tables.json", "r") as f: 46 | tables = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb]) 47 | for tb in json.load(f)] 48 | 49 | assert tables[0].nb_columns == 3 50 | assert tables[0].nb_rows == 6 51 | assert tables[0].bbox() == (35, 20, 770, 326) 52 | 53 | assert tables[1].nb_columns == 2 54 | assert tables[1].nb_rows == 2 55 | assert tables[1].bbox() == (961, 21, 1154, 123) 56 | 57 | 58 | def test_get_table_content(): 59 | with open("test_data/tables.json", "r") as f: 60 | tables = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb]) 61 | for tb in json.load(f)] 62 | 63 | # Load OCR 64 | ocr_df = OCRDataframe(pl.read_csv("test_data/ocr.csv", separator=";", encoding="utf-8")) 65 | 66 | result = [table.get_content(ocr_df=ocr_df, min_confidence=50) for table in tables] 67 | 68 | with open("test_data/expected_tables.json", "r") as f: 69 | expected = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb]) 70 | for tb in json.load(f)] 71 | 72 | assert result == expected 73 | -------------------------------------------------------------------------------- /tests/tables/processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/cells/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_cells.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import polars as pl 5 | 6 | from img2table.tables.objects.cell import Cell 7 | from img2table.tables.objects.line import Line 8 | from img2table.tables.processing.bordered_tables.cells import get_cells 9 | 10 | 11 | def test_get_cells(): 12 | with open("test_data/lines.json", 'r') as f: 13 | data = json.load(f) 14 | h_lines = [Line(**el) for el in data.get('h_lines')] 15 | v_lines = [Line(**el) for el in data.get('v_lines')] 16 | 17 | result = get_cells(horizontal_lines=h_lines, 18 | vertical_lines=v_lines) 19 | 20 | df_expected = pl.read_csv("test_data/expected.csv", separator=";", encoding="utf-8") 21 | expected = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"]) 22 | for row in df_expected.to_dicts()] 23 | 24 | assert sorted(result, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) == sorted(expected, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) 25 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_data/expected.csv: -------------------------------------------------------------------------------- 1 | x1;y1;x2;y2 2 | 1058;21;1154;71 3 | 961;21;1058;71 4 | 1058;71;1154;123 5 | 961;71;1058;123 6 | 276;275;494;326 7 | 276;71;494;123 8 | 35;123;276;173 9 | 35;225;276;275 10 | 35;275;276;326 11 | 35;71;276;123 12 | 35;173;276;225 13 | 494;174;770;224 14 | 494;123;770;174 15 | 494;224;770;275 16 | 494;275;770;326 17 | 494;71;770;123 18 | 276;123;494;275 19 | 35;20;770;71 20 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_data/expected_ident_cells.csv: -------------------------------------------------------------------------------- 1 | index;x1;y1;x2;y2 2 | 0;35;20;770;71 3 | 1;35;20;770;275 4 | 2;35;20;770;326 5 | 3;35;71;276;123 6 | 4;35;123;276;173 7 | 5;35;123;276;275 8 | 6;35;173;276;225 9 | 7;35;225;276;275 10 | 8;35;275;276;326 11 | 9;276;71;494;123 12 | 10;276;123;494;275 13 | 11;276;275;494;326 14 | 12;494;71;770;123 15 | 13;494;123;770;174 16 | 14;494;123;770;275 17 | 15;494;174;770;224 18 | 16;494;224;770;275 19 | 17;494;275;770;326 20 | 18;961;21;1058;71 21 | 19;961;71;1058;123 22 | 20;1058;21;1154;71 23 | 21;1058;71;1154;123 24 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_data/expected_potential_cells.csv: -------------------------------------------------------------------------------- 1 | idx;x1_bbox;x2_bbox;y1_bbox;y2_bbox 2 | 7;35;772;20;326 3 | 18;36;277;123;173 4 | 24;36;277;173;225 5 | 30;36;277;225;275 6 | 0;36;771;20;71 7 | 10;36;771;71;123 8 | 22;36;771;123;275 9 | 6;36;772;20;275 10 | 34;36;772;275;326 11 | 19;495;771;123;174 12 | 27;495;771;174;224 13 | 32;495;771;224;275 14 | 8;962;1156;21;71 15 | 17;962;1156;71;123 16 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_data/expected_vertical_dedup.csv: -------------------------------------------------------------------------------- 1 | index;x1;y1;x2;y2 2 | 4;35;71;276;123 3 | 9;35;123;276;173 4 | 13;35;173;276;225 5 | 16;35;225;276;275 6 | 18;35;275;276;326 7 | 1;35;20;770;71 8 | 20;276;71;494;123 9 | 23;276;123;494;275 10 | 25;276;275;494;326 11 | 27;494;71;770;123 12 | 32;494;123;770;174 13 | 36;494;174;770;224 14 | 39;494;224;770;275 15 | 41;494;275;770;326 16 | 42;961;21;1058;71 17 | 44;961;71;1058;123 18 | 45;1058;21;1154;71 19 | 47;1058;71;1154;123 20 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_data/lines.json: -------------------------------------------------------------------------------- 1 | {"h_lines": [{"x1": 35, "x2": 772, "y1": 20, "y2": 20}, {"x1": 961, "x2": 1156, "y1": 21, "y2": 21}, {"x1": 36, "x2": 771, "y1": 71, "y2": 71}, {"x1": 962, "x2": 1156, "y1": 71, "y2": 71}, {"x1": 36, "x2": 771, "y1": 123, "y2": 123}, {"x1": 962, "x2": 1156, "y1": 123, "y2": 123}, {"x1": 36, "x2": 277, "y1": 173, "y2": 173}, {"x1": 495, "x2": 771, "y1": 174, "y2": 174}, {"x1": 36, "x2": 277, "y1": 225, "y2": 225}, {"x1": 495, "x2": 771, "y1": 224, "y2": 224}, {"x1": 36, "x2": 772, "y1": 275, "y2": 275}, {"x1": 35, "x2": 772, "y1": 326, "y2": 326}], "v_lines": [{"x1": 35, "x2": 35, "y1": 20, "y2": 329}, {"x1": 276, "x2": 276, "y1": 72, "y2": 328}, {"x1": 494, "x2": 494, "y1": 72, "y2": 328}, {"x1": 770, "x2": 770, "y1": 20, "y2": 329}, {"x1": 961, "x2": 961, "y1": 20, "y2": 125}, {"x1": 1058, "x2": 1058, "y1": 21, "y2": 124}, {"x1": 1154, "x2": 1154, "y1": 20, "y2": 125}]} -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_deduplication_cells.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import polars as pl 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.processing.bordered_tables.cells.deduplication import deduplicate_cells 6 | 7 | 8 | def test_deduplicate_cells(): 9 | df_cells = pl.read_csv("test_data/expected_ident_cells.csv", separator=";", encoding="utf-8") 10 | cells = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"]) 11 | for row in df_cells.to_dicts()] 12 | 13 | result = deduplicate_cells(cells=cells) 14 | 15 | df_expected = pl.read_csv("test_data/expected.csv", separator=";", encoding="utf-8") 16 | expected = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"]) 17 | for row in df_expected.to_dicts()] 18 | 19 | assert sorted(result, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) == sorted(expected, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) 20 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/cells/test_identification_cells.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import polars as pl 5 | 6 | from img2table.tables.objects.cell import Cell 7 | from img2table.tables.objects.line import Line 8 | from img2table.tables.processing.bordered_tables.cells.identification import get_cells_dataframe 9 | 10 | 11 | def test_get_cells_dataframe(): 12 | with open("test_data/lines.json", 'r') as f: 13 | data = json.load(f) 14 | h_lines = [Line(**el) for el in data.get('h_lines')] 15 | v_lines = [Line(**el) for el in data.get('v_lines')] 16 | 17 | result = get_cells_dataframe(horizontal_lines=h_lines, 18 | vertical_lines=v_lines) 19 | 20 | df_expected = pl.read_csv("test_data/expected_ident_cells.csv", separator=";", encoding="utf-8") 21 | expected = [Cell(x1=row["x1"], x2=row["x2"], y1=row["y1"], y2=row["y2"]) 22 | for row in df_expected.to_dicts()] 23 | 24 | assert sorted(result, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) == sorted(expected, key=lambda c: (c.x1, c.y1, c.x2, c.y2)) 25 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/lines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/lines/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/lines/test_data/contours.json: -------------------------------------------------------------------------------- 1 | [{"x1": 603, "y1": 296, "x2": 612, "y2": 310}, {"x1": 570, "y1": 296, "x2": 591, "y2": 310}, {"x1": 501, "y1": 296, "x2": 551, "y2": 310}, {"x1": 384, "y1": 296, "x2": 393, "y2": 310}, {"x1": 351, "y1": 296, "x2": 372, "y2": 310}, {"x1": 282, "y1": 296, "x2": 332, "y2": 310}, {"x1": 145, "y1": 296, "x2": 153, "y2": 310}, {"x1": 111, "y1": 296, "x2": 132, "y2": 310}, {"x1": 42, "y1": 296, "x2": 92, "y2": 310}, {"x1": 603, "y1": 245, "x2": 612, "y2": 259}, {"x1": 570, "y1": 245, "x2": 591, "y2": 259}, {"x1": 501, "y1": 245, "x2": 551, "y2": 259}, {"x1": 145, "y1": 245, "x2": 153, "y2": 259}, {"x1": 111, "y1": 245, "x2": 132, "y2": 259}, {"x1": 42, "y1": 245, "x2": 92, "y2": 259}, {"x1": 437, "y1": 198, "x2": 444, "y2": 208}, {"x1": 603, "y1": 194, "x2": 612, "y2": 208}, {"x1": 570, "y1": 194, "x2": 591, "y2": 208}, {"x1": 501, "y1": 194, "x2": 551, "y2": 208}, {"x1": 145, "y1": 194, "x2": 153, "y2": 208}, {"x1": 111, "y1": 194, "x2": 132, "y2": 208}, {"x1": 42, "y1": 194, "x2": 92, "y2": 208}, {"x1": 331, "y1": 193, "x2": 425, "y2": 212}, {"x1": 603, "y1": 143, "x2": 612, "y2": 157}, {"x1": 570, "y1": 143, "x2": 591, "y2": 157}, {"x1": 501, "y1": 143, "x2": 551, "y2": 157}, {"x1": 145, "y1": 143, "x2": 153, "y2": 157}, {"x1": 111, "y1": 143, "x2": 132, "y2": 157}, {"x1": 42, "y1": 143, "x2": 92, "y2": 157}, {"x1": 1062, "y1": 92, "x2": 1112, "y2": 106}, {"x1": 966, "y1": 92, "x2": 1016, "y2": 106}, {"x1": 603, "y1": 92, "x2": 612, "y2": 106}, {"x1": 570, "y1": 92, "x2": 591, "y2": 106}, {"x1": 501, "y1": 92, "x2": 551, "y2": 106}, {"x1": 384, "y1": 92, "x2": 393, "y2": 106}, {"x1": 351, "y1": 92, "x2": 372, "y2": 106}, {"x1": 282, "y1": 92, "x2": 332, "y2": 106}, {"x1": 145, "y1": 92, "x2": 153, "y2": 106}, {"x1": 111, "y1": 92, "x2": 132, "y2": 106}, {"x1": 42, "y1": 92, "x2": 92, "y2": 106}, {"x1": 1062, "y1": 41, "x2": 1112, "y2": 55}, {"x1": 966, "y1": 41, "x2": 1016, "y2": 55}, {"x1": 385, "y1": 41, "x2": 422, "y2": 55}] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/lines/test_data/expected.json: -------------------------------------------------------------------------------- 1 | {"h_lines": [{"x1": 36, "y1": 21, "x2": 771, "y2": 21, "thickness": 4}, {"x1": 962, "y1": 21, "x2": 1156, "y2": 21, "thickness": 4}, {"x1": 36, "y1": 72, "x2": 771, "y2": 72, "thickness": 4}, {"x1": 962, "y1": 72, "x2": 1156, "y2": 72, "thickness": 4}, {"x1": 36, "y1": 123, "x2": 771, "y2": 123, "thickness": 4}, {"x1": 962, "y1": 123, "x2": 1156, "y2": 123, "thickness": 4}, {"x1": 36, "y1": 174, "x2": 277, "y2": 174, "thickness": 4}, {"x1": 495, "y1": 174, "x2": 771, "y2": 174, "thickness": 4}, {"x1": 428, "y1": 192, "x2": 435, "y2": 192, "thickness": 2}, {"x1": 428, "y1": 207, "x2": 435, "y2": 207, "thickness": 2}, {"x1": 36, "y1": 225, "x2": 277, "y2": 225, "thickness": 4}, {"x1": 495, "y1": 225, "x2": 771, "y2": 225, "thickness": 4}, {"x1": 36, "y1": 276, "x2": 772, "y2": 276, "thickness": 4}, {"x1": 36, "y1": 327, "x2": 772, "y2": 327, "thickness": 4}], "v_lines": [{"x1": 36, "y1": 21, "x2": 36, "y2": 328, "thickness": 4}, {"x1": 770, "y1": 21, "x2": 770, "y2": 328, "thickness": 4}, {"x1": 962, "y1": 21, "x2": 962, "y2": 124, "thickness": 4}, {"x1": 1058, "y1": 21, "x2": 1058, "y2": 124, "thickness": 4}, {"x1": 1154, "y1": 21, "x2": 1154, "y2": 124, "thickness": 4}, {"x1": 276, "y1": 72, "x2": 276, "y2": 328, "thickness": 4}, {"x1": 495, "y1": 72, "x2": 495, "y2": 328, "thickness": 4}, {"x1": 135, "y1": 91, "x2": 135, "y2": 107, "thickness": 5}, {"x1": 375, "y1": 91, "x2": 375, "y2": 107, "thickness": 5}, {"x1": 594, "y1": 91, "x2": 594, "y2": 107, "thickness": 5}, {"x1": 135, "y1": 142, "x2": 135, "y2": 158, "thickness": 5}, {"x1": 594, "y1": 142, "x2": 594, "y2": 158, "thickness": 5}, {"x1": 135, "y1": 193, "x2": 135, "y2": 209, "thickness": 5}, {"x1": 594, "y1": 193, "x2": 594, "y2": 209, "thickness": 5}, {"x1": 329, "y1": 195, "x2": 329, "y2": 208, "thickness": 1}, {"x1": 135, "y1": 244, "x2": 135, "y2": 260, "thickness": 5}, {"x1": 594, "y1": 244, "x2": 594, "y2": 260, "thickness": 5}, {"x1": 135, "y1": 295, "x2": 135, "y2": 311, "thickness": 5}, {"x1": 375, "y1": 295, "x2": 375, "y2": 311, "thickness": 5}, {"x1": 594, "y1": 295, "x2": 594, "y2": 311, "thickness": 5}]} -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/lines/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/lines/test_data/test.png -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/lines/test_lines.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import cv2 5 | 6 | from img2table.tables.objects.cell import Cell 7 | from img2table.tables.objects.line import Line 8 | from img2table.tables.processing.bordered_tables.lines import detect_lines 9 | 10 | 11 | def test_detect_lines(): 12 | img = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB) 13 | with open("test_data/contours.json", "r") as f: 14 | contours = [Cell(**el) for el in json.load(f)] 15 | 16 | h_lines, v_lines = detect_lines(img=img, 17 | contours=contours, 18 | char_length=8.85, 19 | min_line_length=10) 20 | 21 | with open("test_data/expected.json", 'r') as f: 22 | data = json.load(f) 23 | h_lines_expected = [Line(**el) for el in data.get('h_lines')] 24 | v_lines_expected = [Line(**el) for el in data.get('v_lines')] 25 | 26 | h_lines = sorted(h_lines, key=lambda l: (l.x1, l.y1, l.x2, l.y2)) 27 | v_lines = sorted(v_lines, key=lambda l: (l.x1, l.y1, l.x2, l.y2)) 28 | h_lines_expected = sorted(h_lines_expected, key=lambda l: (l.x1, l.y1, l.x2, l.y2)) 29 | v_lines_expected = sorted(v_lines_expected, key=lambda l: (l.x1, l.y1, l.x2, l.y2)) 30 | 31 | assert (h_lines, v_lines) == (h_lines_expected, v_lines_expected) 32 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/tables/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_cell_clustering.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.processing.bordered_tables.tables.cell_clustering import cluster_cells_in_tables 6 | 7 | 8 | def test_cluster_cells_in_tables(): 9 | with open("test_data/cells.json", 'r') as f: 10 | cells = [Cell(**el) for el in json.load(f)] 11 | 12 | result = cluster_cells_in_tables(cells=cells) 13 | 14 | with open("test_data/cells_clustered.json", 'r') as f: 15 | expected = [[Cell(**el) for el in cluster] for cluster in json.load(f)] 16 | 17 | assert all([cl in result for cl in expected]) 18 | assert all([cl in expected for cl in result]) 19 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/cell_clusters_normalized.json: -------------------------------------------------------------------------------- 1 | [[{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 35, "y1": 123, "x2": 276, "y2": 174}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 35, "y1": 174, "x2": 276, "y2": 224}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 35, "y1": 224, "x2": 276, "y2": 275}, {"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}], [{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/cells.json: -------------------------------------------------------------------------------- 1 | [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 35, "y1": 123, "x2": 276, "y2": 173}, {"x1": 35, "y1": 173, "x2": 276, "y2": 225}, {"x1": 35, "y1": 225, "x2": 276, "y2": 275}, {"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}, {"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/cells_clustered.json: -------------------------------------------------------------------------------- 1 | [[{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 35, "y1": 123, "x2": 276, "y2": 173}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 35, "y1": 173, "x2": 276, "y2": 225}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 35, "y1": 225, "x2": 276, "y2": 275}, {"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}], [{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/contours.json: -------------------------------------------------------------------------------- 1 | [{"x1": 603, "y1": 296, "x2": 612, "y2": 310}, {"x1": 570, "y1": 296, "x2": 591, "y2": 310}, {"x1": 501, "y1": 296, "x2": 551, "y2": 310}, {"x1": 384, "y1": 296, "x2": 393, "y2": 310}, {"x1": 351, "y1": 296, "x2": 372, "y2": 310}, {"x1": 282, "y1": 296, "x2": 332, "y2": 310}, {"x1": 145, "y1": 296, "x2": 153, "y2": 310}, {"x1": 111, "y1": 296, "x2": 132, "y2": 310}, {"x1": 42, "y1": 296, "x2": 92, "y2": 310}, {"x1": 603, "y1": 245, "x2": 612, "y2": 259}, {"x1": 570, "y1": 245, "x2": 591, "y2": 259}, {"x1": 501, "y1": 245, "x2": 551, "y2": 259}, {"x1": 145, "y1": 245, "x2": 153, "y2": 259}, {"x1": 111, "y1": 245, "x2": 132, "y2": 259}, {"x1": 42, "y1": 245, "x2": 92, "y2": 259}, {"x1": 437, "y1": 198, "x2": 444, "y2": 208}, {"x1": 603, "y1": 194, "x2": 612, "y2": 208}, {"x1": 570, "y1": 194, "x2": 591, "y2": 208}, {"x1": 501, "y1": 194, "x2": 551, "y2": 208}, {"x1": 145, "y1": 194, "x2": 153, "y2": 208}, {"x1": 111, "y1": 194, "x2": 132, "y2": 208}, {"x1": 42, "y1": 194, "x2": 92, "y2": 208}, {"x1": 331, "y1": 193, "x2": 425, "y2": 212}, {"x1": 603, "y1": 143, "x2": 612, "y2": 157}, {"x1": 570, "y1": 143, "x2": 591, "y2": 157}, {"x1": 501, "y1": 143, "x2": 551, "y2": 157}, {"x1": 145, "y1": 143, "x2": 153, "y2": 157}, {"x1": 111, "y1": 143, "x2": 132, "y2": 157}, {"x1": 42, "y1": 143, "x2": 92, "y2": 157}, {"x1": 1062, "y1": 92, "x2": 1112, "y2": 106}, {"x1": 966, "y1": 92, "x2": 1016, "y2": 106}, {"x1": 603, "y1": 92, "x2": 612, "y2": 106}, {"x1": 570, "y1": 92, "x2": 591, "y2": 106}, {"x1": 501, "y1": 92, "x2": 551, "y2": 106}, {"x1": 384, "y1": 92, "x2": 393, "y2": 106}, {"x1": 351, "y1": 92, "x2": 372, "y2": 106}, {"x1": 282, "y1": 92, "x2": 332, "y2": 106}, {"x1": 145, "y1": 92, "x2": 153, "y2": 106}, {"x1": 111, "y1": 92, "x2": 132, "y2": 106}, {"x1": 42, "y1": 92, "x2": 92, "y2": 106}, {"x1": 1062, "y1": 41, "x2": 1112, "y2": 55}, {"x1": 966, "y1": 41, "x2": 1016, "y2": 55}, {"x1": 385, "y1": 41, "x2": 422, "y2": 55}] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/expected.json: -------------------------------------------------------------------------------- 1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 174}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}], [{"x1": 35, "y1": 174, "x2": 276, "y2": 224}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}], [{"x1": 35, "y1": 224, "x2": 276, "y2": 275}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}]], [[{"x1": 961, "y1": 20, "x2": 1058, "y2": 71}, {"x1": 1058, "y1": 20, "x2": 1154, "y2": 71}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/lines.json: -------------------------------------------------------------------------------- 1 | {"h_lines": [{"x1": 492, "y1": 173, "x2": 775, "y2": 173, "thickness": 1}, {"x1": 492, "y1": 225, "x2": 775, "y2": 225, "thickness": 1}, {"x1": 333, "y1": 308, "x2": 350, "y2": 308, "thickness": 1}, {"x1": 373, "y1": 308, "x2": 383, "y2": 308, "thickness": 1}, {"x1": 959, "y1": 20, "x2": 1160, "y2": 20, "thickness": 1}, {"x1": 959, "y1": 71, "x2": 1160, "y2": 71, "thickness": 1}, {"x1": 33, "y1": 224, "x2": 281, "y2": 224, "thickness": 1}, {"x1": 33, "y1": 20, "x2": 775, "y2": 20, "thickness": 1}, {"x1": 33, "y1": 122, "x2": 775, "y2": 122, "thickness": 1}, {"x1": 33, "y1": 72, "x2": 775, "y2": 72, "thickness": 1}, {"x1": 959, "y1": 122, "x2": 1160, "y2": 122, "thickness": 1}, {"x1": 33, "y1": 174, "x2": 281, "y2": 174, "thickness": 1}, {"x1": 33, "y1": 276, "x2": 776, "y2": 276, "thickness": 1}, {"x1": 33, "y1": 326, "x2": 776, "y2": 326, "thickness": 1}, {"x1": 93, "y1": 308, "x2": 110, "y2": 308, "thickness": 1}, {"x1": 552, "y1": 308, "x2": 569, "y2": 308, "thickness": 1}, {"x1": 592, "y1": 308, "x2": 602, "y2": 308, "thickness": 1}], "v_lines": [{"x1": 1154, "y1": 21, "x2": 1154, "y2": 124, "thickness": 1}, {"x1": 276, "y1": 69, "x2": 276, "y2": 332, "thickness": 1}, {"x1": 1058, "y1": 21, "x2": 1058, "y2": 124, "thickness": 1}, {"x1": 36, "y1": 18, "x2": 36, "y2": 332, "thickness": 1}, {"x1": 494, "y1": 69, "x2": 494, "y2": 332, "thickness": 1}, {"x1": 770, "y1": 18, "x2": 770, "y2": 332, "thickness": 1}, {"x1": 962, "y1": 21, "x2": 962, "y2": 124, "thickness": 1}]} -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/table_implicit.json: -------------------------------------------------------------------------------- 1 | [[{"x1": 224, "y1": 644, "x2": 1001, "y2": 746}, {"x1": 1001, "y1": 644, "x2": 1106, "y2": 746}, {"x1": 1106, "y1": 644, "x2": 1501, "y2": 746}], [{"x1": 224, "y1": 746, "x2": 1001, "y2": 848}, {"x1": 1001, "y1": 746, "x2": 1106, "y2": 848}, {"x1": 1106, "y1": 746, "x2": 1501, "y2": 848}], [{"x1": 224, "y1": 848, "x2": 1001, "y2": 950}, {"x1": 1001, "y1": 848, "x2": 1106, "y2": 950}, {"x1": 1106, "y1": 848, "x2": 1501, "y2": 950}], [{"x1": 224, "y1": 950, "x2": 1001, "y2": 1051}, {"x1": 1001, "y1": 950, "x2": 1106, "y2": 1051}, {"x1": 1106, "y1": 950, "x2": 1501, "y2": 1051}], [{"x1": 224, "y1": 1051, "x2": 1001, "y2": 1153}, {"x1": 1001, "y1": 1051, "x2": 1106, "y2": 1153}, {"x1": 1106, "y1": 1051, "x2": 1501, "y2": 1153}], [{"x1": 224, "y1": 1153, "x2": 1001, "y2": 1255}, {"x1": 1001, "y1": 1153, "x2": 1106, "y2": 1255}, {"x1": 1106, "y1": 1153, "x2": 1501, "y2": 1255}], [{"x1": 224, "y1": 1255, "x2": 1001, "y2": 1356}, {"x1": 1001, "y1": 1255, "x2": 1106, "y2": 1356}, {"x1": 1106, "y1": 1255, "x2": 1501, "y2": 1356}], [{"x1": 224, "y1": 1356, "x2": 1001, "y2": 1458}, {"x1": 1001, "y1": 1356, "x2": 1106, "y2": 1458}, {"x1": 1106, "y1": 1356, "x2": 1501, "y2": 1458}], [{"x1": 224, "y1": 1458, "x2": 1001, "y2": 1560}, {"x1": 1001, "y1": 1458, "x2": 1106, "y2": 1560}, {"x1": 1106, "y1": 1458, "x2": 1501, "y2": 1560}], [{"x1": 224, "y1": 1560, "x2": 1001, "y2": 1661}, {"x1": 1001, "y1": 1560, "x2": 1106, "y2": 1661}, {"x1": 1106, "y1": 1560, "x2": 1501, "y2": 1661}]] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/tables_from_cells.json: -------------------------------------------------------------------------------- 1 | [[[{"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}, {"x1": 35, "y1": 20, "x2": 770, "y2": 71}], [{"x1": 35, "y1": 71, "x2": 276, "y2": 123}, {"x1": 276, "y1": 71, "x2": 494, "y2": 123}, {"x1": 494, "y1": 71, "x2": 770, "y2": 123}], [{"x1": 35, "y1": 123, "x2": 276, "y2": 174}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 123, "x2": 770, "y2": 174}], [{"x1": 35, "y1": 174, "x2": 276, "y2": 224}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 174, "x2": 770, "y2": 224}], [{"x1": 35, "y1": 224, "x2": 276, "y2": 275}, {"x1": 276, "y1": 123, "x2": 494, "y2": 275}, {"x1": 494, "y1": 224, "x2": 770, "y2": 275}], [{"x1": 35, "y1": 275, "x2": 276, "y2": 326}, {"x1": 276, "y1": 275, "x2": 494, "y2": 326}, {"x1": 494, "y1": 275, "x2": 770, "y2": 326}]], [[{"x1": 961, "y1": 21, "x2": 1058, "y2": 71}, {"x1": 1058, "y1": 21, "x2": 1154, "y2": 71}], [{"x1": 961, "y1": 71, "x2": 1058, "y2": 123}, {"x1": 1058, "y1": 71, "x2": 1154, "y2": 123}]]] -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_data/word_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/bordered_tables/tables/test_data/word_image.png -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_implicit.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.row import Row 6 | from img2table.tables.objects.table import Table 7 | from img2table.tables.processing.bordered_tables.tables.implicit import implicit_content, implicit_rows_lines, \ 8 | implicit_columns_lines 9 | from img2table.tables.processing.borderless_tables.model import ImageSegment 10 | 11 | 12 | def test_implicit_rows_lines(): 13 | with open("test_data/table_implicit.json", 'r') as f: 14 | table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)]) 15 | 16 | with open("test_data/contours_implicit.json", "r") as f: 17 | contours = [Cell(**el) for el in json.load(f)] 18 | 19 | segment = ImageSegment(x1=table.x1, y1=table.y1, x2=table.x2, y2=table.y2, 20 | elements=contours) 21 | 22 | result = implicit_rows_lines(table=table, 23 | segment=segment) 24 | 25 | # Check that all created lines have right width 26 | assert all([line.width == table.width for line in result]) 27 | 28 | # Check positions 29 | assert sorted([line.y1 for line in result]) == [682, 716, 784, 817, 884, 919, 986, 1020, 30 | 1089, 1121, 1189, 1223, 1292, 1325, 1394, 31 | 1427, 1494, 1529, 1597, 1630] 32 | 33 | 34 | def test_implicit_columns_lines(): 35 | with open("test_data/table_implicit.json", 'r') as f: 36 | table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)]) 37 | 38 | with open("test_data/contours_implicit.json", "r") as f: 39 | contours = [Cell(**el) for el in json.load(f)] 40 | 41 | segment = ImageSegment(x1=table.x1, y1=table.y1, x2=table.x2, y2=table.y2, 42 | elements=contours) 43 | 44 | result = implicit_columns_lines(table=table, 45 | segment=segment, 46 | char_length=11) 47 | 48 | # Check that all created lines have right height 49 | assert all([line.height == table.height for line in result]) 50 | 51 | # Check positions 52 | assert sorted([line.x1 for line in result]) == [395, 605, 725, 809, 886, 1212, 1285, 1396] 53 | 54 | 55 | def test_implicit_content(): 56 | with open("test_data/table_implicit.json", 'r') as f: 57 | table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)]) 58 | 59 | with open("test_data/contours_implicit.json", "r") as f: 60 | contours = [Cell(**el) for el in json.load(f)] 61 | 62 | result = implicit_content(table=table, 63 | contours=contours, 64 | char_length=11, 65 | implicit_rows=True, 66 | implicit_columns=True) 67 | 68 | # Check that 20 more rows have been created 69 | assert result.nb_rows == table.nb_rows + 20 70 | 71 | # Check that 8 more columns have been created 72 | assert result.nb_columns == table.nb_columns + 8 73 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_table_creation.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.row import Row 6 | from img2table.tables.objects.table import Table 7 | from img2table.tables.processing.bordered_tables.tables.table_creation import normalize_table_cells, cluster_to_table, \ 8 | remove_unwanted_elements 9 | 10 | 11 | def test_normalize_table_cells(): 12 | with open("test_data/cells_clustered.json", 'r') as f: 13 | cell_clusters = [[Cell(**el) for el in cluster] for cluster in json.load(f)] 14 | 15 | result = [normalize_table_cells(cluster_cells=cell_cluster) for cell_cluster in cell_clusters] 16 | 17 | with open("test_data/cell_clusters_normalized.json", "r") as f: 18 | expected = [[Cell(**el) for el in cluster] for cluster in json.load(f)] 19 | 20 | assert result == expected 21 | 22 | 23 | def test_remove_unwanted_elements(): 24 | table = Table(rows=[Row(cells=[Cell(x1=0, y1=0, x2=20, y2=20), 25 | Cell(x1=20, y1=0, x2=40, y2=20), 26 | Cell(x1=40, y1=0, x2=60, y2=20)]), 27 | Row(cells=[Cell(x1=0, y1=20, x2=20, y2=40), 28 | Cell(x1=20, y1=20, x2=40, y2=40), 29 | Cell(x1=40, y1=20, x2=60, y2=40)]), 30 | Row(cells=[Cell(x1=0, y1=40, x2=20, y2=60), 31 | Cell(x1=20, y1=40, x2=40, y2=60), 32 | Cell(x1=40, y1=40, x2=60, y2=60)]) 33 | ] 34 | ) 35 | elements = [Cell(x1=25, y1=5, x2=35, y2=15), Cell(x1=45, y1=5, x2=55, y2=15), 36 | Cell(x1=25, y1=25, x2=35, y2=35), Cell(x1=45, y1=25, x2=55, y2=35)] 37 | 38 | result = remove_unwanted_elements(table=table, elements=elements) 39 | 40 | expected = Table(rows=[Row(cells=[Cell(x1=20, y1=0, x2=40, y2=20), 41 | Cell(x1=40, y1=0, x2=60, y2=20)]), 42 | Row(cells=[Cell(x1=20, y1=20, x2=40, y2=40), 43 | Cell(x1=40, y1=20, x2=60, y2=40)]), 44 | ] 45 | ) 46 | 47 | assert result == expected 48 | 49 | 50 | def test_cluster_to_table(): 51 | with open("test_data/cell_clusters_normalized.json", "r") as f: 52 | cell_clusters = [[Cell(**el) for el in cluster] for cluster in json.load(f)] 53 | with open("test_data/contours.json", "r") as f: 54 | contours = [Cell(**el) for el in json.load(f)] 55 | 56 | result = [cluster_to_table(cluster, contours) for cluster in cell_clusters] 57 | 58 | with open("test_data/tables_from_cells.json", "r") as f: 59 | expected = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb]) 60 | for tb in json.load(f)] 61 | 62 | assert result == expected 63 | -------------------------------------------------------------------------------- /tests/tables/processing/bordered_tables/tables/test_tables.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.objects.line import Line 6 | from img2table.tables.objects.row import Row 7 | from img2table.tables.objects.table import Table 8 | from img2table.tables.processing.bordered_tables.tables import get_tables 9 | 10 | 11 | def test_get_tables(): 12 | with open("test_data/cells.json", 'r') as f: 13 | cells = [Cell(**el) for el in json.load(f)] 14 | with open("test_data/contours.json", "r") as f: 15 | contours = [Cell(**el) for el in json.load(f)] 16 | with open("test_data/lines.json", 'r') as f: 17 | data = json.load(f) 18 | lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')] 19 | 20 | result = get_tables(cells=cells, elements=contours, lines=lines, char_length=8.44) 21 | 22 | with open("test_data/expected.json", "r") as f: 23 | expected = [Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in tb]) 24 | for tb in json.load(f)] 25 | 26 | assert result == expected 27 | 28 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/borderless_tables/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/borderless_tables/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/borderless_tables/test_borderless_tables.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import cv2 5 | 6 | from img2table.tables import threshold_dark_areas 7 | from img2table.tables.objects.cell import Cell 8 | from img2table.tables.objects.line import Line 9 | from img2table.tables.processing.borderless_tables import identify_borderless_tables 10 | 11 | 12 | def test_identify_borderless_tables(): 13 | img = cv2.cvtColor(cv2.imread("test_data/test.png"), cv2.COLOR_BGR2RGB) 14 | thresh = threshold_dark_areas(img=img, char_length=11) 15 | 16 | with open("test_data/lines.json", 'r') as f: 17 | data = json.load(f) 18 | lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')] 19 | 20 | with open("test_data/contours.json", 'r') as f: 21 | contours = [Cell(**el) for el in json.load(f)] 22 | 23 | result = identify_borderless_tables(thresh=thresh, 24 | char_length=7.0, 25 | median_line_sep=66, 26 | lines=lines, 27 | contours=contours, 28 | existing_tables=[]) 29 | 30 | assert len(result) == 1 31 | assert result[0].nb_rows == 16 32 | assert result[0].nb_columns == 7 33 | assert (result[0].x1, result[0].y1, result[0].x2, result[0].y2) == (135, 52, 1155, 1054) 34 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/borderless_tables/test_data/lines.json: -------------------------------------------------------------------------------- 1 | {"h_lines": [{"x1": 98, "y1": 1085, "x2": 1227, "y2": 1085, "thickness": 1}, {"x1": 146, "y1": 1109, "x2": 224, "y2": 1109, "thickness": 1}, {"x1": 911, "y1": 1110, "x2": 1228, "y2": 1110, "thickness": 1}, {"x1": 143, "y1": 1144, "x2": 227, "y2": 1144, "thickness": 1}, {"x1": 908, "y1": 1144, "x2": 1231, "y2": 1144, "thickness": 1}], "v_lines": []} -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/borderless_tables/test_data/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/borderless_tables/test_data/test.png -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/borderless_tables/test_whitespaces.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import json 4 | 5 | from img2table.tables.objects.cell import Cell 6 | from img2table.tables.processing.borderless_tables.model import ImageSegment 7 | from img2table.tables.processing.borderless_tables.whitespaces import get_whitespaces, adjacent_whitespaces, \ 8 | identify_coherent_v_whitespaces, get_relevant_vertical_whitespaces 9 | 10 | 11 | def test_get_whitespaces(): 12 | with open("test_data/image_segment.json", "r") as f: 13 | data = json.load(f) 14 | img_segment = ImageSegment(x1=data.get('x1'), 15 | y1=data.get('y1'), 16 | x2=data.get('x2'), 17 | y2=data.get('y2'), 18 | elements=[Cell(**c) for c in data.get('elements')]) 19 | 20 | result = get_whitespaces(segment=img_segment, vertical=True) 21 | 22 | assert len(result) == 38 23 | 24 | 25 | def test_adjacent_whitespaces(): 26 | c_1 = Cell(x1=0, x2=10, y1=0, y2=10) 27 | c_2 = Cell(x1=10, x2=20, y1=0, y2=10) 28 | c_3 = Cell(x1=10, x2=20, y1=0, y2=20) 29 | c_4 = Cell(x1=20, x2=30, y1=0, y2=10) 30 | 31 | assert adjacent_whitespaces(c_1, c_2) 32 | assert adjacent_whitespaces(c_1, c_3) 33 | assert not adjacent_whitespaces(c_1, c_4) 34 | 35 | 36 | def test_identify_coherent_v_whitespaces(): 37 | v_whitespaces = [Cell(x1=0, x2=10, y1=0, y2=10), 38 | Cell(x1=10, x2=20, y1=0, y2=20), 39 | Cell(x1=20, x2=30, y1=0, y2=10), 40 | Cell(x1=50, x2=60, y1=0, y2=20), 41 | Cell(x1=60, x2=70, y1=0, y2=18), 42 | Cell(x1=70, x2=80, y1=0, y2=10), 43 | Cell(x1=80, x2=90, y1=0, y2=20), 44 | Cell(x1=100, x2=110, y1=0, y2=10)] 45 | 46 | result = identify_coherent_v_whitespaces(v_whitespaces=v_whitespaces) 47 | 48 | expected = [Cell(x1=10, x2=20, y1=0, y2=20), 49 | Cell(x1=50, x2=60, y1=0, y2=20), 50 | Cell(x1=80, x2=90, y1=0, y2=20), 51 | Cell(x1=100, x2=110, y1=0, y2=10)] 52 | 53 | assert set(result) == set(expected) 54 | 55 | 56 | def test_get_relevant_vertical_whitespaces(): 57 | with open("test_data/image_segment.json", "r") as f: 58 | data = json.load(f) 59 | img_segment = ImageSegment(x1=data.get('x1'), 60 | y1=data.get('y1'), 61 | x2=data.get('x2'), 62 | y2=data.get('y2'), 63 | elements=[Cell(**c) for c in data.get('elements')]) 64 | 65 | result = get_relevant_vertical_whitespaces(segment=img_segment, 66 | char_length=7.0, 67 | median_line_sep=14) 68 | 69 | assert len(result) == 12 70 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/columns/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/columns/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/columns/test_columns.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import json 4 | 5 | from img2table.tables.objects.cell import Cell 6 | from img2table.tables.processing.borderless_tables.columns import get_columns_delimiters, identify_columns 7 | from img2table.tables.processing.borderless_tables.model import ImageSegment, TableSegment, ColumnGroup, Whitespace, \ 8 | Column, VerticalWS 9 | 10 | 11 | def test_get_columns_delimiters(): 12 | with open("test_data/table_segment.json", "r") as f: 13 | data = json.load(f) 14 | 15 | table_segment = TableSegment(table_areas=[ 16 | ImageSegment(x1=tb.get('x1'), y1=tb.get('y1'), x2=tb.get('x2'), y2=tb.get('y2'), 17 | elements=[Cell(**el) for el in tb.get('elements')], 18 | whitespaces=[Whitespace(cells=[Cell(**el)]) for el in tb.get('whitespaces')], 19 | position=tb.get('position')) 20 | for tb in data.get("table_areas") 21 | ]) 22 | 23 | result = get_columns_delimiters(table_segment=table_segment, 24 | char_length=14) 25 | 26 | assert result == [Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=7, y1=0, x2=21, y2=544)])), 27 | VerticalWS(ws=Whitespace(cells=[Cell(x1=7, y1=496, x2=21, y2=660)]))]), 28 | Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=270, y1=69, x2=372, y2=544)])), 29 | VerticalWS(ws=Whitespace(cells=[Cell(x1=270, y1=496, x2=372, y2=626)]))]), 30 | Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=1659, y1=69, x2=1758, y2=544)])), 31 | VerticalWS(ws=Whitespace(cells=[Cell(x1=1659, y1=496, x2=1758, y2=626)]))]), 32 | Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(x1=1845, y1=0, x2=1859, y2=544)])), 33 | VerticalWS(ws=Whitespace(cells=[Cell(x1=1845, y1=496, x2=1859, y2=660)]))])] 34 | 35 | 36 | def test_identify_columns(): 37 | with open("test_data/table_segment.json", "r") as f: 38 | data = json.load(f) 39 | 40 | table_segment = TableSegment(table_areas=[ 41 | ImageSegment(x1=tb.get('x1'), y1=tb.get('y1'), x2=tb.get('x2'), y2=tb.get('y2'), 42 | elements=[Cell(**el) for el in tb.get('elements')], 43 | whitespaces=[Whitespace(cells=[Cell(**el)]) for el in tb.get('whitespaces')], 44 | position=tb.get('position')) 45 | for tb in data.get("table_areas") 46 | ]) 47 | 48 | result = identify_columns(table_segment=table_segment, 49 | char_length=14, 50 | median_line_sep=16) 51 | 52 | with open("test_data/delimiter_group.json", "r") as f: 53 | data = json.load(f) 54 | expected = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**d)])) for d in col]) 55 | for col in data.get('columns')], 56 | elements=[Cell(**el) for el in data.get('elements')], 57 | char_length=14) 58 | 59 | assert result == expected 60 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/columns/test_data/delimiter_group.json: -------------------------------------------------------------------------------- 1 | {"columns": [[{"x1": 14, "y1": 0, "x2": 14, "y2": 544}, {"x1": 14, "y1": 496, "x2": 14, "y2": 660}], [{"x1": 270, "y1": 69, "x2": 372, "y2": 544}, {"x1": 270, "y1": 496, "x2": 372, "y2": 626}], [{"x1": 1659, "y1": 69, "x2": 1758, "y2": 544}, {"x1": 1659, "y1": 496, "x2": 1758, "y2": 626}], [{"x1": 1852, "y1": 0, "x2": 1852, "y2": 544}, {"x1": 1852, "y1": 496, "x2": 1852, "y2": 660}]], "elements": [{"x1": 21, "y1": 458, "x2": 202, "y2": 496}, {"x1": 1760, "y1": 437, "x2": 1845, "y2": 474}, {"x1": 372, "y1": 413, "x2": 1659, "y2": 496}, {"x1": 23, "y1": 412, "x2": 240, "y2": 452}, {"x1": 1760, "y1": 329, "x2": 1844, "y2": 366}, {"x1": 372, "y1": 326, "x2": 1585, "y2": 373}, {"x1": 23, "y1": 326, "x2": 216, "y2": 373}, {"x1": 375, "y1": 242, "x2": 500, "y2": 286}, {"x1": 23, "y1": 241, "x2": 243, "y2": 287}, {"x1": 1760, "y1": 221, "x2": 1845, "y2": 258}, {"x1": 373, "y1": 196, "x2": 1648, "y2": 243}, {"x1": 23, "y1": 196, "x2": 240, "y2": 236}, {"x1": 1758, "y1": 113, "x2": 1843, "y2": 150}, {"x1": 374, "y1": 111, "x2": 563, "y2": 156}, {"x1": 22, "y1": 111, "x2": 270, "y2": 150}, {"x1": 21, "y1": 0, "x2": 1792, "y2": 69}, {"x1": 23, "y1": 587, "x2": 230, "y2": 626}, {"x1": 1760, "y1": 567, "x2": 1845, "y2": 604}, {"x1": 372, "y1": 564, "x2": 1468, "y2": 611}, {"x1": 23, "y1": 544, "x2": 236, "y2": 588}]} -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/columns/test_data/table_segment.json: -------------------------------------------------------------------------------- 1 | {"elements": [{"x1": 21, "y1": 458, "x2": 202, "y2": 496}, {"x1": 1760, "y1": 437, "x2": 1845, "y2": 474}, {"x1": 372, "y1": 413, "x2": 1659, "y2": 496}, {"x1": 23, "y1": 412, "x2": 240, "y2": 452}, {"x1": 1760, "y1": 329, "x2": 1844, "y2": 366}, {"x1": 372, "y1": 326, "x2": 1585, "y2": 373}, {"x1": 23, "y1": 326, "x2": 216, "y2": 373}, {"x1": 375, "y1": 242, "x2": 500, "y2": 286}, {"x1": 23, "y1": 241, "x2": 243, "y2": 287}, {"x1": 1760, "y1": 221, "x2": 1845, "y2": 258}, {"x1": 373, "y1": 196, "x2": 1648, "y2": 243}, {"x1": 23, "y1": 196, "x2": 240, "y2": 236}, {"x1": 1758, "y1": 113, "x2": 1843, "y2": 150}, {"x1": 374, "y1": 111, "x2": 563, "y2": 156}, {"x1": 22, "y1": 111, "x2": 270, "y2": 150}, {"x1": 21, "y1": 0, "x2": 1792, "y2": 69}, {"x1": 23, "y1": 587, "x2": 230, "y2": 626}, {"x1": 1760, "y1": 567, "x2": 1845, "y2": 604}, {"x1": 372, "y1": 564, "x2": 1468, "y2": 611}, {"x1": 23, "y1": 544, "x2": 236, "y2": 588}], "whitespaces": [{"x1": 7, "y1": 0, "x2": 21, "y2": 496}, {"x1": 270, "y1": 69, "x2": 372, "y2": 496}, {"x1": 1659, "y1": 69, "x2": 1758, "y2": 496}, {"x1": 1845, "y1": 0, "x2": 1859, "y2": 496}, {"x1": 7, "y1": 544, "x2": 23, "y2": 660}, {"x1": 236, "y1": 544, "x2": 372, "y2": 626}, {"x1": 1468, "y1": 544, "x2": 1760, "y2": 626}, {"x1": 1845, "y1": 544, "x2": 1859, "y2": 660}], "table_areas": [{"x1": 7, "y1": 0, "x2": 1859, "y2": 496, "elements": [{"x1": 21, "y1": 458, "x2": 202, "y2": 496}, {"x1": 1760, "y1": 437, "x2": 1845, "y2": 474}, {"x1": 372, "y1": 413, "x2": 1659, "y2": 496}, {"x1": 23, "y1": 412, "x2": 240, "y2": 452}, {"x1": 1760, "y1": 329, "x2": 1844, "y2": 366}, {"x1": 372, "y1": 326, "x2": 1585, "y2": 373}, {"x1": 23, "y1": 326, "x2": 216, "y2": 373}, {"x1": 375, "y1": 242, "x2": 500, "y2": 286}, {"x1": 23, "y1": 241, "x2": 243, "y2": 287}, {"x1": 1760, "y1": 221, "x2": 1845, "y2": 258}, {"x1": 373, "y1": 196, "x2": 1648, "y2": 243}, {"x1": 23, "y1": 196, "x2": 240, "y2": 236}, {"x1": 1758, "y1": 113, "x2": 1843, "y2": 150}, {"x1": 374, "y1": 111, "x2": 563, "y2": 156}, {"x1": 22, "y1": 111, "x2": 270, "y2": 150}, {"x1": 21, "y1": 0, "x2": 1792, "y2": 69}], "whitespaces": [{"x1": 7, "y1": 0, "x2": 21, "y2": 496}, {"x1": 270, "y1": 69, "x2": 372, "y2": 496}, {"x1": 1659, "y1": 69, "x2": 1758, "y2": 496}, {"x1": 1845, "y1": 0, "x2": 1859, "y2": 496}], "position": 1}, {"x1": 7, "y1": 544, "x2": 1859, "y2": 660, "elements": [{"x1": 23, "y1": 587, "x2": 230, "y2": 626}, {"x1": 1760, "y1": 567, "x2": 1845, "y2": 604}, {"x1": 372, "y1": 564, "x2": 1468, "y2": 611}, {"x1": 23, "y1": 544, "x2": 236, "y2": 588}], "whitespaces": [{"x1": 7, "y1": 544, "x2": 23, "y2": 660}, {"x1": 236, "y1": 544, "x2": 372, "y2": 626}, {"x1": 1468, "y1": 544, "x2": 1760, "y2": 626}, {"x1": 1845, "y1": 544, "x2": 1859, "y2": 660}], "position": 2}]} -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/layout/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/layout/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/layout/test_data/lines.json: -------------------------------------------------------------------------------- 1 | {"h_lines": [{"x1": 405, "y1": 158, "x2": 735, "y2": 158, "thickness": 1}, {"x1": 405, "y1": 196, "x2": 735, "y2": 196, "thickness": 1}, {"x1": 405, "y1": 271, "x2": 735, "y2": 271, "thickness": 1}, {"x1": 356, "y1": 356, "x2": 368, "y2": 356, "thickness": 1}, {"x1": 165, "y1": 372, "x2": 182, "y2": 372, "thickness": 1}, {"x1": 104, "y1": 587, "x2": 115, "y2": 587, "thickness": 1}, {"x1": 600, "y1": 592, "x2": 615, "y2": 592, "thickness": 1}, {"x1": 668, "y1": 600, "x2": 684, "y2": 600, "thickness": 1}, {"x1": 231, "y1": 603, "x2": 241, "y2": 603, "thickness": 1}, {"x1": 178, "y1": 643, "x2": 193, "y2": 643, "thickness": 1}, {"x1": 244, "y1": 755, "x2": 259, "y2": 755, "thickness": 1}, {"x1": 278, "y1": 755, "x2": 288, "y2": 755, "thickness": 1}, {"x1": 410, "y1": 791, "x2": 421, "y2": 791, "thickness": 1}, {"x1": 410, "y1": 807, "x2": 421, "y2": 807, "thickness": 1}, {"x1": 121, "y1": 842, "x2": 131, "y2": 842, "thickness": 1}, {"x1": 89, "y1": 866, "x2": 104, "y2": 866, "thickness": 1}], "v_lines": [{"x1": 87, "y1": 643, "x2": 87, "y2": 653, "thickness": 1}, {"x1": 326, "y1": 627, "x2": 326, "y2": 637, "thickness": 1}, {"x1": 405, "y1": 431, "x2": 405, "y2": 441, "thickness": 1}]} -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/layout/test_data/test.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/layout/test_data/test.bmp -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/layout/test_data/text_thresh.bmp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/layout/test_data/text_thresh.bmp -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/layout/test_image_elements.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import cv2 5 | 6 | from img2table.tables.objects.cell import Cell 7 | from img2table.tables.processing.borderless_tables.layout import get_image_elements 8 | 9 | 10 | def test_get_image_elements(): 11 | thresh = cv2.imread("test_data/text_thresh.bmp", cv2.IMREAD_GRAYSCALE) 12 | 13 | result = get_image_elements(thresh=thresh, 14 | char_length=6.0, 15 | median_line_sep=16) 16 | 17 | with open("test_data/elements.json", "r") as f: 18 | expected = [Cell(**el) for el in json.load(f)] 19 | 20 | assert result == expected 21 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/layout/test_layout.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import cv2 5 | 6 | from img2table.tables import threshold_dark_areas 7 | from img2table.tables.objects.line import Line 8 | from img2table.tables.processing.borderless_tables import segment_image 9 | 10 | 11 | def test_segment_image(): 12 | img = cv2.cvtColor(cv2.imread("test_data/test.bmp"), cv2.COLOR_BGR2RGB) 13 | thresh = threshold_dark_areas(img=img, char_length=6) 14 | 15 | with open("test_data/lines.json", 'r') as f: 16 | data = json.load(f) 17 | lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')] 18 | 19 | result = segment_image(thresh=thresh, 20 | lines=lines, 21 | char_length=6.0, 22 | median_line_sep=16) 23 | 24 | assert len(result) == 2 25 | 26 | assert len(result[0].elements) == 30 27 | assert len(result[0].table_areas) == 5 28 | assert len(result[0].whitespaces) == 21 29 | 30 | assert len(result[1].elements) == 4 31 | assert len(result[1].table_areas) == 1 32 | assert len(result[1].whitespaces) == 4 33 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/layout/test_rlsa.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import cv2 5 | import numpy as np 6 | from numba import config 7 | 8 | from img2table.tables import threshold_dark_areas 9 | from img2table.tables.objects.line import Line 10 | from img2table.tables.processing.borderless_tables.layout.rlsa import identify_text_mask 11 | 12 | 13 | def test_identify_text_mask(): 14 | config.DISABLE_JIT = True 15 | 16 | img = cv2.cvtColor(cv2.imread("test_data/test.bmp"), cv2.COLOR_BGR2RGB) 17 | thresh = threshold_dark_areas(img=img, char_length=6) 18 | 19 | with open("test_data/lines.json", 'r') as f: 20 | data = json.load(f) 21 | lines = [Line(**el) for el in data.get('h_lines') + data.get('v_lines')] 22 | 23 | result = identify_text_mask(thresh=thresh, 24 | lines=lines, 25 | char_length=6.0) 26 | 27 | expected = cv2.imread("test_data/text_thresh.bmp", cv2.IMREAD_GRAYSCALE) 28 | 29 | assert np.array_equal(result, expected) 30 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/rows/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/rows/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/rows/test_data/h_whitespaces.json: -------------------------------------------------------------------------------- 1 | [{"x1": 93, "y1": 45, "x2": 1233, "y2": 45}, {"x1": 93, "y1": 78, "x2": 1233, "y2": 78}, {"x1": 93, "y1": 146, "x2": 1233, "y2": 146}, {"x1": 93, "y1": 212, "x2": 1233, "y2": 212}, {"x1": 93, "y1": 278, "x2": 1233, "y2": 278}, {"x1": 93, "y1": 344, "x2": 1233, "y2": 344}, {"x1": 93, "y1": 410, "x2": 1233, "y2": 410}, {"x1": 93, "y1": 476, "x2": 1233, "y2": 476}, {"x1": 93, "y1": 542, "x2": 1233, "y2": 542}, {"x1": 93, "y1": 608, "x2": 1233, "y2": 608}, {"x1": 93, "y1": 674, "x2": 1233, "y2": 674}, {"x1": 93, "y1": 740, "x2": 1233, "y2": 740}, {"x1": 93, "y1": 806, "x2": 1233, "y2": 806}, {"x1": 93, "y1": 872, "x2": 1233, "y2": 872}, {"x1": 93, "y1": 938, "x2": 1233, "y2": 938}, {"x1": 93, "y1": 1004, "x2": 1233, "y2": 1004}, {"x1": 93, "y1": 1084, "x2": 1233, "y2": 1084}, {"x1": 93, "y1": 1147, "x2": 1233, "y2": 1147}] -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/rows/test_data/rows.json: -------------------------------------------------------------------------------- 1 | [{"x1": 53, "y1": 45, "x2": 1277, "y2": 45}, {"x1": 53, "y1": 78, "x2": 1277, "y2": 78}, {"x1": 53, "y1": 146, "x2": 1277, "y2": 146}, {"x1": 53, "y1": 212, "x2": 1277, "y2": 212}, {"x1": 53, "y1": 278, "x2": 1277, "y2": 278}, {"x1": 53, "y1": 344, "x2": 1277, "y2": 344}, {"x1": 53, "y1": 410, "x2": 1277, "y2": 410}, {"x1": 53, "y1": 476, "x2": 1277, "y2": 476}, {"x1": 53, "y1": 542, "x2": 1277, "y2": 542}, {"x1": 53, "y1": 608, "x2": 1277, "y2": 608}, {"x1": 53, "y1": 674, "x2": 1277, "y2": 674}, {"x1": 53, "y1": 740, "x2": 1277, "y2": 740}, {"x1": 53, "y1": 806, "x2": 1277, "y2": 806}, {"x1": 53, "y1": 872, "x2": 1277, "y2": 872}, {"x1": 53, "y1": 938, "x2": 1277, "y2": 938}, {"x1": 53, "y1": 1004, "x2": 1277, "y2": 1004}, {"x1": 53, "y1": 1084, "x2": 1277, "y2": 1084}, {"x1": 53, "y1": 1147, "x2": 1277, "y2": 1147}] -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/rows/test_rows.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import json 4 | 5 | from img2table.tables.objects.cell import Cell 6 | from img2table.tables.processing.borderless_tables.model import ColumnGroup, Column, VerticalWS, Whitespace 7 | from img2table.tables.processing.borderless_tables.rows import \ 8 | identify_delimiter_group_rows, identify_row_delimiters, filter_coherent_row_delimiters, correct_delimiter_width 9 | 10 | 11 | def test_identify_row_delimiters(): 12 | with open("test_data/delimiter_group.json", "r") as f: 13 | data = json.load(f) 14 | column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))]) 15 | for col in data.get('delimiters')], 16 | elements=[Cell(**el) for el in data.get('elements')], 17 | char_length=14) 18 | 19 | result = identify_row_delimiters(column_group=column_group) 20 | 21 | with open("test_data/h_whitespaces.json", "r") as f: 22 | expected = [Cell(**c) for c in json.load(f)] 23 | 24 | assert result == expected 25 | 26 | 27 | def test_filter_coherent_row_delimiters(): 28 | row_delimiters = [Cell(x1=0, x2=100, y1=0, y2=0), 29 | Cell(x1=0, x2=80, y1=10, y2=10), 30 | Cell(x1=0, x2=100, y1=20, y2=20)] 31 | 32 | column_group = ColumnGroup(columns=[Column([VerticalWS(Whitespace(cells=[Cell(x1=0, x2=0, y1=0, y2=20)]))]), 33 | Column([VerticalWS(Whitespace(cells=[Cell(x1=30, x2=30, y1=0, y2=20)]))]), 34 | Column([VerticalWS(Whitespace(cells=[Cell(x1=60, x2=60, y1=0, y2=20)]))]), 35 | Column([VerticalWS(Whitespace(cells=[Cell(x1=100, x2=100, y1=0, y2=20)]))])], 36 | elements=[Cell(x1=85, x2=95, y1=2, y2=7)], 37 | char_length=14) 38 | 39 | result = filter_coherent_row_delimiters(row_delimiters=row_delimiters, 40 | column_group=column_group) 41 | 42 | expected = [Cell(x1=0, x2=100, y1=0, y2=0), 43 | Cell(x1=0, x2=100, y1=20, y2=20)] 44 | 45 | assert result == expected 46 | 47 | 48 | def test_correct_delimiter_width(): 49 | row_delimiters = [Cell(x1=0, x2=100, y1=0, y2=0), 50 | Cell(x1=0, x2=80, y1=10, y2=10), 51 | Cell(x1=30, x2=100, y1=20, y2=20), 52 | Cell(x1=0, x2=100, y1=30, y2=30)] 53 | 54 | contours = [Cell(x1=23, x2=34, y1=12, y2=18), 55 | Cell(x1=86, x2=93, y1=2, y2=9), 56 | Cell(x1=3, x2=17, y1=18, y2=24)] 57 | 58 | result = correct_delimiter_width(row_delimiters=row_delimiters, 59 | contours=contours) 60 | 61 | expected = [Cell(x1=0, x2=100, y1=0, y2=0), 62 | Cell(x1=0, x2=100, y1=10, y2=10), 63 | Cell(x1=17, x2=100, y1=20, y2=20), 64 | Cell(x1=0, x2=100, y1=30, y2=30)] 65 | 66 | assert result == expected 67 | 68 | 69 | def test_identify_delimiter_group_rows(): 70 | with open("test_data/delimiter_group.json", "r") as f: 71 | data = json.load(f) 72 | column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))]) 73 | for col in data.get('delimiters')], 74 | elements=[Cell(**el) for el in data.get('elements')], 75 | char_length=14) 76 | 77 | with open("test_data/contours.json", 'r') as f: 78 | contours = [Cell(**el) for el in json.load(f)] 79 | 80 | result = identify_delimiter_group_rows(column_group=column_group, 81 | contours=contours) 82 | 83 | assert len(result) == 18 84 | assert min([d.y1 for d in result]) == 45 85 | assert max([d.y2 for d in result]) == 1147 86 | assert min([d.x1 for d in result]) == 93 87 | assert max([d.x2 for d in result]) == 1233 88 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/table/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/borderless_tables/table/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/table/test_data/rows.json: -------------------------------------------------------------------------------- 1 | [{"x1": 53, "y1": 45, "x2": 1277, "y2": 45}, {"x1": 53, "y1": 78, "x2": 1277, "y2": 78}, {"x1": 53, "y1": 146, "x2": 1277, "y2": 146}, {"x1": 53, "y1": 212, "x2": 1277, "y2": 212}, {"x1": 53, "y1": 278, "x2": 1277, "y2": 278}, {"x1": 53, "y1": 344, "x2": 1277, "y2": 344}, {"x1": 53, "y1": 410, "x2": 1277, "y2": 410}, {"x1": 53, "y1": 476, "x2": 1277, "y2": 476}, {"x1": 53, "y1": 542, "x2": 1277, "y2": 542}, {"x1": 53, "y1": 608, "x2": 1277, "y2": 608}, {"x1": 53, "y1": 674, "x2": 1277, "y2": 674}, {"x1": 53, "y1": 740, "x2": 1277, "y2": 740}, {"x1": 53, "y1": 806, "x2": 1277, "y2": 806}, {"x1": 53, "y1": 872, "x2": 1277, "y2": 872}, {"x1": 53, "y1": 938, "x2": 1277, "y2": 938}, {"x1": 53, "y1": 1004, "x2": 1277, "y2": 1004}, {"x1": 53, "y1": 1084, "x2": 1277, "y2": 1084}, {"x1": 53, "y1": 1147, "x2": 1277, "y2": 1147}] -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/table/test_table.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import json 4 | 5 | from img2table.tables.objects.cell import Cell 6 | from img2table.tables.processing.borderless_tables import identify_table 7 | from img2table.tables.processing.borderless_tables.model import ColumnGroup, Column, VerticalWS, Whitespace 8 | 9 | 10 | def test_identify_table(): 11 | with open("test_data/delimiter_group.json", "r") as f: 12 | data = json.load(f) 13 | column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))]) 14 | for col in data.get('delimiters')], 15 | elements=[Cell(**c) for c in data.get('elements')], 16 | char_length=4.66) 17 | 18 | with open("test_data/contours.json", 'r') as f: 19 | contours = [Cell(**el) for el in json.load(f)] 20 | 21 | with open("test_data/rows.json", "r") as f: 22 | row_delimiters = [Cell(**c) for c in json.load(f)] 23 | 24 | result = identify_table(columns=column_group, 25 | row_delimiters=row_delimiters, 26 | contours=contours, 27 | median_line_sep=16, 28 | char_length=4.66) 29 | 30 | assert result.nb_rows == 17 31 | assert result.nb_columns == 8 32 | assert (result.x1, result.y1, result.x2, result.y2) == (91, 45, 1235, 1147) 33 | -------------------------------------------------------------------------------- /tests/tables/processing/borderless_tables/table/test_table_creation.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import json 4 | 5 | from img2table.tables.objects.cell import Cell 6 | from img2table.tables.processing.borderless_tables.model import ColumnGroup, VerticalWS, Column, Whitespace 7 | from img2table.tables.processing.borderless_tables.table.table_creation import get_table 8 | 9 | 10 | def test_get_table(): 11 | with open("test_data/delimiter_group.json", "r") as f: 12 | data = json.load(f) 13 | column_group = ColumnGroup(columns=[Column(whitespaces=[VerticalWS(ws=Whitespace(cells=[Cell(**col)]))]) 14 | for col in data.get('delimiters')], 15 | elements=[Cell(**c) for c in data.get('elements')], 16 | char_length=4.66) 17 | 18 | with open("test_data/contours.json", 'r') as f: 19 | contours = [Cell(**el) for el in json.load(f)] 20 | 21 | with open("test_data/rows.json", "r") as f: 22 | row_delimiters = [Cell(**c) for c in json.load(f)] 23 | 24 | result = get_table(columns=column_group, 25 | row_delimiters=row_delimiters, 26 | contours=contours) 27 | 28 | assert result.nb_rows == 17 29 | assert result.nb_columns == 8 30 | assert (result.x1, result.y1, result.x2, result.y2) == (91, 45, 1235, 1147) 31 | -------------------------------------------------------------------------------- /tests/tables/processing/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/common/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/common/test_common.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import cv2 3 | 4 | from img2table.tables.objects.cell import Cell 5 | from img2table.tables.processing.common import is_contained_cell, merge_contours, get_contours_cell 6 | 7 | 8 | def test_is_contained_cell(): 9 | cell_1 = Cell(x1=0, x2=20, y1=0, y2=20) 10 | cell_2 = Cell(x1=0, x2=40, y1=0, y2=25) 11 | cell_3 = Cell(x1=50, x2=70, y1=123, y2=256) 12 | 13 | assert is_contained_cell(inner_cell=cell_1, outer_cell=cell_2) 14 | assert not is_contained_cell(inner_cell=cell_2, outer_cell=cell_1) 15 | assert not is_contained_cell(inner_cell=cell_1, outer_cell=cell_3) 16 | assert not is_contained_cell(inner_cell=cell_2, outer_cell=cell_3) 17 | 18 | 19 | def test_merge_contours(): 20 | contours = [Cell(x1=0, x2=20, y1=0, y2=20), 21 | Cell(x1=0, x2=20, y1=10, y2=20), 22 | Cell(x1=60, x2=80, y1=0, y2=20), 23 | Cell(x1=10, x2=20, y1=100, y2=200)] 24 | 25 | # Do not merge by axis 26 | expected = [Cell(x1=0, x2=20, y1=0, y2=20), 27 | Cell(x1=60, x2=80, y1=0, y2=20), 28 | Cell(x1=10, x2=20, y1=100, y2=200)] 29 | assert set(merge_contours(contours=contours, vertically=None)) == set(expected) 30 | 31 | # Merge vertically 32 | expected_vertical = [Cell(x1=0, x2=80, y1=0, y2=20), Cell(x1=10, x2=20, y1=100, y2=200)] 33 | assert merge_contours(contours=contours, vertically=True) == expected_vertical 34 | 35 | # Merge horizontally 36 | expected_horizontal = [Cell(x1=0, x2=20, y1=0, y2=200), Cell(x1=60, x2=80, y1=0, y2=20)] 37 | assert merge_contours(contours=contours, vertically=False) == expected_horizontal 38 | 39 | 40 | def test_get_contours_cell(): 41 | img = cv2.cvtColor(cv2.imread("test_data/test.jpg"), cv2.COLOR_BGR2RGB) 42 | cell = Cell(x1=0, x2=img.shape[1], y1=0, y2=img.shape[0]) 43 | 44 | result = get_contours_cell(img=img, 45 | cell=cell, 46 | margin=5, 47 | blur_size=5, 48 | kernel_size=9, 49 | merge_vertically=True) 50 | 51 | expected = [Cell(x1=51, y1=19, x2=518, y2=146), 52 | Cell(x1=60, y1=156, x2=534, y2=691), 53 | Cell(x1=65, y1=765, x2=543, y2=811)] 54 | 55 | assert result == expected 56 | -------------------------------------------------------------------------------- /tests/tables/processing/common/test_data/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/common/test_data/test.jpg -------------------------------------------------------------------------------- /tests/tables/processing/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/text/__init__.py -------------------------------------------------------------------------------- /tests/tables/processing/text/test_data/table.json: -------------------------------------------------------------------------------- 1 | [[{"x1": 12, "y1": 67, "x2": 104, "y2": 177}, {"x1": 104, "y1": 67, "x2": 288, "y2": 177}, {"x1": 288, "y1": 67, "x2": 440, "y2": 177}, {"x1": 440, "y1": 67, "x2": 596, "y2": 177}, {"x1": 596, "y1": 67, "x2": 734, "y2": 177}, {"x1": 734, "y1": 67, "x2": 1043, "y2": 177}], [{"x1": 12, "y1": 177, "x2": 104, "y2": 220}, {"x1": 104, "y1": 177, "x2": 288, "y2": 220}, {"x1": 288, "y1": 177, "x2": 440, "y2": 220}, {"x1": 440, "y1": 177, "x2": 596, "y2": 220}, {"x1": 596, "y1": 177, "x2": 734, "y2": 220}, {"x1": 734, "y1": 177, "x2": 1043, "y2": 220}], [{"x1": 12, "y1": 220, "x2": 104, "y2": 264}, {"x1": 104, "y1": 220, "x2": 288, "y2": 264}, {"x1": 288, "y1": 220, "x2": 440, "y2": 264}, {"x1": 440, "y1": 220, "x2": 596, "y2": 264}, {"x1": 596, "y1": 220, "x2": 734, "y2": 264}, {"x1": 734, "y1": 220, "x2": 1043, "y2": 264}], [{"x1": 12, "y1": 264, "x2": 104, "y2": 341}, {"x1": 104, "y1": 264, "x2": 288, "y2": 341}, {"x1": 288, "y1": 264, "x2": 440, "y2": 341}, {"x1": 440, "y1": 264, "x2": 596, "y2": 341}, {"x1": 596, "y1": 264, "x2": 734, "y2": 341}, {"x1": 734, "y1": 264, "x2": 1043, "y2": 341}], [{"x1": 12, "y1": 341, "x2": 104, "y2": 384}, {"x1": 104, "y1": 341, "x2": 288, "y2": 384}, {"x1": 288, "y1": 341, "x2": 440, "y2": 384}, {"x1": 440, "y1": 341, "x2": 596, "y2": 384}, {"x1": 596, "y1": 341, "x2": 734, "y2": 384}, {"x1": 734, "y1": 341, "x2": 1043, "y2": 384}], [{"x1": 12, "y1": 384, "x2": 104, "y2": 428}, {"x1": 104, "y1": 384, "x2": 288, "y2": 428}, {"x1": 288, "y1": 384, "x2": 440, "y2": 428}, {"x1": 440, "y1": 384, "x2": 596, "y2": 428}, {"x1": 596, "y1": 384, "x2": 734, "y2": 428}, {"x1": 734, "y1": 384, "x2": 1043, "y2": 428}], [{"x1": 12, "y1": 428, "x2": 104, "y2": 471}, {"x1": 104, "y1": 428, "x2": 288, "y2": 471}, {"x1": 288, "y1": 428, "x2": 440, "y2": 471}, {"x1": 440, "y1": 428, "x2": 596, "y2": 471}, {"x1": 596, "y1": 428, "x2": 734, "y2": 471}, {"x1": 734, "y1": 428, "x2": 1043, "y2": 471}], [{"x1": 12, "y1": 471, "x2": 104, "y2": 514}, {"x1": 104, "y1": 471, "x2": 288, "y2": 514}, {"x1": 288, "y1": 471, "x2": 440, "y2": 514}, {"x1": 440, "y1": 471, "x2": 596, "y2": 514}, {"x1": 596, "y1": 471, "x2": 734, "y2": 514}, {"x1": 734, "y1": 471, "x2": 1043, "y2": 514}], [{"x1": 12, "y1": 514, "x2": 104, "y2": 558}, {"x1": 104, "y1": 514, "x2": 288, "y2": 558}, {"x1": 288, "y1": 514, "x2": 440, "y2": 558}, {"x1": 440, "y1": 514, "x2": 596, "y2": 558}, {"x1": 596, "y1": 514, "x2": 734, "y2": 558}, {"x1": 734, "y1": 514, "x2": 1043, "y2": 558}], [{"x1": 12, "y1": 558, "x2": 104, "y2": 635}, {"x1": 104, "y1": 558, "x2": 288, "y2": 635}, {"x1": 288, "y1": 558, "x2": 440, "y2": 635}, {"x1": 440, "y1": 558, "x2": 596, "y2": 635}, {"x1": 596, "y1": 558, "x2": 734, "y2": 635}, {"x1": 734, "y1": 558, "x2": 1043, "y2": 635}], [{"x1": 12, "y1": 635, "x2": 104, "y2": 678}, {"x1": 104, "y1": 635, "x2": 288, "y2": 678}, {"x1": 288, "y1": 635, "x2": 440, "y2": 678}, {"x1": 440, "y1": 635, "x2": 596, "y2": 678}, {"x1": 596, "y1": 635, "x2": 734, "y2": 678}, {"x1": 734, "y1": 635, "x2": 1043, "y2": 678}]] -------------------------------------------------------------------------------- /tests/tables/processing/text/test_data/test.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xavctn/img2table/d3fc3721d18b22d515ae2f5d723f6474519e89eb/tests/tables/processing/text/test_data/test.jpg -------------------------------------------------------------------------------- /tests/tables/processing/text/test_titles.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import json 3 | 4 | import cv2 5 | import polars as pl 6 | 7 | from img2table.ocr.data import OCRDataframe 8 | from img2table.tables.objects.cell import Cell 9 | from img2table.tables.objects.row import Row 10 | from img2table.tables.objects.table import Table 11 | from img2table.tables.processing.text.titles import get_title_tables 12 | 13 | 14 | def test_get_title_tables(): 15 | img = cv2.cvtColor(cv2.imread("test_data/test.jpg"), cv2.COLOR_BGR2RGB) 16 | with open("test_data/table.json", "r") as f: 17 | table = Table(rows=[Row(cells=[Cell(**el) for el in row]) for row in json.load(f)]) 18 | ocr_df = OCRDataframe(df=pl.read_csv("test_data/ocr.csv", separator=";")) 19 | 20 | result = get_title_tables(img=img, tables=[table], ocr_df=ocr_df) 21 | 22 | assert result[0].title == "10 most populous countries" 23 | assert get_title_tables(img=img, tables=[], ocr_df=ocr_df) == [] 24 | --------------------------------------------------------------------------------