├── .gitignore ├── Dockerfile ├── README.md ├── TableExtractor ├── __init__.py └── modules │ ├── LinesDetector.py │ ├── PreProcessing.py │ ├── RowsDetector.py │ ├── TableBuilder.py │ └── utils.py ├── app ├── __init__.py ├── static │ ├── poster.jpg │ ├── styles.css │ └── table-icon.svg └── templates │ └── index.html ├── data ├── digital_table_1.png ├── digital_table_2.png ├── digital_table_3.png ├── sample_table.jpg ├── sample_table_2.jpg └── sample_table_3.jpg ├── heroku.yml ├── main.py ├── requirements-docker.txt └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .idea 3 | tmp 4 | 5 | # Created by https://www.toptal.com/developers/gitignore/api/python,windows,venv 6 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,windows,venv 7 | 8 | ### Python ### 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | pip-wheel-metadata/ 32 | share/python-wheels/ 33 | *.egg-info/ 34 | .installed.cfg 35 | *.egg 36 | MANIFEST 37 | 38 | # PyInstaller 39 | # Usually these files are written by a python script from a template 40 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 41 | *.manifest 42 | *.spec 43 | 44 | # Installer logs 45 | pip-log.txt 46 | pip-delete-this-directory.txt 47 | 48 | # Unit test / coverage reports 49 | htmlcov/ 50 | .tox/ 51 | .nox/ 52 | .coverage 53 | .coverage.* 54 | .cache 55 | nosetests.xml 56 | coverage.xml 57 | *.cover 58 | *.py,cover 59 | .hypothesis/ 60 | .pytest_cache/ 61 | pytestdebug.log 62 | 63 | # Translations 64 | *.mo 65 | *.pot 66 | 67 | # Django stuff: 68 | *.log 69 | local_settings.py 70 | db.sqlite3 71 | db.sqlite3-journal 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | doc/_build/ 83 | 84 | # PyBuilder 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | pythonenv* 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | .dmypy.json 137 | dmypy.json 138 | 139 | # Pyre type checker 140 | .pyre/ 141 | 142 | # pytype static type analyzer 143 | .pytype/ 144 | 145 | # profiling data 146 | .prof 147 | 148 | ### venv ### 149 | # Virtualenv 150 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 151 | [Bb]in 152 | [Ii]nclude 153 | [Ll]ib 154 | [Ll]ib64 155 | [Ll]ocal 156 | [Ss]cripts 157 | pyvenv.cfg 158 | pip-selfcheck.json 159 | 160 | ### Windows ### 161 | # Windows thumbnail cache files 162 | Thumbs.db 163 | Thumbs.db:encryptable 164 | ehthumbs.db 165 | ehthumbs_vista.db 166 | 167 | # Dump file 168 | *.stackdump 169 | 170 | # Folder config file 171 | [Dd]esktop.ini 172 | 173 | # Recycle Bin used on file shares 174 | $RECYCLE.BIN/ 175 | 176 | # Windows Installer files 177 | *.cab 178 | *.msi 179 | *.msix 180 | *.msm 181 | *.msp 182 | 183 | # Windows shortcuts 184 | *.lnk 185 | 186 | # End of https://www.toptal.com/developers/gitignore/api/python,windows,venv -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim-buster 2 | 3 | RUN apt-get -y update \ 4 | && apt-get -y upgrade \ 5 | # tesserocr requirements 6 | && apt-get -y install tesseract-ocr libtesseract-dev libleptonica-dev 7 | 8 | # Required for tesserocr: 9 | # https://github.com/sirfz/tesserocr/issues/165#issuecomment-445789709 10 | ENV LC_ALL=C 11 | # Use port 5000 by default, could be overwritten by cloud providers (e.g. Heroku) 12 | ENV PORT=5000 13 | 14 | EXPOSE ${PORT} 15 | 16 | WORKDIR /app 17 | 18 | COPY . . 19 | 20 | # Build tesserocr wheel and install dependancies 21 | RUN apt-get -y install pkg-config build-essential \ 22 | # Use piwheels for arm builds 23 | && pip3 install -r requirements-docker.txt --extra-index-url https://www.piwheels.org/simple \ 24 | # Remove build dependencies 25 | && apt-get -y purge --auto-remove pkg-config build-essential 26 | 27 | # Run flask app 28 | CMD gunicorn -b 0.0.0.0:$PORT -w 4 app:app --timeout 120 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Table Extraction 2 | 3 |  4 | 5 |  6 | 7 | ## Overview 8 | 9 | This framework was developed as part of my undergraduate final year project at University and allows for the extraction of tabular data from raster images. It uses **line information** to locate cells, and an algorithm arranges the cells in memory to reconstruct the tabular structure. It then uses the Tesseract OCR engine to extract the text and returns the entire table as JSON data. It achieved 89% cell detection accuracy when extracting prayer times from timetables (see `data` folder for some examples). 10 | 11 | The main drawbacks are as follows: 12 | 13 | - Heavily relies on ruling lines. The table must have all column and row separators, and blurry images can cause a drop in line detection 14 | - Table region detection is quite rudimentary: it looks for the largest quadrilateral in the image 15 | - It can only detect one table 16 | - Tesseract needs more fine tuning for better OCR processing, as sometimes text is not recognized properly. 17 | 18 | Below is a summary of how the framework works. This structure is reflected in `TableExtractor/__init__.py`. 19 | 20 |  21 | 22 | ## Docker 23 | 24 | This is the recommended way to run this project as the environment is all set up and ready to use. For convenience, Docker images are automatically built and released on [Docker Hub](https://hub.docker.com/repository/docker/abdullahibneat/table-extraction). 25 | 26 | To run the Docker container locally: 27 | 28 | ``` 29 | docker pull abdullahibneat/table-extraction 30 | docker run -d -p 5000:5000 abdullahibneat/table-extraction 31 | ``` 32 | 33 | Then visit http://localhost:5000 and you're ready to go! 34 | 35 | When using a cloud provider, you can change the port by setting the `PORT` environment variable. In Heroku, the port is set automatically so this repository can simply be pushed to the Heroku remote. 36 | 37 | ## Manual setup 38 | 39 | ### OCR setup 40 | 41 | An OCR engine is NOT required to run the project, though without one the returned table object will return cell numbers instead of the cell contents. 42 | 43 | If you wish to skip the OCR process, **remove the tesserocr requirement from `requirements.txt`** and continue reading the "Get started" section. 44 | 45 | This project uses [tesserocr](https://github.com/sirfz/tesserocr) as the Tesseract wrapper out-of-the-box. Follow the instructions [here](https://github.com/sirfz/tesserocr) to set up tesserocr on your system. 46 | 47 | Alternatively, use your own OCR implementation by removing the tesserocr requirement from `requirements.txt` and updating the "Advanced usage" code in `main.py` with your own implementation. 48 | 49 | ### Get started 50 | 51 | 1. Make sure Python 3.7.x is installed. `❗❗❗THIS IS IMPORTANT❗❗❗` 52 | 2. `Recommended:` Set up a Python 3.7 [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) 53 | 3. Install the requirements (make sure you have read the "OCR setup" section above before running this command): `pip install -r requirements.txt` 54 | 4. Run the `main.py` file: `python3 main.py` 55 | 56 | ### Flask API server 57 | 58 | A simple Flask API was written to interact with the table extractor. Run the `app` module with Flask: 59 | 60 | ``` 61 | FLASK_APP=app flask run 62 | ``` 63 | 64 | and visit the address (default: `http://localhost:5000`). Alternatively, send the image as form data (it can have any name) in a `POST` request to the root endpoint: 65 | 66 | ``` 67 | curl -F image=@myImage.jpg http://localhost:5000 68 | ``` 69 | -------------------------------------------------------------------------------- /TableExtractor/__init__.py: -------------------------------------------------------------------------------- 1 | from .modules import PreProcessing, utils, LinesDetector, RowsDetector, TableBuilder 2 | import cv2 3 | import numpy as np 4 | 5 | getOCRFunction = utils.getOCRFunction 6 | 7 | def extractTable(imgPath, ocrFunction = None): 8 | # Dictonary to store data to be returned 9 | ret = {} 10 | 11 | img = cv2.imread(imgPath, 0) 12 | 13 | if img.size == 0: 14 | raise ValueError("The file provided must be an image of type jpg, jpeg or png.") 15 | 16 | # PROCESS IMAGE 17 | laplacian = PreProcessing.process(img) 18 | 19 | # FIND CONTOUR 20 | contours, _ = cv2.findContours(laplacian, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) 21 | 22 | # FIND TABLE REGION 23 | # It is assumed the table takes up most of the image, 24 | # thus it can be identified by finding the largest contour with 4 sides 25 | table_contour, table_contour_approx = utils.findLargestQuadrilateralContour(contours) 26 | 27 | if table_contour[0] is None: 28 | raise ValueError("No table detected.") 29 | 30 | # Sort points in clocwise order, compute table width and height 31 | table_pts, table_width, table_height = utils.processContour(table_contour_approx[0]) 32 | 33 | # EXTRACT TABLE REGION 34 | # Start with a full black image 35 | mask = np.zeros(img.shape).astype(img.dtype) 36 | # Create a mask for the table region 37 | cv2.fillPoly(mask, table_contour, (255, 255, 255)) 38 | # Apply the mask to the thresholded image, filling the region 39 | # outside of the table with white 40 | table_img = cv2.bitwise_and(img, mask) 41 | 42 | # WARP TABLE 43 | # Use warp to extract the table region from the processed image 44 | # by mapping table points to a new image of size table_width x table_height 45 | target_points = np.float32([[0, 0], [table_width, 0], [table_width, table_height], [0, table_height]]) 46 | matrix = cv2.getPerspectiveTransform(table_pts, target_points) 47 | # Apply warp to the image to extract the tbale region 48 | warped = cv2.warpPerspective(table_img, matrix, (table_width, table_height)) 49 | # Apply warp to mask 50 | warped_mask = cv2.warpPerspective(mask, matrix, (table_width, table_height)) 51 | # Resize warped and mask to have width 750px 52 | scale_factor = 1500 / table_width 53 | warped = cv2.resize(warped, (0, 0), fx=scale_factor, fy=scale_factor) 54 | warped_mask = cv2.resize(warped_mask, (0, 0), fx=scale_factor, fy=scale_factor) 55 | warped = cv2.GaussianBlur(warped, (5, 5), 2) 56 | # Apply threshold 57 | warped = cv2.adaptiveThreshold(warped, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 9, 2) 58 | 59 | # FIND HORIZONTAL & VERTICAL LINES 60 | lines = LinesDetector.findLines(warped) 61 | # Since the funciton above might get rid of the black area outside the table 62 | # region, apply mask again 63 | lines = cv2.bitwise_and(lines, warped_mask) 64 | 65 | # EXTRACT CELLS 66 | # Get each cell's contour 67 | cell_contours, _ = cv2.findContours(lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) 68 | 69 | # Sometimes the contour of the table is detected again, so filter large contours out 70 | warpedArea = warped.shape[0] * warped.shape[1] * 0.9 71 | 72 | def validContour(cnt): 73 | _, _, w, h = cv2.boundingRect(cnt) 74 | return w * h < warpedArea 75 | 76 | cell_contours = [cnt for cnt in cell_contours if validContour(cnt)] 77 | 78 | # Group cells by row 79 | # findRows returns a Python dictonary with 80 | # key = y value of the row 81 | # value = array of cell contours in the row 82 | rows = RowsDetector.findRows(cell_contours) 83 | 84 | # Compute number of rows and cells 85 | ret["rows"] = len(rows.values()) 86 | ret["cells"] = sum([len(c) for c in rows.values()]) 87 | 88 | # CREATE TABLE IMAGE WITHOUT LINES 89 | # This will help the OCR engine perform better. 90 | # Start with a full white image, add the cells as black rectangles and use the OR operation 91 | # to remove all the lines. 92 | text_mask = np.full(warped.shape, 255).astype(warped.dtype) 93 | # Merge (sum()) all the cell contours from rows (key-value dictonary) 94 | text_mask = cv2.drawContours(text_mask, sum(rows.values(), []), -1, (0, 0, 0), -1) 95 | # Use close operation to dilate and erode image reducing overall noise 96 | text_only = cv2.morphologyEx(warped, cv2.MORPH_CLOSE, np.ones((3,3))) 97 | text_only = cv2.bitwise_or(warped, text_mask) 98 | 99 | # RECONSTRUCT TABLE STRUCTURE 100 | try: 101 | ret["table"] = TableBuilder.reconstructTable(rows, text_only, ocrFunction) 102 | except Exception: 103 | raise ValueError("Error while parsing table, try again with a clearer picture.") 104 | 105 | return ret -------------------------------------------------------------------------------- /TableExtractor/modules/LinesDetector.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | def findLines(img): 5 | # Adapted from https://docs.opencv.org/4.4.0/dd/dd7/tutorial_morph_lines_detection.html 6 | 7 | # Get image height and width to dynamically change 8 | # horizontal and vertical kernel sizes 9 | height, width = img.shape 10 | 11 | # Erode image to thicken lines 12 | eroded = cv2.erode(img, np.ones((3, 3))) 13 | 14 | kernel_length = 3 / 100 15 | 16 | # To find horizontal lines, run a horizontal kernel (e.g. [1 1 1 1]) 17 | # Dilation finds lines, but shrinks their lengths, so 18 | # follow with Erosion to restore original lines' size 19 | horizontal_kernel = np.ones((1, int(width * kernel_length))) 20 | horizontal = cv2.morphologyEx(eroded, cv2.MORPH_CLOSE, horizontal_kernel) 21 | 22 | # To find vertical lines, run a vertical kernel 23 | vertical_kernel = np.ones((int(height * kernel_length), 1)) 24 | vertical = cv2.morphologyEx(eroded, cv2.MORPH_CLOSE, vertical_kernel) 25 | 26 | lines = cv2.bitwise_and(vertical, horizontal) 27 | lines = cv2.erode(lines, np.ones((3, 3)), iterations=3) 28 | 29 | return lines -------------------------------------------------------------------------------- /TableExtractor/modules/PreProcessing.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | def process(img): 5 | # Blur image to remove noise 6 | # Determine kernel size by using image height and width 7 | height, width = img.shape 8 | kernel_size = max(int(height * 0.005), int(width * 0.005)) 9 | # Kernel must have odd values because of GaussianBlur 10 | if kernel_size % 2 == 0: 11 | kernel_size += 1 12 | kernel = (kernel_size, kernel_size) 13 | blur = cv2.GaussianBlur(img, kernel, 1) 14 | 15 | # Use adaptive thresholding to have only black and white pixels 16 | # Without adaptive shadows might black out regions in the image 17 | # Gaussian produces less noise compared to ADAPTIVE_THRESH_MEAN_C 18 | # Block size: above, both kernel values are odd, but block size must be even, therefore add 1. 19 | block_size = kernel_size * 2 - 1 20 | threshold = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, 2) 21 | 22 | # Use laplacian to detect gradients in the image (i.e. lines) 23 | # This helps to improve table region detection in later stages 24 | laplacian = cv2.Laplacian(threshold, cv2.CV_64F) 25 | # Convert data type from 64f to unsigned 8-bit integer 26 | laplacian = np.uint8(np.absolute(laplacian)) 27 | 28 | return laplacian -------------------------------------------------------------------------------- /TableExtractor/modules/RowsDetector.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | 4 | def findRows(cell_contours): 5 | rows = {} 6 | 7 | for cnt in cell_contours: 8 | # Approximate contour to a rectangle, get x, y, width and height 9 | x, y, width, height = cv2.boundingRect(cnt) 10 | 11 | # Ignore cell contours with width or height < 15px 12 | if width < 15 or height < 15: 13 | continue 14 | 15 | # Contour could have a strange shape, so replace original contour 16 | # with the approximated bounding rectange shape 17 | cnt = np.array([ 18 | (x, y), # Top left 19 | (x + width, y), # Top right 20 | (x + width, y + height), # Bottom right 21 | (x, y + height) # Bottom left 22 | ]).reshape((4, 2)) 23 | 24 | # Keep track of whether the contour has been assigned to a row 25 | added = False 26 | 27 | # Iterate over existing rows where: 28 | # row = y-coordinate of the row 29 | for row in rows.keys(): 30 | # Add this cell to the row that is on the same line (i.e. y-axis ± 50px) 31 | # as this cell's contour 32 | if (row - 50) <= y <= (row + 50): 33 | rows[row].append(cnt) 34 | added = True 35 | break 36 | 37 | # If the row wasn't added, create a new row with this cell's y-coordinate 38 | # as the row 39 | if not added: 40 | rows[y] = [cnt] 41 | 42 | # Sort rows top to bottom. 43 | rows = dict(sorted(rows.items())) 44 | 45 | # Sort cells left to right 46 | for key, value in rows.items(): 47 | rows[key] = sorted(value, key=lambda cnt: cv2.boundingRect(cnt)[0]) 48 | 49 | return rows -------------------------------------------------------------------------------- /TableExtractor/modules/TableBuilder.py: -------------------------------------------------------------------------------- 1 | from .utils import leafListToDict 2 | 3 | def reconstructTable(rows, warped = None, ocrFunction = None): 4 | # Reconstruct the table following a top-to-bottom approach. Iterate over each row 5 | # and check for the number of cells. If there are more cells than the previous row, 6 | # this will indicate columns have been split, and the new row is treated as a new 7 | # heading in the table. 8 | # Otherwise, if the same number of cells appear, this means the row contains new 9 | # values for the previous column, so add the cell content to the existing column. 10 | 11 | # EXAMPLE TABLE: 12 | # +-------+-------+ 13 | # | A | B | 14 | # +---+---+---+---+ 15 | # | C | D | E | F | 16 | # +---+---+---+---+ 17 | # | 1 | 2 | 3 | 4 | 18 | # +---+---+---+---+ 19 | 20 | # Store table as a disctionary, where: 21 | # key = column name 22 | # value = list of cell values 23 | # In the example above, table will look like the following: 24 | # 25 | # table = { 26 | # A: { 27 | # C: [1], 28 | # D: [2], 29 | # }, 30 | # B: { 31 | # E: [3], 32 | # F: [4] 33 | # } 34 | # } 35 | table = {} 36 | 37 | # Columns is a reference to the values of the heading names. 38 | # For example, in the above example: 39 | # after the first iteration: columns = [[], []] 40 | # after the second iteration: columns = [[], [], [], []] 41 | # after the third iteration: columns = [[1], [2], [3], [4]] 42 | columns = None 43 | columns_sizes = [] # Keep track of column sizes 44 | 45 | # Keep track of cell number for use in case OCR fails 46 | cell_number = 0 47 | 48 | for cells in rows.values(): 49 | cell_sizes = [] # Keep track of cell sizes 50 | 51 | # Extract cell text (will be replaced with OCR) 52 | cell_contents = [] 53 | for cnt in cells: 54 | # Extract cell region from image 55 | x1, y1 = cnt[0] 56 | x2, y2 = cnt[2] 57 | cell_sizes.append(x2 - x1) # Add cell width to the list of cell sizes 58 | 59 | # Perform OCR if image and ocrFunction are passed in 60 | if warped is not None and callable(ocrFunction): 61 | cell = warped[y1:y2, x1:x2] 62 | text = ocrFunction(cell) 63 | # Otherwise return cell number 64 | else: 65 | text = str(cell_number) 66 | 67 | if text == "": 68 | text = "(failed) cell #" + str(cell_number) 69 | cell_contents.append(text) 70 | cell_number += 1 71 | 72 | if columns is None: 73 | # FIRST ITERATION 74 | # Add first row to the table 75 | for cell in cell_contents: 76 | table[cell] = [] 77 | columns = list(table.values()) 78 | columns_sizes = cell_sizes 79 | 80 | elif len(cell_contents) > len(columns): 81 | # DIFFERENT NUMBER OF CELLS 82 | # Columns have been split, add this row as new headings 83 | 84 | # Replace the previous columns to be new dictionaries. 85 | # At this line, columns contains the lists of the last headings. Because new 86 | # headings have been found, the last headings are converted from lists to 87 | # dictionaries. 88 | columns = leafListToDict(table) 89 | 90 | # Keep track of the previous headings 91 | previous_headings = list(columns) 92 | 93 | # Create new columns for each of the new headings 94 | columns = [[] for _ in cell_contents] 95 | 96 | # Split the new headings into lists of equal size. 97 | # For instance, in the example table above they are split in groups of 2: 98 | # - [C, D] are children of A 99 | # - [E, F] are children of B 100 | # This is done by comparing the current column size (+1%) to the cell sizes. 101 | # When the column size is exceeded, this will indicate a new column has started. 102 | current_column_index = 0 # Keep track of current column 103 | current_width = 0 # Aggregate all cell sizes to check against column size 104 | 105 | for i, heading in enumerate(cell_contents): 106 | current_width += cell_sizes[i] # Add current cell width to current column 107 | 108 | # If cell doesn't fit in the current column, move to the next one 109 | if current_width > columns_sizes[current_column_index] * 1.01: 110 | current_column_index += 1 111 | current_width = cell_sizes[i] 112 | 113 | # Add this new heading as child of previous heading 114 | previous_headings[current_column_index][heading] = columns[i] 115 | 116 | # Reset columns sizes 117 | columns_sizes = cell_sizes 118 | 119 | elif len(cell_contents) == len(columns): 120 | # SAME NUMBER OF CELLS 121 | # add all cells one by one to each column 122 | for i in range(len(cells)): 123 | columns[i].append(cell_contents[i]) 124 | 125 | elif len(cell_contents) < len(columns): 126 | # LESS CELLS THAN PREVIOUS ROW 127 | # This might happen if a table is of the following structure: 128 | # +-------+-------+ 129 | # | A | B | 130 | # +---+---+---+---+ 131 | # | C | D | E | F | 132 | # +---+---+---+---+ 133 | # | G | 134 | # +---+---+---+---+ 135 | # In this example, the third row has 4 cells, and it's followed by a row 136 | # of 1 cell. Unfortunately I couldn't find a nice way to store such rows 137 | # in a JSON file, so the table extraction is stopped at this stage. 138 | break 139 | return table 140 | -------------------------------------------------------------------------------- /TableExtractor/modules/utils.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import re 4 | 5 | # Function to find the largest 4-sided contour from an array of countours 6 | def findLargestQuadrilateralContour(contours): 7 | # Sort contours from smallest area to biggest 8 | sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True) 9 | 10 | biggest_contour = None 11 | biggest_contour_approx = None 12 | 13 | for cnt in sorted_contours: 14 | # Get the length of the perimeter 15 | perimeter = cv2.arcLength(cnt, True) 16 | 17 | # Approximate a shape that resembles the contour 18 | # This is needed because the image might be warped, thus 19 | # edges are curved and not perfectly straight 20 | approx = cv2.approxPolyDP(cnt, 0.01 * perimeter, True) 21 | 22 | # Check if the approximation contains only 4 sides 23 | # (i.e. quadrilateral) 24 | if len(approx) == 4: 25 | biggest_contour = cnt 26 | biggest_contour_approx = approx 27 | break 28 | 29 | return [biggest_contour], [biggest_contour_approx] 30 | 31 | 32 | # Function to sort points in a contour in clockwise order, starting from top left 33 | def processContour(approx): 34 | # Reshape array([x, y], ...) to array( array([x], [y]), ...) 35 | approx = approx.reshape((4, 2)) 36 | 37 | # Sort points in clockwise order, starting from top left 38 | pts = np.zeros((4, 2), dtype=np.float32) 39 | 40 | # Add up all values 41 | # Smallest sum = top left point 42 | # Largest sum = bottom right point 43 | s = approx.sum(axis=1) 44 | pts[0] = approx[np.argmin(s)] 45 | pts[2] = approx[np.argmax(s)] 46 | 47 | # For the other 2 points, compute difference between all points 48 | # Smallest difference = top right point 49 | # Largest difference = bottom left point 50 | diff = np.diff(approx, axis=1) 51 | pts[1] = approx[np.argmin(diff)] 52 | pts[3] = approx[np.argmax(diff)] 53 | 54 | # Calculate smallest height and width 55 | width = int(min(pts[1][0] - pts[0][0], pts[2][0] - pts[3][0])) 56 | height = int(min(pts[3][1] - pts[0][1], pts[2][1] - pts[1][1])) 57 | 58 | return pts, width, height 59 | 60 | 61 | # This funciton is used to recursively find the leaf entries in a dictionary, and replace 62 | # the last list values with dictionaries. This is done beacuse a new table heading should 63 | # be represented by a dictionary type, whereas column values are stored in a list. 64 | def leafListToDict(column): 65 | # If the column is a list... 66 | # This could happen if a heading has multiple values followed by a column split 67 | # E.g. 68 | # +-----------------+------------------+-------------------+ 69 | # | A | B | C | 70 | # +-----------------+------------------+-------------------+ 71 | # | value1 | value2 | value3 | 72 | # +-----------------+------------------+-------------------+ 73 | # | value4 | value5 | value6 | 74 | # +--------+--------+--------+---------+---------+---------+ 75 | # | D | E | F | G | H | I | 76 | # +--------+--------+--------+---------+---------+---------+ 77 | # | value7 | value8 | value9 | value10 | value11 | value12 | 78 | # +--------+--------+--------+---------+---------+---------+ 79 | # Column A has 2 values (value1, value4) followed by a column split (D, E) 80 | if type(column) is list: 81 | # If the last item in the list is a dictionary, iterate over that dictionary to 82 | # find the leaf list 83 | if type(column[-1]) is dict: 84 | return leafListToDict(column[-1]) 85 | 86 | # Otherwise create a new dictiorary and return it 87 | new_value = {} 88 | column.append(new_value) 89 | return [new_value] 90 | 91 | # If the values are all empty lists... 92 | # any(list) returns True if list contains non-empty lists 93 | # E.g. any([[1], [2], [3]]) = True, any([[], [], []]) = False 94 | if not any(column.values()): 95 | # ...replace them with dictionaries 96 | for key in column: 97 | column[key] = {} 98 | return column.values() 99 | # Otherwise recursively iterate all the dictionaries until the leaf key-value pair is 100 | # reached. Double for-loop is used to flatten the return array 101 | # E.g. [[a], [b], [c]] => [a, b, c] 102 | return [column for child in column.values() for column in leafListToDict(child)] 103 | 104 | def getOCRFunction(api): 105 | pattern = "[^a-zA-Z0-9:]" 106 | def ocrFunction(cell): 107 | # Add a white border around cell to improve OCR results 108 | cell = cv2.copyMakeBorder(cell, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=(255, 255, 255)) 109 | # Pass cell image to tesserocr 110 | # More info: https://github.com/sirfz/tesserocr/issues/198#issuecomment-652572304 111 | height, width = cell.shape 112 | api.SetImageBytes( 113 | imagedata=cell.tobytes(), 114 | width=width, 115 | height=height, 116 | bytes_per_pixel=1, 117 | bytes_per_line=width 118 | ) 119 | text = api.GetUTF8Text().strip() 120 | text = re.sub(pattern, "", text) 121 | return text 122 | return ocrFunction -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify, render_template 2 | from werkzeug.utils import secure_filename 3 | from time import time 4 | import TableExtractor 5 | from os.path import exists 6 | from os import remove, makedirs 7 | 8 | # 9 | # OCR setup 10 | # 11 | from tesserocr import PyTessBaseAPI, PSM, OEM 12 | 13 | api = PyTessBaseAPI(lang="eng", psm=PSM.SINGLE_BLOCK, oem=OEM.LSTM_ONLY) 14 | ocrFunction = TableExtractor.getOCRFunction(api) 15 | 16 | # 17 | # FLASK API 18 | # 19 | app = Flask(__name__) 20 | 21 | # Restrict file upload to these extensions 22 | ALLOWED_EXTENSIONS = {"jpg", "jpeg", "png"} 23 | # Limit upload size to 16MB 24 | app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024 25 | # "jsonify" sorts JSON keys alphabetically by default 26 | # This turns off this behaviour to preserve table columns' order 27 | app.config['JSON_SORT_KEYS'] = False 28 | 29 | # Function to check that uploaded files are of image type 30 | def allowed_file(filename): 31 | return "." in filename and \ 32 | filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS 33 | 34 | 35 | @app.route("/", methods=["GET", "POST"]) 36 | def index(): 37 | """ 38 | This POST endpoint expects an image to be sent within form data. The upload field can have any name. 39 | 40 | E.g. 41 |
44 | 45 | Note: the "enctype" is important if using an HTML form! 46 | """ 47 | if(request.method == "GET"): 48 | return render_template("index.html") 49 | 50 | if(len(request.files) == 0): 51 | return jsonify(error="You must submit an image as form data.") 52 | 53 | # Get the first file from the form data 54 | file = request.files[next(iter(request.files))] 55 | filename = secure_filename(file.filename) 56 | 57 | # Check that a file was actually sent. Empty forms send an empty file. 58 | if(not file or filename == ""): 59 | return jsonify(error="You must submit an image as form data.") 60 | 61 | # Check file is an image 62 | if(not allowed_file(filename)): 63 | return jsonify(error="The image must be any of the following formats: " + ", ".join(ALLOWED_EXTENSIONS)) 64 | 65 | try: 66 | if(not exists("./tmp")): 67 | makedirs("tmp") 68 | 69 | # Save file in ./tmp directory 70 | tmp_filename = str(time()) + filename 71 | tmp_path = "./tmp/" + tmp_filename 72 | file.save(tmp_path) 73 | 74 | # Perform table extraction 75 | tableData = TableExtractor.extractTable(tmp_path, ocrFunction) 76 | except: 77 | tableData = jsonify(error="An error occurred while parsing the image. Try again with a clearer picture.") 78 | finally: 79 | # Delete temporary file 80 | if(exists(tmp_path)): 81 | remove(tmp_path) 82 | 83 | return tableData -------------------------------------------------------------------------------- /app/static/poster.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/app/static/poster.jpg -------------------------------------------------------------------------------- /app/static/styles.css: -------------------------------------------------------------------------------- 1 | @import url("https://fonts.googleapis.com/css2?family=Raleway:wght@400;700;900&display=swap"); 2 | 3 | * { 4 | margin: 0; 5 | padding: 0; 6 | } 7 | 8 | body { 9 | font-family: "Raleway", sans-serif; 10 | } 11 | 12 | .wrapper { 13 | padding: 3rem; 14 | max-width: 1200px; 15 | margin: 0 auto; 16 | } 17 | 18 | .table-icon { 19 | display: inline-block; 20 | width: 1em; 21 | height: 1em; 22 | background-image: url(/static/table-icon.svg); 23 | background-size: auto 100%; 24 | } 25 | 26 | h1 { 27 | display: flex; 28 | align-items: center; 29 | gap: 0.25em; 30 | font-weight: 900; 31 | } 32 | 33 | h1, h2 { 34 | margin-bottom: 1rem; 35 | } 36 | 37 | header p { 38 | font-size: 1.5em; 39 | max-width: 600px; 40 | } 41 | 42 | form { 43 | padding: 2rem; 44 | margin: 5rem auto; 45 | border: solid 10px lightgray; 46 | max-width: 500px; 47 | } 48 | 49 | form.disabled { 50 | background: lightgray; 51 | opacity: 0.5; 52 | } 53 | 54 | form div { 55 | display: flex; 56 | flex-direction: column; 57 | align-items: center; 58 | text-align: center; 59 | } 60 | 61 | form input { 62 | flex: 1; 63 | } 64 | 65 | form button { 66 | border: none; 67 | background: lightgray; 68 | padding: 0.5rem; 69 | margin-top: 1rem; 70 | } 71 | 72 | .usage { 73 | display: flex; 74 | flex-direction: column; 75 | gap: 3rem; 76 | width: fit-content; 77 | margin: 5rem auto; 78 | } 79 | 80 | .usage > div { 81 | flex: 1; 82 | } 83 | 84 | .code { 85 | padding: 1rem; 86 | background-color: lightgray; 87 | width: fit-content; 88 | margin: 1rem auto; 89 | font-family: monospace; 90 | } 91 | 92 | .code p:before { 93 | content: "> "; 94 | } 95 | 96 | .code > * + * { 97 | margin-top: 1em; 98 | } 99 | 100 | img { 101 | max-width: 100%; 102 | } 103 | 104 | footer { 105 | text-align: center; 106 | padding: 3rem; 107 | background-color: black; 108 | font-size: 2rem; 109 | } 110 | 111 | footer a { 112 | text-decoration: none; 113 | color: white; 114 | } 115 | 116 | footer a:hover { 117 | text-decoration: underline; 118 | } 119 | 120 | @media (min-width: 800px) { 121 | .wrapper { 122 | padding: 5rem; 123 | } 124 | 125 | form div { 126 | flex-direction: row; 127 | } 128 | 129 | form button { 130 | margin-top: 0; 131 | } 132 | 133 | .usage { 134 | flex-direction: row; 135 | } 136 | } -------------------------------------------------------------------------------- /app/static/table-icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 5 | -------------------------------------------------------------------------------- /app/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |A line-based framework to detect and extract tabular data in JSON format from raster images using computer vision and Tesseract OCR.
17 |Send an image as form data to the following POST endpoint:
31 |For example:
33 |curl -F image=@myImage.jpg
35 |docker pull abdullahibneat/table-extraction
41 |docker run -d -p 5000:5000 abdullahibneat/table-extraction
42 |# Visit http://localhost:5000
43 |# or send post requests there
44 |