├── .gitignore
├── Dockerfile
├── README.md
├── TableExtractor
    ├── __init__.py
    └── modules
    │   ├── LinesDetector.py
    │   ├── PreProcessing.py
    │   ├── RowsDetector.py
    │   ├── TableBuilder.py
    │   └── utils.py
├── app
    ├── __init__.py
    ├── static
    │   ├── poster.jpg
    │   ├── styles.css
    │   └── table-icon.svg
    └── templates
    │   └── index.html
├── data
    ├── digital_table_1.png
    ├── digital_table_2.png
    ├── digital_table_3.png
    ├── sample_table.jpg
    ├── sample_table_2.jpg
    └── sample_table_3.jpg
├── heroku.yml
├── main.py
├── requirements-docker.txt
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | .vscode
  2 | .idea
  3 | tmp
  4 | 
  5 | # Created by https://www.toptal.com/developers/gitignore/api/python,windows,venv
  6 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,windows,venv
  7 | 
  8 | ### Python ###
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | pip-wheel-metadata/
 32 | share/python-wheels/
 33 | *.egg-info/
 34 | .installed.cfg
 35 | *.egg
 36 | MANIFEST
 37 | 
 38 | # PyInstaller
 39 | #  Usually these files are written by a python script from a template
 40 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 41 | *.manifest
 42 | *.spec
 43 | 
 44 | # Installer logs
 45 | pip-log.txt
 46 | pip-delete-this-directory.txt
 47 | 
 48 | # Unit test / coverage reports
 49 | htmlcov/
 50 | .tox/
 51 | .nox/
 52 | .coverage
 53 | .coverage.*
 54 | .cache
 55 | nosetests.xml
 56 | coverage.xml
 57 | *.cover
 58 | *.py,cover
 59 | .hypothesis/
 60 | .pytest_cache/
 61 | pytestdebug.log
 62 | 
 63 | # Translations
 64 | *.mo
 65 | *.pot
 66 | 
 67 | # Django stuff:
 68 | *.log
 69 | local_settings.py
 70 | db.sqlite3
 71 | db.sqlite3-journal
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | doc/_build/
 83 | 
 84 | # PyBuilder
 85 | target/
 86 | 
 87 | # Jupyter Notebook
 88 | .ipynb_checkpoints
 89 | 
 90 | # IPython
 91 | profile_default/
 92 | ipython_config.py
 93 | 
 94 | # pyenv
 95 | .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | pythonenv*
123 | 
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 | 
128 | # Rope project settings
129 | .ropeproject
130 | 
131 | # mkdocs documentation
132 | /site
133 | 
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 | 
139 | # Pyre type checker
140 | .pyre/
141 | 
142 | # pytype static type analyzer
143 | .pytype/
144 | 
145 | # profiling data
146 | .prof
147 | 
148 | ### venv ###
149 | # Virtualenv
150 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
151 | [Bb]in
152 | [Ii]nclude
153 | [Ll]ib
154 | [Ll]ib64
155 | [Ll]ocal
156 | [Ss]cripts
157 | pyvenv.cfg
158 | pip-selfcheck.json
159 | 
160 | ### Windows ###
161 | # Windows thumbnail cache files
162 | Thumbs.db
163 | Thumbs.db:encryptable
164 | ehthumbs.db
165 | ehthumbs_vista.db
166 | 
167 | # Dump file
168 | *.stackdump
169 | 
170 | # Folder config file
171 | [Dd]esktop.ini
172 | 
173 | # Recycle Bin used on file shares
174 | $RECYCLE.BIN/
175 | 
176 | # Windows Installer files
177 | *.cab
178 | *.msi
179 | *.msix
180 | *.msm
181 | *.msp
182 | 
183 | # Windows shortcuts
184 | *.lnk
185 | 
186 | # End of https://www.toptal.com/developers/gitignore/api/python,windows,venv


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.7-slim-buster
 2 | 
 3 | RUN apt-get -y update \
 4 |     && apt-get -y upgrade \
 5 |     # tesserocr requirements
 6 |     && apt-get -y install tesseract-ocr libtesseract-dev libleptonica-dev
 7 | 
 8 | # Required for tesserocr:
 9 | # https://github.com/sirfz/tesserocr/issues/165#issuecomment-445789709
10 | ENV LC_ALL=C
11 | # Use port 5000 by default, could be overwritten by cloud providers (e.g. Heroku)
12 | ENV PORT=5000
13 | 
14 | EXPOSE ${PORT}
15 | 
16 | WORKDIR /app
17 | 
18 | COPY . .
19 | 
20 | # Build tesserocr wheel and install dependancies
21 | RUN apt-get -y install pkg-config build-essential \
22 |     # Use piwheels for arm builds
23 |     && pip3 install -r requirements-docker.txt --extra-index-url https://www.piwheels.org/simple \
24 |     # Remove build dependencies
25 |     && apt-get -y purge --auto-remove pkg-config build-essential
26 | 
27 | # Run flask app
28 | CMD gunicorn -b 0.0.0.0:$PORT -w 4 app:app --timeout 120
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Table Extraction
 2 | 
 3 | ![Extracting tabular data as JSON data](https://i.imgur.com/vUUQ4g1.png)
 4 | 
 5 | ![The web interface](https://i.imgur.com/on76ccg.png)
 6 | 
 7 | ## Overview
 8 | 
 9 | This framework was developed as part of my undergraduate final year project at University and allows for the extraction of tabular data from raster images. It uses **line information** to locate cells, and an algorithm arranges the cells in memory to reconstruct the tabular structure. It then uses the Tesseract OCR engine to extract the text and returns the entire table as JSON data.  It achieved 89% cell detection accuracy when extracting prayer times from timetables (see `data` folder for some examples). 
10 | 
11 | The main drawbacks are as follows:
12 | 
13 |  - Heavily relies on ruling lines. The table must have all column and row separators, and blurry images can cause a drop in line detection
14 |  - Table region detection is quite rudimentary: it looks for the largest quadrilateral in the image
15 |  - It can only detect one table
16 |  - Tesseract needs more fine tuning for better OCR processing, as sometimes text is not recognized properly.
17 | 
18 | Below is a summary of how the framework works. This structure is reflected in `TableExtractor/__init__.py`.
19 | 
20 | ![Overview of processes involved](https://i.imgur.com/oz6YSGK.jpg)
21 | 
22 | ## Docker
23 | 
24 | This is the recommended way to run this project as the environment is all set up and ready to use. For convenience, Docker images are automatically built and released on [Docker Hub](https://hub.docker.com/repository/docker/abdullahibneat/table-extraction).
25 | 
26 | To run the Docker container locally:
27 | 
28 | ```
29 | docker pull abdullahibneat/table-extraction
30 | docker run -d -p 5000:5000 abdullahibneat/table-extraction
31 | ```
32 | 
33 | Then visit http://localhost:5000 and you're ready to go!
34 | 
35 | When using a cloud provider, you can change the port by setting the `PORT` environment variable. In Heroku, the port is set automatically so this repository can simply be pushed to the Heroku remote.
36 | 
37 | ## Manual setup
38 | 
39 | ### OCR setup
40 | 
41 | An OCR engine is NOT required to run the project, though without one the returned table object will return cell numbers instead of the cell contents.
42 | 
43 | If you wish to skip the OCR process, **remove the tesserocr requirement from `requirements.txt`** and continue reading the "Get started" section.
44 | 
45 | This project uses [tesserocr](https://github.com/sirfz/tesserocr) as the Tesseract wrapper out-of-the-box. Follow the instructions [here](https://github.com/sirfz/tesserocr) to set up tesserocr on your system.
46 | 
47 | Alternatively, use your own OCR implementation by removing the tesserocr requirement from `requirements.txt` and updating the "Advanced usage" code in `main.py` with your own implementation.
48 | 
49 | ### Get started
50 | 
51 | 1. Make sure Python 3.7.x is installed. `❗❗❗THIS IS IMPORTANT❗❗❗`
52 | 2. `Recommended:` Set up a Python 3.7 [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)
53 | 3. Install the requirements (make sure you have read the "OCR setup" section above before running this command): `pip install -r requirements.txt`
54 | 4. Run the `main.py` file: `python3 main.py`
55 | 
56 | ### Flask API server
57 | 
58 | A simple Flask API was written to interact with the table extractor. Run the `app` module with Flask:
59 | 
60 | ```
61 | FLASK_APP=app flask run
62 | ```
63 | 
64 | and visit the address (default: `http://localhost:5000`). Alternatively, send the image as form data (it can have any name) in a `POST` request to the root endpoint:
65 | 
66 | ```
67 | curl -F image=@myImage.jpg http://localhost:5000
68 | ```
69 | 


--------------------------------------------------------------------------------
/TableExtractor/__init__.py:
--------------------------------------------------------------------------------
  1 | from .modules import PreProcessing, utils, LinesDetector, RowsDetector, TableBuilder
  2 | import cv2
  3 | import numpy as np
  4 | 
  5 | getOCRFunction = utils.getOCRFunction
  6 | 
  7 | def extractTable(imgPath, ocrFunction = None):
  8 |     # Dictonary to store data to be returned
  9 |     ret = {}
 10 | 
 11 |     img = cv2.imread(imgPath, 0)
 12 |     
 13 |     if img.size == 0:
 14 |         raise ValueError("The file provided must be an image of type jpg, jpeg or png.")
 15 | 
 16 |     # PROCESS IMAGE
 17 |     laplacian = PreProcessing.process(img)
 18 | 
 19 |     # FIND CONTOUR
 20 |     contours, _ = cv2.findContours(laplacian, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
 21 | 
 22 |     # FIND TABLE REGION
 23 |     # It is assumed the table takes up most of the image,
 24 |     # thus it can be identified by finding the largest contour with 4 sides
 25 |     table_contour, table_contour_approx = utils.findLargestQuadrilateralContour(contours)
 26 | 
 27 |     if table_contour[0] is None:
 28 |         raise ValueError("No table detected.")
 29 | 
 30 |     # Sort points in clocwise order, compute table width and height
 31 |     table_pts, table_width, table_height = utils.processContour(table_contour_approx[0])
 32 | 
 33 |     # EXTRACT TABLE REGION
 34 |     # Start with a full black image
 35 |     mask = np.zeros(img.shape).astype(img.dtype)
 36 |     # Create a mask for the table region
 37 |     cv2.fillPoly(mask, table_contour, (255, 255, 255))
 38 |     # Apply the mask to the thresholded image, filling the region
 39 |     # outside of the table with white
 40 |     table_img = cv2.bitwise_and(img, mask)
 41 | 
 42 |     # WARP TABLE
 43 |     # Use warp to extract the table region from the processed image
 44 |     # by mapping table points to a new image of size table_width x table_height
 45 |     target_points = np.float32([[0, 0], [table_width, 0], [table_width, table_height], [0, table_height]])
 46 |     matrix = cv2.getPerspectiveTransform(table_pts, target_points)
 47 |     # Apply warp to the image to extract the tbale region
 48 |     warped = cv2.warpPerspective(table_img, matrix, (table_width, table_height))
 49 |     # Apply warp to mask
 50 |     warped_mask = cv2.warpPerspective(mask, matrix, (table_width, table_height))
 51 |     # Resize warped and mask to have width 750px
 52 |     scale_factor = 1500 / table_width
 53 |     warped = cv2.resize(warped, (0, 0), fx=scale_factor, fy=scale_factor)
 54 |     warped_mask = cv2.resize(warped_mask, (0, 0), fx=scale_factor, fy=scale_factor)
 55 |     warped = cv2.GaussianBlur(warped, (5, 5), 2)
 56 |     # Apply threshold
 57 |     warped = cv2.adaptiveThreshold(warped, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 9, 2)
 58 | 
 59 |     # FIND HORIZONTAL & VERTICAL LINES
 60 |     lines = LinesDetector.findLines(warped)
 61 |     # Since the funciton above might get rid of the black area outside the table
 62 |     # region, apply mask again
 63 |     lines = cv2.bitwise_and(lines, warped_mask)
 64 | 
 65 |     # EXTRACT CELLS
 66 |     # Get each cell's contour
 67 |     cell_contours, _ = cv2.findContours(lines, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 68 |     
 69 |     # Sometimes the contour of the table is detected again, so filter large contours out
 70 |     warpedArea = warped.shape[0] * warped.shape[1] * 0.9
 71 | 
 72 |     def validContour(cnt):
 73 |         _, _, w, h = cv2.boundingRect(cnt)
 74 |         return w * h < warpedArea
 75 | 
 76 |     cell_contours = [cnt for cnt in cell_contours if validContour(cnt)]
 77 | 
 78 |     # Group cells by row
 79 |     # findRows returns a Python dictonary with
 80 |     #   key = y value of the row
 81 |     #   value = array of cell contours in the row
 82 |     rows = RowsDetector.findRows(cell_contours)
 83 | 
 84 |     # Compute number of rows and cells
 85 |     ret["rows"] = len(rows.values())
 86 |     ret["cells"] = sum([len(c) for c in rows.values()])
 87 | 
 88 |     # CREATE TABLE IMAGE WITHOUT LINES
 89 |     # This will help the OCR engine perform better.
 90 |     # Start with a full white image, add the cells as black rectangles and use the OR operation
 91 |     # to remove all the lines.
 92 |     text_mask = np.full(warped.shape, 255).astype(warped.dtype)
 93 |     # Merge (sum()) all the cell contours from rows (key-value dictonary)
 94 |     text_mask = cv2.drawContours(text_mask, sum(rows.values(), []), -1, (0, 0, 0), -1)
 95 |     # Use close operation to dilate and erode image reducing overall noise
 96 |     text_only = cv2.morphologyEx(warped, cv2.MORPH_CLOSE, np.ones((3,3)))
 97 |     text_only = cv2.bitwise_or(warped, text_mask)
 98 | 
 99 |     # RECONSTRUCT TABLE STRUCTURE
100 |     try:
101 |         ret["table"] = TableBuilder.reconstructTable(rows, text_only, ocrFunction)
102 |     except Exception:
103 |         raise ValueError("Error while parsing table, try again with a clearer picture.")
104 | 
105 |     return ret


--------------------------------------------------------------------------------
/TableExtractor/modules/LinesDetector.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | 
 4 | def findLines(img):
 5 |     # Adapted from https://docs.opencv.org/4.4.0/dd/dd7/tutorial_morph_lines_detection.html
 6 | 
 7 |     # Get image height and width to dynamically change
 8 |     # horizontal and vertical kernel sizes
 9 |     height, width = img.shape
10 | 
11 |     # Erode image to thicken lines
12 |     eroded = cv2.erode(img, np.ones((3, 3)))
13 | 
14 |     kernel_length = 3 / 100
15 | 
16 |     # To find horizontal lines, run a horizontal kernel (e.g. [1 1 1 1])
17 |     # Dilation finds lines, but shrinks their lengths, so
18 |     # follow with Erosion to restore original lines' size
19 |     horizontal_kernel = np.ones((1, int(width * kernel_length)))
20 |     horizontal = cv2.morphologyEx(eroded, cv2.MORPH_CLOSE, horizontal_kernel)
21 |     
22 |     # To find vertical lines, run a vertical kernel
23 |     vertical_kernel = np.ones((int(height * kernel_length), 1))
24 |     vertical = cv2.morphologyEx(eroded, cv2.MORPH_CLOSE, vertical_kernel)
25 | 
26 |     lines = cv2.bitwise_and(vertical, horizontal)
27 |     lines = cv2.erode(lines, np.ones((3, 3)), iterations=3)
28 | 
29 |     return lines


--------------------------------------------------------------------------------
/TableExtractor/modules/PreProcessing.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | 
 4 | def process(img):
 5 |     # Blur image to remove noise
 6 |     # Determine kernel size by using image height and width
 7 |     height, width = img.shape
 8 |     kernel_size = max(int(height * 0.005), int(width * 0.005))
 9 |     # Kernel must have odd values because of GaussianBlur
10 |     if kernel_size % 2 == 0:
11 |         kernel_size += 1
12 |     kernel = (kernel_size, kernel_size)
13 |     blur = cv2.GaussianBlur(img, kernel, 1)
14 | 
15 |     # Use adaptive thresholding to have only black and white pixels
16 |     # Without adaptive shadows might black out regions in the image
17 |     # Gaussian produces less noise compared to ADAPTIVE_THRESH_MEAN_C
18 |     # Block size: above, both kernel values are odd, but block size must be even, therefore add 1.
19 |     block_size = kernel_size * 2 - 1
20 |     threshold = cv2.adaptiveThreshold(blur, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, block_size, 2)
21 | 
22 |     # Use laplacian to detect gradients in the image (i.e. lines)
23 |     # This helps to improve table region detection in later stages
24 |     laplacian = cv2.Laplacian(threshold, cv2.CV_64F)
25 |     # Convert data type from 64f to unsigned 8-bit integer
26 |     laplacian = np.uint8(np.absolute(laplacian))
27 | 
28 |     return laplacian


--------------------------------------------------------------------------------
/TableExtractor/modules/RowsDetector.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | 
 4 | def findRows(cell_contours):
 5 |     rows = {}
 6 | 
 7 |     for cnt in cell_contours:
 8 |         # Approximate contour to a rectangle, get x, y, width and height
 9 |         x, y, width, height = cv2.boundingRect(cnt)
10 | 
11 |         # Ignore cell contours with width or height < 15px
12 |         if width < 15 or height < 15:
13 |             continue
14 | 
15 |         # Contour could have a strange shape, so replace original contour
16 |         # with the approximated bounding rectange shape
17 |         cnt = np.array([
18 |             (x, y), # Top left
19 |             (x + width, y), # Top right
20 |             (x + width, y + height), # Bottom right
21 |             (x, y + height) # Bottom left
22 |         ]).reshape((4, 2))
23 | 
24 |         # Keep track of whether the contour has been assigned to a row
25 |         added = False
26 | 
27 |         # Iterate over existing rows where:
28 |         # row = y-coordinate of the row
29 |         for row in rows.keys():
30 |             # Add this cell to the row that is on the same line (i.e. y-axis ± 50px)
31 |             # as this cell's contour
32 |             if (row - 50) <= y <= (row + 50):
33 |                 rows[row].append(cnt)
34 |                 added = True
35 |                 break
36 | 
37 |         # If the row wasn't added, create a new row with this cell's y-coordinate
38 |         # as the row
39 |         if not added:
40 |             rows[y] = [cnt]
41 | 
42 |     # Sort rows top to bottom.
43 |     rows = dict(sorted(rows.items()))
44 | 
45 |     # Sort cells left to right
46 |     for key, value in rows.items():
47 |         rows[key] = sorted(value, key=lambda cnt: cv2.boundingRect(cnt)[0])
48 |     
49 |     return rows


--------------------------------------------------------------------------------
/TableExtractor/modules/TableBuilder.py:
--------------------------------------------------------------------------------
  1 | from .utils import leafListToDict
  2 | 
  3 | def reconstructTable(rows, warped = None, ocrFunction = None):
  4 |     # Reconstruct the table following a top-to-bottom approach. Iterate over each row
  5 |     # and check for the number of cells. If there are more cells than the previous row,
  6 |     # this will indicate columns have been split, and the new row is treated as a new 
  7 |     # heading in the table.
  8 |     # Otherwise, if the same number of cells appear, this means the row contains new
  9 |     # values for the previous column, so add the cell content to the existing column.
 10 | 
 11 |     # EXAMPLE TABLE:
 12 |     # +-------+-------+
 13 |     # |   A   |   B   |
 14 |     # +---+---+---+---+
 15 |     # | C | D | E | F |
 16 |     # +---+---+---+---+
 17 |     # | 1 | 2 | 3 | 4 |
 18 |     # +---+---+---+---+
 19 | 
 20 |     # Store table as a disctionary, where:
 21 |     #   key = column name
 22 |     #   value = list of cell values
 23 |     # In the example above, table will look like the following: 
 24 |     #
 25 |     # table = {
 26 |     #   A: {
 27 |     #       C: [1],
 28 |     #       D: [2],
 29 |     #   },
 30 |     #   B: {
 31 |     #       E: [3],
 32 |     #       F: [4]
 33 |     #   }
 34 |     # }
 35 |     table = {}
 36 | 
 37 |     # Columns is a reference to the values of the heading names.
 38 |     # For example, in the above example:
 39 |     # after the first iteration:    columns = [[], []]
 40 |     # after the second iteration:   columns = [[], [], [], []]
 41 |     # after the third iteration:    columns = [[1], [2], [3], [4]]
 42 |     columns = None
 43 |     columns_sizes = [] # Keep track of column sizes
 44 | 
 45 |     # Keep track of cell number for use in case OCR fails
 46 |     cell_number = 0
 47 | 
 48 |     for cells in rows.values():
 49 |         cell_sizes = [] # Keep track of cell sizes
 50 | 
 51 |         # Extract cell text (will be replaced with OCR)
 52 |         cell_contents = []
 53 |         for cnt in cells:
 54 |             # Extract cell region from image
 55 |             x1, y1 = cnt[0]
 56 |             x2, y2 = cnt[2]
 57 |             cell_sizes.append(x2 - x1) # Add cell width to the list of cell sizes
 58 | 
 59 |             # Perform OCR if image and ocrFunction are passed in
 60 |             if warped is not None and callable(ocrFunction):
 61 |                 cell = warped[y1:y2, x1:x2]
 62 |                 text = ocrFunction(cell)
 63 |             # Otherwise return cell number
 64 |             else:
 65 |                 text = str(cell_number)
 66 | 
 67 |             if text == "":
 68 |                 text = "(failed) cell #" + str(cell_number)
 69 |             cell_contents.append(text)
 70 |             cell_number += 1
 71 |         
 72 |         if columns is None:
 73 |             # FIRST ITERATION
 74 |             # Add first row to the table
 75 |             for cell in cell_contents:
 76 |                 table[cell] = []
 77 |             columns = list(table.values())
 78 |             columns_sizes = cell_sizes
 79 | 
 80 |         elif len(cell_contents) > len(columns):
 81 |             # DIFFERENT NUMBER OF CELLS
 82 |             # Columns have been split, add this row as new headings
 83 | 
 84 |             # Replace the previous columns to be new dictionaries.
 85 |             # At this line, columns contains the lists of the last headings. Because new
 86 |             # headings have been found, the last headings are converted from lists to
 87 |             # dictionaries.
 88 |             columns = leafListToDict(table)
 89 | 
 90 |             # Keep track of the previous headings
 91 |             previous_headings = list(columns)
 92 | 
 93 |             # Create new columns for each of the new headings
 94 |             columns = [[] for _ in cell_contents]
 95 | 
 96 |             # Split the new headings into lists of equal size.
 97 |             # For instance, in the example table above they are split in groups of 2:
 98 |             #   - [C, D] are children of A
 99 |             #   - [E, F] are children of B
100 |             # This is done by comparing the current column size (+1%) to the cell sizes.
101 |             # When the column size is exceeded, this will indicate a new column has started.
102 |             current_column_index = 0 # Keep track of current column
103 |             current_width = 0 # Aggregate all cell sizes to check against column size
104 | 
105 |             for i, heading in enumerate(cell_contents):
106 |                 current_width += cell_sizes[i] # Add current cell width to current column
107 | 
108 |                 # If cell doesn't fit in the current column, move to the next one
109 |                 if current_width > columns_sizes[current_column_index] * 1.01:
110 |                     current_column_index += 1
111 |                     current_width = cell_sizes[i]
112 | 
113 |                 # Add this new heading as child of previous heading
114 |                 previous_headings[current_column_index][heading] = columns[i]
115 | 
116 |             # Reset columns sizes
117 |             columns_sizes = cell_sizes
118 | 
119 |         elif len(cell_contents) == len(columns):
120 |             # SAME NUMBER OF CELLS
121 |             # add all cells one by one to each column
122 |             for i in range(len(cells)):
123 |                 columns[i].append(cell_contents[i])
124 |         
125 |         elif len(cell_contents) < len(columns):
126 |             # LESS CELLS THAN PREVIOUS ROW
127 |             # This might happen if a table is of the following structure:
128 |             # +-------+-------+
129 |             # |   A   |   B   |
130 |             # +---+---+---+---+
131 |             # | C | D | E | F |
132 |             # +---+---+---+---+
133 |             # |       G       |
134 |             # +---+---+---+---+
135 |             # In this example, the third row has 4 cells, and it's followed by a row
136 |             # of 1 cell. Unfortunately I couldn't find a nice way to store such rows
137 |             # in a JSON file, so the table extraction is stopped at this stage.
138 |             break
139 |     return table
140 | 


--------------------------------------------------------------------------------
/TableExtractor/modules/utils.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import re
  4 | 
  5 | # Function to find the largest 4-sided contour from an array of countours
  6 | def findLargestQuadrilateralContour(contours):
  7 |     # Sort contours from smallest area to biggest
  8 |     sorted_contours = sorted(contours, key=cv2.contourArea, reverse=True)
  9 | 
 10 |     biggest_contour = None
 11 |     biggest_contour_approx = None
 12 | 
 13 |     for cnt in sorted_contours:
 14 |         # Get the length of the perimeter
 15 |         perimeter = cv2.arcLength(cnt, True)
 16 | 
 17 |         # Approximate a shape that resembles the contour
 18 |         # This is needed because the image might be warped, thus
 19 |         # edges are curved and not perfectly straight
 20 |         approx = cv2.approxPolyDP(cnt, 0.01 * perimeter, True)
 21 | 
 22 |         # Check if the approximation contains only 4 sides
 23 |         # (i.e. quadrilateral)
 24 |         if len(approx) == 4:
 25 |             biggest_contour = cnt
 26 |             biggest_contour_approx = approx
 27 |             break
 28 | 
 29 |     return [biggest_contour], [biggest_contour_approx]
 30 | 
 31 | 
 32 | # Function to sort points in a contour in clockwise order, starting from top left
 33 | def processContour(approx):
 34 |     # Reshape array([x, y], ...) to array( array([x], [y]), ...)
 35 |     approx = approx.reshape((4, 2))
 36 | 
 37 |     # Sort points in clockwise order, starting from top left
 38 |     pts = np.zeros((4, 2), dtype=np.float32)
 39 | 
 40 |     # Add up all values
 41 |     # Smallest sum = top left point
 42 |     # Largest sum = bottom right point
 43 |     s = approx.sum(axis=1)
 44 |     pts[0] = approx[np.argmin(s)]
 45 |     pts[2] = approx[np.argmax(s)]
 46 | 
 47 |     # For the other 2 points, compute difference between all points
 48 |     # Smallest difference = top right point
 49 |     # Largest difference = bottom left point
 50 |     diff = np.diff(approx, axis=1)
 51 |     pts[1] = approx[np.argmin(diff)]
 52 |     pts[3] = approx[np.argmax(diff)]
 53 | 
 54 |     # Calculate smallest height and width
 55 |     width = int(min(pts[1][0] - pts[0][0], pts[2][0] - pts[3][0]))
 56 |     height = int(min(pts[3][1] - pts[0][1], pts[2][1] - pts[1][1]))
 57 | 
 58 |     return pts, width, height
 59 | 
 60 | 
 61 | # This funciton is used to recursively find the leaf entries in a dictionary, and replace
 62 | # the last list values with dictionaries. This is done beacuse a new table heading should
 63 | # be represented by a dictionary type, whereas column values are stored in a list.
 64 | def leafListToDict(column):
 65 |     # If the column is a list...
 66 |     # This could happen if a heading has multiple values followed by a column split
 67 |     # E.g.
 68 |     #       +-----------------+------------------+-------------------+
 69 |     #       |        A        |        B         |         C         |
 70 |     #       +-----------------+------------------+-------------------+
 71 |     #       |     value1      |     value2       |      value3       |
 72 |     #       +-----------------+------------------+-------------------+
 73 |     #       |     value4      |     value5       |      value6       |
 74 |     #       +--------+--------+--------+---------+---------+---------+
 75 |     #       |   D    |   E    |   F    |    G    |    H    |    I    |
 76 |     #       +--------+--------+--------+---------+---------+---------+
 77 |     #       | value7 | value8 | value9 | value10 | value11 | value12 |
 78 |     #       +--------+--------+--------+---------+---------+---------+
 79 |     # Column A has 2 values (value1, value4) followed by a column split (D, E)
 80 |     if type(column) is list:
 81 |         # If the last item in the list is a dictionary, iterate over that dictionary to
 82 |         # find the leaf list
 83 |         if type(column[-1]) is dict:
 84 |             return leafListToDict(column[-1])
 85 |         
 86 |         # Otherwise create a new dictiorary and return it
 87 |         new_value = {}
 88 |         column.append(new_value)
 89 |         return [new_value]
 90 | 
 91 |     # If the values are all empty lists...
 92 |     # any(list) returns True if list contains non-empty lists
 93 |     # E.g. any([[1], [2], [3]]) = True, any([[], [], []]) = False
 94 |     if not any(column.values()):
 95 |         # ...replace them with dictionaries
 96 |         for key in column:
 97 |             column[key] = {}
 98 |         return column.values()
 99 |     # Otherwise recursively iterate all the dictionaries until the leaf key-value pair is
100 |     # reached. Double for-loop is used to flatten the return array
101 |     # E.g. [[a], [b], [c]] => [a, b, c]
102 |     return [column for child in column.values() for column in leafListToDict(child)]
103 | 
104 | def getOCRFunction(api):
105 |     pattern = "[^a-zA-Z0-9:]"
106 |     def ocrFunction(cell):
107 |         # Add a white border around cell to improve OCR results
108 |         cell = cv2.copyMakeBorder(cell, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=(255, 255, 255))
109 |         # Pass cell image to tesserocr
110 |         # More info: https://github.com/sirfz/tesserocr/issues/198#issuecomment-652572304
111 |         height, width = cell.shape
112 |         api.SetImageBytes(
113 |             imagedata=cell.tobytes(),
114 |             width=width,
115 |             height=height,
116 |             bytes_per_pixel=1,
117 |             bytes_per_line=width
118 |         )
119 |         text = api.GetUTF8Text().strip()
120 |         text = re.sub(pattern, "", text)
121 |         return text
122 |     return ocrFunction


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, request, jsonify, render_template
 2 | from werkzeug.utils import secure_filename
 3 | from time import time
 4 | import TableExtractor
 5 | from os.path import exists
 6 | from os import remove, makedirs
 7 | 
 8 | #
 9 | # OCR setup
10 | #
11 | from tesserocr import PyTessBaseAPI, PSM, OEM
12 | 
13 | api = PyTessBaseAPI(lang="eng", psm=PSM.SINGLE_BLOCK, oem=OEM.LSTM_ONLY)
14 | ocrFunction = TableExtractor.getOCRFunction(api)
15 | 
16 | #
17 | # FLASK API
18 | #
19 | app = Flask(__name__)
20 | 
21 | # Restrict file upload to these extensions
22 | ALLOWED_EXTENSIONS = {"jpg", "jpeg", "png"}
23 | # Limit upload size to 16MB
24 | app.config["MAX_CONTENT_LENGTH"] = 16 * 1024 * 1024
25 | # "jsonify" sorts JSON keys alphabetically by default
26 | # This turns off this behaviour to preserve table columns' order
27 | app.config['JSON_SORT_KEYS'] = False
28 | 
29 | # Function to check that uploaded files are of image type
30 | def allowed_file(filename):
31 |     return "." in filename and \
32 |            filename.rsplit(".", 1)[1].lower() in ALLOWED_EXTENSIONS
33 | 
34 | 
35 | @app.route("/", methods=["GET", "POST"])
36 | def index():
37 |     """
38 |     This POST endpoint expects an image to be sent within form data. The upload field can have any name.
39 | 
40 |     E.g.
41 |     <form method="POST" enctype="multipart/form-data">
42 |         <input type="file" name="image" />
43 |     </form>
44 | 
45 |     Note: the "enctype" is important if using an HTML form!
46 |     """
47 |     if(request.method == "GET"):
48 |         return render_template("index.html")
49 | 
50 |     if(len(request.files) == 0):
51 |         return jsonify(error="You must submit an image as form data.")
52 | 
53 |     # Get the first file from the form data
54 |     file = request.files[next(iter(request.files))]
55 |     filename = secure_filename(file.filename)
56 | 
57 |     # Check that a file was actually sent. Empty forms send an empty file.
58 |     if(not file or filename == ""):
59 |         return jsonify(error="You must submit an image as form data.")
60 |     
61 |     # Check file is an image
62 |     if(not allowed_file(filename)):
63 |         return jsonify(error="The image must be any of the following formats: " + ", ".join(ALLOWED_EXTENSIONS))
64 | 
65 |     try:
66 |         if(not exists("./tmp")):
67 |             makedirs("tmp")
68 |             
69 |         # Save file in ./tmp directory
70 |         tmp_filename = str(time()) + filename
71 |         tmp_path = "./tmp/" + tmp_filename
72 |         file.save(tmp_path)
73 | 
74 |         # Perform table extraction
75 |         tableData = TableExtractor.extractTable(tmp_path, ocrFunction)
76 |     except:
77 |         tableData = jsonify(error="An error occurred while parsing the image. Try again with a clearer picture.")
78 |     finally:
79 |         # Delete temporary file
80 |         if(exists(tmp_path)):
81 |             remove(tmp_path)
82 |     
83 |     return tableData


--------------------------------------------------------------------------------
/app/static/poster.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/app/static/poster.jpg


--------------------------------------------------------------------------------
/app/static/styles.css:
--------------------------------------------------------------------------------
  1 | @import url("https://fonts.googleapis.com/css2?family=Raleway:wght@400;700;900&display=swap");
  2 | 
  3 | * {
  4 |     margin: 0;
  5 |     padding: 0;
  6 | }
  7 | 
  8 | body {
  9 |     font-family: "Raleway", sans-serif;
 10 | }
 11 | 
 12 | .wrapper {
 13 |     padding: 3rem;
 14 |     max-width: 1200px;
 15 |     margin: 0 auto;
 16 | }
 17 | 
 18 | .table-icon {
 19 |     display: inline-block;
 20 |     width: 1em;
 21 |     height: 1em;
 22 |     background-image: url(/static/table-icon.svg);
 23 |     background-size: auto 100%;
 24 | }
 25 | 
 26 | h1 {
 27 |     display: flex;
 28 |     align-items: center;
 29 |     gap: 0.25em;
 30 |     font-weight: 900;
 31 | }
 32 | 
 33 | h1, h2 {
 34 |     margin-bottom: 1rem;
 35 | }
 36 | 
 37 | header p {
 38 |     font-size: 1.5em;
 39 |     max-width: 600px;
 40 | }
 41 | 
 42 | form {
 43 |     padding: 2rem;
 44 |     margin: 5rem auto;
 45 |     border: solid 10px lightgray;
 46 |     max-width: 500px;
 47 | }
 48 | 
 49 | form.disabled {
 50 |     background: lightgray;
 51 |     opacity: 0.5;
 52 | }
 53 | 
 54 | form div {
 55 |     display: flex;
 56 |     flex-direction: column;
 57 |     align-items: center;
 58 |     text-align: center;
 59 | }
 60 | 
 61 | form input {
 62 |     flex: 1;
 63 | }
 64 | 
 65 | form button {
 66 |     border: none;
 67 |     background: lightgray;
 68 |     padding: 0.5rem;
 69 |     margin-top: 1rem;
 70 | }
 71 | 
 72 | .usage {
 73 |     display: flex;
 74 |     flex-direction: column;
 75 |     gap: 3rem;
 76 |     width: fit-content;
 77 |     margin: 5rem auto;
 78 | }
 79 | 
 80 | .usage > div {
 81 |     flex: 1;
 82 | }
 83 | 
 84 | .code {
 85 |     padding: 1rem;
 86 |     background-color: lightgray;
 87 |     width: fit-content;
 88 |     margin: 1rem auto;
 89 |     font-family: monospace;
 90 | }
 91 | 
 92 | .code p:before {
 93 |     content: ">  ";
 94 | }
 95 | 
 96 | .code > * + * {
 97 |     margin-top: 1em;
 98 | }
 99 | 
100 | img {
101 |     max-width: 100%;
102 | }
103 | 
104 | footer {
105 |     text-align: center;
106 |     padding: 3rem;
107 |     background-color: black;
108 |     font-size: 2rem;
109 | }
110 | 
111 | footer a {
112 |     text-decoration: none;
113 |     color: white;
114 | }
115 | 
116 | footer a:hover {
117 |     text-decoration: underline;
118 | }
119 | 
120 | @media (min-width: 800px) {
121 |     .wrapper {
122 |         padding: 5rem;
123 |     }
124 | 
125 |     form div {
126 |         flex-direction: row;
127 |     }
128 | 
129 |     form button {
130 |         margin-top: 0;
131 |     }
132 | 
133 |     .usage {
134 |         flex-direction: row;
135 |     }
136 | }


--------------------------------------------------------------------------------
/app/static/table-icon.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <svg version="1.1" viewBox="0 0 85 85" xmlns="http://www.w3.org/2000/svg">
3 | <path d="m0 10.625c6.333e-7 -5.868 4.757-10.625 10.625-10.625h63.75c5.868 0 10.625 4.757 10.625 10.625v63.75c0 5.868-4.757 10.625-10.625 10.625h-63.75c-5.868 0-10.625-4.757-10.625-10.625v-63.75zm79.688 10.625h-21.25v15.938h21.25v-15.938zm0 21.25h-21.25v15.938h21.25v-15.938zm0 21.25h-21.25v15.938h15.938c2.934 0 5.3125-2.3785 5.3125-5.3125v-10.625zm-26.562 15.938v-15.938h-21.25v15.938h21.25zm-26.562 0v-15.938h-21.25v10.625c0 2.934 2.3785 5.3125 5.3125 5.3125h15.938zm-21.25-21.25h21.25v-15.938h-21.25v15.938zm0-21.25h21.25v-15.938h-21.25v15.938zm26.562-15.938v15.938h21.25v-15.938h-21.25zm21.25 21.25h-21.25v15.938h21.25v-15.938z" fill-rule="evenodd"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/app/templates/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
 6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
 7 |     <title>Table Extraction - Extract table data from raster images</title>
 8 |     <meta name="description" content="A line-based framework to detect and extract tabular data in JSON format from raster images using computer vision and Tesseract OCR.">
 9 |     <meta name="author" content="Abdullah Ibne Atiq">
10 |     <link rel="stylesheet" href="/static/styles.css">
11 | </head>
12 | <body>
13 |     <div class="wrapper">
14 |         <header>
15 |             <h1><span class="table-icon"></span> Table Extraction</h1>
16 |             <p>A line-based framework to detect and extract tabular data in JSON format from raster images using computer vision and Tesseract OCR.</p>
17 |         </header>
18 |     
19 |         <form method="POST" enctype="multipart/form-data">
20 |             <h2>Select an image</h2>
21 |             <div>
22 |                 <input required name="image" type="file" accept=".jpg, .jpeg, .png" />
23 |                 <button>Extract table ➔</button>
24 |             </div>
25 |         </form>
26 |     
27 |         <div class="usage">
28 |             <div>
29 |                 <h2>API</h2>
30 |                 <p>Send an image as form data to the following POST endpoint:</p>
31 |                 <div class="code"><span class="hostname"/></div>
32 |                 <p>For example:</p>
33 |                 <div class="code">
34 |                     <p>curl -F image=@myImage.jpg <span class="hostname"/></p>
35 |                 </div>
36 |             </div>
37 |             <div>
38 |                 <h2>Run locally with Docker</h2>
39 |                 <div class="code">
40 |                     <p>docker pull abdullahibneat/table-extraction</p>
41 |                     <p>docker run -d -p 5000:5000 abdullahibneat/table-extraction</p>
42 |                     <p class="comment"># Visit http://localhost:5000</p>
43 |                     <p class="comment"># or send post requests there</p>
44 |                 </div>
45 |             </div>
46 |         </div>
47 |     
48 |         <h1>How it works</h1>
49 |         <img alt="How it works" src="/static/poster.jpg" />
50 |     </div>
51 |     <footer>
52 |         <a href="https://github.com/abdullahibneat/TableExtraction" target="_blank">Chek out the full source on GitHub ➔</a>
53 |     </footer>
54 | 
55 |     <script>
56 |         // Retrieve hostname from window
57 |         Array.from(document.getElementsByClassName("hostname"))
58 |             .forEach(el => el.innerHTML = window.location.origin )
59 |         
60 |         // Disable form on submit
61 |         const form = document.querySelector("form")
62 |         form.onsubmit = () => { form.classList.add("disabled") }
63 |     </script>
64 | </body>
65 | </html>


--------------------------------------------------------------------------------
/data/digital_table_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/data/digital_table_1.png


--------------------------------------------------------------------------------
/data/digital_table_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/data/digital_table_2.png


--------------------------------------------------------------------------------
/data/digital_table_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/data/digital_table_3.png


--------------------------------------------------------------------------------
/data/sample_table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/data/sample_table.jpg


--------------------------------------------------------------------------------
/data/sample_table_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/data/sample_table_2.jpg


--------------------------------------------------------------------------------
/data/sample_table_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abdullahibneat/TableExtraction/c952eaa5e33178b38f7f6a6c33c5571569aa1bc7/data/sample_table_3.jpg


--------------------------------------------------------------------------------
/heroku.yml:
--------------------------------------------------------------------------------
1 | build:
2 |     docker:
3 |         web: Dockerfile


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import TableExtractor
 2 | 
 3 | #
 4 | # SIMPLE USAGE
 5 | #
 6 | # Pass the path to an image to the "extractTable" function, and return
 7 | # the tabular sctructure WITHOUT PERFORMING OCR (it uses cell numbers instead)
 8 | 
 9 | print(TableExtractor.extractTable("data/sample_table.jpg"))
10 | 
11 | 
12 | #
13 | # ADVANCED USAGE
14 | #
15 | # Pass an OCR function that takes in an OpenCV image and returns its text.
16 | # 
17 | # For instance, using tesserocr (UNCOMMENT __ALL__ THE LINES BELOW):
18 | 
19 | # from tesserocr import PyTessBaseAPI, PSM, OEM
20 | # from TableExtractor import getOCRFunction
21 | 
22 | # with PyTessBaseAPI(lang="eng", psm=PSM.SINGLE_BLOCK, oem=OEM.LSTM_ONLY) as api:
23 | #     print(TableExtractor.extractTable("data/sample_table.jpg", getOCRFunction(api)))


--------------------------------------------------------------------------------
/requirements-docker.txt:
--------------------------------------------------------------------------------
1 | # Make sure opencv version is available at https://www.piwheels.org/project/opencv-python-headless/
2 | opencv-python-headless==4.5.1.48
3 | tesserocr==2.5.1
4 | flask==2.0.1
5 | gunicorn==20.1.0


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # Make sure opencv version is available at https://www.piwheels.org/project/opencv-python-headless/
2 | opencv-python==4.5.1.48
3 | tesserocr==2.5.1
4 | flask==2.0.1


--------------------------------------------------------------------------------