├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── data
    └── example.jpg
├── main.py
├── requirements.txt
├── table.py
├── textcleaner
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | bin/
  2 | excel/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | env/
 15 | build/
 16 | develop-eggs/
 17 | dist/
 18 | downloads/
 19 | eggs/
 20 | .eggs/
 21 | lib/
 22 | lib64/
 23 | parts/
 24 | sdist/
 25 | var/
 26 | wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # dotenv
 86 | .env
 87 | 
 88 | # virtualenv
 89 | .venv
 90 | venv/
 91 | ENV/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Brian Yang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | run:
2 | 	python main.py $(target)
3 | clean:
4 | 	rm -rf excel bin __pycache__
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spreadsheet-images
 2 | Take lab reports or any papers with tables in them, and instantly extract those tables and convert them to Excel spreadsheets.
 3 | 
 4 | ## Disclaimer
 5 | The textcleaner script was made by Fred Weinhaus. To use the textcleaner script in this repository for commercial use, redistribute it on the Internet, integrate it into free applications on the Internet, etc. you must contact Fred at fmw@alink.net for permission, or else you cannot use the textcleaner script. See the textcleaner file for more details. Users who use or fork this project have my permission to use and modify the rest of the code in this project, just not the textcleaner script. 
 6 | 
 7 | ## Installation
 8 | 1. Install Tesseract OCR (used to recognize the text in the tables).
 9 |     - `sudo apt-get install tesseract-ocr libtesseract-dev libleptonica-dev`
10 | 2. Install python libraries:
11 |     - `pip install -r requirements.txt`
12 | ## Run
13 | 1. Run `make target=<filepath>` (or if `make` is not installed, then run `python main.py <filepath>`) on the command line where filepath is the path to the target image or PDF.
14 | 
15 | The resulting Excel spreadsheet should be in the `excel/`folder named `tables.xlsx`. Each table will have its own separate sheet when the file is opened.
16 | 


--------------------------------------------------------------------------------
/data/example.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brian-yang/table-parser-opencv/3ba5df73c9615c59925462f541d51a6409a18bc0/data/example.jpg


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import cv2 as cv
  3 | import utils
  4 | from table import Table
  5 | from PIL import Image
  6 | import xlsxwriter
  7 | import sys
  8 | from pdf2image import convert_from_path
  9 | 
 10 | # =====================================================
 11 | # IMAGE LOADING
 12 | # =====================================================
 13 | if len(sys.argv) < 2:
 14 |     print("Usage: python main.py <img_path>")
 15 |     sys.exit(1)
 16 | 
 17 | path = sys.argv[1]
 18 | if not path.endswith(".pdf") and not path.endswith(".jpg"):
 19 |     print("Must use a pdf or a jpg image to run the program.")
 20 |     sys.exit(1)
 21 | 
 22 | if path.endswith(".pdf"):
 23 |     ext_img = convert_from_path(path)[0]
 24 | else:
 25 |     ext_img = Image.open(path)
 26 | 
 27 | ext_img.save("data/target.png", "PNG")
 28 | image = cv.imread("data/target.png")
 29 | 
 30 | # Convert resized RGB image to grayscale
 31 | NUM_CHANNELS = 3
 32 | if len(image.shape) == NUM_CHANNELS:
 33 |     grayscale = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
 34 | 
 35 | # =====================================================
 36 | # IMAGE FILTERING (using adaptive thresholding)
 37 | # =====================================================
 38 | """
 39 | ADAPTIVE THRESHOLDING
 40 | Thresholding changes pixels' color values to a specified pixel value if the current pixel value
 41 | is less than a threshold value, which could be:
 42 | 
 43 | 1. a specified global threshold value provided as an argument to the threshold function (simple thresholding),
 44 | 2. the mean value of the pixels in the neighboring area (adaptive thresholding - mean method),
 45 | 3. the weighted sum of neigborhood values where the weights are Gaussian windows (adaptive thresholding - Gaussian method).
 46 | 
 47 | The last two parameters to the adaptiveThreshold function are the size of the neighboring area and
 48 | the constant C which is subtracted from the mean or weighted mean calculated.
 49 | """
 50 | MAX_THRESHOLD_VALUE = 255
 51 | BLOCK_SIZE = 15
 52 | THRESHOLD_CONSTANT = 0
 53 | 
 54 | # Filter image
 55 | filtered = cv.adaptiveThreshold(~grayscale, MAX_THRESHOLD_VALUE, cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY, BLOCK_SIZE, THRESHOLD_CONSTANT)
 56 | 
 57 | # =====================================================
 58 | # LINE ISOLATION
 59 | # =====================================================
 60 | """
 61 | HORIZONTAL AND VERTICAL LINE ISOLATION
 62 | To isolate the vertical and horizontal lines, 
 63 | 
 64 | 1. Set a scale.
 65 | 2. Create a structuring element.
 66 | 3. Isolate the lines by eroding and then dilating the image.
 67 | """
 68 | SCALE = 15
 69 | 
 70 | # Isolate horizontal and vertical lines using morphological operations
 71 | horizontal = filtered.copy()
 72 | vertical = filtered.copy()
 73 | 
 74 | horizontal_size = int(horizontal.shape[1] / SCALE)
 75 | horizontal_structure = cv.getStructuringElement(cv.MORPH_RECT, (horizontal_size, 1))
 76 | utils.isolate_lines(horizontal, horizontal_structure)
 77 | 
 78 | vertical_size = int(vertical.shape[0] / SCALE)
 79 | vertical_structure = cv.getStructuringElement(cv.MORPH_RECT, (1, vertical_size))
 80 | utils.isolate_lines(vertical, vertical_structure)
 81 | 
 82 | # =====================================================
 83 | # TABLE EXTRACTION
 84 | # =====================================================
 85 | # Create an image mask with just the horizontal
 86 | # and vertical lines in the image. Then find
 87 | # all contours in the mask.
 88 | mask = horizontal + vertical
 89 | (contours, _) = cv.findContours(mask, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
 90 | 
 91 | # Find intersections between the lines
 92 | # to determine if the intersections are table joints.
 93 | intersections = cv.bitwise_and(horizontal, vertical)
 94 | 
 95 | # Get tables from the images
 96 | tables = [] # list of tables
 97 | for i in range(len(contours)):
 98 |     # Verify that region of interest is a table
 99 |     (rect, table_joints) = utils.verify_table(contours[i], intersections)
100 |     if rect == None or table_joints == None:
101 |         continue
102 | 
103 |     # Create a new instance of a table
104 |     table = Table(rect[0], rect[1], rect[2], rect[3])
105 | 
106 |     # Get an n-dimensional array of the coordinates of the table joints
107 |     joint_coords = []
108 |     for i in range(len(table_joints)):
109 |         joint_coords.append(table_joints[i][0][0])
110 |     joint_coords = np.asarray(joint_coords)
111 | 
112 |     # Returns indices of coordinates in sorted order
113 |     # Sorts based on parameters (aka keys) starting from the last parameter, then second-to-last, etc
114 |     sorted_indices = np.lexsort((joint_coords[:, 0], joint_coords[:, 1]))
115 |     joint_coords = joint_coords[sorted_indices]
116 | 
117 |     # Store joint coordinates in the table instance
118 |     table.set_joints(joint_coords)
119 | 
120 |     tables.append(table)
121 | 
122 |     #cv.rectangle(image, (table.x, table.y), (table.x + table.w, table.y + table.h), (0, 255, 0), 1, 8, 0)
123 |     #cv.imshow("tables", image)
124 |     #cv.waitKey(0)
125 | 
126 | # =====================================================
127 | # OCR AND WRITING TEXT TO EXCEL
128 | # =====================================================
129 | out = "bin/"
130 | table_name = "table.jpg"
131 | psm = 6
132 | oem = 3
133 | mult = 3
134 | 
135 | utils.mkdir(out)
136 | utils.mkdir("bin/table/")
137 | 
138 | utils.mkdir("excel/")
139 | workbook = xlsxwriter.Workbook('excel/tables.xlsx')
140 | 
141 | for table in tables:
142 |     worksheet = workbook.add_worksheet()
143 | 
144 |     table_entries = table.get_table_entries()
145 | 
146 |     table_roi = image[table.y:table.y + table.h, table.x:table.x + table.w]
147 |     table_roi = cv.resize(table_roi, (table.w * mult, table.h * mult))
148 | 
149 |     cv.imwrite(out + table_name, table_roi)
150 | 
151 |     num_img = 0
152 |     for i in range(len(table_entries)):
153 |         row = table_entries[i]
154 |         for j in range(len(row)):
155 |             entry = row[j]
156 |             entry_roi = table_roi[entry[1] * mult: (entry[1] + entry[3]) * mult, entry[0] * mult:(entry[0] + entry[2]) * mult]
157 | 
158 |             fname = out + "table/cell" + str(num_img) + ".jpg"
159 |             cv.imwrite(fname, entry_roi)
160 | 
161 |             fname = utils.run_textcleaner(fname, num_img)
162 |             text = utils.run_tesseract(fname, num_img, psm, oem)
163 | 
164 |             num_img += 1
165 | 
166 |             worksheet.write(i, j, text)
167 | 
168 | workbook.close()
169 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.16.0
2 | opencv-python==4.0.0.21
3 | pdf2image==1.4.0
4 | Pillow==8.1.1
5 | pytesseract==0.2.6
6 | XlsxWriter==1.1.2
7 | 


--------------------------------------------------------------------------------
/table.py:
--------------------------------------------------------------------------------
 1 | class Table:
 2 |     def __init__(self, x, y, w, h):
 3 |         self.x = x
 4 |         self.y = y
 5 |         self.w = w
 6 |         self.h = h
 7 |         self.joints = None
 8 | 
 9 |     def __str__(self):
10 |         return "(x: %d, y: %d, w: %d, h: %d)" % (self.x, self.x + self.w, self.y, self.y + self.h)
11 |     
12 |     # Stores the coordinates of the table joints.
13 |     # Assumes the n-dimensional array joints is sorted in ascending order.
14 |     def set_joints(self, joints):
15 |         if self.joints != None:
16 |             raise ValueError("Invalid setting of table joints array.")
17 | 
18 |         self.joints = []
19 |         row_y = joints[0][1]
20 |         row = []
21 |         for i in range(len(joints)):
22 |             if i == len(joints) - 1:
23 |                 row.append(joints[i])
24 |                 self.joints.append(row)
25 |                 break
26 | 
27 |             row.append(joints[i])
28 | 
29 |             # If the next joint has a new y-coordinate,
30 |             # start a new row.
31 |             if joints[i + 1][1] != row_y:
32 |                 self.joints.append(row)
33 |                 row_y = joints[i + 1][1]
34 |                 row = []
35 | 
36 |     # Prints the coordinates of the joints.
37 |     def print_joints(self):
38 |         if self.joints == None:
39 |             print("Joint coordinates not found.")
40 |             return
41 | 
42 |         print("[")
43 |         for row in self.joints:
44 |             print("\t" + str(row))
45 |         print("]")
46 | 
47 |     # Finds the bounds of table entries in the image by
48 |     # using the coordinates of the table joints.
49 |     def get_table_entries(self):
50 |         if self.joints == None:
51 |             print("Joint coordinates not found.")
52 |             return
53 | 
54 |         entry_coords = []
55 |         for i in range(0, len(self.joints) - 1):
56 |             entry_coords.append(self.get_entry_bounds_in_row(self.joints[i], self.joints[i + 1]))
57 | 
58 |         return entry_coords
59 | 
60 |     # Finds the bounds of table entries
61 |     # in each row based on the given sets of joints.
62 |     def get_entry_bounds_in_row(self, joints_A, joints_B):
63 |         row_entries = []
64 | 
65 |         # Since the sets of joints may not have the same 
66 |         # number of points, we pick the set with a lower number 
67 |         # of points to find the bounds from.
68 |         if len(joints_A) <= len(joints_B):
69 |             defining_bounds = joints_A
70 |             helper_bounds = joints_B
71 |         else:
72 |             defining_bounds = joints_B
73 |             helper_bounds = joints_A
74 | 
75 |         for i in range(0, len(defining_bounds) - 1):
76 |             x = defining_bounds[i][0]
77 |             y = defining_bounds[i][1]
78 |             w = defining_bounds[i + 1][0] - x # helper_bounds's (i + 1)th coordinate may not be the lower-right corner
79 |             h = helper_bounds[0][1] - y # helper_bounds has the same y-coordinate for all of its elements
80 | 
81 |             # If the calculated height is less than 0, 
82 |             # make the height positive and
83 |             # use the y-coordinate of the row above for the bounds
84 |             if h < 0:
85 |                 h = -h
86 |                 y = y - h
87 | 
88 |             row_entries.append([x, y, w, h])
89 | 
90 |         return row_entries
91 | 


--------------------------------------------------------------------------------
/textcleaner:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brian-yang/table-parser-opencv/3ba5df73c9615c59925462f541d51a6409a18bc0/textcleaner


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import cv2 as cv
 2 | import pytesseract as tess
 3 | from PIL import Image
 4 | import subprocess as s
 5 | import os
 6 | 
 7 | """
 8 | Apply morphology operations
 9 | """
10 | def isolate_lines(src, structuring_element):
11 | 	cv.erode(src, structuring_element, src, (-1, -1)) # makes white spots smaller
12 | 	cv.dilate(src, structuring_element, src, (-1, -1)) # makes white spots bigger
13 | 
14 | """
15 | Verify if the region inside a contour is a table
16 | If it is a table, returns the bounding rect
17 | and the table joints. Else return None.
18 | """
19 | MIN_TABLE_AREA = 50 # min table area to be considered a table
20 | EPSILON = 3 # epsilon value for contour approximation
21 | def verify_table(contour, intersections):
22 |     area = cv.contourArea(contour)
23 | 
24 |     if (area < MIN_TABLE_AREA):
25 |         return (None, None)
26 | 
27 |     # approxPolyDP approximates a polygonal curve within the specified precision
28 |     curve = cv.approxPolyDP(contour, EPSILON, True)
29 | 
30 |     # boundingRect calculates the bounding rectangle of a point set (eg. a curve)
31 |     rect = cv.boundingRect(curve) # format of each rect: x, y, w, h
32 | 
33 |     # Finds the number of joints in each region of interest (ROI)
34 |     # Format is in row-column order (as finding the ROI involves numpy arrays)
35 |     # format: image_mat[rect.y: rect.y + rect.h, rect.x: rect.x + rect.w]
36 |     possible_table_region = intersections[rect[1]:rect[1] + rect[3], rect[0]:rect[0] + rect[2]]
37 |     (possible_table_joints, _) = cv.findContours(possible_table_region, cv.RETR_CCOMP, cv.CHAIN_APPROX_SIMPLE)
38 | 
39 |     # Determines the number of table joints in the image
40 |     # If less than 5 table joints, then the image
41 |     # is likely not a table
42 |     if len(possible_table_joints) < 5:
43 |         return (None, None)
44 | 
45 |     return rect, possible_table_joints
46 | 
47 | """
48 | Creates the build directory if it doesn't already exist."
49 | """
50 | def mkdir(path):
51 |     if not os.path.exists(path):
52 |         os.makedirs(path)
53 | 
54 | """
55 | Displays an image with opencv for durationMillis milliseconds
56 | """
57 | def showImg(name, matrix, durationMillis = 0):
58 |     cv.imshow(name, matrix)
59 |     cv.waitKey(durationMillis)
60 | 
61 | """
62 | Clean the image by using the textcleaner script
63 | """
64 | def run_textcleaner(filename, img_id):
65 |     mkdir("bin/cleaned/")
66 | 
67 |     # Run textcleaner
68 |     cleaned_file = "bin/cleaned/cleaned" + str(img_id) + ".jpg"
69 |     s.call(["./textcleaner", "-g", "-e", "none", "-f", str(10), "-o", str(5), filename, cleaned_file])
70 | 
71 |     return cleaned_file
72 | 
73 | """
74 | Run tesseract to perform optical character recognition (OCR)
75 | """
76 | def run_tesseract(filename, img_id, psm, oem):
77 |     mkdir("bin/extracted/")
78 | 
79 |     image = Image.open(filename)
80 |     language = 'eng'
81 |     configuration = "--psm " + str(psm) + " --oem " + str(oem)
82 | 
83 |     # Run tesseract
84 |     text = tess.image_to_string(image, lang=language, config=configuration)
85 |     if len(text.strip()) == 0:
86 |         configuration += " -c tessedit_char_whitelist=0123456789"
87 |         text = tess.image_to_string(image, lang=language, config=configuration)
88 |     #print(text)
89 | 
90 |     return text
91 | 


--------------------------------------------------------------------------------