├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── data └── example.jpg ├── main.py ├── requirements.txt ├── table.py ├── textcleaner └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | excel/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # dotenv 86 | .env 87 | 88 | # virtualenv 89 | .venv 90 | venv/ 91 | ENV/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Brian Yang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | run: 2 | python main.py $(target) 3 | clean: 4 | rm -rf excel bin __pycache__ 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spreadsheet-images 2 | Take lab reports or any papers with tables in them, and instantly extract those tables and convert them to Excel spreadsheets. 3 | 4 | ## Disclaimer 5 | The textcleaner script was made by Fred Weinhaus. To use the textcleaner script in this repository for commercial use, redistribute it on the Internet, integrate it into free applications on the Internet, etc. you must contact Fred at fmw@alink.net for permission, or else you cannot use the textcleaner script. See the textcleaner file for more details. Users who use or fork this project have my permission to use and modify the rest of the code in this project, just not the textcleaner script. 6 | 7 | ## Installation 8 | 1. Install Tesseract OCR (used to recognize the text in the tables). 9 | - `sudo apt-get install tesseract-ocr libtesseract-dev libleptonica-dev` 10 | 2. Install python libraries: 11 | - `pip install -r requirements.txt` 12 | ## Run 13 | 1. Run `make target=` (or if `make` is not installed, then run `python main.py `) on the command line where filepath is the path to the target image or PDF. 14 | 15 | The resulting Excel spreadsheet should be in the `excel/`folder named `tables.xlsx`. Each table will have its own separate sheet when the file is opened. 16 | -------------------------------------------------------------------------------- /data/example.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brian-yang/table-parser-opencv/3ba5df73c9615c59925462f541d51a6409a18bc0/data/example.jpg -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 as cv 3 | import utils 4 | from table import Table 5 | from PIL import Image 6 | import xlsxwriter 7 | import sys 8 | from pdf2image import convert_from_path 9 | 10 | # ===================================================== 11 | # IMAGE LOADING 12 | # ===================================================== 13 | if len(sys.argv) < 2: 14 | print("Usage: python main.py ") 15 | sys.exit(1) 16 | 17 | path = sys.argv[1] 18 | if not path.endswith(".pdf") and not path.endswith(".jpg"): 19 | print("Must use a pdf or a jpg image to run the program.") 20 | sys.exit(1) 21 | 22 | if path.endswith(".pdf"): 23 | ext_img = convert_from_path(path)[0] 24 | else: 25 | ext_img = Image.open(path) 26 | 27 | ext_img.save("data/target.png", "PNG") 28 | image = cv.imread("data/target.png") 29 | 30 | # Convert resized RGB image to grayscale 31 | NUM_CHANNELS = 3 32 | if len(image.shape) == NUM_CHANNELS: 33 | grayscale = cv.cvtColor(image, cv.COLOR_BGR2GRAY) 34 | 35 | # ===================================================== 36 | # IMAGE FILTERING (using adaptive thresholding) 37 | # ===================================================== 38 | """ 39 | ADAPTIVE THRESHOLDING 40 | Thresholding changes pixels' color values to a specified pixel value if the current pixel value 41 | is less than a threshold value, which could be: 42 | 43 | 1. a specified global threshold value provided as an argument to the threshold function (simple thresholding), 44 | 2. the mean value of the pixels in the neighboring area (adaptive thresholding - mean method), 45 | 3. the weighted sum of neigborhood values where the weights are Gaussian windows (adaptive thresholding - Gaussian method). 46 | 47 | The last two parameters to the adaptiveThreshold function are the size of the neighboring area and 48 | the constant C which is subtracted from the mean or weighted mean calculated. 49 | """ 50 | MAX_THRESHOLD_VALUE = 255 51 | BLOCK_SIZE = 15 52 | THRESHOLD_CONSTANT = 0 53 | 54 | # Filter image 55 | filtered = cv.adaptiveThreshold(~grayscale, MAX_THRESHOLD_VALUE, cv.ADAPTIVE_THRESH_MEAN_C, cv.THRESH_BINARY, BLOCK_SIZE, THRESHOLD_CONSTANT) 56 | 57 | # ===================================================== 58 | # LINE ISOLATION 59 | # ===================================================== 60 | """ 61 | HORIZONTAL AND VERTICAL LINE ISOLATION 62 | To isolate the vertical and horizontal lines, 63 | 64 | 1. Set a scale. 65 | 2. Create a structuring element. 66 | 3. Isolate the lines by eroding and then dilating the image. 67 | """ 68 | SCALE = 15 69 | 70 | # Isolate horizontal and vertical lines using morphological operations 71 | horizontal = filtered.copy() 72 | vertical = filtered.copy() 73 | 74 | horizontal_size = int(horizontal.shape[1] / SCALE) 75 | horizontal_structure = cv.getStructuringElement(cv.MORPH_RECT, (horizontal_size, 1)) 76 | utils.isolate_lines(horizontal, horizontal_structure) 77 | 78 | vertical_size = int(vertical.shape[0] / SCALE) 79 | vertical_structure = cv.getStructuringElement(cv.MORPH_RECT, (1, vertical_size)) 80 | utils.isolate_lines(vertical, vertical_structure) 81 | 82 | # ===================================================== 83 | # TABLE EXTRACTION 84 | # ===================================================== 85 | # Create an image mask with just the horizontal 86 | # and vertical lines in the image. Then find 87 | # all contours in the mask. 88 | mask = horizontal + vertical 89 | (contours, _) = cv.findContours(mask, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE) 90 | 91 | # Find intersections between the lines 92 | # to determine if the intersections are table joints. 93 | intersections = cv.bitwise_and(horizontal, vertical) 94 | 95 | # Get tables from the images 96 | tables = [] # list of tables 97 | for i in range(len(contours)): 98 | # Verify that region of interest is a table 99 | (rect, table_joints) = utils.verify_table(contours[i], intersections) 100 | if rect == None or table_joints == None: 101 | continue 102 | 103 | # Create a new instance of a table 104 | table = Table(rect[0], rect[1], rect[2], rect[3]) 105 | 106 | # Get an n-dimensional array of the coordinates of the table joints 107 | joint_coords = [] 108 | for i in range(len(table_joints)): 109 | joint_coords.append(table_joints[i][0][0]) 110 | joint_coords = np.asarray(joint_coords) 111 | 112 | # Returns indices of coordinates in sorted order 113 | # Sorts based on parameters (aka keys) starting from the last parameter, then second-to-last, etc 114 | sorted_indices = np.lexsort((joint_coords[:, 0], joint_coords[:, 1])) 115 | joint_coords = joint_coords[sorted_indices] 116 | 117 | # Store joint coordinates in the table instance 118 | table.set_joints(joint_coords) 119 | 120 | tables.append(table) 121 | 122 | #cv.rectangle(image, (table.x, table.y), (table.x + table.w, table.y + table.h), (0, 255, 0), 1, 8, 0) 123 | #cv.imshow("tables", image) 124 | #cv.waitKey(0) 125 | 126 | # ===================================================== 127 | # OCR AND WRITING TEXT TO EXCEL 128 | # ===================================================== 129 | out = "bin/" 130 | table_name = "table.jpg" 131 | psm = 6 132 | oem = 3 133 | mult = 3 134 | 135 | utils.mkdir(out) 136 | utils.mkdir("bin/table/") 137 | 138 | utils.mkdir("excel/") 139 | workbook = xlsxwriter.Workbook('excel/tables.xlsx') 140 | 141 | for table in tables: 142 | worksheet = workbook.add_worksheet() 143 | 144 | table_entries = table.get_table_entries() 145 | 146 | table_roi = image[table.y:table.y + table.h, table.x:table.x + table.w] 147 | table_roi = cv.resize(table_roi, (table.w * mult, table.h * mult)) 148 | 149 | cv.imwrite(out + table_name, table_roi) 150 | 151 | num_img = 0 152 | for i in range(len(table_entries)): 153 | row = table_entries[i] 154 | for j in range(len(row)): 155 | entry = row[j] 156 | entry_roi = table_roi[entry[1] * mult: (entry[1] + entry[3]) * mult, entry[0] * mult:(entry[0] + entry[2]) * mult] 157 | 158 | fname = out + "table/cell" + str(num_img) + ".jpg" 159 | cv.imwrite(fname, entry_roi) 160 | 161 | fname = utils.run_textcleaner(fname, num_img) 162 | text = utils.run_tesseract(fname, num_img, psm, oem) 163 | 164 | num_img += 1 165 | 166 | worksheet.write(i, j, text) 167 | 168 | workbook.close() 169 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.16.0 2 | opencv-python==4.0.0.21 3 | pdf2image==1.4.0 4 | Pillow==8.1.1 5 | pytesseract==0.2.6 6 | XlsxWriter==1.1.2 7 | -------------------------------------------------------------------------------- /table.py: -------------------------------------------------------------------------------- 1 | class Table: 2 | def __init__(self, x, y, w, h): 3 | self.x = x 4 | self.y = y 5 | self.w = w 6 | self.h = h 7 | self.joints = None 8 | 9 | def __str__(self): 10 | return "(x: %d, y: %d, w: %d, h: %d)" % (self.x, self.x + self.w, self.y, self.y + self.h) 11 | 12 | # Stores the coordinates of the table joints. 13 | # Assumes the n-dimensional array joints is sorted in ascending order. 14 | def set_joints(self, joints): 15 | if self.joints != None: 16 | raise ValueError("Invalid setting of table joints array.") 17 | 18 | self.joints = [] 19 | row_y = joints[0][1] 20 | row = [] 21 | for i in range(len(joints)): 22 | if i == len(joints) - 1: 23 | row.append(joints[i]) 24 | self.joints.append(row) 25 | break 26 | 27 | row.append(joints[i]) 28 | 29 | # If the next joint has a new y-coordinate, 30 | # start a new row. 31 | if joints[i + 1][1] != row_y: 32 | self.joints.append(row) 33 | row_y = joints[i + 1][1] 34 | row = [] 35 | 36 | # Prints the coordinates of the joints. 37 | def print_joints(self): 38 | if self.joints == None: 39 | print("Joint coordinates not found.") 40 | return 41 | 42 | print("[") 43 | for row in self.joints: 44 | print("\t" + str(row)) 45 | print("]") 46 | 47 | # Finds the bounds of table entries in the image by 48 | # using the coordinates of the table joints. 49 | def get_table_entries(self): 50 | if self.joints == None: 51 | print("Joint coordinates not found.") 52 | return 53 | 54 | entry_coords = [] 55 | for i in range(0, len(self.joints) - 1): 56 | entry_coords.append(self.get_entry_bounds_in_row(self.joints[i], self.joints[i + 1])) 57 | 58 | return entry_coords 59 | 60 | # Finds the bounds of table entries 61 | # in each row based on the given sets of joints. 62 | def get_entry_bounds_in_row(self, joints_A, joints_B): 63 | row_entries = [] 64 | 65 | # Since the sets of joints may not have the same 66 | # number of points, we pick the set with a lower number 67 | # of points to find the bounds from. 68 | if len(joints_A) <= len(joints_B): 69 | defining_bounds = joints_A 70 | helper_bounds = joints_B 71 | else: 72 | defining_bounds = joints_B 73 | helper_bounds = joints_A 74 | 75 | for i in range(0, len(defining_bounds) - 1): 76 | x = defining_bounds[i][0] 77 | y = defining_bounds[i][1] 78 | w = defining_bounds[i + 1][0] - x # helper_bounds's (i + 1)th coordinate may not be the lower-right corner 79 | h = helper_bounds[0][1] - y # helper_bounds has the same y-coordinate for all of its elements 80 | 81 | # If the calculated height is less than 0, 82 | # make the height positive and 83 | # use the y-coordinate of the row above for the bounds 84 | if h < 0: 85 | h = -h 86 | y = y - h 87 | 88 | row_entries.append([x, y, w, h]) 89 | 90 | return row_entries 91 | -------------------------------------------------------------------------------- /textcleaner: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brian-yang/table-parser-opencv/3ba5df73c9615c59925462f541d51a6409a18bc0/textcleaner -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import cv2 as cv 2 | import pytesseract as tess 3 | from PIL import Image 4 | import subprocess as s 5 | import os 6 | 7 | """ 8 | Apply morphology operations 9 | """ 10 | def isolate_lines(src, structuring_element): 11 | cv.erode(src, structuring_element, src, (-1, -1)) # makes white spots smaller 12 | cv.dilate(src, structuring_element, src, (-1, -1)) # makes white spots bigger 13 | 14 | """ 15 | Verify if the region inside a contour is a table 16 | If it is a table, returns the bounding rect 17 | and the table joints. Else return None. 18 | """ 19 | MIN_TABLE_AREA = 50 # min table area to be considered a table 20 | EPSILON = 3 # epsilon value for contour approximation 21 | def verify_table(contour, intersections): 22 | area = cv.contourArea(contour) 23 | 24 | if (area < MIN_TABLE_AREA): 25 | return (None, None) 26 | 27 | # approxPolyDP approximates a polygonal curve within the specified precision 28 | curve = cv.approxPolyDP(contour, EPSILON, True) 29 | 30 | # boundingRect calculates the bounding rectangle of a point set (eg. a curve) 31 | rect = cv.boundingRect(curve) # format of each rect: x, y, w, h 32 | 33 | # Finds the number of joints in each region of interest (ROI) 34 | # Format is in row-column order (as finding the ROI involves numpy arrays) 35 | # format: image_mat[rect.y: rect.y + rect.h, rect.x: rect.x + rect.w] 36 | possible_table_region = intersections[rect[1]:rect[1] + rect[3], rect[0]:rect[0] + rect[2]] 37 | (possible_table_joints, _) = cv.findContours(possible_table_region, cv.RETR_CCOMP, cv.CHAIN_APPROX_SIMPLE) 38 | 39 | # Determines the number of table joints in the image 40 | # If less than 5 table joints, then the image 41 | # is likely not a table 42 | if len(possible_table_joints) < 5: 43 | return (None, None) 44 | 45 | return rect, possible_table_joints 46 | 47 | """ 48 | Creates the build directory if it doesn't already exist." 49 | """ 50 | def mkdir(path): 51 | if not os.path.exists(path): 52 | os.makedirs(path) 53 | 54 | """ 55 | Displays an image with opencv for durationMillis milliseconds 56 | """ 57 | def showImg(name, matrix, durationMillis = 0): 58 | cv.imshow(name, matrix) 59 | cv.waitKey(durationMillis) 60 | 61 | """ 62 | Clean the image by using the textcleaner script 63 | """ 64 | def run_textcleaner(filename, img_id): 65 | mkdir("bin/cleaned/") 66 | 67 | # Run textcleaner 68 | cleaned_file = "bin/cleaned/cleaned" + str(img_id) + ".jpg" 69 | s.call(["./textcleaner", "-g", "-e", "none", "-f", str(10), "-o", str(5), filename, cleaned_file]) 70 | 71 | return cleaned_file 72 | 73 | """ 74 | Run tesseract to perform optical character recognition (OCR) 75 | """ 76 | def run_tesseract(filename, img_id, psm, oem): 77 | mkdir("bin/extracted/") 78 | 79 | image = Image.open(filename) 80 | language = 'eng' 81 | configuration = "--psm " + str(psm) + " --oem " + str(oem) 82 | 83 | # Run tesseract 84 | text = tess.image_to_string(image, lang=language, config=configuration) 85 | if len(text.strip()) == 0: 86 | configuration += " -c tessedit_char_whitelist=0123456789" 87 | text = tess.image_to_string(image, lang=language, config=configuration) 88 | #print(text) 89 | 90 | return text 91 | --------------------------------------------------------------------------------