├── .gitignore ├── Form cleaner.ipynb ├── Intelligent Character Recognition.ipynb ├── LICENSE ├── Page detection.ipynb ├── Probabilistic Line Transformation.ipynb ├── README.md ├── files ├── form1.jpg ├── form10.jpg ├── form11.png ├── form2.jpg ├── form3.jpg ├── form4.jpg ├── form5.jpg ├── form6.jpg ├── form7.jpg ├── form8.jpg └── form9.jpg └── img └── common_segmentation_methods.png /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | -------------------------------------------------------------------------------- /Form cleaner.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Form cleaner" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import cv2\n", 17 | "import math\n", 18 | "import numpy as np\n", 19 | "import imutils\n", 20 | "import subprocess\n", 21 | "from IPython.display import Image\n", 22 | "\n", 23 | "inputFile = \"form11.png\"\n", 24 | "Image(filename='files/' + inputFile)" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "## Detect scanned page (if applicable)\n", 32 | "- Detect edges\n", 33 | "- Use the edges in the image to find the contour (outline) representing the piece of paper being scanned.\n", 34 | "- Apply a perspective transform to obtain the top-down view of the document.\n", 35 | "\n", 36 | "Objective:\n", 37 | "- We want to work on the scanned page (if applicable) or the page itself.\n", 38 | "\n", 39 | "See:\n", 40 | "- http://bretahajek.com/2017/01/scanning-documents-photos-opencv/\n", 41 | "- https://www.pyimagesearch.com/2014/09/01/build-kick-ass-mobile-document-scanner-just-5-minutes/\n", 42 | "- https://github.com/Breta01/handwriting-ocr/blob/master/PageDetection.ipynb" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "img = cv2.imread(\"files/\" + inputFile)\n", 52 | "\n", 53 | "# TODO\n", 54 | "\n", 55 | "cv2.imwrite(\"files/result-0.jpg\", img)\n", 56 | "Image(filename='files/result-0.jpg')" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "## Remove form / table structure\n", 64 | "\n", 65 | "Objective:\n", 66 | "- Remove horizontal and vertical lines\n", 67 | "- Make it easier to detect text (remove unnecessary elements on page)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "options = \"files/result-0.jpg \"\n", 77 | "options += \"-type \"\n", 78 | "options += \"Grayscale \"\n", 79 | "options += \"-negate \"\n", 80 | "options += \"-define morphology:compose=darken \"\n", 81 | "options += \"-morphology Thinning 'Rectangle:15x1+0+0<' \"\n", 82 | "options += \"-negate \"\n", 83 | "options += \"files/result-1.jpg\"\n", 84 | "\n", 85 | "# Make sure to install imagemagick, otherwise the following line will fail\n", 86 | "subprocess.getoutput(\"convert \" + options)\n", 87 | "Image(filename='files/result-1.jpg')" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": {}, 93 | "source": [ 94 | "## Remove noise and make text clearer\n", 95 | "\n", 96 | "Objectives:\n", 97 | "- Make text clearer\n", 98 | "- Apply OSTU threshold to clean up the result" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "img = cv2.imread(\"files/result-1.jpg\")\n", 108 | "gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n", 109 | "\n", 110 | "gray = cv2.blur(gray,(1,1))\n", 111 | "gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]\n", 112 | "\n", 113 | "cv2.imwrite(\"files/result-2.jpg\", gray)\n", 114 | "Image(filename='files/result-2.jpg')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def my_blur(img):\n", 124 | " img = cv2.adaptiveThreshold(img, 255,\n", 125 | " cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\n", 126 | " cv2.THRESH_BINARY, 115, 4)\n", 127 | " \n", 128 | " k1 = np.ones((1,1),np.uint8)\n", 129 | " img = cv2.morphologyEx(img, cv2.MORPH_OPEN, k1)\n", 130 | "\n", 131 | " _,img = cv2.threshold(img,0,255,cv2.THRESH_BINARY_INV)\n", 132 | "\n", 133 | " k1 = np.ones((2,2),np.uint8)\n", 134 | " img = cv2.morphologyEx(img, cv2.MORPH_DILATE, k1)\n", 135 | " \n", 136 | " img = cv2.blur(img,(2,2))\n", 137 | " img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]\n", 138 | " return img\n", 139 | "\n", 140 | "img1 = cv2.imread(\"files/result-0.jpg\")\n", 141 | "img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)\n", 142 | "img1 = my_blur(img1)\n", 143 | "\n", 144 | "img2 = cv2.imread(\"files/result-2.jpg\")\n", 145 | "img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\n", 146 | "img2 = my_blur(img2)\n", 147 | "\n", 148 | "img = cv2.absdiff(img1, img2)\n", 149 | "img = cv2.bitwise_not(img)\n", 150 | "\n", 151 | "#kernel = np.ones((1,1),np.uint8)\n", 152 | "#img = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)\n", 153 | "\n", 154 | "cv2.imwrite(\"files/result-3.jpg\", img)\n", 155 | "Image(filename='files/result-3.jpg')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## Shape detection and extraction\n", 163 | "\n", 164 | "Objective:\n", 165 | "- Extract text line\n", 166 | "\n", 167 | "Possibilities:\n", 168 | "- MSER\n", 169 | "- Threshold (OTSU) and findContours\n", 170 | "\n", 171 | "See:\n", 172 | "- http://opencvpython.blogspot.ca/2012/06/hi-this-article-is-tutorial-which-try.html" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "# TODO" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "## OCR and handwriting recognition\n", 189 | "\n", 190 | "See:\n", 191 | "- Tesseract 4\n", 192 | "- https://github.com/Breta01/handwriting-ocr (Handwriting recognition)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# TODO" 202 | ] 203 | } 204 | ], 205 | "metadata": { 206 | "kernelspec": { 207 | "display_name": "Python 3", 208 | "language": "python", 209 | "name": "python3" 210 | }, 211 | "language_info": { 212 | "codemirror_mode": { 213 | "name": "ipython", 214 | "version": 3 215 | }, 216 | "file_extension": ".py", 217 | "mimetype": "text/x-python", 218 | "name": "python", 219 | "nbconvert_exporter": "python", 220 | "pygments_lexer": "ipython3", 221 | "version": "3.6.3" 222 | } 223 | }, 224 | "nbformat": 4, 225 | "nbformat_minor": 2 226 | } 227 | -------------------------------------------------------------------------------- /Intelligent Character Recognition.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Intelligent Character Recognition\n", 8 | "(AKA ICR)\n", 9 | "\n", 10 | "See:\n", 11 | "- https://en.wikipedia.org/wiki/Intelligent_character_recognition\n", 12 | "\n", 13 | "This is not the same as OCR.\n", 14 | "\n", 15 | "> When we talk about OCR, we are really talking about converting data from an image that was created by a machine, whether it be a document created by an office application, and even an old document typewritten\n", 16 | ">\n", 17 | "> For ICR, pattern-matching goes to a different level. You cannot really have success deploying a solution requiring ICR without additional information to aid in the recognition process.\n", 18 | ">\n", 19 | "> https://www.parascript.com/blog/difference-ocr-icr/\n", 20 | "\n", 21 | "# Segmentation challenges\n", 22 | "\n", 23 | "A few common segmentation methods are listed below:\n", 24 | "![Common segmentation methods](img/common_segmentation_methods.png)\n", 25 | "\n", 26 | "See: https://www.scanstore.com/ICR_Guide/\n", 27 | "\n", 28 | "## Possible solutions\n", 29 | "\n", 30 | "1. Use the common pattern to detect text field (AKA Handprint Recognition)\n", 31 | "\n", 32 | "2. Use imagemagick to remove the form/table structure:\n", 33 | "\n", 34 | "```\n", 35 | "convert inputFile.jpg -type Grayscale -negate -define morphology:compose=darken -morphology Thinning 'Rectangle:15x1+0+0<' -negate outputFile.jpg\n", 36 | "```\n", 37 | "\n", 38 | "Advantages:\n", 39 | "- Remove most horizontal and vertical lines\n", 40 | "- Clearer text (remove unnecessary elements on page)\n", 41 | "\n", 42 | "Disadvantages:\n", 43 | "- It introduces extra spaces (becomes monospaced font)\n", 44 | "- It can remove some part of handwriting text\n", 45 | "- Hard to determine the perfect rectangle size (vary: 15)\n", 46 | "\n", 47 | "# Let's explore the available options on the market\n", 48 | "\n", 49 | "## queXF\n", 50 | "An open source, web based paper form verification and data entry system\n", 51 | "\n", 52 | "> The process of ICR is broken in to the following 7 steps:\n", 53 | "> - Character isolation\n", 54 | "> - Noise reduction\n", 55 | "> - Boundary removal\n", 56 | "> - Normalising\n", 57 | "> - Thinning\n", 58 | "> - Feature extraction\n", 59 | "> - Training or Recognition\n", 60 | ">\n", 61 | "> https://quexf.acspri.org.au/intelligent-character-recognition\n", 62 | "\n", 63 | "It can:\n", 64 | "- Automatically detects locations of boxes on the form (no need to manually overlay boxes)\n", 65 | "\n", 66 | "\n", 67 | "## NIST FORM-BASED HANDPRINT RECOGNITION SYSTEM (RELEASE 2.2)\n", 68 | "A public domain document processing system was developed by the National Institute of Standards and Technology (NIST) in 1994. The system is a standard reference form-based handprint recognition system for evaluating optical character recognition (OCR), and it is intended to provide a baseline of performance on an open application. \n", 69 | "\n", 70 | "NIST developed a new release in 2003 for internet distribution of its standard reference form-based handprint recognition system for evaluating optical character recognition (OCR). Release 2.2 incorporates all bug fixes generated for previous versions.\n", 71 | "\n", 72 | "https://www.nist.gov/services-resources/software/public-domain-ocr\n", 73 | "\n", 74 | "For source code and training data:\n", 75 | "- https://github.com/brainopener/nist-ocr\n", 76 | "\n", 77 | "According to the documentation:\n", 78 | "- To process a new type of form, trainreg must be run on a prototypical form and the output coordinate stored.\n", 79 | "\n", 80 | "## ABBYY Cloud OCR SDK\n", 81 | "\n", 82 | "https://ocrsdk.com/documentation/quick-start/text-fields/\n", 83 | "\n", 84 | "## Others\n", 85 | "- https://www.a2ia.com/fr/a2ia-fieldreader\n", 86 | "- http://www.recogniform.com/index.htm\n" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 3", 93 | "language": "python", 94 | "name": "python3" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 3 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython3", 106 | "version": "3.6.3" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 2 111 | } 112 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Philip Doxakis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Page detection.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import cv2\n", 10 | "import numpy as np\n", 11 | "import sys\n", 12 | "\n", 13 | "image = cv2.cvtColor(cv2.imread('files/form8.jpg'), cv2.COLOR_BGR2RGB)\n", 14 | "\n", 15 | "def resize(img, height=800):\n", 16 | " \"\"\" Resize image to given height \"\"\"\n", 17 | " rat = height / img.shape[0]\n", 18 | " return cv2.resize(img, (int(rat * img.shape[1]), height))\n", 19 | "\n", 20 | "# Resize and convert to grayscale\n", 21 | "img = cv2.cvtColor(resize(image), cv2.COLOR_BGR2GRAY)\n", 22 | "\n", 23 | "# Bilateral filter preserv edges\n", 24 | "img = cv2.bilateralFilter(img, 9, 75, 75)\n", 25 | "\n", 26 | "# Create black and white image based on adaptive threshold\n", 27 | "img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 115, 4)\n", 28 | "\n", 29 | "# Median filter clears small details\n", 30 | "img = cv2.medianBlur(img, 11)\n", 31 | "\n", 32 | "# Add black border in case that page is touching an image border\n", 33 | "img = cv2.copyMakeBorder(img, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=[0, 0, 0])\n", 34 | "\n", 35 | "edges = cv2.Canny(img, 200, 250)\n", 36 | "\n", 37 | "# Getting contours \n", 38 | "im2, contours, hierarchy = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)\n", 39 | "\n", 40 | "# Finding contour of biggest rectangle\n", 41 | "# Otherwise return corners of original image\n", 42 | "# Don't forget on our 5px border!\n", 43 | "height = edges.shape[0]\n", 44 | "width = edges.shape[1]\n", 45 | "MAX_COUNTOUR_AREA = (width - 10) * (height - 10)\n", 46 | "\n", 47 | "# Page fill at least half of image, then saving max area found\n", 48 | "maxAreaFound = MAX_COUNTOUR_AREA * 0.5\n", 49 | "\n", 50 | "# Saving page contour\n", 51 | "pageContour = np.array([[5, 5], [5, height-5], [width-5, height-5], [width-5, 5]])\n", 52 | "\n", 53 | "# Go through all contours\n", 54 | "for cnt in contours:\n", 55 | " # Simplify contour\n", 56 | " perimeter = cv2.arcLength(cnt, True)\n", 57 | " approx = cv2.approxPolyDP(cnt, 0.03 * perimeter, True)\n", 58 | "\n", 59 | " # Page has 4 corners and it is convex\n", 60 | " # Page area must be bigger than maxAreaFound \n", 61 | " if (len(approx) == 4 and\n", 62 | " cv2.isContourConvex(approx) and\n", 63 | " maxAreaFound < cv2.contourArea(approx) < MAX_COUNTOUR_AREA):\n", 64 | "\n", 65 | " maxAreaFound = cv2.contourArea(approx)\n", 66 | " pageContour = approx\n", 67 | "\n", 68 | "# Result in pageConoutr (numpy array of 4 points):\n", 69 | "\n", 70 | "\n", 71 | "def fourCornersSort(pts):\n", 72 | " \"\"\" Sort corners: top-left, bot-left, bot-right, top-right \"\"\"\n", 73 | " # Difference and sum of x and y value\n", 74 | " # Inspired by http://www.pyimagesearch.com\n", 75 | " diff = np.diff(pts, axis=1)\n", 76 | " summ = pts.sum(axis=1)\n", 77 | " \n", 78 | " # Top-left point has smallest sum...\n", 79 | " # np.argmin() returns INDEX of min\n", 80 | " return np.array([pts[np.argmin(summ)],\n", 81 | " pts[np.argmax(diff)],\n", 82 | " pts[np.argmax(summ)],\n", 83 | " pts[np.argmin(diff)]])\n", 84 | "\n", 85 | "\n", 86 | "def contourOffset(cnt, offset):\n", 87 | " \"\"\" Offset contour, by 5px border \"\"\"\n", 88 | " # Matrix addition\n", 89 | " cnt += offset\n", 90 | " \n", 91 | " # if value < 0 => replace it by 0\n", 92 | " cnt[cnt < 0] = 0\n", 93 | " return cnt\n", 94 | "\n", 95 | "\n", 96 | "# Sort and offset corners\n", 97 | "pageContour = fourCornersSort(pageContour[:, 0])\n", 98 | "pageContour = contourOffset(pageContour, (-5, -5))\n", 99 | "\n", 100 | "# Recalculate to original scale - start Points\n", 101 | "sPoints = pageContour.dot(image.shape[0] / 800)\n", 102 | " \n", 103 | "# Using Euclidean distance\n", 104 | "# Calculate maximum height (maximal length of vertical edges) and width\n", 105 | "height = max(np.linalg.norm(sPoints[0] - sPoints[1]),\n", 106 | " np.linalg.norm(sPoints[2] - sPoints[3]))\n", 107 | "width = max(np.linalg.norm(sPoints[1] - sPoints[2]),\n", 108 | " np.linalg.norm(sPoints[3] - sPoints[0]))\n", 109 | "\n", 110 | "# Create target points\n", 111 | "tPoints = np.array([[0, 0],\n", 112 | " [0, height],\n", 113 | " [width, height],\n", 114 | " [width, 0]], np.float32)\n", 115 | "\n", 116 | "# getPerspectiveTransform() needs float32\n", 117 | "if sPoints.dtype != np.float32:\n", 118 | " sPoints = sPoints.astype(np.float32)\n", 119 | "\n", 120 | "# Wraping perspective\n", 121 | "M = cv2.getPerspectiveTransform(sPoints, tPoints) \n", 122 | "newImage = cv2.warpPerspective(image, M, (int(width), int(height)))\n", 123 | "\n", 124 | "# Saving the result. Yay! (don't forget to convert colors bact to BGR)\n", 125 | "cv2.imwrite(\"files/result-page-detection.jpg\", newImage)" 126 | ] 127 | } 128 | ], 129 | "metadata": { 130 | "kernelspec": { 131 | "display_name": "Python 3", 132 | "language": "python", 133 | "name": "python3" 134 | }, 135 | "language_info": { 136 | "codemirror_mode": { 137 | "name": "ipython", 138 | "version": 3 139 | }, 140 | "file_extension": ".py", 141 | "mimetype": "text/x-python", 142 | "name": "python", 143 | "nbconvert_exporter": "python", 144 | "pygments_lexer": "ipython3", 145 | "version": "3.6.3" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 2 150 | } 151 | -------------------------------------------------------------------------------- /Probabilistic Line Transformation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import cv2\n", 10 | "import math\n", 11 | "import numpy as np\n", 12 | "import imutils\n", 13 | "from IPython.display import Image" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": null, 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "def detectOrientation(image):\n", 23 | " # convert the image to grayscale and flip the foreground\n", 24 | " # and background to ensure foreground is now \"white\" and\n", 25 | " # the background is \"black\"\n", 26 | " gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n", 27 | " gray = cv2.bitwise_not(gray)\n", 28 | "\n", 29 | " # threshold the image, setting all foreground pixels to\n", 30 | " # 255 and all background pixels to 0\n", 31 | " thresh = cv2.threshold(gray, 0, 255,\n", 32 | " cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]\n", 33 | " \n", 34 | " # grab the (x, y) coordinates of all pixel values that\n", 35 | " # are greater than zero, then use these coordinates to\n", 36 | " # compute a rotated bounding box that contains all\n", 37 | " # coordinates\n", 38 | " coords = np.column_stack(np.where(thresh > 0))\n", 39 | " angle = cv2.minAreaRect(coords)[-1]\n", 40 | "\n", 41 | " # the `cv2.minAreaRect` function returns values in the\n", 42 | " # range [-90, 0); as the rectangle rotates clockwise the\n", 43 | " # returned angle trends to 0 -- in this special case we\n", 44 | " # need to add 90 degrees to the angle\n", 45 | " if angle < -45:\n", 46 | " angle = -(90 + angle)\n", 47 | "\n", 48 | " # otherwise, just take the inverse of the angle to make\n", 49 | " # it positive\n", 50 | " else:\n", 51 | " angle = -angle\n", 52 | " \n", 53 | " # rotate the image to deskew it\n", 54 | " (h, w) = image.shape[:2]\n", 55 | " center = (w // 2, h // 2)\n", 56 | " M = cv2.getRotationMatrix2D(center, angle, 1.0)\n", 57 | " rotated = cv2.warpAffine(image, M, (w, h),\n", 58 | " flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n", 59 | " return (angle, rotated)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "inputPath = 'files/form2.jpg'\n", 69 | "\n", 70 | "# Probabilistic Line Transformation.\n", 71 | "\n", 72 | "img = cv2.imread(inputPath)\n", 73 | "img = imutils.rotate(img, 0)\n", 74 | "\n", 75 | "gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n", 76 | "edges = cv2.Canny(gray,50,200,apertureSize = 3)\n", 77 | "minLineLength = 10\n", 78 | "maxLineGap = 100\n", 79 | "lines = cv2.HoughLinesP(edges,1,np.pi/360,100,minLineLength,maxLineGap)\n", 80 | "if lines is not None:\n", 81 | " for line in lines:\n", 82 | " for x1,y1,x2,y2 in line:\n", 83 | " cv2.line(img,(x1,y1),(x2,y2),(0,255,0),2,cv2.LINE_AA)\n", 84 | "\n", 85 | "outputPath = inputPath.replace('.jpg', '_output_ProbabilisticLineTransformation.jpg')\n", 86 | "cv2.imwrite(outputPath, img)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "print(detectOrientation(img)[0])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [] 104 | } 105 | ], 106 | "metadata": { 107 | "kernelspec": { 108 | "display_name": "Python 3", 109 | "language": "python", 110 | "name": "python3" 111 | }, 112 | "language_info": { 113 | "codemirror_mode": { 114 | "name": "ipython", 115 | "version": 3 116 | }, 117 | "file_extension": ".py", 118 | "mimetype": "text/x-python", 119 | "name": "python", 120 | "nbconvert_exporter": "python", 121 | "pygments_lexer": "ipython3", 122 | "version": "3.6.3" 123 | } 124 | }, 125 | "nbformat": 4, 126 | "nbformat_minor": 2 127 | } 128 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Form Segmentation 2 | 3 | Let's explore how we can extract text from any forms / scanned pages. 4 | 5 | ## Objectives 6 | 7 | The goal is to find an algorithm that can extract the maximum information from a given page (jpg format). 8 | So, we can feed it to another system. (Business logic, neural network, classifier, etc.) 9 | The overall process may not be perfect. 10 | But it would be great if it can find enough information to identify the type of document and the involve identities. 11 | 12 | - Parse any form / scanned page and extract any text data (printed text and handwriting text). 13 | So, no prior knowledge of the layout / structure of the document. 14 | 15 | - Automatic extraction process (no human interaction. So, it can scale out) 16 | 17 | - Somehow fast (or the ability to speed up the task with more machines or CPU) 18 | 19 | ## Challenges 20 | 21 | There are many challenges to overcome. 22 | But the main problem is to identify which part of the form contains text. 23 | 24 | Some other challenges: 25 | 26 | - Black Border Removal 27 | - ICR (Intelligent Character Recognition): recognize and convert hand-drawn characters into text 28 | - Scanned page (Detect edges and apply a perspective transform to obtain the top-down view of the document) 29 | - Remove noise (blur, OTSU, adaptivethreshold with opencv) 30 | - Shape detection and extraction 31 | - OCR (Not a real issue since we can use : Tesseract 4 great for printed text) 32 | - Handwriting recognition 33 | - Minimize errors 34 | -------------------------------------------------------------------------------- /files/form1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form1.jpg -------------------------------------------------------------------------------- /files/form10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form10.jpg -------------------------------------------------------------------------------- /files/form11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form11.png -------------------------------------------------------------------------------- /files/form2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form2.jpg -------------------------------------------------------------------------------- /files/form3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form3.jpg -------------------------------------------------------------------------------- /files/form4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form4.jpg -------------------------------------------------------------------------------- /files/form5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form5.jpg -------------------------------------------------------------------------------- /files/form6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form6.jpg -------------------------------------------------------------------------------- /files/form7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form7.jpg -------------------------------------------------------------------------------- /files/form8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form8.jpg -------------------------------------------------------------------------------- /files/form9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form9.jpg -------------------------------------------------------------------------------- /img/common_segmentation_methods.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/img/common_segmentation_methods.png --------------------------------------------------------------------------------