├── .gitignore
├── Form cleaner.ipynb
├── Intelligent Character Recognition.ipynb
├── LICENSE
├── Page detection.ipynb
├── Probabilistic Line Transformation.ipynb
├── README.md
├── files
    ├── form1.jpg
    ├── form10.jpg
    ├── form11.png
    ├── form2.jpg
    ├── form3.jpg
    ├── form4.jpg
    ├── form5.jpg
    ├── form6.jpg
    ├── form7.jpg
    ├── form8.jpg
    └── form9.jpg
└── img
    └── common_segmentation_methods.png


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | 


--------------------------------------------------------------------------------
/Form cleaner.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Form cleaner"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import cv2\n",
 17 |     "import math\n",
 18 |     "import numpy as np\n",
 19 |     "import imutils\n",
 20 |     "import subprocess\n",
 21 |     "from IPython.display import Image\n",
 22 |     "\n",
 23 |     "inputFile = \"form11.png\"\n",
 24 |     "Image(filename='files/' + inputFile)"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "## Detect scanned page (if applicable)\n",
 32 |     "- Detect edges\n",
 33 |     "- Use the edges in the image to find the contour (outline) representing the piece of paper being scanned.\n",
 34 |     "- Apply a perspective transform to obtain the top-down view of the document.\n",
 35 |     "\n",
 36 |     "Objective:\n",
 37 |     "- We want to work on the scanned page (if applicable) or the page itself.\n",
 38 |     "\n",
 39 |     "See:\n",
 40 |     "- http://bretahajek.com/2017/01/scanning-documents-photos-opencv/\n",
 41 |     "- https://www.pyimagesearch.com/2014/09/01/build-kick-ass-mobile-document-scanner-just-5-minutes/\n",
 42 |     "- https://github.com/Breta01/handwriting-ocr/blob/master/PageDetection.ipynb"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": null,
 48 |    "metadata": {},
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "img = cv2.imread(\"files/\" + inputFile)\n",
 52 |     "\n",
 53 |     "# TODO\n",
 54 |     "\n",
 55 |     "cv2.imwrite(\"files/result-0.jpg\", img)\n",
 56 |     "Image(filename='files/result-0.jpg')"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "## Remove form / table structure\n",
 64 |     "\n",
 65 |     "Objective:\n",
 66 |     "- Remove horizontal and vertical lines\n",
 67 |     "- Make it easier to detect text (remove unnecessary elements on page)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "options = \"files/result-0.jpg \"\n",
 77 |     "options += \"-type \"\n",
 78 |     "options += \"Grayscale \"\n",
 79 |     "options += \"-negate \"\n",
 80 |     "options += \"-define morphology:compose=darken \"\n",
 81 |     "options += \"-morphology Thinning 'Rectangle:15x1+0+0<' \"\n",
 82 |     "options += \"-negate \"\n",
 83 |     "options += \"files/result-1.jpg\"\n",
 84 |     "\n",
 85 |     "# Make sure to install imagemagick, otherwise the following line will fail\n",
 86 |     "subprocess.getoutput(\"convert \" + options)\n",
 87 |     "Image(filename='files/result-1.jpg')"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {},
 93 |    "source": [
 94 |     "## Remove noise and make text clearer\n",
 95 |     "\n",
 96 |     "Objectives:\n",
 97 |     "- Make text clearer\n",
 98 |     "- Apply OSTU threshold to clean up the result"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "img = cv2.imread(\"files/result-1.jpg\")\n",
108 |     "gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)\n",
109 |     "\n",
110 |     "gray = cv2.blur(gray,(1,1))\n",
111 |     "gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]\n",
112 |     "\n",
113 |     "cv2.imwrite(\"files/result-2.jpg\", gray)\n",
114 |     "Image(filename='files/result-2.jpg')"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "def my_blur(img):\n",
124 |     "    img = cv2.adaptiveThreshold(img, 255,\n",
125 |     "                                cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\n",
126 |     "                                cv2.THRESH_BINARY, 115, 4)\n",
127 |     "    \n",
128 |     "    k1 = np.ones((1,1),np.uint8)\n",
129 |     "    img = cv2.morphologyEx(img, cv2.MORPH_OPEN, k1)\n",
130 |     "\n",
131 |     "    _,img = cv2.threshold(img,0,255,cv2.THRESH_BINARY_INV)\n",
132 |     "\n",
133 |     "    k1 = np.ones((2,2),np.uint8)\n",
134 |     "    img = cv2.morphologyEx(img, cv2.MORPH_DILATE, k1)\n",
135 |     "    \n",
136 |     "    img = cv2.blur(img,(2,2))\n",
137 |     "    img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]\n",
138 |     "    return img\n",
139 |     "\n",
140 |     "img1 = cv2.imread(\"files/result-0.jpg\")\n",
141 |     "img1 = cv2.cvtColor(img1, cv2.COLOR_BGR2GRAY)\n",
142 |     "img1 = my_blur(img1)\n",
143 |     "\n",
144 |     "img2 = cv2.imread(\"files/result-2.jpg\")\n",
145 |     "img2 = cv2.cvtColor(img2, cv2.COLOR_BGR2GRAY)\n",
146 |     "img2 = my_blur(img2)\n",
147 |     "\n",
148 |     "img = cv2.absdiff(img1, img2)\n",
149 |     "img = cv2.bitwise_not(img)\n",
150 |     "\n",
151 |     "#kernel = np.ones((1,1),np.uint8)\n",
152 |     "#img = cv2.morphologyEx(img, cv2.MORPH_OPEN, kernel)\n",
153 |     "\n",
154 |     "cv2.imwrite(\"files/result-3.jpg\", img)\n",
155 |     "Image(filename='files/result-3.jpg')"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "## Shape detection and extraction\n",
163 |     "\n",
164 |     "Objective:\n",
165 |     "- Extract text line\n",
166 |     "\n",
167 |     "Possibilities:\n",
168 |     "- MSER\n",
169 |     "- Threshold (OTSU) and findContours\n",
170 |     "\n",
171 |     "See:\n",
172 |     "- http://opencvpython.blogspot.ca/2012/06/hi-this-article-is-tutorial-which-try.html"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "# TODO"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "markdown",
186 |    "metadata": {},
187 |    "source": [
188 |     "## OCR and handwriting recognition\n",
189 |     "\n",
190 |     "See:\n",
191 |     "- Tesseract 4\n",
192 |     "- https://github.com/Breta01/handwriting-ocr (Handwriting recognition)"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "# TODO"
202 |    ]
203 |   }
204 |  ],
205 |  "metadata": {
206 |   "kernelspec": {
207 |    "display_name": "Python 3",
208 |    "language": "python",
209 |    "name": "python3"
210 |   },
211 |   "language_info": {
212 |    "codemirror_mode": {
213 |     "name": "ipython",
214 |     "version": 3
215 |    },
216 |    "file_extension": ".py",
217 |    "mimetype": "text/x-python",
218 |    "name": "python",
219 |    "nbconvert_exporter": "python",
220 |    "pygments_lexer": "ipython3",
221 |    "version": "3.6.3"
222 |   }
223 |  },
224 |  "nbformat": 4,
225 |  "nbformat_minor": 2
226 | }
227 | 


--------------------------------------------------------------------------------
/Intelligent Character Recognition.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Intelligent Character Recognition\n",
  8 |     "(AKA ICR)\n",
  9 |     "\n",
 10 |     "See:\n",
 11 |     "- https://en.wikipedia.org/wiki/Intelligent_character_recognition\n",
 12 |     "\n",
 13 |     "This is not the same as OCR.\n",
 14 |     "\n",
 15 |     "> When we talk about OCR, we are really talking about converting data from an image that was created by a machine, whether it be a document created by an office application, and even an old document typewritten\n",
 16 |     ">\n",
 17 |     "> For ICR, pattern-matching goes to a different level. You cannot really have success deploying a solution requiring ICR without additional information to aid in the recognition process.\n",
 18 |     ">\n",
 19 |     "> https://www.parascript.com/blog/difference-ocr-icr/\n",
 20 |     "\n",
 21 |     "# Segmentation challenges\n",
 22 |     "\n",
 23 |     "A few common segmentation methods are listed below:\n",
 24 |     "![Common segmentation methods](img/common_segmentation_methods.png)\n",
 25 |     "\n",
 26 |     "See: https://www.scanstore.com/ICR_Guide/\n",
 27 |     "\n",
 28 |     "## Possible solutions\n",
 29 |     "\n",
 30 |     "1. Use the common pattern to detect text field (AKA Handprint Recognition)\n",
 31 |     "\n",
 32 |     "2. Use imagemagick to remove the form/table structure:\n",
 33 |     "\n",
 34 |     "```\n",
 35 |     "convert inputFile.jpg -type Grayscale -negate -define morphology:compose=darken -morphology Thinning 'Rectangle:15x1+0+0<' -negate outputFile.jpg\n",
 36 |     "```\n",
 37 |     "\n",
 38 |     "Advantages:\n",
 39 |     "- Remove most horizontal and vertical lines\n",
 40 |     "- Clearer text (remove unnecessary elements on page)\n",
 41 |     "\n",
 42 |     "Disadvantages:\n",
 43 |     "- It introduces extra spaces (becomes monospaced font)\n",
 44 |     "- It can remove some part of handwriting text\n",
 45 |     "- Hard to determine the perfect rectangle size (vary: 15)\n",
 46 |     "\n",
 47 |     "# Let's explore the available options on the market\n",
 48 |     "\n",
 49 |     "## queXF\n",
 50 |     "An open source, web based paper form verification and data entry system\n",
 51 |     "\n",
 52 |     "> The process of ICR is broken in to the following 7 steps:\n",
 53 |     "> - Character isolation\n",
 54 |     "> - Noise reduction\n",
 55 |     "> - Boundary removal\n",
 56 |     "> - Normalising\n",
 57 |     "> - Thinning\n",
 58 |     "> - Feature extraction\n",
 59 |     "> - Training or Recognition\n",
 60 |     ">\n",
 61 |     "> https://quexf.acspri.org.au/intelligent-character-recognition\n",
 62 |     "\n",
 63 |     "It can:\n",
 64 |     "- Automatically detects locations of boxes on the form (no need to manually overlay boxes)\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "## NIST FORM-BASED HANDPRINT RECOGNITION SYSTEM (RELEASE 2.2)\n",
 68 |     "A public domain document processing system was developed by the National Institute of Standards and Technology (NIST) in 1994. The system is a standard reference form-based handprint recognition system for evaluating optical character recognition (OCR), and it is intended to provide a baseline of performance on an open application. \n",
 69 |     "\n",
 70 |     "NIST developed a new release in 2003 for internet distribution of its standard reference form-based handprint recognition system for evaluating optical character recognition (OCR). Release 2.2 incorporates all bug fixes generated for previous versions.\n",
 71 |     "\n",
 72 |     "https://www.nist.gov/services-resources/software/public-domain-ocr\n",
 73 |     "\n",
 74 |     "For source code and training data:\n",
 75 |     "- https://github.com/brainopener/nist-ocr\n",
 76 |     "\n",
 77 |     "According to the documentation:\n",
 78 |     "- To process a new type of form, trainreg must be run on a prototypical form and the output coordinate stored.\n",
 79 |     "\n",
 80 |     "##  ABBYY Cloud OCR SDK\n",
 81 |     "\n",
 82 |     "https://ocrsdk.com/documentation/quick-start/text-fields/\n",
 83 |     "\n",
 84 |     "## Others\n",
 85 |     "- https://www.a2ia.com/fr/a2ia-fieldreader\n",
 86 |     "- http://www.recogniform.com/index.htm\n"
 87 |    ]
 88 |   }
 89 |  ],
 90 |  "metadata": {
 91 |   "kernelspec": {
 92 |    "display_name": "Python 3",
 93 |    "language": "python",
 94 |    "name": "python3"
 95 |   },
 96 |   "language_info": {
 97 |    "codemirror_mode": {
 98 |     "name": "ipython",
 99 |     "version": 3
100 |    },
101 |    "file_extension": ".py",
102 |    "mimetype": "text/x-python",
103 |    "name": "python",
104 |    "nbconvert_exporter": "python",
105 |    "pygments_lexer": "ipython3",
106 |    "version": "3.6.3"
107 |   }
108 |  },
109 |  "nbformat": 4,
110 |  "nbformat_minor": 2
111 | }
112 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Philip Doxakis
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Page detection.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import cv2\n",
 10 |     "import numpy as np\n",
 11 |     "import sys\n",
 12 |     "\n",
 13 |     "image = cv2.cvtColor(cv2.imread('files/form8.jpg'), cv2.COLOR_BGR2RGB)\n",
 14 |     "\n",
 15 |     "def resize(img, height=800):\n",
 16 |     "    \"\"\" Resize image to given height \"\"\"\n",
 17 |     "    rat = height / img.shape[0]\n",
 18 |     "    return cv2.resize(img, (int(rat * img.shape[1]), height))\n",
 19 |     "\n",
 20 |     "# Resize and convert to grayscale\n",
 21 |     "img = cv2.cvtColor(resize(image), cv2.COLOR_BGR2GRAY)\n",
 22 |     "\n",
 23 |     "# Bilateral filter preserv edges\n",
 24 |     "img = cv2.bilateralFilter(img, 9, 75, 75)\n",
 25 |     "\n",
 26 |     "# Create black and white image based on adaptive threshold\n",
 27 |     "img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 115, 4)\n",
 28 |     "\n",
 29 |     "# Median filter clears small details\n",
 30 |     "img = cv2.medianBlur(img, 11)\n",
 31 |     "\n",
 32 |     "# Add black border in case that page is touching an image border\n",
 33 |     "img = cv2.copyMakeBorder(img, 5, 5, 5, 5, cv2.BORDER_CONSTANT, value=[0, 0, 0])\n",
 34 |     "\n",
 35 |     "edges = cv2.Canny(img, 200, 250)\n",
 36 |     "\n",
 37 |     "# Getting contours  \n",
 38 |     "im2, contours, hierarchy = cv2.findContours(edges, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)\n",
 39 |     "\n",
 40 |     "# Finding contour of biggest rectangle\n",
 41 |     "# Otherwise return corners of original image\n",
 42 |     "# Don't forget on our 5px border!\n",
 43 |     "height = edges.shape[0]\n",
 44 |     "width = edges.shape[1]\n",
 45 |     "MAX_COUNTOUR_AREA = (width - 10) * (height - 10)\n",
 46 |     "\n",
 47 |     "# Page fill at least half of image, then saving max area found\n",
 48 |     "maxAreaFound = MAX_COUNTOUR_AREA * 0.5\n",
 49 |     "\n",
 50 |     "# Saving page contour\n",
 51 |     "pageContour = np.array([[5, 5], [5, height-5], [width-5, height-5], [width-5, 5]])\n",
 52 |     "\n",
 53 |     "# Go through all contours\n",
 54 |     "for cnt in contours:\n",
 55 |     "    # Simplify contour\n",
 56 |     "    perimeter = cv2.arcLength(cnt, True)\n",
 57 |     "    approx = cv2.approxPolyDP(cnt, 0.03 * perimeter, True)\n",
 58 |     "\n",
 59 |     "    # Page has 4 corners and it is convex\n",
 60 |     "    # Page area must be bigger than maxAreaFound \n",
 61 |     "    if (len(approx) == 4 and\n",
 62 |     "            cv2.isContourConvex(approx) and\n",
 63 |     "            maxAreaFound < cv2.contourArea(approx) < MAX_COUNTOUR_AREA):\n",
 64 |     "\n",
 65 |     "        maxAreaFound = cv2.contourArea(approx)\n",
 66 |     "        pageContour = approx\n",
 67 |     "\n",
 68 |     "# Result in pageConoutr (numpy array of 4 points):\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "def fourCornersSort(pts):\n",
 72 |     "    \"\"\" Sort corners: top-left, bot-left, bot-right, top-right \"\"\"\n",
 73 |     "    # Difference and sum of x and y value\n",
 74 |     "    # Inspired by http://www.pyimagesearch.com\n",
 75 |     "    diff = np.diff(pts, axis=1)\n",
 76 |     "    summ = pts.sum(axis=1)\n",
 77 |     "    \n",
 78 |     "    # Top-left point has smallest sum...\n",
 79 |     "    # np.argmin() returns INDEX of min\n",
 80 |     "    return np.array([pts[np.argmin(summ)],\n",
 81 |     "                     pts[np.argmax(diff)],\n",
 82 |     "                     pts[np.argmax(summ)],\n",
 83 |     "                     pts[np.argmin(diff)]])\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "def contourOffset(cnt, offset):\n",
 87 |     "    \"\"\" Offset contour, by 5px border \"\"\"\n",
 88 |     "    # Matrix addition\n",
 89 |     "    cnt += offset\n",
 90 |     "    \n",
 91 |     "    # if value < 0 => replace it by 0\n",
 92 |     "    cnt[cnt < 0] = 0\n",
 93 |     "    return cnt\n",
 94 |     "\n",
 95 |     "\n",
 96 |     "# Sort and offset corners\n",
 97 |     "pageContour = fourCornersSort(pageContour[:, 0])\n",
 98 |     "pageContour = contourOffset(pageContour, (-5, -5))\n",
 99 |     "\n",
100 |     "# Recalculate to original scale - start Points\n",
101 |     "sPoints = pageContour.dot(image.shape[0] / 800)\n",
102 |     "  \n",
103 |     "# Using Euclidean distance\n",
104 |     "# Calculate maximum height (maximal length of vertical edges) and width\n",
105 |     "height = max(np.linalg.norm(sPoints[0] - sPoints[1]),\n",
106 |     "             np.linalg.norm(sPoints[2] - sPoints[3]))\n",
107 |     "width = max(np.linalg.norm(sPoints[1] - sPoints[2]),\n",
108 |     "             np.linalg.norm(sPoints[3] - sPoints[0]))\n",
109 |     "\n",
110 |     "# Create target points\n",
111 |     "tPoints = np.array([[0, 0],\n",
112 |     "                    [0, height],\n",
113 |     "                    [width, height],\n",
114 |     "                    [width, 0]], np.float32)\n",
115 |     "\n",
116 |     "# getPerspectiveTransform() needs float32\n",
117 |     "if sPoints.dtype != np.float32:\n",
118 |     "    sPoints = sPoints.astype(np.float32)\n",
119 |     "\n",
120 |     "# Wraping perspective\n",
121 |     "M = cv2.getPerspectiveTransform(sPoints, tPoints) \n",
122 |     "newImage = cv2.warpPerspective(image, M, (int(width), int(height)))\n",
123 |     "\n",
124 |     "# Saving the result. Yay! (don't forget to convert colors bact to BGR)\n",
125 |     "cv2.imwrite(\"files/result-page-detection.jpg\", newImage)"
126 |    ]
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "kernelspec": {
131 |    "display_name": "Python 3",
132 |    "language": "python",
133 |    "name": "python3"
134 |   },
135 |   "language_info": {
136 |    "codemirror_mode": {
137 |     "name": "ipython",
138 |     "version": 3
139 |    },
140 |    "file_extension": ".py",
141 |    "mimetype": "text/x-python",
142 |    "name": "python",
143 |    "nbconvert_exporter": "python",
144 |    "pygments_lexer": "ipython3",
145 |    "version": "3.6.3"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 2
150 | }
151 | 


--------------------------------------------------------------------------------
/Probabilistic Line Transformation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import cv2\n",
 10 |     "import math\n",
 11 |     "import numpy as np\n",
 12 |     "import imutils\n",
 13 |     "from IPython.display import Image"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": null,
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "def detectOrientation(image):\n",
 23 |     "    # convert the image to grayscale and flip the foreground\n",
 24 |     "    # and background to ensure foreground is now \"white\" and\n",
 25 |     "    # the background is \"black\"\n",
 26 |     "    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)\n",
 27 |     "    gray = cv2.bitwise_not(gray)\n",
 28 |     "\n",
 29 |     "    # threshold the image, setting all foreground pixels to\n",
 30 |     "    # 255 and all background pixels to 0\n",
 31 |     "    thresh = cv2.threshold(gray, 0, 255,\n",
 32 |     "        cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]\n",
 33 |     "    \n",
 34 |     "    # grab the (x, y) coordinates of all pixel values that\n",
 35 |     "    # are greater than zero, then use these coordinates to\n",
 36 |     "    # compute a rotated bounding box that contains all\n",
 37 |     "    # coordinates\n",
 38 |     "    coords = np.column_stack(np.where(thresh > 0))\n",
 39 |     "    angle = cv2.minAreaRect(coords)[-1]\n",
 40 |     "\n",
 41 |     "    # the `cv2.minAreaRect` function returns values in the\n",
 42 |     "    # range [-90, 0); as the rectangle rotates clockwise the\n",
 43 |     "    # returned angle trends to 0 -- in this special case we\n",
 44 |     "    # need to add 90 degrees to the angle\n",
 45 |     "    if angle < -45:\n",
 46 |     "        angle = -(90 + angle)\n",
 47 |     "\n",
 48 |     "    # otherwise, just take the inverse of the angle to make\n",
 49 |     "    # it positive\n",
 50 |     "    else:\n",
 51 |     "        angle = -angle\n",
 52 |     "    \n",
 53 |     "    # rotate the image to deskew it\n",
 54 |     "    (h, w) = image.shape[:2]\n",
 55 |     "    center = (w // 2, h // 2)\n",
 56 |     "    M = cv2.getRotationMatrix2D(center, angle, 1.0)\n",
 57 |     "    rotated = cv2.warpAffine(image, M, (w, h),\n",
 58 |     "        flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)\n",
 59 |     "    return (angle, rotated)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "inputPath = 'files/form2.jpg'\n",
 69 |     "\n",
 70 |     "# Probabilistic Line Transformation.\n",
 71 |     "\n",
 72 |     "img = cv2.imread(inputPath)\n",
 73 |     "img = imutils.rotate(img, 0)\n",
 74 |     "\n",
 75 |     "gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)\n",
 76 |     "edges = cv2.Canny(gray,50,200,apertureSize = 3)\n",
 77 |     "minLineLength = 10\n",
 78 |     "maxLineGap = 100\n",
 79 |     "lines = cv2.HoughLinesP(edges,1,np.pi/360,100,minLineLength,maxLineGap)\n",
 80 |     "if lines is not None:\n",
 81 |     "    for line in lines:\n",
 82 |     "        for x1,y1,x2,y2 in line:\n",
 83 |     "            cv2.line(img,(x1,y1),(x2,y2),(0,255,0),2,cv2.LINE_AA)\n",
 84 |     "\n",
 85 |     "outputPath = inputPath.replace('.jpg', '_output_ProbabilisticLineTransformation.jpg')\n",
 86 |     "cv2.imwrite(outputPath, img)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "print(detectOrientation(img)[0])"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": []
104 |   }
105 |  ],
106 |  "metadata": {
107 |   "kernelspec": {
108 |    "display_name": "Python 3",
109 |    "language": "python",
110 |    "name": "python3"
111 |   },
112 |   "language_info": {
113 |    "codemirror_mode": {
114 |     "name": "ipython",
115 |     "version": 3
116 |    },
117 |    "file_extension": ".py",
118 |    "mimetype": "text/x-python",
119 |    "name": "python",
120 |    "nbconvert_exporter": "python",
121 |    "pygments_lexer": "ipython3",
122 |    "version": "3.6.3"
123 |   }
124 |  },
125 |  "nbformat": 4,
126 |  "nbformat_minor": 2
127 | }
128 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Form Segmentation
 2 | 
 3 | Let's explore how we can extract text from any forms / scanned pages.
 4 | 
 5 | ## Objectives
 6 | 
 7 | The goal is to find an algorithm that can extract the maximum information from a given page (jpg format).
 8 | So, we can feed it to another system. (Business logic, neural network, classifier, etc.)
 9 | The overall process may not be perfect.
10 | But it would be great if it can find enough information to identify the type of document and the involve identities.
11 | 
12 | - Parse any form / scanned page and extract any text data (printed text and handwriting text).
13 | So, no prior knowledge of the layout / structure of the document.
14 | 
15 | - Automatic extraction process (no human interaction. So, it can scale out)
16 | 
17 | - Somehow fast (or the ability to speed up the task with more machines or CPU)
18 | 
19 | ## Challenges
20 | 
21 | There are many challenges to overcome.
22 | But the main problem is to identify which part of the form contains text.
23 | 
24 | Some other challenges:
25 | 
26 | - Black Border Removal
27 | - ICR (Intelligent Character Recognition): recognize and convert hand-drawn characters into text
28 | - Scanned page (Detect edges and apply a perspective transform to obtain the top-down view of the document)
29 | - Remove noise (blur, OTSU, adaptivethreshold with opencv)
30 | - Shape detection and extraction
31 | - OCR (Not a real issue since we can use : Tesseract 4 great for printed text)
32 | - Handwriting recognition
33 | - Minimize errors
34 | 


--------------------------------------------------------------------------------
/files/form1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form1.jpg


--------------------------------------------------------------------------------
/files/form10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form10.jpg


--------------------------------------------------------------------------------
/files/form11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form11.png


--------------------------------------------------------------------------------
/files/form2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form2.jpg


--------------------------------------------------------------------------------
/files/form3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form3.jpg


--------------------------------------------------------------------------------
/files/form4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form4.jpg


--------------------------------------------------------------------------------
/files/form5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form5.jpg


--------------------------------------------------------------------------------
/files/form6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form6.jpg


--------------------------------------------------------------------------------
/files/form7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form7.jpg


--------------------------------------------------------------------------------
/files/form8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form8.jpg


--------------------------------------------------------------------------------
/files/form9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/files/form9.jpg


--------------------------------------------------------------------------------
/img/common_segmentation_methods.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/doxakis/form-segmentation/4395b4dd77acd17322bec5b9213dbe20c8d405c6/img/common_segmentation_methods.png


--------------------------------------------------------------------------------