├── Img ├── Aadhar.png ├── address.png ├── app.png ├── gif of visionapi.gif ├── opencv_ocr_pipeline.png └── visionraw.mp4 ├── OCR_source file.py └── README.md /Img/Aadhar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sakethbachu/ocr_using_tesseract/3de4bc701a7f4fe6b0da86be67ee153c4245217b/Img/Aadhar.png -------------------------------------------------------------------------------- /Img/address.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sakethbachu/ocr_using_tesseract/3de4bc701a7f4fe6b0da86be67ee153c4245217b/Img/address.png -------------------------------------------------------------------------------- /Img/app.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sakethbachu/ocr_using_tesseract/3de4bc701a7f4fe6b0da86be67ee153c4245217b/Img/app.png -------------------------------------------------------------------------------- /Img/gif of visionapi.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sakethbachu/ocr_using_tesseract/3de4bc701a7f4fe6b0da86be67ee153c4245217b/Img/gif of visionapi.gif -------------------------------------------------------------------------------- /Img/opencv_ocr_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sakethbachu/ocr_using_tesseract/3de4bc701a7f4fe6b0da86be67ee153c4245217b/Img/opencv_ocr_pipeline.png -------------------------------------------------------------------------------- /Img/visionraw.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sakethbachu/ocr_using_tesseract/3de4bc701a7f4fe6b0da86be67ee153c4245217b/Img/visionraw.mp4 -------------------------------------------------------------------------------- /OCR_source file.py: -------------------------------------------------------------------------------- 1 | #With this code you can perform text detection and recognition using tesseract engine maintained by the Google 2 | #This is a 2 step process of extraction of ROI and performing recognition task on the extracted ROIs 3 | #Instead of tesseract engie you can aslo use the google vision api 4 | 5 | # import the necessary packages 6 | from imutils.object_detection import non_max_suppression 7 | import numpy as np 8 | import pytesseract 9 | import argparse 10 | import cv2 11 | # This function basically uses a deep learning based text detector in regions withing the image 12 | # The text detector gives probability of given area having text and also maps to the bounding box location in 13 | # text containing region of the image. 14 | def decode_predictions(scores, geometry): 15 | (numRows, numCols) = scores.shape[2:4] 16 | rects = [] 17 | confidences = [] 18 | 19 | # loop over the number of rows 20 | for y in range(0, numRows): 21 | scoresData = scores[0, 0, y] 22 | xData0 = geometry[0, 0, y] 23 | xData1 = geometry[0, 1, y] 24 | xData2 = geometry[0, 2, y] 25 | xData3 = geometry[0, 3, y] 26 | anglesData = geometry[0, 4, y] 27 | 28 | for x in range(0, numCols): 29 | # if our score does not have sufficient probability, 30 | if scoresData[x] < args["min_confidence"]: 31 | continue 32 | 33 | (offsetX, offsetY) = (x * 4.0, y * 4.0) 34 | 35 | angle = anglesData[x] 36 | cos = np.cos(angle) 37 | sin = np.sin(angle) 38 | 39 | h = xData0[x] + xData2[x] 40 | w = xData1[x] + xData3[x] 41 | 42 | # compute both the starting and ending (x, y)-coordinates 43 | # for the text prediction bounding box 44 | endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) 45 | endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) 46 | startX = int(endX - w) 47 | startY = int(endY - h) 48 | 49 | # add the bounding box coordinates and probability score 50 | # to our respective lists 51 | rects.append((startX, startY, endX, endY)) 52 | confidences.append(scoresData[x]) 53 | 54 | # return a tuple of the bounding boxes and associated confidences 55 | return (rects, confidences) 56 | 57 | # construct the argument parser and parse the arguments 58 | # all the necessary inputs will be taken from this block of code 59 | # the description given as a part of help tag will give you information about the input 60 | ap = argparse.ArgumentParser() 61 | ap.add_argument("-i", "--image", type=str, 62 | help="path to input image") 63 | ap.add_argument("-east", "--east", type=str, 64 | help="path to input EAST text detector") 65 | ap.add_argument("-c", "--min-confidence", type=float, default=0.5, 66 | help="minimum probability required to inspect a region") 67 | ap.add_argument("-w", "--width", type=int, default=320, 68 | help="nearest multiple of 32 for resized width") 69 | ap.add_argument("-e", "--height", type=int, default=320, 70 | help="nearest multiple of 32 for resized height") 71 | ap.add_argument("-p", "--padding", type=float, default=0.0, 72 | help="amount of padding to add to each border of ROI") 73 | args = vars(ap.parse_args()) 74 | 75 | # load the input image and grab the image dimensions 76 | image = cv2.imread(args["image"]) 77 | orig = image.copy() 78 | #getting the dimensions of the image 79 | (origH, origW) = image.shape[:2] 80 | 81 | # set the new width and height and then determine the ratio in change 82 | # for both the width and height 83 | (newW, newH) = (args["width"], args["height"]) 84 | rW = origW / float(newW) 85 | rH = origH / float(newH) 86 | 87 | # resize the image and grab the new image dimensions 88 | image = cv2.resize(image, (newW, newH)) 89 | (H, W) = image.shape[:2] 90 | 91 | 92 | layerNames = [ 93 | "feature_fusion/Conv_7/Sigmoid", 94 | "feature_fusion/concat_3"] 95 | 96 | # load the pre-trained EAST text detector 97 | print("EAST text detector...") 98 | net = cv2.dnn.readNet(args["east"]) 99 | 100 | 101 | blob = cv2.dnn.blobFromImage(image, 1.0, (W, H), 102 | (123.68, 116.78, 103.94), swapRB=True, crop=False) 103 | net.setInput(blob) 104 | (scores, geometry) = net.forward(layerNames) 105 | 106 | # decode the predictions, then apply non-maxima suppression to 107 | # suppress weak, overlapping bounding boxes 108 | (rects, confidences) = decode_predictions(scores, geometry) 109 | boxes = non_max_suppression(np.array(rects), probs=confidences) 110 | 111 | # initialize the list of results 112 | results = [] 113 | 114 | # loop over the bounding boxes 115 | for (startX, startY, endX, endY) in boxes: 116 | # scale the bounding box coordinates based on the respective 117 | # ratios 118 | startX = int(startX * rW) 119 | startY = int(startY * rH) 120 | endX = int(endX * rW) 121 | endY = int(endY * rH) 122 | 123 | 124 | # are computing the deltas in both the x and y directions 125 | dX = int((endX - startX) * args["padding"]) 126 | dY = int((endY - startY) * args["padding"]) 127 | 128 | # apply padding to each side of the bounding box, respectively 129 | startX = max(0, startX - dX) 130 | startY = max(0, startY - dY) 131 | endX = min(origW, endX + (dX * 2)) 132 | endY = min(origH, endY + (dY * 2)) 133 | 134 | # extract the actual padded ROI 135 | roi = orig[startY:endY, startX:endX] 136 | 137 | # in order to apply Tesseract v4 to OCR text we must supply 138 | # (1) a language, (2) an OEM flag of 4, indicating that the we 139 | # wish to use the LSTM neural net model for OCR, and finally 140 | # (3) an OEM value, in this case, 7 which implies that we are 141 | # treating the ROI as a single line of text 142 | config = ("-l eng --oem 1 --psm 7") 143 | text = pytesseract.image_to_string(roi, config=config) 144 | 145 | # add the bounding box coordinates and OCR'd text to the list 146 | # of results 147 | results.append(((startX, startY, endX, endY), text)) 148 | 149 | # sort the results bounding box coordinates from top to bottom 150 | results = sorted(results, key=lambda r:r[0][1]) 151 | 152 | # loop over the results 153 | for ((startX, startY, endX, endY), text) in results: 154 | # display the text OCR'd by Tesseract 155 | print("OCR TEXT") 156 | print("========") 157 | print("{}\n".format(text)) 158 | 159 | # strip out non-ASCII text so we can draw the text on the image 160 | # using OpenCV, then draw the text and a bounding box surrounding 161 | # the text region of the input image 162 | text = "".join([c if ord(c) < 128 else "" for c in text]).strip() 163 | output = orig.copy() 164 | cv2.rectangle(output, (startX, startY), (endX, endY), 165 | (0, 0, 255), 2) 166 | cv2.putText(output, text, (startX, startY - 20), 167 | cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3) 168 | 169 | # show the output image 170 | #calling the imshow function 171 | cv2.imshow("Text Detection", output) 172 | cv2.waitKey(0) 173 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Optical Charachter Recognition using Tesseract 2 | [![Watch the video](https://github.com/sakethbachu/ocr_using_tesseract/blob/master/Img/gif%20of%20visionapi.gif)](https://github.com/sakethbachu/ocr_using_tesseract/blob/master/Img/visionraw.mp4) 3 | 4 | # Description 5 | Optical charachter recognition is one of the most important tasks in today's automated world. In this repository we have used both Tesseract and Vision-api by Google. We did not benchmark the results obtained by the methods. In this, we will perform both (1) text detection and (2) text recognition using OpenCV, Python, and Tesseract. To perform text detection we use OpenCV’s EAST deep learning model. Using this model we were able to detect and localize the bounding box coordinates of text contained in an image. The next step is to take each of these areas containing text and actually recognize and OCR the text using OpenCV and Tesseract. 6 | 7 | # Methods 8 | * In order to perform OpenCV OCR text recognition, we’ll use Tesseract v4 which includes a highly accurate deep learning-based model for text recognition. 9 | * Once we have detected the text regions with OpenCV, we’ll then extract each of the text ROIs and pass them into Tesseract, enabling us to build an entire OpenCV OCR pipeline. 10 | 11 | # Features 12 | Tesseract api is an LSTM network used for text recognition. The overall pipeline is given below. 13 | ![alt text](https://github.com/sakethbachu/ocr_using_tesseract/blob/master/Img/opencv_ocr_pipeline.png "Logo Title Text 1") 14 | 15 | # Contents of this repository 16 | * OCR_source file.py : This is the main python source file to do ocr using tesseract. 17 | * Img : Contains the descriptive images and the omr-sheet. 18 | 19 | # Requirements 20 | * Python 21 | * OpenCV 22 | * Tesseract v4 23 | * Google Vision API 24 | 25 | # Usage 26 | Use this in command line where python is installed `python text_recognition.py --east frozen_east_text_detection.pb \ --image images/example_01.jpg` 27 | 28 | # Demo 29 | ![alt text](https://github.com/sakethbachu/ocr_using_tesseract/blob/master/Img/address.png "Logo Title Text 1") 30 | --------------------------------------------------------------------------------