├── __init__.py ├── models ├── __init__.py └── resnet.py ├── requirements.txt ├── images ├── hello_world.png └── vietnamxinchao.png ├── utils.py ├── prediction.py └── train_model.py /__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import Helpers -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import ResNet -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | opencv-python 2 | matplotlib 3 | sklearn 4 | torch -------------------------------------------------------------------------------- /images/hello_world.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/housecricket/How-did-I-write-an-own-OCR-program-using-Keras-and-TensorFlow-in-Python/HEAD/images/hello_world.png -------------------------------------------------------------------------------- /images/vietnamxinchao.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/housecricket/How-did-I-write-an-own-OCR-program-using-Keras-and-TensorFlow-in-Python/HEAD/images/vietnamxinchao.png -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from tensorflow.keras.datasets import mnist 2 | import numpy as np 3 | 4 | def load_az_dataset(dataset_path): 5 | # initialize the list of data and labels 6 | data = [] 7 | labels = [] 8 | 9 | # loop over the rows of the A-Z handwritten digit dataset 10 | for row in open(dataset_path): 11 | # parse the label and image from the row 12 | row = row.split(",") 13 | label = int(row[0]) 14 | image = np.array([int(x) for x in row[1:]], dtype="uint8") 15 | 16 | # images are represented as single channel (grayscale) images 17 | # that are 28x28=784 pixels -- we need to take this flattened 18 | # 784-d list of numbers and reshape them into a 28x28 matrix 19 | image = image.reshape((28, 28)) 20 | 21 | # update the list of data and labels 22 | data.append(image) 23 | labels.append(label) 24 | 25 | # convert the data and labels to NumPy arrays 26 | data = np.array(data, dtype="float32") 27 | labels = np.array(labels, dtype="int") 28 | 29 | # return a 2-tuple of the A-Z data and labels 30 | return (data, labels) 31 | 32 | 33 | def load_zero_nine_dataset(): 34 | # load the MNIST dataset and stack the training data and testing 35 | # data together (we'll create our own training and testing splits 36 | # later in the project) 37 | ((trainData, trainLabels), (testData, testLabels)) = mnist.load_data() 38 | data = np.vstack([trainData, testData]) 39 | labels = np.hstack([trainLabels, testLabels]) 40 | # return a 2-tuple of the MNIST data and labels 41 | return (data, labels) 42 | -------------------------------------------------------------------------------- /prediction.py: -------------------------------------------------------------------------------- 1 | # USAGE 2 | # python prediction.py --model path_to_trained_model --image path_to_input_image 3 | 4 | # import the necessary packages 5 | from tensorflow.keras.models import load_model 6 | from imutils.contours import sort_contours 7 | import numpy as np 8 | import argparse 9 | import imutils 10 | import cv2 11 | 12 | # construct the argument parser and parse the arguments 13 | ap = argparse.ArgumentParser() 14 | ap.add_argument("-i", "--image", required=True, 15 | help="path to input image") 16 | ap.add_argument("-m", "--model", type=str, required=True, 17 | help="path to trained model") 18 | args = vars(ap.parse_args()) 19 | 20 | # load the handwriting OCR model 21 | print("[INFO] loading handwriting OCR model...") 22 | model = load_model(args["model"]) 23 | 24 | # load the input image from disk, convert it to grayscale, and blur 25 | # it to reduce noise 26 | image = cv2.imread(args["image"]) 27 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) 28 | blurred = cv2.GaussianBlur(gray, (5, 5), 0) 29 | 30 | # perform edge detection, find contours in the edge map, and sort the 31 | # resulting contours from left-to-right 32 | edged = cv2.Canny(blurred, 30, 150) 33 | cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL, 34 | cv2.CHAIN_APPROX_SIMPLE) 35 | cnts = imutils.grab_contours(cnts) 36 | cnts = sort_contours(cnts, method="left-to-right")[0] 37 | 38 | # initialize the list of contour bounding boxes and associated 39 | # characters that we'll be OCR'ing 40 | chars = [] 41 | 42 | # loop over the contours 43 | for c in cnts: 44 | # compute the bounding box of the contour 45 | (x, y, w, h) = cv2.boundingRect(c) 46 | 47 | # filter out bounding boxes, ensuring they are neither too small 48 | # nor too large 49 | if (w >= 5 and w <= 150) and (h >= 15 and h <= 120): 50 | # extract the character and threshold it to make the character 51 | # appear as *white* (foreground) on a *black* background, then 52 | # grab the width and height of the thresholded image 53 | roi = gray[y:y + h, x:x + w] 54 | thresh = cv2.threshold(roi, 0, 255, 55 | cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1] 56 | (tH, tW) = thresh.shape 57 | 58 | # if the width is greater than the height, resize along the 59 | # width dimension 60 | if tW > tH: 61 | thresh = imutils.resize(thresh, width=32) 62 | 63 | # otherwise, resize along the height 64 | else: 65 | thresh = imutils.resize(thresh, height=32) 66 | 67 | # re-grab the image dimensions (now that its been resized) 68 | # and then determine how much we need to pad the width and 69 | # height such that our image will be 32x32 70 | (tH, tW) = thresh.shape 71 | dX = int(max(0, 32 - tW) / 2.0) 72 | dY = int(max(0, 32 - tH) / 2.0) 73 | 74 | # pad the image and force 32x32 dimensions 75 | padded = cv2.copyMakeBorder(thresh, top=dY, bottom=dY, 76 | left=dX, right=dX, borderType=cv2.BORDER_CONSTANT, 77 | value=(0, 0, 0)) 78 | padded = cv2.resize(padded, (32, 32)) 79 | 80 | # prepare the padded image for classification via our 81 | # handwriting OCR model 82 | padded = padded.astype("float32") / 255.0 83 | padded = np.expand_dims(padded, axis=-1) 84 | 85 | # update our list of characters that will be OCR'd 86 | chars.append((padded, (x, y, w, h))) 87 | 88 | # extract the bounding box locations and padded characters 89 | boxes = [b[1] for b in chars] 90 | chars = np.array([c[0] for c in chars], dtype="float32") 91 | 92 | # OCR the characters using our handwriting recognition model 93 | preds = model.predict(chars) 94 | 95 | # define the list of label names 96 | labelNames = "0123456789" 97 | labelNames += "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 98 | labelNames = [l for l in labelNames] 99 | 100 | # loop over the predictions and bounding box locations together 101 | for (pred, (x, y, w, h)) in zip(preds, boxes): 102 | # find the index of the label with the largest corresponding 103 | # probability, then extract the probability and label 104 | i = np.argmax(pred) 105 | prob = pred[i] 106 | label = labelNames[i] 107 | 108 | # draw the prediction on the image 109 | print("[INFO] {} - {:.2f}%".format(label, prob * 100)) 110 | cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2) 111 | cv2.putText(image, label, (x - 10, y - 10), 112 | cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2) 113 | 114 | # show the image 115 | cv2.imshow("Image", image) 116 | cv2.waitKey(0) -------------------------------------------------------------------------------- /models/resnet.py: -------------------------------------------------------------------------------- 1 | # import the necessary packages 2 | 3 | from tensorflow.keras.layers import BatchNormalization 4 | from tensorflow.keras.layers import Conv2D 5 | from tensorflow.keras.layers import AveragePooling2D 6 | from tensorflow.keras.layers import MaxPooling2D 7 | from tensorflow.keras.layers import ZeroPadding2D 8 | from tensorflow.keras.layers import Activation 9 | from tensorflow.keras.layers import Dense 10 | from tensorflow.keras.layers import Flatten 11 | from tensorflow.keras.layers import Input 12 | from tensorflow.keras.models import Model 13 | from tensorflow.keras.layers import add 14 | from tensorflow.keras.regularizers import l2 15 | from tensorflow.keras import backend as K 16 | 17 | 18 | class ResNet: 19 | @staticmethod 20 | def residual_module(data, K, stride, chanDim, red=False, 21 | reg=0.0001, bnEps=2e-5, bnMom=0.9): 22 | # the shortcut branch of the ResNet module should be 23 | # initialize as the input (identity) data 24 | shortcut = data 25 | 26 | # the first block of the ResNet module are the 1x1 CONVs 27 | bn1 = BatchNormalization(axis=chanDim, epsilon=bnEps, 28 | momentum=bnMom)(data) 29 | act1 = Activation("relu")(bn1) 30 | conv1 = Conv2D(int(K * 0.25), (1, 1), use_bias=False, 31 | kernel_regularizer=l2(reg))(act1) 32 | 33 | # the second block of the ResNet module are the 3x3 CONVs 34 | bn2 = BatchNormalization(axis=chanDim, epsilon=bnEps, 35 | momentum=bnMom)(conv1) 36 | act2 = Activation("relu")(bn2) 37 | conv2 = Conv2D(int(K * 0.25), (3, 3), strides=stride, 38 | padding="same", use_bias=False, 39 | kernel_regularizer=l2(reg))(act2) 40 | 41 | # the third block of the ResNet module is another set of 1x1 42 | # CONVs 43 | bn3 = BatchNormalization(axis=chanDim, epsilon=bnEps, 44 | momentum=bnMom)(conv2) 45 | act3 = Activation("relu")(bn3) 46 | conv3 = Conv2D(K, (1, 1), use_bias=False, 47 | kernel_regularizer=l2(reg))(act3) 48 | 49 | # if we are to reduce the spatial size, apply a CONV layer to 50 | # the shortcut 51 | if red: 52 | shortcut = Conv2D(K, (1, 1), strides=stride, 53 | use_bias=False, kernel_regularizer=l2(reg))(act1) 54 | 55 | # add together the shortcut and the final CONV 56 | x = add([conv3, shortcut]) 57 | 58 | # return the addition as the output of the ResNet module 59 | return x 60 | 61 | @staticmethod 62 | def build(width, height, depth, classes, stages, filters, 63 | reg=0.0001, bnEps=2e-5, bnMom=0.9, dataset="cifar"): 64 | # initialize the input shape to be "channels last" and the 65 | # channels dimension itself 66 | inputShape = (height, width, depth) 67 | chanDim = -1 68 | 69 | # if we are using "channels first", update the input shape 70 | # and channels dimension 71 | if K.image_data_format() == "channels_first": 72 | inputShape = (depth, height, width) 73 | chanDim = 1 74 | 75 | # set the input and apply BN 76 | inputs = Input(shape=inputShape) 77 | x = BatchNormalization(axis=chanDim, epsilon=bnEps, 78 | momentum=bnMom)(inputs) 79 | 80 | # check if we are utilizing the CIFAR dataset 81 | if dataset == "cifar": 82 | # apply a single CONV layer 83 | x = Conv2D(filters[0], (3, 3), use_bias=False, 84 | padding="same", kernel_regularizer=l2(reg))(x) 85 | 86 | # check to see if we are using the Tiny ImageNet dataset 87 | elif dataset == "tiny_imagenet": 88 | # apply CONV => BN => ACT => POOL to reduce spatial size 89 | x = Conv2D(filters[0], (5, 5), use_bias=False, 90 | padding="same", kernel_regularizer=l2(reg))(x) 91 | x = BatchNormalization(axis=chanDim, epsilon=bnEps, 92 | momentum=bnMom)(x) 93 | x = Activation("relu")(x) 94 | x = ZeroPadding2D((1, 1))(x) 95 | x = MaxPooling2D((3, 3), strides=(2, 2))(x) 96 | 97 | # loop over the number of stages 98 | for i in range(0, len(stages)): 99 | # initialize the stride, then apply a residual module 100 | # used to reduce the spatial size of the input volume 101 | stride = (1, 1) if i == 0 else (2, 2) 102 | x = ResNet.residual_module(x, filters[i + 1], stride, 103 | chanDim, red=True, bnEps=bnEps, bnMom=bnMom) 104 | 105 | # loop over the number of layers in the stage 106 | for j in range(0, stages[i] - 1): 107 | # apply a ResNet module 108 | x = ResNet.residual_module(x, filters[i + 1], 109 | (1, 1), chanDim, bnEps=bnEps, bnMom=bnMom) 110 | 111 | # apply BN => ACT => POOL 112 | x = BatchNormalization(axis=chanDim, epsilon=bnEps, 113 | momentum=bnMom)(x) 114 | x = Activation("relu")(x) 115 | x = AveragePooling2D((8, 8))(x) 116 | 117 | # softmax classifier 118 | x = Flatten()(x) 119 | x = Dense(classes, kernel_regularizer=l2(reg))(x) 120 | x = Activation("softmax")(x) 121 | 122 | # create the model 123 | model = Model(inputs, x, name="resnet") 124 | 125 | # return the constructed network architecture 126 | return model 127 | -------------------------------------------------------------------------------- /train_model.py: -------------------------------------------------------------------------------- 1 | # set the matplotlib backend so figures can be saved in the background 2 | import cv2 3 | import argparse 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from sklearn.metrics import classification_report 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.preprocessing import LabelBinarizer 9 | from tensorflow.keras.optimizers import SGD 10 | from tensorflow.keras.preprocessing.image import ImageDataGenerator 11 | from imutils import build_montages 12 | 13 | from models import ResNet 14 | # from typing_extensions import Required 15 | import matplotlib 16 | matplotlib.use("Agg") 17 | from utils import load_az_dataset 18 | from utils import load_zero_nine_dataset 19 | 20 | # import the necessary packages 21 | 22 | # construct the argument parser and parse the arguments 23 | ap = argparse.ArgumentParser() 24 | 25 | ap.add_argument("-a", "--az", required=True, help="path to A-Z dataset") 26 | ap.add_argument("-m", "--model", default='trained_ocr.model', type=str, 27 | help="path to output the trained handwriting recognition model") 28 | ap.add_argument("-p", "--plot", type=str, default="plot.png", 29 | help="path to output the training history file") 30 | 31 | args = vars(ap.parse_args()) 32 | 33 | # initialize the number of epochs to train for, initial learning rate, 34 | # and batch size 35 | EPOCHS = 50 36 | INIT_LR = 1e-1 37 | BS = 128 38 | 39 | # load the A-Z and MNIST datasets, respectively 40 | print("[INFO] loading datasets...") 41 | 42 | (azData, azLabels) = load_az_dataset(args["az"]) 43 | (digitsData, digitsLabels) = load_zero_nine_dataset() 44 | 45 | # the MNIST dataset occupies the labels 0-9, so let's add 10 to every 46 | # A-Z label to ensure the A-Z characters are not incorrectly labeled 47 | # as digits 48 | azLabels += 10 49 | 50 | 51 | # stack the A-Z data and labels with the MNIST digits data and labels 52 | data = np.vstack([azData, digitsData]) 53 | labels = np.hstack([azLabels, digitsLabels]) 54 | 55 | # each image in the A-Z and MNIST digts datasets are 28x28 pixels; 56 | # however, the architecture we're using is designed for 32x32 images, 57 | # so we need to resize them to 32x32 58 | data = [cv2.resize(image, (32, 32)) for image in data] 59 | data = np.array(data, dtype="float32") 60 | 61 | # add a channel dimension to every image in the dataset and scale the 62 | # pixel intensities of the images from [0, 255] down to [0, 1] 63 | data = np.expand_dims(data, axis=-1) 64 | data /= 255.0 65 | 66 | # convert the labels from integers to vectors 67 | le = LabelBinarizer() 68 | 69 | labels = le.fit_transform(labels) 70 | # ounts = labels.sum(axis=0) 71 | 72 | # account for skew in the labeled data 73 | classTotals = labels.sum(axis=0) 74 | classWeight = {} 75 | 76 | # loop over all classes and calculate the class weight 77 | for i in range(0, len(classTotals)): 78 | classWeight[i] = classTotals.max() / classTotals[i] 79 | 80 | # partition the data into training and testing splits using 80% of 81 | # the data for training and the remaining 20% for testing 82 | (trainX, testX, trainY, testY) = train_test_split(data, 83 | labels, test_size=0.20, stratify=None, random_state=42) 84 | 85 | # construct the image generator for data augmentation 86 | aug = ImageDataGenerator(rotation_range=10, zoom_range=0.05, width_shift_range=0.1, 87 | height_shift_range=0.1, shear_range=0.15, horizontal_flip=False, fill_mode="nearest") 88 | 89 | # initialize and compile our deep neural network 90 | print("[INFO] compiling model...") 91 | 92 | opt = SGD(learning_rate=INIT_LR, decay=INIT_LR / EPOCHS) 93 | model = ResNet.build(32, 32, 1, len(le.classes_), (3, 3, 3), 94 | (64, 64, 128, 256), reg=0.0005) 95 | model.compile(loss="categorical_crossentropy", 96 | optimizer=opt, metrics=["accuracy"]) 97 | 98 | # train the network 99 | print("[INFO] training network...") 100 | 101 | H = model.fit( 102 | aug.flow(trainX, trainY, batch_size=BS), validation_data=(testX, testY), steps_per_epoch=len(trainX) // BS, epochs=EPOCHS, 103 | class_weight=classWeight, 104 | verbose=1) 105 | 106 | # define the list of label names 107 | labelNames = "0123456789" 108 | labelNames += "ABCDEFGHIJKLMNOPQRSTUVWXYZ" 109 | labelNames = [l for l in labelNames] 110 | 111 | # evaluate the network 112 | print("[INFO] evaluating network...") 113 | predictions = model.predict(testX, batch_size=BS) 114 | print(classification_report(testY.argmax(axis=1), 115 | predictions.argmax(axis=1), target_names=labelNames)) 116 | 117 | # save the model to disk 118 | print() 119 | model.save(args["model"], save_format="h5") 120 | 121 | # construct a plot that plots and saves the training history 122 | N = np.arange(0, EPOCHS) 123 | plt.style.use("ggplot") 124 | plt.figure() 125 | plt.plot(N, H.history["loss"], label="train_loss") 126 | plt.plot(N, H.history["val_loss"], label="val_loss") 127 | plt.title("Trainning Loss and Accuracy") 128 | plt.xlabel("Epoch #") 129 | plt.ylabel("Loss/Accuracy") 130 | plt.legend(loc="lower left") 131 | plt.savefig(args["plot"]) 132 | 133 | images = [] 134 | # randomly select a few testing characters 135 | for i in np.random.choice(np.arange(0, len(testY)), size=(49,)): 136 | # classify the character 137 | probs = model.predict(testX[np.newaxis, i]) 138 | prediction = probs.argmax(axis=1) 139 | label = labelNames[prediction[0]] 140 | 141 | # extract the image from the test data and initialize the text 142 | # label color as green (correct) 143 | image = (testX[i] * 255).astype("uint8") 144 | color = (0, 255, 0) 145 | 146 | # otherwise, the class label prediction is incorrect 147 | if prediction[0] != np.argmax(testY[i]): 148 | color = (0, 0, 255) 149 | 150 | # merge the channels into one image, resize the image from 32x32 151 | # to 96x96 so we can better see it and then draw the predicted 152 | # label on the image 153 | image = cv2.merge([image] * 3) 154 | image = cv2.resize(image, (96, 96), interpolation=cv2.INTER_LINEAR) 155 | cv2.putText(image, label, (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.75, 156 | color, 2) 157 | 158 | # add the image to our list of output images 159 | images.append(image) 160 | 161 | 162 | # construct the montage for the images 163 | images = np.array(images).reshape(7, 7, 96, 96, 3).swapaxes(1, 2).reshape(672, 672, 3) 164 | 165 | # show the output montage 166 | plt.figure(figsize=(10, 10)) 167 | plt.imshow(images) 168 | --------------------------------------------------------------------------------