├── __init__.py
├── models
    ├── __init__.py
    └── resnet.py
├── requirements.txt
├── images
    ├── hello_world.png
    └── vietnamxinchao.png
├── utils.py
├── prediction.py
└── train_model.py


/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import Helpers


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import ResNet


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | opencv-python   
2 | matplotlib
3 | sklearn
4 | torch


--------------------------------------------------------------------------------
/images/hello_world.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/housecricket/How-did-I-write-an-own-OCR-program-using-Keras-and-TensorFlow-in-Python/HEAD/images/hello_world.png


--------------------------------------------------------------------------------
/images/vietnamxinchao.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/housecricket/How-did-I-write-an-own-OCR-program-using-Keras-and-TensorFlow-in-Python/HEAD/images/vietnamxinchao.png


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from tensorflow.keras.datasets import mnist
 2 | import numpy as np
 3 | 
 4 | def load_az_dataset(dataset_path):
 5 |     # initialize the list of data and labels
 6 |     data = []
 7 |     labels = []
 8 | 
 9 |     # loop over the rows of the A-Z handwritten digit dataset
10 |     for row in open(dataset_path):
11 |         # parse the label and image from the row
12 |         row = row.split(",")
13 |         label = int(row[0])
14 |         image = np.array([int(x) for x in row[1:]], dtype="uint8")
15 | 
16 |         # images are represented as single channel (grayscale) images
17 |         # that are 28x28=784 pixels -- we need to take this flattened
18 |         # 784-d list of numbers and reshape them into a 28x28 matrix
19 |         image = image.reshape((28, 28))
20 | 
21 |         # update the list of data and labels
22 |         data.append(image)
23 |         labels.append(label)
24 | 
25 |     # convert the data and labels to NumPy arrays
26 |     data = np.array(data, dtype="float32")
27 |     labels = np.array(labels, dtype="int")
28 | 
29 |     # return a 2-tuple of the A-Z data and labels
30 |     return (data, labels)
31 | 
32 | 
33 | def load_zero_nine_dataset():
34 |     # load the MNIST dataset and stack the training data and testing
35 |     # data together (we'll create our own training and testing splits
36 |     # later in the project)
37 |     ((trainData, trainLabels), (testData, testLabels)) = mnist.load_data()
38 |     data = np.vstack([trainData, testData])
39 |     labels = np.hstack([trainLabels, testLabels])
40 |     # return a 2-tuple of the MNIST data and labels
41 |     return (data, labels)
42 | 


--------------------------------------------------------------------------------
/prediction.py:
--------------------------------------------------------------------------------
  1 | # USAGE
  2 | # python prediction.py --model path_to_trained_model --image path_to_input_image
  3 | 
  4 | # import the necessary packages
  5 | from tensorflow.keras.models import load_model
  6 | from imutils.contours import sort_contours
  7 | import numpy as np
  8 | import argparse
  9 | import imutils
 10 | import cv2
 11 | 
 12 | # construct the argument parser and parse the arguments
 13 | ap = argparse.ArgumentParser()
 14 | ap.add_argument("-i", "--image", required=True,
 15 | 	help="path to input image")
 16 | ap.add_argument("-m", "--model", type=str, required=True,
 17 | 	help="path to trained model")
 18 | args = vars(ap.parse_args())
 19 | 
 20 | # load the handwriting OCR model
 21 | print("[INFO] loading handwriting OCR model...")
 22 | model = load_model(args["model"])
 23 | 
 24 | # load the input image from disk, convert it to grayscale, and blur
 25 | # it to reduce noise
 26 | image = cv2.imread(args["image"])
 27 | gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 28 | blurred = cv2.GaussianBlur(gray, (5, 5), 0)
 29 | 
 30 | # perform edge detection, find contours in the edge map, and sort the
 31 | # resulting contours from left-to-right
 32 | edged = cv2.Canny(blurred, 30, 150)
 33 | cnts = cv2.findContours(edged.copy(), cv2.RETR_EXTERNAL,
 34 | 	cv2.CHAIN_APPROX_SIMPLE)
 35 | cnts = imutils.grab_contours(cnts)
 36 | cnts = sort_contours(cnts, method="left-to-right")[0]
 37 | 
 38 | # initialize the list of contour bounding boxes and associated
 39 | # characters that we'll be OCR'ing
 40 | chars = []
 41 | 
 42 | # loop over the contours
 43 | for c in cnts:
 44 | 	# compute the bounding box of the contour
 45 | 	(x, y, w, h) = cv2.boundingRect(c)
 46 | 
 47 | 	# filter out bounding boxes, ensuring they are neither too small
 48 | 	# nor too large
 49 | 	if (w >= 5 and w <= 150) and (h >= 15 and h <= 120):
 50 | 		# extract the character and threshold it to make the character
 51 | 		# appear as *white* (foreground) on a *black* background, then
 52 | 		# grab the width and height of the thresholded image
 53 | 		roi = gray[y:y + h, x:x + w]
 54 | 		thresh = cv2.threshold(roi, 0, 255,
 55 | 			cv2.THRESH_BINARY_INV | cv2.THRESH_OTSU)[1]
 56 | 		(tH, tW) = thresh.shape
 57 | 
 58 | 		# if the width is greater than the height, resize along the
 59 | 		# width dimension
 60 | 		if tW > tH:
 61 | 			thresh = imutils.resize(thresh, width=32)
 62 | 
 63 | 		# otherwise, resize along the height
 64 | 		else:
 65 | 			thresh = imutils.resize(thresh, height=32)
 66 | 
 67 | 		# re-grab the image dimensions (now that its been resized)
 68 | 		# and then determine how much we need to pad the width and
 69 | 		# height such that our image will be 32x32
 70 | 		(tH, tW) = thresh.shape
 71 | 		dX = int(max(0, 32 - tW) / 2.0)
 72 | 		dY = int(max(0, 32 - tH) / 2.0)
 73 | 
 74 | 		# pad the image and force 32x32 dimensions
 75 | 		padded = cv2.copyMakeBorder(thresh, top=dY, bottom=dY,
 76 | 			left=dX, right=dX, borderType=cv2.BORDER_CONSTANT,
 77 | 			value=(0, 0, 0))
 78 | 		padded = cv2.resize(padded, (32, 32))
 79 | 
 80 | 		# prepare the padded image for classification via our
 81 | 		# handwriting OCR model
 82 | 		padded = padded.astype("float32") / 255.0
 83 | 		padded = np.expand_dims(padded, axis=-1)
 84 | 
 85 | 		# update our list of characters that will be OCR'd
 86 | 		chars.append((padded, (x, y, w, h)))
 87 | 
 88 | # extract the bounding box locations and padded characters
 89 | boxes = [b[1] for b in chars]
 90 | chars = np.array([c[0] for c in chars], dtype="float32")
 91 | 
 92 | # OCR the characters using our handwriting recognition model
 93 | preds = model.predict(chars)
 94 | 
 95 | # define the list of label names
 96 | labelNames = "0123456789"
 97 | labelNames += "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 98 | labelNames = [l for l in labelNames]
 99 | 
100 | # loop over the predictions and bounding box locations together
101 | for (pred, (x, y, w, h)) in zip(preds, boxes):
102 | 	# find the index of the label with the largest corresponding
103 | 	# probability, then extract the probability and label
104 | 	i = np.argmax(pred)
105 | 	prob = pred[i]
106 | 	label = labelNames[i]
107 | 
108 | 	# draw the prediction on the image
109 | 	print("[INFO] {} - {:.2f}%".format(label, prob * 100))
110 | 	cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
111 | 	cv2.putText(image, label, (x - 10, y - 10),
112 | 		cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 255, 0), 2)
113 | 
114 | # show the image
115 | cv2.imshow("Image", image)
116 | cv2.waitKey(0)


--------------------------------------------------------------------------------
/models/resnet.py:
--------------------------------------------------------------------------------
  1 | # import the necessary packages
  2 | 
  3 | from tensorflow.keras.layers import BatchNormalization
  4 | from tensorflow.keras.layers import Conv2D
  5 | from tensorflow.keras.layers import AveragePooling2D
  6 | from tensorflow.keras.layers import MaxPooling2D
  7 | from tensorflow.keras.layers import ZeroPadding2D
  8 | from tensorflow.keras.layers import Activation
  9 | from tensorflow.keras.layers import Dense
 10 | from tensorflow.keras.layers import Flatten
 11 | from tensorflow.keras.layers import Input
 12 | from tensorflow.keras.models import Model
 13 | from tensorflow.keras.layers import add
 14 | from tensorflow.keras.regularizers import l2
 15 | from tensorflow.keras import backend as K
 16 | 
 17 | 
 18 | class ResNet:
 19 |     @staticmethod
 20 |     def residual_module(data, K, stride, chanDim, red=False,
 21 |                         reg=0.0001, bnEps=2e-5, bnMom=0.9):
 22 |         # the shortcut branch of the ResNet module should be
 23 |         # initialize as the input (identity) data
 24 |         shortcut = data
 25 | 
 26 |         # the first block of the ResNet module are the 1x1 CONVs
 27 |         bn1 = BatchNormalization(axis=chanDim, epsilon=bnEps,
 28 |                                  momentum=bnMom)(data)
 29 |         act1 = Activation("relu")(bn1)
 30 |         conv1 = Conv2D(int(K * 0.25), (1, 1), use_bias=False,
 31 |                        kernel_regularizer=l2(reg))(act1)
 32 | 
 33 |         # the second block of the ResNet module are the 3x3 CONVs
 34 |         bn2 = BatchNormalization(axis=chanDim, epsilon=bnEps,
 35 |                                  momentum=bnMom)(conv1)
 36 |         act2 = Activation("relu")(bn2)
 37 |         conv2 = Conv2D(int(K * 0.25), (3, 3), strides=stride,
 38 |                        padding="same", use_bias=False,
 39 |                        kernel_regularizer=l2(reg))(act2)
 40 | 
 41 |         # the third block of the ResNet module is another set of 1x1
 42 |         # CONVs
 43 |         bn3 = BatchNormalization(axis=chanDim, epsilon=bnEps,
 44 |                                  momentum=bnMom)(conv2)
 45 |         act3 = Activation("relu")(bn3)
 46 |         conv3 = Conv2D(K, (1, 1), use_bias=False,
 47 |                        kernel_regularizer=l2(reg))(act3)
 48 | 
 49 |         # if we are to reduce the spatial size, apply a CONV layer to
 50 |         # the shortcut
 51 |         if red:
 52 |             shortcut = Conv2D(K, (1, 1), strides=stride,
 53 |                               use_bias=False, kernel_regularizer=l2(reg))(act1)
 54 | 
 55 |         # add together the shortcut and the final CONV
 56 |         x = add([conv3, shortcut])
 57 | 
 58 |         # return the addition as the output of the ResNet module
 59 |         return x
 60 | 
 61 |     @staticmethod
 62 |     def build(width, height, depth, classes, stages, filters,
 63 |               reg=0.0001, bnEps=2e-5, bnMom=0.9, dataset="cifar"):
 64 |         # initialize the input shape to be "channels last" and the
 65 |         # channels dimension itself
 66 |         inputShape = (height, width, depth)
 67 |         chanDim = -1
 68 | 
 69 |         # if we are using "channels first", update the input shape
 70 |         # and channels dimension
 71 |         if K.image_data_format() == "channels_first":
 72 |             inputShape = (depth, height, width)
 73 |             chanDim = 1
 74 | 
 75 |         # set the input and apply BN
 76 |         inputs = Input(shape=inputShape)
 77 |         x = BatchNormalization(axis=chanDim, epsilon=bnEps,
 78 |                                momentum=bnMom)(inputs)
 79 | 
 80 |         # check if we are utilizing the CIFAR dataset
 81 |         if dataset == "cifar":
 82 |             # apply a single CONV layer
 83 |             x = Conv2D(filters[0], (3, 3), use_bias=False,
 84 |                        padding="same", kernel_regularizer=l2(reg))(x)
 85 | 
 86 |         # check to see if we are using the Tiny ImageNet dataset
 87 |         elif dataset == "tiny_imagenet":
 88 |             # apply CONV => BN => ACT => POOL to reduce spatial size
 89 |             x = Conv2D(filters[0], (5, 5), use_bias=False,
 90 |                        padding="same", kernel_regularizer=l2(reg))(x)
 91 |             x = BatchNormalization(axis=chanDim, epsilon=bnEps,
 92 |                                    momentum=bnMom)(x)
 93 |             x = Activation("relu")(x)
 94 |             x = ZeroPadding2D((1, 1))(x)
 95 |             x = MaxPooling2D((3, 3), strides=(2, 2))(x)
 96 | 
 97 |         # loop over the number of stages
 98 |         for i in range(0, len(stages)):
 99 |             # initialize the stride, then apply a residual module
100 |             # used to reduce the spatial size of the input volume
101 |             stride = (1, 1) if i == 0 else (2, 2)
102 |             x = ResNet.residual_module(x, filters[i + 1], stride,
103 |                                        chanDim, red=True, bnEps=bnEps, bnMom=bnMom)
104 | 
105 |             # loop over the number of layers in the stage
106 |             for j in range(0, stages[i] - 1):
107 |                 # apply a ResNet module
108 |                 x = ResNet.residual_module(x, filters[i + 1],
109 |                                            (1, 1), chanDim, bnEps=bnEps, bnMom=bnMom)
110 | 
111 |         # apply BN => ACT => POOL
112 |         x = BatchNormalization(axis=chanDim, epsilon=bnEps,
113 |                                momentum=bnMom)(x)
114 |         x = Activation("relu")(x)
115 |         x = AveragePooling2D((8, 8))(x)
116 | 
117 |         # softmax classifier
118 |         x = Flatten()(x)
119 |         x = Dense(classes, kernel_regularizer=l2(reg))(x)
120 |         x = Activation("softmax")(x)
121 | 
122 |         # create the model
123 |         model = Model(inputs, x, name="resnet")
124 | 
125 |         # return the constructed network architecture
126 |         return model
127 | 


--------------------------------------------------------------------------------
/train_model.py:
--------------------------------------------------------------------------------
  1 | # set the matplotlib backend so figures can be saved in the background
  2 | import cv2
  3 | import argparse
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | from sklearn.metrics import classification_report
  7 | from sklearn.model_selection import train_test_split
  8 | from sklearn.preprocessing import LabelBinarizer
  9 | from tensorflow.keras.optimizers import SGD
 10 | from tensorflow.keras.preprocessing.image import ImageDataGenerator
 11 | from imutils import build_montages
 12 | 
 13 | from models import ResNet
 14 | # from typing_extensions import Required
 15 | import matplotlib
 16 | matplotlib.use("Agg")
 17 | from utils import load_az_dataset
 18 | from utils import load_zero_nine_dataset
 19 | 
 20 | # import the necessary packages
 21 | 
 22 | # construct the argument parser and parse the arguments
 23 | ap = argparse.ArgumentParser()
 24 | 
 25 | ap.add_argument("-a", "--az", required=True, help="path to A-Z dataset")
 26 | ap.add_argument("-m", "--model", default='trained_ocr.model', type=str,
 27 |                 help="path to output the trained handwriting recognition model")
 28 | ap.add_argument("-p", "--plot", type=str, default="plot.png",
 29 |                 help="path to output the training history file")
 30 | 
 31 | args = vars(ap.parse_args())
 32 | 
 33 | # initialize the number of epochs to train for, initial learning rate,
 34 | # and batch size
 35 | EPOCHS = 50
 36 | INIT_LR = 1e-1
 37 | BS = 128
 38 | 
 39 | # load the A-Z and MNIST datasets, respectively
 40 | print("[INFO] loading datasets...")
 41 | 
 42 | (azData, azLabels) = load_az_dataset(args["az"])
 43 | (digitsData, digitsLabels) = load_zero_nine_dataset()
 44 | 
 45 | # the MNIST dataset occupies the labels 0-9, so let's add 10 to every
 46 | # A-Z label to ensure the A-Z characters are not incorrectly labeled
 47 | # as digits
 48 | azLabels += 10
 49 | 
 50 | 
 51 | # stack the A-Z data and labels with the MNIST digits data and labels
 52 | data = np.vstack([azData, digitsData])
 53 | labels = np.hstack([azLabels, digitsLabels])
 54 | 
 55 | # each image in the A-Z and MNIST digts datasets are 28x28 pixels;
 56 | # however, the architecture we're using is designed for 32x32 images,
 57 | # so we need to resize them to 32x32
 58 | data = [cv2.resize(image, (32, 32)) for image in data]
 59 | data = np.array(data, dtype="float32")
 60 | 
 61 | # add a channel dimension to every image in the dataset and scale the
 62 | # pixel intensities of the images from [0, 255] down to [0, 1]
 63 | data = np.expand_dims(data, axis=-1)
 64 | data /= 255.0
 65 | 
 66 | # convert the labels from integers to vectors
 67 | le = LabelBinarizer()
 68 | 
 69 | labels = le.fit_transform(labels)
 70 | # ounts = labels.sum(axis=0)
 71 | 
 72 | # account for skew in the labeled data
 73 | classTotals = labels.sum(axis=0)
 74 | classWeight = {}
 75 | 
 76 | # loop over all classes and calculate the class weight
 77 | for i in range(0, len(classTotals)):
 78 |     classWeight[i] = classTotals.max() / classTotals[i]
 79 | 
 80 | # partition the data into training and testing splits using 80% of
 81 | # the data for training and the remaining 20% for testing
 82 | (trainX, testX, trainY, testY) = train_test_split(data,
 83 |                                                   labels, test_size=0.20, stratify=None, random_state=42)
 84 | 
 85 | # construct the image generator for data augmentation
 86 | aug = ImageDataGenerator(rotation_range=10, zoom_range=0.05, width_shift_range=0.1,
 87 |                          height_shift_range=0.1, shear_range=0.15, horizontal_flip=False, fill_mode="nearest")
 88 | 
 89 | # initialize and compile our deep neural network
 90 | print("[INFO] compiling model...")
 91 | 
 92 | opt = SGD(learning_rate=INIT_LR, decay=INIT_LR / EPOCHS)
 93 | model = ResNet.build(32, 32, 1, len(le.classes_), (3, 3, 3),
 94 |                      (64, 64, 128, 256), reg=0.0005)
 95 | model.compile(loss="categorical_crossentropy",
 96 |               optimizer=opt, metrics=["accuracy"])
 97 | 
 98 | # train the network
 99 | print("[INFO] training network...")
100 | 
101 | H = model.fit(
102 |     aug.flow(trainX, trainY, batch_size=BS), validation_data=(testX, testY), steps_per_epoch=len(trainX) // BS, epochs=EPOCHS,
103 |     class_weight=classWeight,
104 |     verbose=1)
105 | 
106 | # define the list of label names
107 | labelNames = "0123456789"
108 | labelNames += "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
109 | labelNames = [l for l in labelNames]
110 | 
111 | # evaluate the network
112 | print("[INFO] evaluating network...")
113 | predictions = model.predict(testX, batch_size=BS)
114 | print(classification_report(testY.argmax(axis=1),
115 |                             predictions.argmax(axis=1), target_names=labelNames))
116 | 
117 | # save the model to disk
118 | print()
119 | model.save(args["model"], save_format="h5")
120 | 
121 | # construct a plot that plots and saves the training history
122 | N = np.arange(0, EPOCHS)
123 | plt.style.use("ggplot")
124 | plt.figure()
125 | plt.plot(N, H.history["loss"], label="train_loss")
126 | plt.plot(N, H.history["val_loss"], label="val_loss")
127 | plt.title("Trainning Loss and Accuracy")
128 | plt.xlabel("Epoch #")
129 | plt.ylabel("Loss/Accuracy")
130 | plt.legend(loc="lower left")
131 | plt.savefig(args["plot"])
132 | 
133 | images = [] 
134 | # randomly select a few testing characters
135 | for i in np.random.choice(np.arange(0, len(testY)), size=(49,)):
136 |     # classify the character
137 |     probs = model.predict(testX[np.newaxis, i])
138 |     prediction = probs.argmax(axis=1)
139 |     label = labelNames[prediction[0]]
140 | 
141 |     # extract the image from the test data and initialize the text
142 |     # label color as green (correct)
143 |     image = (testX[i] * 255).astype("uint8")
144 |     color = (0, 255, 0)
145 | 
146 |     # otherwise, the class label prediction is incorrect
147 |     if prediction[0] != np.argmax(testY[i]):
148 |         color = (0, 0, 255)
149 | 
150 |     # merge the channels into one image, resize the image from 32x32
151 |     # to 96x96 so we can better see it and then draw the predicted
152 |     # label on the image
153 |     image = cv2.merge([image] * 3)
154 |     image = cv2.resize(image, (96, 96), interpolation=cv2.INTER_LINEAR)
155 |     cv2.putText(image, label, (5, 20), cv2.FONT_HERSHEY_SIMPLEX, 0.75,
156 |     color, 2)
157 | 
158 |     # add the image to our list of output images
159 |     images.append(image)
160 | 
161 | 
162 | # construct the montage for the images
163 | images = np.array(images).reshape(7, 7, 96, 96, 3).swapaxes(1, 2).reshape(672, 672, 3)
164 | 
165 | # show the output montage
166 | plt.figure(figsize=(10, 10))
167 | plt.imshow(images)
168 | 


--------------------------------------------------------------------------------