├── README.md
├── chars74k_cnn.py
├── make_test.py
├── make_train.py
└── trim_images.sh


/README.md:
--------------------------------------------------------------------------------
 1 | # Chars74k_CNN
 2 | 
 3 | VGG style convolution neural network for the case sensitive character recognition Chars74k dataset. Currently gets 83.4% on holdout validation dataset of 6,220 images. Matching performance of Wang and Wu et al. 2012.  
 4 | 
 5 | ### Architecture
 6 | 
 7 | The input are 64 x 64 greyscale images
 8 | 4 convolution layers with filter size 3x3 and ReLU activations. Max pooling layers after every other convolution layer.
 9 | 2 hidden layers with dropout. Softmax output.
10 | 
11 | | Layer Type | Parameters |
12 | | -----------|----------- |
13 | | Input      | size: 64x64, channel: 1 |
14 | | convolution| kernel: 3x3, channel: 128 |
15 | | ReLU |  |
16 | | convolution| kernel: 3x3, channel: 128 |
17 | | ReLU | |
18 | | max pool | kernel: 2x2 |
19 | | convolution| kernel: 3x3, channel: 256 |
20 | | ReLU |  |
21 | | convolution| kernel: 3x3, channel: 256 |
22 | | ReLU |  |
23 | | max pool | kernel: 2x2 |
24 | | convolution| kernel: 3x3, channel: 512 |
25 | | ReLU |  |
26 | | convolution| kernel: 3x3, channel: 512 |
27 | | ReLU |  |
28 | | max pool | kernel: 2x2 |
29 | | fully connected | units: 2048 |
30 | | ReLU |  |
31 | | dropout | 0.5 |
32 | | fully connected | units: 2048 |
33 | | ReLU |  |
34 | | dropout | 0.5 |
35 | | softmax | units: 62 |
36 | 
37 | ### Data augmentation
38 | 
39 | Images are randomly transformed 'on the fly' while they are being prepared in each batch. The CPU will prepare each batch while the GPU will run the previous batch through the network.
40 | 
41 | * Random rotations between -10 and 10 degrees.
42 | * Random translation between -10 and 10 pixels in any direction.
43 | * Random zoom between factors of 1 and 1.3.
44 | * Random shearing between -25 and 25 degrees.
45 | * Bool choice to invert colors.
46 | * Sobel edge detector applied to 1/4 of images.
47 | 
48 | ![Imgur](http://i.imgur.com/vNkJrKi.png)![Imgur](http://i.imgur.com/0G8Khxv.gif)
49 | 
50 | ### To-do
51 | 
52 | Stream data from SSD instead of holding all images in memory (need to install SSD first).
53 | Try different network architectures and data pre-processing.
54 | 
55 | ### References
56 | 
57 | * Karen Simonyan, Andrew Zisserman, "Very Deep Convolutional Networks for Large-Scale Image Recognition", [link](http://arxiv.org/abs/1409.1556)
58 | * Alex Krizhevsky, Ilya Sutskever, Geoffrey E. Hinton, "ImageNet Classification with Deep Convolutional Neural Networks", [link](http://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks)
59 | * Sander Dieleman, "Classifying plankton with deep neural networks", [link](http://benanne.github.io/2015/03/17/plankton.html)
60 | 


--------------------------------------------------------------------------------
/chars74k_cnn.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | from sklearn.cross_validation import train_test_split
  6 | from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MinMaxScaler
  7 | 
  8 | from keras.regularizers import l2
  9 | from keras.models import Sequential
 10 | from keras.layers.core import Dense, Activation, Dropout, Flatten
 11 | from keras.layers.convolutional import Convolution2D, MaxPooling2D
 12 | from keras.layers.advanced_activations import LeakyReLU, PReLU
 13 | from keras.utils import np_utils, generic_utils
 14 | from keras.optimizers import SGD
 15 | 
 16 | from random import randint, uniform
 17 | 
 18 | import seaborn as sns
 19 | from matplotlib import pyplot
 20 | from skimage.io import imshow
 21 | from skimage import transform, filters, exposure
 22 | 
 23 | PIXELS = 64
 24 | imageSize = PIXELS * PIXELS
 25 | num_features = imageSize
 26 | label_enc = LabelBinarizer()
 27 | 
 28 | BATCHSIZE = 128
 29 | 
 30 | def fast_warp(img, tf, output_shape, mode='nearest'):
 31 |     return transform._warps_cy._warp_fast(img, tf.params, output_shape=output_shape, mode=mode)
 32 | 
 33 | def batch_iterator(data, y, batchsize, model):
 34 |     '''
 35 |     Data augmentation batch iterator for feeding images into CNN.
 36 |     This example will randomly rotate all images in a given batch between -10 and 10 degrees
 37 |     and to random translations between -10 and 10 pixels in all directions.
 38 |     Random zooms between 1 and 1.3.
 39 |     Random shearing between -10 and 10 degrees.
 40 |     Randomly applies sobel edge detector to 1/4th of the images in each batch.
 41 |     Randomly inverts 1/2 of the images in each batch.
 42 |     '''
 43 | 
 44 |     n_samples = data.shape[0]
 45 |     loss = []
 46 |     for i in range((n_samples + batchsize -1) // batchsize):
 47 |         sl = slice(i * batchsize, (i + 1) * batchsize)
 48 |         X_batch = data[sl]
 49 |         y_batch = y[sl]
 50 | 
 51 |         # set empty copy to hold augmented images so that we don't overwrite
 52 |         X_batch_aug = np.empty(shape = (X_batch.shape[0], 1, PIXELS, PIXELS), dtype = 'float32')
 53 | 
 54 |         # random rotations betweein -10 and 10 degrees
 55 |         dorotate = randint(-10,10)
 56 | 
 57 |         # random translations
 58 |         trans_1 = randint(-10,10)
 59 |         trans_2 = randint(-10,10)
 60 | 
 61 |         # random zooms
 62 |         zoom = uniform(1, 1.3)
 63 | 
 64 |         # shearing
 65 |         shear_deg = uniform(-25, 25)
 66 | 
 67 |         # set the transform parameters for skimage.transform.warp
 68 |         # have to shift to center and then shift back after transformation otherwise
 69 |         # rotations will make image go out of frame
 70 |         center_shift   = np.array((PIXELS, PIXELS)) / 2. - 0.5
 71 |         tform_center   = transform.SimilarityTransform(translation=-center_shift)
 72 |         tform_uncenter = transform.SimilarityTransform(translation=center_shift)
 73 | 
 74 |         tform_aug = transform.AffineTransform(rotation = np.deg2rad(dorotate),
 75 |                                               scale =(1/zoom, 1/zoom),
 76 |                                               shear = np.deg2rad(shear_deg),
 77 |                                               translation = (trans_1, trans_2))
 78 | 
 79 |         tform = tform_center + tform_aug + tform_uncenter
 80 | 
 81 |         # images in the batch do the augmentation
 82 |         for j in range(X_batch.shape[0]):
 83 | 
 84 |             X_batch_aug[j][0] = fast_warp(X_batch[j][0], tform,
 85 |                                           output_shape = (PIXELS, PIXELS))
 86 | 
 87 |         # use sobel edge detector filter on one quarter of the images
 88 |         indices_sobel = np.random.choice(X_batch_aug.shape[0], X_batch_aug.shape[0] / 4, replace = False)
 89 |         for k in indices_sobel:
 90 |             img = X_batch_aug[k][0]
 91 |             X_batch_aug[k][0] = filters.sobel(img)
 92 | 
 93 |         # invert half of the images
 94 |         indices_invert = np.random.choice(X_batch_aug.shape[0], X_batch_aug.shape[0] / 2, replace = False)
 95 |         for l in indices_invert:
 96 |             img = X_batch_aug[l][0]
 97 |             X_batch_aug[l][0] = np.absolute(img - np.amax(img))
 98 | 
 99 |         # fit model on each batch
100 |         loss.append(model.train_on_batch(X_batch_aug, y_batch))
101 | 
102 |     return np.mean(loss)
103 | 
104 | def load_data_cv(train_path):
105 | 
106 |     print('Read data')
107 |     # reading training data
108 |     training = np.load(train_path)
109 | 
110 |     # split training labels and pre-process them
111 |     training_targets = training[:,num_features]
112 |     training_targets = label_enc.fit_transform(training_targets)
113 |     training_targets = training_targets.astype('int32')
114 | 
115 |     # split training inputs and scale data 0 to 1
116 |     training_inputs = training[:,0:num_features].astype('float32')
117 |     #training_inputs = training_inputs / np.amax(training_inputs)
118 | 
119 |     # train test split
120 |     x_train, x_test, y_train, y_test = train_test_split(training_inputs, training_targets)
121 | 
122 |     print 'train size:', x_train.shape[0], 'eval size:', x_test.shape[0]
123 | 
124 |     # reshaping training and testing data so it can be feed to convolutional layers
125 |     x_train = x_train.reshape(x_train.shape[0], 1, PIXELS, PIXELS)
126 |     x_test = x_test.reshape(x_test.shape[0], 1, PIXELS, PIXELS)
127 | 
128 |     return x_train, x_test, y_train, y_test
129 | 
130 | def load_data_test(train_path, test_path):
131 | 
132 |     print('Read data')
133 |     # reading training data
134 |     training = np.load(train_path)
135 | 
136 |     # split training labels and pre-process them
137 |     training_targets = training[:,num_features]
138 |     training_targets = label_enc.fit_transform(training_targets)
139 |     training_targets = training_targets.astype('int32')
140 | 
141 |     # split training inputs and scale data 0 to 1
142 |     training_inputs = training[:,0:num_features].astype('float32')
143 |     #training_inputs = training_inputs / np.amax(training_inputs)
144 | 
145 |     # read testing data
146 |     testing_inputs = np.load(test_path).astype('float32')
147 | 
148 |     # reshaping training and testing data so it can be feed to convolutional layers
149 |     training_inputs = training_inputs.reshape(training_inputs.shape[0], 1, PIXELS, PIXELS)
150 |     testing_inputs = testing_inputs.reshape(testing_inputs.shape[0], 1, PIXELS, PIXELS)
151 | 
152 |     return training_inputs, training_targets, testing_inputs
153 | 
154 | def build_model():
155 |     '''
156 |     VGG style CNN. Using either PReLU or LeakyReLU in the fully connected layers
157 |     '''
158 |     print('Creating the model')
159 |     model = Sequential()
160 | 
161 |     model.add(Convolution2D(128,3,3, input_shape=(1, PIXELS, PIXELS), activation = 'relu'))
162 |     model.add(Convolution2D(128,3,3, activation = 'relu'))
163 |     model.add(MaxPooling2D(pool_size=(2,2)))
164 | 
165 |     model.add(Convolution2D(256,3,3, activation = 'relu'))
166 |     model.add(Convolution2D(256,3,3, activation = 'relu'))
167 |     model.add(MaxPooling2D(pool_size=(2,2)))
168 | 
169 |     model.add(Convolution2D(512,3,3, activation = 'relu'))
170 |     model.add(Convolution2D(512,3,3, activation = 'relu'))
171 |     model.add(MaxPooling2D(pool_size=(2,2)))
172 | 
173 |     # convert convolutional filters to flat so they can be feed to fully connected layers
174 |     model.add(Flatten())
175 | 
176 |     model.add(Dense(2048, activation='relu'))
177 |     #model.add(LeakyReLU(alpha=0.3))
178 |     model.add(Dropout(0.5))
179 | 
180 |     model.add(Dense(2048, activation='relu'))
181 |     #model.add(LeakyReLU(alpha=0.3))
182 |     model.add(Dropout(0.5))
183 | 
184 |     model.add(Dense(62))
185 |     model.add(Activation('softmax'))
186 | 
187 |     # setting sgd optimizer parameters
188 |     sgd = SGD(lr=0.03, decay=1e-4, momentum=0.9, nesterov=True)
189 |     model.compile(loss='categorical_crossentropy', optimizer=sgd)
190 |     return model
191 | 
192 | def main():
193 | 
194 |     # switch the commented lines here to alternate between CV testing and making kaggle submission
195 |     x_train, x_test, y_train, y_test = load_data_cv('data/train_32.npy')
196 |     #x_train, y_train, x_test = load_data_test('data/train_32.npy', 'data/test_32.npy')
197 | 
198 |     model = build_model()
199 | 
200 |     print("Starting training")
201 |     # batch iterator with 300 epochs
202 |     train_loss = []
203 |     valid_loss = []
204 |     valid_acc = []
205 |     try:
206 |         for i in range(300):
207 |             if i == 250:
208 |                 model.optimizer.lr.set_value(0.003)
209 |             if i == 275:
210 |                 model.optimizer.lr.set_value(0.0003)
211 |             start = time.time()
212 |             loss = batch_iterator(x_train, y_train, BATCHSIZE, model)
213 |             train_loss.append(loss)
214 |             valid_avg = model.evaluate(x_test, y_test, show_accuracy = True, verbose = 0)
215 |             valid_loss.append(valid_avg[0])
216 |             valid_acc.append(valid_avg[1])
217 |             end = time.time() - start
218 |             print 'iter:', i, '| Tloss:', np.round(loss, decimals = 3)#, '| Vloss:', np.round(valid_avg[0], decimals = 3), '| Vacc:', np.round(valid_avg[1], decimals = 3), '| time:', np.round(end, decimals = 1)
219 |     except KeyboardInterrupt:
220 |         pass
221 | 
222 |     train_loss = np.array(train_loss)
223 |     valid_loss = np.array(valid_loss)
224 |     valid_acc = np.array(valid_acc)
225 |     sns.set_style("whitegrid")
226 |     pyplot.plot(train_loss, linewidth = 3, label = 'train loss')
227 |     pyplot.plot(valid_loss, linewidth = 3, label = 'valid loss')
228 |     pyplot.legend(loc = 2)
229 |     pyplot.ylim([0,4.5])
230 |     pyplot.twinx()
231 |     pyplot.plot(valid_acc, linewidth = 3, label = 'valid accuracy', color = 'r')
232 |     pyplot.grid()
233 |     pyplot.ylim([0,1])
234 |     pyplot.legend(loc = 1)
235 |     pyplot.savefig('data/training_plot.png')
236 |     #pyplot.show()
237 | 
238 | 
239 |     #print("Generating predections")
240 |     #preds = model.predict(x_test, verbose=0)
241 |     #np.save('data/preds4.npy', preds)
242 |     #preds_orig = np.load('data/preds1.npy')
243 |     #preds_two = np.load('data/preds2.npy')
244 |     #preds_three = np.load('data/preds3.npy')
245 |     #preds_avg = (preds + preds_orig + preds_two + preds_three) / 4.0
246 |     #preds = label_enc.inverse_transform(preds_avg, threshold=0.5).astype(str)
247 | 
248 |     #submission = pd.read_csv('data/sampleSubmission.csv', dtype = str)
249 |     #submission['Class'] = preds
250 |     #submission.to_csv('preds/chars_74k_avg_preds.csv', index = False)
251 | 
252 | if __name__ == '__main__':
253 |     main()
254 | 


--------------------------------------------------------------------------------
/make_test.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import cPickle as pickle
 5 | from natsort import natsorted
 6 | 
 7 | from skimage import exposure
 8 | from matplotlib import pyplot
 9 | from skimage.io import imread
10 | from PIL import Image
11 | from skimage.io import imshow
12 | from skimage.filters import sobel
13 | from skimage import feature
14 | 
15 | from sklearn.preprocessing import StandardScaler
16 | 
17 | PATH = '/Volumes/Mildred/Kaggle/chars_74k/Data/test'
18 | 
19 | maxPixel = 64
20 | imageSize = maxPixel * maxPixel
21 | num_features = imageSize
22 | 
23 | def plot_sample(x):
24 |     img = x.reshape(maxPixel, maxPixel)
25 |     imshow(img)
26 |     pyplot.show()
27 | 
28 | def load_images(path):
29 |     print 'reading file names ... '
30 |     names = [d for d in os.listdir (path) if d.endswith('.Bmp')]
31 |     names = natsorted(names)
32 |     num_rows = len(names)
33 |     print num_rows
34 | 
35 |     print 'making dataset ... '
36 |     test_image = np.zeros((num_rows, num_features), dtype = float)
37 |     file_names = []
38 |     i = 0
39 |     for n in names:
40 |         print n.split('.')[0]
41 | 
42 |         image = imread(os.path.join(path, n), as_grey = True)
43 |         #image = sobel(image)
44 | 
45 |         test_image[i, 0:num_features] = np.reshape(image, (1, num_features))
46 |         i += 1
47 | 
48 |     return test_image
49 | 
50 | test = load_images(PATH)
51 | 
52 | print test[0]
53 | print test.shape
54 | 
55 | np.save('test_32.npy', test)
56 | 
57 | plot_sample(test[0])
58 | print np.amax(test[0])
59 | print np.amin(test[0])
60 | 
61 | #print file_names
62 | 


--------------------------------------------------------------------------------
/make_train.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | import pandas as pd
 4 | import cPickle as pickle
 5 | from natsort import natsorted
 6 | 
 7 | from skimage import exposure
 8 | from matplotlib import pyplot
 9 | from skimage.io import imread
10 | from PIL import Image
11 | from skimage.io import imshow
12 | from skimage.filters import sobel
13 | from skimage import feature
14 | 
15 | from sklearn.preprocessing import StandardScaler
16 | 
17 | PATH = '/Volumes/Mildred/Kaggle/chars_74k/Data/train'
18 | LABELS = '/Volumes/Mildred/Kaggle/chars_74k/Data/trainLabels.csv'
19 | 
20 | maxPixel = 64
21 | imageSize = maxPixel * maxPixel
22 | num_features = imageSize
23 | 
24 | def plot_sample(x):
25 |     img = x.reshape(maxPixel, maxPixel)
26 |     imshow(img)
27 |     pyplot.show()
28 | 
29 | def load_images(path):
30 |     print 'reading file names ... '
31 |     names = [d for d in os.listdir (path) if d.endswith('.Bmp')]
32 |     names = natsorted(names)
33 |     num_rows = len(names)
34 |     print num_rows
35 | 
36 |     print 'making dataset ... '
37 |     train_image = np.zeros((num_rows, num_features), dtype = float)
38 |     levels = np.zeros((num_rows, 1), dtype = str)
39 |     file_names = []
40 |     i = 0
41 |     for n in names:
42 |         print n.split('.')[0]
43 | 
44 |         image = imread(os.path.join(path, n), as_grey = True)
45 | 
46 |         train_image[i, 0:num_features] = np.reshape(image, (1, num_features))
47 | 
48 |         levels[i] = labels.Class[labels.ID == n.split('.')[0]].values
49 |         i += 1
50 | 
51 |     return train_image, levels
52 | 
53 | labels = pd.read_csv(LABELS, dtype = str)
54 | print labels
55 | 
56 | train, levels = load_images(PATH)
57 | 
58 | print train[0]
59 | print train.shape
60 | print levels.shape
61 | 
62 | np.save('train_32.npy', np.hstack((train, levels)))
63 | 
64 | plot_sample(train[0])
65 | print np.amax(train[0])
66 | print np.amin(train[0])
67 | 
68 | #print file_names
69 | 


--------------------------------------------------------------------------------
/trim_images.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | IMAGES="/Volumes/Mildred/Kaggle/chars_74k/data/test/*.Bmp"
 4 | for file in $IMAGES
 5 | do
 6 | 	echo "$file"
 7 | 	convert $file -resize 64x64! -gravity center $file
 8 | 
 9 | done
10 | 


--------------------------------------------------------------------------------