├── runtime.txt ├── .gitignore ├── requirements.txt ├── helpers ├── draw_prob_dist.py ├── restructure_dataset.py └── get_prob_dist.py ├── README.md └── cnn_rnn_classifier.py /runtime.txt: -------------------------------------------------------------------------------- 1 | python-3.5.2 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | __pycache__ 3 | venv 4 | tiny-imagenet-100-A 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | h5py==2.6.0 3 | Keras==1.1.2 4 | matplotlib==1.5.3 5 | numpy==1.11.2 6 | Pillow==3.4.2 7 | pkg-resources==0.0.0 8 | protobuf==3.1.0 9 | pyparsing==2.1.10 10 | python-dateutil==2.6.0 11 | pytz==2016.10 12 | PyYAML==3.12 13 | scipy==0.18.1 14 | six==1.10.0 15 | tensorflow==0.12.0rc0 16 | Theano==0.8.2 17 | -------------------------------------------------------------------------------- /helpers/draw_prob_dist.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | if __name__ == '__main__': 4 | labels_list = [] 5 | top_1_list = [] 6 | top_5_list = [] 7 | with open('./cnn_rnn_results.txt', 'r') as f: 8 | # Get rid of first line 9 | f.readline() 10 | 11 | for line in f: 12 | line_list = line.strip().split() 13 | label, top_1, top_5 = line_list 14 | labels_list.append(label) 15 | top_1_list.append(float(top_1)) 16 | top_5_list.append(float(top_5)) 17 | 18 | index = [i for i in range(len(labels_list))] 19 | 20 | print("Top 1 Average: {}%".format(np.average(top_1_list))) 21 | print("Top 5 Average: {}%".format(np.average(top_5_list))) 22 | 23 | plt.figure(1, figsize=(20, 10)) 24 | plt.bar(index, top_1_list) 25 | plt.suptitle('Top-1 Accuracy per label') 26 | 27 | plt.ylabel('Probability (%)') 28 | plt.xlabel('Label') 29 | 30 | plt.xticks(index, labels_list, rotation='vertical') 31 | plt.subplots_adjust(bottom=0.15) 32 | 33 | plt.figure(2, figsize=(20, 10)) 34 | plt.bar(index, top_5_list) 35 | plt.suptitle('Top-5 Accuracy per label') 36 | 37 | plt.ylabel('Probability (%)') 38 | plt.xlabel('Label') 39 | 40 | plt.xticks(index, labels_list, rotation='vertical') 41 | plt.subplots_adjust(bottom=0.15) 42 | 43 | plt.show() 44 | 45 | -------------------------------------------------------------------------------- /helpers/restructure_dataset.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import os 3 | import shutil 4 | 5 | dataset_train_path = './tiny-imagenet-100-A/train/' 6 | folders = [folder for folder in os.listdir(dataset_train_path)] 7 | 8 | for folder in folders: 9 | label_dir = os.path.join(dataset_train_path, folder) 10 | images_dir = os.path.join(label_dir, 'images/') 11 | images = [image for image in os.listdir(images_dir)] 12 | 13 | for img in images: 14 | img_path = os.path.join(images_dir, img) 15 | relocated_img = Image.open(img_path) 16 | relocated_img.save(os.path.join(label_dir, img)) 17 | 18 | shutil.rmtree(images_dir) 19 | os.unlink(os.path.join(label_dir, folder + '_boxes.txt')) 20 | 21 | dataset_val_path = './tiny-imagenet-100-A/val/' 22 | val_images_dir = os.path.join(dataset_val_path, 'images') 23 | 24 | labels_dict = {} 25 | with open(os.path.join(dataset_val_path, 'val_annotations.txt'), 'r') as f: 26 | for line in f: 27 | line_list = line.strip().split() 28 | label = line_list[1] 29 | labels_dict[line_list[0]] = label 30 | try: 31 | os.mkdir(os.path.join(dataset_val_path, label)) 32 | except OSError: 33 | pass 34 | 35 | val_images = [img for img in os.listdir(val_images_dir)] 36 | 37 | for val_image in val_images: 38 | img_path = os.path.join(val_images_dir, val_image) 39 | label = labels_dict[val_image] 40 | 41 | relocated_img = Image.open(img_path) 42 | relocated_img.save(os.path.join(dataset_val_path, label, val_image)) 43 | 44 | shutil.rmtree(val_images_dir) 45 | os.unlink(os.path.join(dataset_val_path, 'val_annotations.txt')) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a practical example on how to combine both a CNN and a RNN to classify images. 2 | 3 | _NOTE: This classifier was tested with the tiny-imagenet-100 dataset only._ 4 | 5 | ## Network Architecture 6 | 7 | The network consists of two different branches: a CNN branch which uses the Xception model, pretrained 8 | on imagenet and provided by Keras (https://keras.io/applications/#xception) and another indepented RNN branch. 9 | 10 | Each one of these branches runs parallel to each other. 11 | 12 | Initially, the entire network takes an RGB image whose shape is 299x299x3. 13 | 14 | On the CNN branch, this image is taken as is (299x299x3) and passed through the pretrained Xception 15 | model until it reaches the final convolution block which has the bottleneck features, which is of size 16 | (batch_size, 2048). 17 | 18 | On the other branch, the 299x299x3 image is transformed into a grayscale image of size 299x299x1 to 19 | be able to properly split it into chunks to feed it into the RNN. Afterwards, this 299x299 image is 20 | reshaped into (23, 3887), where 23 is the timesteps and 3887 is the dim of each timestep. These values 21 | were chosen because 23*3887 == 299*299. The reshaped image is then passed through two LSTM 22 | layers, each of which are of (batch_size, 2048) output. 23 | 24 | Next, now that we have (batch_size, 2048) from both the CNN and RNN branches, these two outputs 25 | are merged using element-wise multiplication. The output of this multiplication is then fed to the 26 | classification layer which consists of 100 nodes (100 classes) and a softmax activation. 27 | 28 | ## Network Training 29 | 30 | The network was trained in two phases. In the first phase, all the layers of the CNN were frozen and only 31 | the last classification layer and the RNN network were trained. This was done using the RMSProp 32 | optimizer. 33 | 34 | In the second phase, all the layers of the entire network were unfrozen and finetuned using Adam 35 | optimizer with a learning rate of 0.0001. 36 | 37 | Using this two phase training technique, the cnn/rnn model combination is able to achieve a Top 5 Accuracy of 96.14% on 38 | a minified version of the ImageNet dataset that contains only 100 classes (tiny-imagenet-100) 39 | 40 | ## Dataset Structure 41 | 42 | Keras’ ImageDataGenerator flow_from_directory method 43 | expects the dataset to be in a certain structure. 44 | 45 | The restructure_dataset.py script in the helpers directory can be used 46 | to reorganize the original dataset (given it has the same structure as the tiny-imagenet-100 dataset) into the strucutre Keras 47 | expects. 48 | 49 | ## Image Preprocessing 50 | 51 | The Xception model expects images to be processed in a certain way. However, because 52 | Keras’ built in ImageDataGenerator is used, We could not easily preprocess the input while using the 53 | fit_generator() training method. 54 | 55 | Consequently, in cnn_rnn_classifier.py, a new class was created, CustomImageDataGenerator that inherits 56 | from ImageDataGenerator and has an overloaded standardize() method which is called by 57 | ImageDataGenerator before batch is yielded to fit_generator(). 58 | 59 | The standardize() method of CustomImageDataGenerator applies the Xception model’s required 60 | preprocessing on the input. 61 | -------------------------------------------------------------------------------- /helpers/get_prob_dist.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from keras.applications.xception import Xception, decode_predictions, preprocess_input 4 | from keras.callbacks import ModelCheckpoint 5 | from keras.engine import Input, merge 6 | from keras.layers import GlobalAveragePooling2D, Dense, Reshape, Lambda, K, LSTM 7 | from keras.preprocessing import image 8 | from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img 9 | from keras.models import Model 10 | from keras.callbacks import TensorBoard 11 | from keras.optimizers import Adam 12 | 13 | import time 14 | import numpy as np 15 | from keras.utils import np_utils 16 | 17 | np.random.seed(1337) 18 | 19 | 20 | def load_val_dataset(): 21 | validation_data_dir = './tiny-imagenet-100-A/val/' 22 | 23 | classes = [] 24 | for subdir in sorted(os.listdir(validation_data_dir)): 25 | if os.path.isdir(os.path.join(validation_data_dir, subdir)): 26 | classes.append(subdir) 27 | 28 | class_indices = dict(zip(classes, range(len(classes)))) 29 | 30 | X_val = [] 31 | 32 | # Extracting validation dat 33 | i = 0 34 | y_val = [] 35 | for subdir in classes: 36 | subpath = os.path.join(validation_data_dir, subdir) 37 | for fname in sorted(os.listdir(subpath)): 38 | y_val.append(class_indices[subdir]) 39 | 40 | # Load image as numpy array and append it to X_val 41 | img = load_img(os.path.join(subpath, fname), target_size=(img_width, img_height)) 42 | x = img_to_array(img) 43 | X_val.append(x) 44 | 45 | i += 1 46 | 47 | Y_val = np_utils.to_categorical(y_val) 48 | X_val = np.asarray(X_val, dtype='float32') 49 | return classes, X_val, Y_val 50 | 51 | 52 | def rgb_to_grayscale(input): 53 | """Average out each pixel across its 3 RGB layers resulting in a grayscale image""" 54 | return K.mean(input, axis=3) 55 | 56 | 57 | def rgb_to_grayscale_output_shape(input_shape): 58 | return input_shape[:-1] 59 | 60 | 61 | nb_val_samples = 5000 62 | 63 | img_width = 299 64 | img_height = 299 65 | 66 | print("Building model...") 67 | input_tensor = Input(shape=(img_width, img_height, 3)) 68 | 69 | # Creating CNN 70 | cnn_model = Xception(weights='imagenet', include_top=False, input_tensor=input_tensor) 71 | 72 | x = cnn_model.output 73 | cnn_bottleneck = GlobalAveragePooling2D()(x) 74 | 75 | # Creating RNN 76 | x = Lambda(rgb_to_grayscale, rgb_to_grayscale_output_shape)(input_tensor) 77 | x = Reshape((23, 3887))(x) # 23 timesteps, input dim of each timestep 3887 78 | x = LSTM(2048, return_sequences=True)(x) 79 | rnn_output = LSTM(2048)(x) 80 | 81 | # Merging both cnn bottleneck and rnn's output wise element wise multiplication 82 | x = merge([cnn_bottleneck, rnn_output], mode='mul') 83 | predictions = Dense(100, activation='softmax')(x) 84 | 85 | model = Model(input=input_tensor, output=predictions) 86 | 87 | model.load_weights("./finetuned_cnn_rnn_weights_2.hdf5") 88 | 89 | print("Model built") 90 | 91 | model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', 92 | metrics=['accuracy', 'top_k_categorical_accuracy']) 93 | 94 | classes, X_val, Y_val = load_val_dataset() 95 | 96 | n_labels = len(classes) 97 | 98 | n_imgs_by_label = np.zeros(n_labels, dtype=np.dtype(int)) 99 | n_top1_accurate_by_label = np.zeros(n_labels, dtype=np.dtype(int)) 100 | n_top5_accurate_by_label = np.zeros(n_labels, dtype=np.dtype(int)) 101 | 102 | # Loop over each validation image and calculate Top-1 and Top-5 Correct Classification Rate 103 | for i, img in enumerate(X_val): 104 | print(i) 105 | ground_truth = Y_val[i].argmax() 106 | n_imgs_by_label[ground_truth] += 1 107 | 108 | img = np.expand_dims(img, axis=0) 109 | img = preprocess_input(img) 110 | preds = model.predict(img) 111 | 112 | top_5_indices = (-preds).argsort()[:, :5] 113 | top_5_indices = top_5_indices[0] 114 | if ground_truth == top_5_indices[0]: 115 | n_top1_accurate_by_label[ground_truth] += 1 116 | if ground_truth in top_5_indices: 117 | n_top5_accurate_by_label[ground_truth] += 1 118 | 119 | # Create a text file that contains the top 1 and top 5 ACCR of each label 120 | results_path = '/home/shady-fanous/cnn_rnn_results.txt' 121 | with open(results_path, 'w+') as f: 122 | f.write('Label\tTop-1 Accuracy\tTop-5 Accuracy\n') 123 | for i, label in enumerate(classes): 124 | label_top1_accuracy = round(100.0 * n_top1_accurate_by_label[i] / n_imgs_by_label[i], 2) 125 | label_top5_accuracy = round(100.0 * n_top5_accurate_by_label[i] / n_imgs_by_label[i], 2) 126 | line = '{}\t{}\t{}\n'.format(label, label_top1_accuracy, label_top5_accuracy) 127 | f.write(line) 128 | -------------------------------------------------------------------------------- /cnn_rnn_classifier.py: -------------------------------------------------------------------------------- 1 | from keras.applications.xception import Xception 2 | from keras.callbacks import ModelCheckpoint 3 | from keras.engine import Input, merge 4 | from keras.layers import GlobalAveragePooling2D, Dense, Reshape, Lambda, K, LSTM 5 | from keras.preprocessing.image import ImageDataGenerator 6 | from keras.models import Model 7 | from keras.callbacks import TensorBoard 8 | from keras.optimizers import Adam 9 | 10 | import time 11 | import numpy as np 12 | 13 | np.random.seed(1337) 14 | 15 | 16 | class CustomImageDataGenerator(ImageDataGenerator): 17 | """ 18 | Because Xception utilizes a custom preprocessing method, the only way to utilize this 19 | preprocessing method using the ImageDataGenerator is to overload the standardize method. 20 | 21 | The standardize method gets applied to each batch before ImageDataGenerator yields that batch. 22 | """ 23 | 24 | def standardize(self, x): 25 | """ 26 | Taken from keras.applications.xception.preprocess_input 27 | """ 28 | if self.featurewise_center: 29 | x /= 255. 30 | x -= 0.5 31 | x *= 2. 32 | return x 33 | 34 | 35 | def get_training_generator(batch_size=128): 36 | train_data_dir = './tiny-imagenet-100-A/train/' 37 | validation_data_dir = './tiny-imagenet-100-A/val/' 38 | image_datagen = CustomImageDataGenerator(featurewise_center=True) 39 | 40 | train_generator = image_datagen.flow_from_directory( 41 | train_data_dir, 42 | target_size=(img_width, img_height), 43 | batch_size=batch_size 44 | ) 45 | 46 | val_generator = image_datagen.flow_from_directory( 47 | validation_data_dir, 48 | target_size=(img_width, img_height), 49 | batch_size=batch_size, 50 | shuffle=False 51 | ) 52 | 53 | return train_generator, val_generator 54 | 55 | 56 | def rgb_to_grayscale(input): 57 | """Average out each pixel across its 3 RGB layers resulting in a grayscale image""" 58 | return K.mean(input, axis=3) 59 | 60 | 61 | def rgb_to_grayscale_output_shape(input_shape): 62 | return input_shape[:-1] 63 | 64 | 65 | batch_size_phase_one = 32 66 | batch_size_phase_two = 16 67 | nb_val_samples = 5000 68 | 69 | nb_epochs = 30 70 | 71 | img_width = 299 72 | img_height = 299 73 | 74 | # Setting tensorbord callback 75 | now = time.strftime("%c") 76 | tensorboard_callback = TensorBoard(log_dir='./logs/' + 'cnn_rnn ' + now, histogram_freq=0, write_graph=True, 77 | write_images=False) 78 | 79 | # Loading dataset 80 | print("Loading the dataset with batch size of {}...".format(batch_size_phase_one)) 81 | train_generator, val_generator = get_training_generator(batch_size_phase_one) 82 | print("Dataset loaded") 83 | 84 | print("Building model...") 85 | input_tensor = Input(shape=(img_width, img_height, 3)) 86 | 87 | # Creating CNN 88 | cnn_model = Xception(weights='imagenet', include_top=False, input_tensor=input_tensor) 89 | 90 | x = cnn_model.output 91 | cnn_bottleneck = GlobalAveragePooling2D()(x) 92 | 93 | # Make CNN layers not trainable 94 | for layer in cnn_model.layers: 95 | layer.trainable = False 96 | 97 | # Creating RNN 98 | x = Lambda(rgb_to_grayscale, rgb_to_grayscale_output_shape)(input_tensor) 99 | x = Reshape((23, 3887))(x) # 23 timesteps, input dim of each timestep 3887 100 | x = LSTM(2048, return_sequences=True)(x) 101 | rnn_output = LSTM(2048)(x) 102 | 103 | # Merging both cnn bottleneck and rnn's output wise element wise multiplication 104 | x = merge([cnn_bottleneck, rnn_output], mode='mul') 105 | predictions = Dense(100, activation='softmax')(x) 106 | 107 | model = Model(input=input_tensor, output=predictions) 108 | 109 | print("Model built") 110 | 111 | model.compile(optimizer='rmsprop', loss='categorical_crossentropy') 112 | 113 | print("Starting training") 114 | checkpointer = ModelCheckpoint(filepath="./initial_cnn_rnn_weights_2.hdf5", verbose=1, save_best_only=True) 115 | model.fit_generator(train_generator, samples_per_epoch=4480, nb_epoch=nb_epochs, verbose=1, 116 | validation_data=val_generator, 117 | nb_val_samples=nb_val_samples, 118 | callbacks=[tensorboard_callback, checkpointer]) 119 | 120 | print("Initial training done, starting phase two (finetuning)") 121 | 122 | # Load two new generator with smaller batch size, needed because using the same batch size 123 | # for the fine tuning will result in GPU running out of memory and tensorflow raising an error 124 | print("Loading the dataset with batch size of {}...".format(batch_size_phase_two)) 125 | train_generator, val_generator = get_training_generator(batch_size_phase_two) 126 | print("Dataset loaded") 127 | 128 | # Load best weights from initial training 129 | model.load_weights("./initial_cnn_rnn_weights_2.hdf5") 130 | 131 | # Make all layers trainable for finetuning 132 | for layer in model.layers: 133 | layer.trainable = True 134 | 135 | model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', 136 | metrics=['accuracy', 'top_k_categorical_accuracy']) 137 | 138 | checkpointer = ModelCheckpoint(filepath="./finetuned_cnn_rnn_weights_2.hdf5", verbose=1, save_best_only=True, 139 | monitor='val_acc') 140 | model.fit_generator(train_generator, samples_per_epoch=2240, nb_epoch=nb_epochs, verbose=1, 141 | validation_data=val_generator, 142 | nb_val_samples=nb_val_samples, 143 | callbacks=[tensorboard_callback, checkpointer]) 144 | 145 | # Final evaluation of the model 146 | print("Training done, doing final evaluation...") 147 | 148 | model.load_weights("./finetuned_cnn_rnn_weights_2.hdf5") 149 | 150 | model.compile(optimizer=Adam(lr=0.0001), loss='categorical_crossentropy', 151 | metrics=['accuracy', 'top_k_categorical_accuracy']) 152 | 153 | scores = model.evaluate_generator(val_generator, val_samples=nb_val_samples) 154 | print(model.metrics_names, scores) 155 | print("%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100)) 156 | --------------------------------------------------------------------------------