├── ImageCaptioning_Prediction.py ├── projectapp.py ├── README.md ├── Image_Captioning.py └── ImageCaptioning_Model.py /ImageCaptioning_Prediction.py: -------------------------------------------------------------------------------- 1 | def prediction_function(img_path): 2 | import numpy as np 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import os 6 | import keras 7 | import tensorflow as tf 8 | from keras.preprocessing.sequence import pad_sequences 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.layers import concatenate, BatchNormalization, Input 11 | from keras.layers.merge import add 12 | from keras.utils import to_categorical, plot_model 13 | import io 14 | import boto3 15 | from smart_open import smart_open 16 | import string 17 | from keras.preprocessing.image import load_img, img_to_array 18 | from PIL import Image 19 | import numpy as np 20 | from numpy.testing import assert_allclose 21 | from keras.models import load_model 22 | from keras.callbacks import ModelCheckpoint 23 | import pickle 24 | from keras.applications.resnet50 import ResNet50 25 | from keras.optimizers import Adam 26 | from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate 27 | from keras.models import Sequential, Model 28 | from keras.utils import np_utils 29 | from keras.preprocessing import image, sequence 30 | from gtts import gTTS 31 | import IPython.display as ipd 32 | import pickle 33 | 34 | s3_resource = boto3.resource('s3') 35 | 36 | wordtoix,ixtoword,max_length = pickle.loads(s3_resource.Bucket("projectdata27").Object("project.pkl").get()['Body'].read()) 37 | 38 | bucket = s3_resource.Bucket('projectdata27') 39 | 40 | client = boto3.client('s3') 41 | client.download_file('projectdata27', 42 | 'model4.h5', 43 | 'model4.h5') 44 | # returns a compiled model 45 | # identical to the previous one 46 | model = load_model('model4.h5') 47 | 48 | client.download_file('projectdata27', 49 | 'modelR.h5', 50 | 'modelR.h5') 51 | modelR = load_model('modelR.h5') 52 | 53 | k='data/data/Images/'+img_path 54 | imag = bucket.Object(k) 55 | img_data = imag.get().get('Body').read() 56 | img=Image.open(io.BytesIO(img_data)) 57 | img=img.resize((224,224)) 58 | 59 | im = img_to_array(img) 60 | im = np.expand_dims(im, axis=0) 61 | im = modelR.predict(im) 62 | 63 | in_text = 'startseq' 64 | for i in range(max_length): 65 | sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix] 66 | sequence = pad_sequences([sequence], maxlen=max_length) 67 | yhat = model3.predict([photo,sequence], verbose=0) 68 | yhat = np.argmax(yhat) 69 | word = ixtoword[yhat] 70 | in_text += ' ' + word 71 | if word == 'endseq': 72 | break 73 | final = in_text.split() 74 | final = final[1:-1] 75 | final = ' '.join(final) 76 | 77 | print(final) 78 | 79 | language = 'en' 80 | myobj = gTTS(text=final, lang=language, slow=False) 81 | myobj.save("project.mp3") 82 | os.system("project.mp3") 83 | ipd.Audio("project.mp3", autoplay=True) 84 | 85 | 86 | 87 | 88 | if __name__ == '__main__': 89 | i = input('Enter the image id') 90 | prediction_function(i) 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /projectapp.py: -------------------------------------------------------------------------------- 1 | def prediction_function(): 2 | import streamlit as st 3 | import numpy as np 4 | import os 5 | import keras 6 | import tensorflow as tf 7 | from keras.preprocessing.sequence import pad_sequences 8 | from keras.preprocessing.text import Tokenizer 9 | from keras.layers import concatenate, BatchNormalization, Input 10 | from keras.layers.merge import add 11 | from keras.utils import to_categorical, plot_model 12 | import io 13 | import boto3 14 | from smart_open import smart_open 15 | import string 16 | from keras.preprocessing.image import load_img, img_to_array 17 | from PIL import Image 18 | import numpy as np 19 | from numpy.testing import assert_allclose 20 | from keras.models import load_model 21 | from keras.callbacks import ModelCheckpoint 22 | import pickle 23 | from keras.applications.resnet50 import ResNet50 24 | from keras.optimizers import Adam 25 | from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate 26 | from keras.models import Sequential, Model 27 | from keras.utils import np_utils 28 | from keras.preprocessing import image, sequence 29 | from gtts import gTTS 30 | import pickle 31 | import IPython.display as ipd 32 | 33 | st.title("Image upload") 34 | image_file = st.file_uploader("Upload Image", type=["jpg"]) 35 | 36 | if image_file is not None: 37 | st.image(Image.open(image_file),width=200, height=200) 38 | 39 | if st.button("Generate Caption"): 40 | s3_resource = boto3.resource('s3') 41 | wordtoix,ixtoword,max_length = pickle.loads(s3_resource.Bucket("projectdata27").Object("project.pkl").get()['Body'].read()) 42 | 43 | bucket = s3_resource.Bucket('projectdata27') 44 | client = boto3.client('s3') 45 | client.download_file('projectdata27', 46 | 'model4.h5', 47 | 'model4.h5') 48 | # returns a compiled model 49 | # identical to the previous one 50 | model = load_model('model4.h5',compile=False) 51 | client.download_file('projectdata27', 52 | 'modelR.h5', 53 | 'modelR.h5') 54 | modelR = load_model('modelR.h5',compile=False) 55 | img=Image.open(image_file) 56 | img=img.resize((224,224)) 57 | im = img_to_array(img) 58 | im = np.expand_dims(im, axis=0) 59 | im = modelR.predict(im) 60 | in_text = 'startseq' 61 | for i in range(max_length): 62 | sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix] 63 | sequence = pad_sequences([sequence], maxlen=max_length) 64 | yhat = model.predict([im,sequence], verbose=0) 65 | yhat = np.argmax(yhat) 66 | word = ixtoword[yhat] 67 | in_text += ' ' + word 68 | if word == 'endseq': 69 | break 70 | final = in_text.split() 71 | final = final[1:-1] 72 | final = ' '.join(final) 73 | st.success('Description: '.format(final)) 74 | st.write(final) 75 | 76 | language = 'en' 77 | myobj = gTTS(text=final, lang=language, slow=False) 78 | myobj.save("project.mp3") 79 | audio_file = open('project.mp3', 'rb') 80 | audio_bytes = audio_file.read() 81 | st.audio(audio_bytes, format='audio/ogg') 82 | 83 | 84 | if __name__ == '__main__': 85 | 86 | prediction_function() 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Image_captioning 2 | # Image Caption Generator using Deep Learning 3 | Image captioning is a research area of Artificial Intelligence (AI) that deals with image understanding and a language description for that image. Image understanding needs to detect and recognize objects. It also needs to understand scene type or location, object properties and their interactions. Generating well-formed sentences requires both syntactic and semantic understanding of the language. In this project, a framework is developed leveraging the capabilities of artificial neural networks to “caption an image based on its significant features”.The generation process of image semantics not only understands the objects or scene recognition in the image, but also has the ability to analyze their states, understand the relationship among them and generate a correct sentence. The model would make use of Convolution neural networks to read image data and Long Short Term Memory (LSTM) for learning sentences/captions for image. The Flickr8K dataset is used to train the model. 4 | 5 | ## Problem Statement: 6 | The objective is to generate semantically and syntactically description about object or scene recognition in the image, and to understand the relationship among them. 7 | 8 | ## Dataset: 9 | 1. **Flickr8k_Dataset**: Contains a total of 8092 images in JPG format with different shapes and sizes. Of which 6000 are used for training, 1000 for test and 1000 for development. 10 | 2. **Flickr8k_text** : Contains text files describing train_set ,test_set. Flickr8k.token.txt contains 5 captions for each image i.e. total 40460 captions. 11 | 12 | ## Built With: 13 | Jupyter Interface on EMR 14 | 15 | ## What is Image Captioning? 16 | Image Captioning is the process of generating textual description of an image. It uses both Natural Language Processing and Computer Vision to generate the captions. 17 | ![1](https://user-images.githubusercontent.com/63635084/105141690-89dce100-5b1f-11eb-8d64-103f5905aa5c.png) 18 | 19 | ## Model Architecture: 20 | ![WhatsApp Image 2021-01-20 at 1 34 45 PM](https://user-images.githubusercontent.com/63635084/105146486-1094bc80-5b26-11eb-9fbc-04128defd40d.jpeg) 21 | 22 | ## Execution: 23 | 24 | The problem needs two models and seamless integration between them. The network can be viewed as a combination of encoder and decoder. Encoder would be a convolutional neural network(CNN). Image is processed by CNN layer and features are extracted. End of CNN layer is connected to a Long short-term memory(LSTM) networks, a special kind of Recurrent Neural Network(RNN). LSTM’s are capable of learning long term dependencies. The model is built using Keras, a deep learning library in python. Keras is a high-level library that is above Tensorflow. The API is very simple and makes use of Tensorflow backend. 25 | 26 | Transfer learning is used for CNN implementation. Transfer learning is a major topic in machine learning that involves storing knowledge from one model and applying to another problem. The reason we use pretrained network is because, CNN models are difficult to train from scratch and it could be very computationally expensive that it takes several hours on GPU. In the scientific community, it is very common to use pretrained model on larger dataset and then using the model as a feature extractor. 27 | 28 | The output of image model acts as input to language model. To understand the captions under the images a Recurrent Neural Network(RNN) is used to solve the problem. Long Short-term memory (LSTM), which is a variation of RNN is used. LSTM works better and has powerful update equation and backpropagation. LSTM is a language model and decoder trained on feature vectors. LSTM’s had phenomenal influence and success in different problems like language modelling, speech recognition, translation etc. 29 | 30 | LSTM picks part of image and maps to the appropriate word in the caption. An embedding layer is created to get a vector representation for each word in the caption. Then the output vector is given as input to LSTM for the model to learn the neighbouring words for each word. Then the LSTM output is converted to fixed dimension using dense layer. Now, the outputs from both Language Model and Image model are combined, and input the vector to LSTM. LSTM learns the different captions for that image in training phase. The LSTM output is converted to the size of vocabulary size using the dense layer and activate the model using activation method. In testing phase, LSTM predict the captions for the image. LSTM predicts next word for the given image with the partial caption available at that stage. 31 | 32 | ## Network/Model: 33 | ![1](https://user-images.githubusercontent.com/63635084/105346093-1b754d00-5c0b-11eb-92df-a0efcdc415b1.JPG) 34 | ![cnn](https://user-images.githubusercontent.com/63635084/105346106-203a0100-5c0b-11eb-8d7d-3a1ea1679817.JPG) 35 | 36 | ## Code: 37 | [Click here for Code](https://github.com/AWS-Big-Data-Projects/IMAGE_CAPTION_GENERATOR/blob/master/Image_Captioning.py) 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /Image_Captioning.py: -------------------------------------------------------------------------------- 1 | ## Importing Required Libraries 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import os 7 | import keras 8 | import tensorflow as tf 9 | from keras.preprocessing.sequence import pad_sequences 10 | from keras.preprocessing.text import Tokenizer 11 | from keras.layers import concatenate, BatchNormalization, Input 12 | from keras.layers.merge import add 13 | from keras.utils import to_categorical, plot_model 14 | import io 15 | import boto3 16 | from smart_open import smart_open 17 | import string 18 | from keras.preprocessing.image import load_img, img_to_array 19 | from PIL import Image 20 | import numpy as np 21 | from tensorflow.keras.applications.inception_v3 import InceptionV3 22 | from numpy.testing import assert_allclose 23 | from keras.models import load_model 24 | from keras.callbacks import ModelCheckpoint 25 | import pickle 26 | from keras.applications.resnet50 import ResNet50 27 | from keras.optimizers import Adam 28 | from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate 29 | from keras.models import Sequential, Model 30 | from keras.utils import np_utils 31 | from keras.preprocessing import image, sequence 32 | 33 | 34 | ## Loading Text Data""" 35 | 36 | token_path = 's3://projectdata27/data/data/captions.txt' 37 | text = smart_open(token_path, 'r', encoding = 'utf-8').read() 38 | 39 | 40 | ## Preprocessing Text Data 41 | 42 | descriptions = dict() 43 | for line in text.split('\n'): 44 | # split line by white space 45 | tokens = line.split(',') 46 | 47 | # take the first token as image id, the rest as description 48 | image_id, image_desc = tokens[0], tokens[1:] 49 | 50 | # extract filename from image id 51 | image_id = image_id.split('.')[0] 52 | 53 | # convert description tokens back to string 54 | image_desc = ' '.join(image_desc) 55 | if image_id not in descriptions.keys(): 56 | descriptions[image_id] = list() 57 | descriptions[image_id].append(image_desc) 58 | 59 | print(descriptions['3534548254_7bee952a0e']) 60 | 61 | 62 | # prepare translation table for removing punctuation 63 | 64 | table = str.maketrans('', '', string.punctuation) 65 | for key, desc_list in descriptions.items(): 66 | for i in range(len(desc_list)): 67 | desc = desc_list[i] 68 | # tokenize 69 | desc = desc.split() 70 | # convert to lower case 71 | desc = [word.lower() for word in desc] 72 | # remove punctuation from each token 73 | desc = [w.translate(table) for w in desc] 74 | # remove hanging 's' and 'a' 75 | desc = [word for word in desc if len(word)>1] 76 | # remove tokens with numbers in them 77 | desc = [word for word in desc if word.isalpha()] 78 | # store as string 79 | desc_list[i] = ' '.join(desc) 80 | 81 | del descriptions[''] 82 | 83 | 84 | t=[] 85 | token_path = 's3://projectdata27/data/data/trainimages.txt' 86 | train = smart_open(token_path, 'r', encoding = 'utf-8').read() 87 | for line in train.split('\n'): 88 | t.append(line[:-4]) 89 | 90 | t.remove('') 91 | 92 | 93 | vocabulary = set() 94 | for key in t: 95 | [vocabulary.update(d.split()) for d in descriptions[key]] 96 | print('Original Vocabulary Size: %d' % len(vocabulary)) 97 | 98 | # Create a list of all the training captions 99 | all_captions = [] 100 | for key, val in descriptions.items(): 101 | if key in t: 102 | for cap in val: 103 | all_captions.append(cap) 104 | 105 | 106 | # Consider only words which occur at least 10 times in the corpus 107 | 108 | word_count_threshold = 10 109 | word_counts = {} 110 | nsents = 0 111 | for sent in all_captions: 112 | nsents += 1 113 | for w in sent.split(' '): 114 | word_counts[w] = word_counts.get(w, 0) + 1 115 | 116 | vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] 117 | print('preprocessed words %d ' % len(vocab)) 118 | 119 | 120 | #find the maximum length of a description in a dataset 121 | 122 | max_length = max(len(des.split()) for des in all_captions) 123 | max_length 124 | 125 | despc = dict() 126 | for key, des_list in descriptions.items(): 127 | if key in t: 128 | despc[key] = list() 129 | for line in des_list: 130 | desc = 'startseq ' + line + ' endseq' 131 | despc[key].append(desc) 132 | 133 | 134 | # word mapping to integers 135 | 136 | ixtoword = {} 137 | wordtoix = {} 138 | 139 | ix = 1 140 | for word in vocab: 141 | wordtoix[word] = ix 142 | ixtoword[ix] = word 143 | ix += 1 144 | 145 | 146 | # convert a dictionary of clean descriptions to a list of descriptions 147 | 148 | def to_lines(descriptions): 149 | all_desc = list() 150 | for key in t: 151 | [all_desc.append(d) for d in descriptions[key]] 152 | return all_desc 153 | # calculate the length of the description with the most words 154 | def max_length(descriptions): 155 | lines = to_lines(descriptions) 156 | return max(len(d.split()) for d in lines) 157 | # determine the maximum sequence length 158 | max_length = max_length(despc) 159 | print('Max Description Length: %d' % max_length) 160 | 161 | s3 = boto3.resource('s3') 162 | 163 | bucket = s3.Bucket('projectdata27') 164 | 165 | temp = captions[10].split(",") 166 | image = bucket.Object('data/data/Images/'+temp[0]) 167 | img_data = image.get().get('Body').read() 168 | img=Image.open(io.BytesIO(img_data)) 169 | plt.imshow(img) 170 | 171 | for ix in range(len(tokens[temp[0]])): 172 | print(tokens[temp[0]][ix]) 173 | 174 | modelR = load_model('modelR.h5') 175 | 176 | train_path='s3://projectdata27/data/data/trainimages.txt' 177 | x_train = smart_open(train_path, 'r', encoding = 'utf-8').read().split("\n") 178 | 179 | 180 | x_train.remove('') 181 | 182 | 183 | def preprocessing(img_path): 184 | k='data/data/Images/'+img_path 185 | imag = bucket.Object(k) 186 | img_data = imag.get().get('Body').read() 187 | img=Image.open(io.BytesIO(img_data)) 188 | img=img.resize((224,224)) 189 | 190 | 191 | im = img_to_array(img) 192 | im = np.expand_dims(im, axis=0) 193 | return im 194 | 195 | train_data = {} 196 | 197 | for ix in x_train: 198 | img = preprocessing(ix) 199 | train_data[ix] = modelR.predict(img).reshape(2048) 200 | 201 | train_data 202 | 203 | 204 | # load glove vectors for embedding layer 205 | 206 | vocab_size=1650 207 | embeddings_index = {} 208 | 209 | 210 | g = smart_open('s3://projectdata27/glove.6B.200d.txt', 'r', encoding = 'utf-8').read() 211 | 212 | for line in g.split("\n"): 213 | values = line.split(" ") 214 | word = values[0] 215 | indices = np.asarray(values[1: ], dtype = 'float32') 216 | embeddings_index[word] = indices 217 | 218 | len(embeddings_index) 219 | 220 | emb_dim= 200 221 | emb_matrix = np.zeros((vocab_size, emb_dim)) 222 | for word, i in wordtoix.items(): 223 | emb_vec = embeddings_index.get(word) 224 | if emb_vec is not None: 225 | emb_matrix[i] = emb_vec 226 | emb_matrix.shape 227 | 228 | X1, X2, y = list(), list(), list() 229 | for key, des_list in despc.items(): 230 | if key in t: 231 | pic = train_data[key + '.jpg'] 232 | for cap in des_list: 233 | seq = [wordtoix[word] for word in cap.split(' ') if word in wordtoix] 234 | for i in range(1, len(seq)): 235 | in_seq, out_seq = seq[:i], seq[i] 236 | in_seq = pad_sequences([in_seq], maxlen = max_length)[0] 237 | out_seq = to_categorical([out_seq], num_classes = vocab_size)[0] 238 | # store 239 | X1.append(pic) 240 | X2.append(in_seq) 241 | y.append(out_seq) 242 | 243 | X2 = np.array(X2) 244 | X1 = np.array(X1) 245 | y = np.array(y) 246 | 247 | 248 | ip1 = Input(shape = (2048, )) 249 | fe1 = Dropout(0.2)(ip1) 250 | fe2 = Dense(256, activation = 'relu')(fe1) 251 | ip2 = Input(shape = (max_length, )) 252 | se1 = Embedding(vocab_size, emb_dim, mask_zero = True)(ip2) 253 | se2 = Dropout(0.2)(se1) 254 | se3 = LSTM(256)(se2) 255 | decoder1 = add([fe2, se3]) 256 | decoder2 = Dense(512, activation = 'relu')(decoder1) 257 | outputs = Dense(vocab_size, activation = 'softmax')(decoder2) 258 | model3 = Model(inputs = [ip1, ip2], outputs = outputs) 259 | 260 | model3.layers[2].set_weights([emb_matrix]) 261 | model3.layers[2].trainable = False 262 | model3.compile(loss = 'categorical_crossentropy', optimizer = 'adam',metrics=['accuracy']) 263 | 264 | 265 | # define the checkpoint 266 | 267 | filepath = "model3.h5" 268 | checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') 269 | callbacks_list = [checkpoint] 270 | 271 | model3.fit([X1,X2], y, epochs=50, batch_size=256, callbacks=callbacks_list) 272 | 273 | def feature_extraction(img_path): 274 | k='data/data/Images/'+img_path 275 | imag = bucket.Object(k) 276 | img_data = imag.get().get('Body').read() 277 | img=Image.open(io.BytesIO(img_data)) 278 | img=img.resize((224,224)) 279 | 280 | im = img_to_array(img) 281 | im = np.expand_dims(im, axis=0) 282 | im = modelR.predict(im) 283 | return im 284 | 285 | def final_caption(photo): 286 | in_text = 'startseq' 287 | for i in range(max_length): 288 | sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix] 289 | sequence = pad_sequences([sequence], maxlen=max_length) 290 | yhat = model3.predict([photo,sequence], verbose=0) 291 | yhat = np.argmax(yhat) 292 | word = ixtoword[yhat] 293 | in_text += ' ' + word 294 | if word == 'endseq': 295 | break 296 | final = in_text.split() 297 | final = final[1:-1] 298 | final = ' '.join(final) 299 | return final 300 | 301 | 302 | 303 | ## ResNet50 304 | 305 | from IPython.core.display import display, HTML 306 | display(HTML("""ResNet50 Architecture""")) 307 | 308 | modelR = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg') 309 | modelR.summary() 310 | 311 | R=load_model('modelR.h5') 312 | 313 | modelR = load_model('modelR.h5') 314 | 315 | 316 | 317 | ## Progressive Loading 318 | 319 | def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch): 320 | X1, X2, y = list(), list(), list() 321 | n=0 322 | # loop for ever over images 323 | while 1: 324 | for key, desc_list in descriptions.items(): 325 | n+=1 326 | # retrieve the photo feature 327 | photo = photos[key+'.jpg'] 328 | for desc in desc_list: 329 | # encode the sequence 330 | seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix] 331 | # split one sequence into multiple X, y pairs 332 | for i in range(1, len(seq)): 333 | # split into input and output pair 334 | in_seq, out_seq = seq[:i], seq[i] 335 | # pad input sequence 336 | in_seq = pad_sequences([in_seq], maxlen=max_length)[0] 337 | # encode output sequence 338 | out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] 339 | # store 340 | X1.append(photo) 341 | X2.append(in_seq) 342 | y.append(out_seq) 343 | # yield the batch data 344 | if n==num_photos_per_batch: 345 | yield (np.array(X1), np.array(X2)), np.array(y) 346 | X1, X2, y = list(), list(), list() 347 | n=0 348 | 349 | embedding_dim=200 350 | inputs1 = Input(shape=(2048,)) 351 | fe1 = Dropout(0.5)(inputs1) 352 | fe2 = Dense(256, activation='relu')(fe1) 353 | inputs2 = Input(shape=(max_length,)) 354 | se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2) 355 | se2 = Dropout(0.5)(se1) 356 | se3 = LSTM(256)(se2) 357 | decoder1 = add([fe2, se3]) 358 | decoder2 = Dense(512, activation='relu')(decoder1) 359 | outputs = Dense(vocab_size, activation='softmax')(decoder2) 360 | model2 = Model(inputs=[inputs1, inputs2], outputs=outputs) 361 | 362 | model2.layers[2].set_weights([emb_matrix]) 363 | model2.layers[2].trainable = False 364 | 365 | model2.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy']) 366 | 367 | epochs = 10 368 | number_pics_per_bath = 3 369 | steps = len(despc)//number_pics_per_bath 370 | 371 | for i in range(epochs): 372 | 373 | generator = data_generator(despc, train_data, wordtoix, max_length, number_pics_per_bath) 374 | 375 | 376 | model2.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1) 377 | model2.save('model2.h5') 378 | 379 | 380 | -------------------------------------------------------------------------------- /ImageCaptioning_Model.py: -------------------------------------------------------------------------------- 1 | ## Importing Required Libraries 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | import os 7 | import keras 8 | import tensorflow as tf 9 | from keras.preprocessing.sequence import pad_sequences 10 | from keras.preprocessing.text import Tokenizer 11 | from keras.layers import concatenate, BatchNormalization, Input 12 | from keras.layers.merge import add 13 | from keras.utils import to_categorical, plot_model 14 | import io 15 | import boto3 16 | from smart_open import smart_open 17 | import string 18 | from keras.preprocessing.image import load_img, img_to_array 19 | from PIL import Image 20 | import numpy as np 21 | from numpy.testing import assert_allclose 22 | from keras.models import load_model 23 | from keras.callbacks import ModelCheckpoint 24 | import pickle 25 | from keras.applications.resnet50 import ResNet50 26 | from keras.optimizers import Adam 27 | from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate 28 | from keras.models import Sequential, Model 29 | from keras.utils import np_utils 30 | from keras.preprocessing import image, sequence 31 | from gtts import gTTS 32 | import IPython.display as ipd 33 | 34 | """## Loading Text Data""" 35 | 36 | token_path = 's3://projectdata27/data/data/captions.txt' 37 | text = smart_open(token_path, 'r', encoding = 'utf-8').read() 38 | 39 | """## Preprocessing Text Data""" 40 | 41 | descriptions = dict() 42 | for line in text.split('\n'): 43 | # split line by white space 44 | tokens = line.split(',') 45 | 46 | # take the first token as image id, the rest as description 47 | image_id, image_desc = tokens[0], tokens[1:] 48 | 49 | # extract filename from image id 50 | image_id = image_id.split('.')[0] 51 | 52 | # convert description tokens back to string 53 | image_desc = ' '.join(image_desc) 54 | if image_id not in descriptions.keys(): 55 | descriptions[image_id] = list() 56 | descriptions[image_id].append(image_desc) 57 | 58 | print(descriptions['3534548254_7bee952a0e']) 59 | 60 | # prepare translation table for removing punctuation 61 | table = str.maketrans('', '', string.punctuation) 62 | for key, desc_list in descriptions.items(): 63 | for i in range(len(desc_list)): 64 | desc = desc_list[i] 65 | # tokenize 66 | desc = desc.split() 67 | # convert to lower case 68 | desc = [word.lower() for word in desc] 69 | # remove punctuation from each token 70 | desc = [w.translate(table) for w in desc] 71 | # remove hanging 's' and 'a' 72 | desc = [word for word in desc if len(word)>1] 73 | # remove tokens with numbers in them 74 | desc = [word for word in desc if word.isalpha()] 75 | # store as string 76 | desc_list[i] = ' '.join(desc) 77 | 78 | del descriptions[''] 79 | 80 | t=[] 81 | token_path = 's3://projectdata27/data/data/trainimages.txt' 82 | train = smart_open(token_path, 'r', encoding = 'utf-8').read() 83 | for line in train.split('\n'): 84 | t.append(line[:-4]) 85 | 86 | t.remove('') 87 | 88 | vocabulary = set() 89 | for key in t: 90 | [vocabulary.update(d.split()) for d in descriptions[key]] 91 | print('Original Vocabulary Size: %d' % len(vocabulary)) 92 | 93 | # Create a list of all the training captions 94 | all_captions = [] 95 | for key, val in descriptions.items(): 96 | if key in t: 97 | for cap in val: 98 | all_captions.append(cap) 99 | 100 | 101 | # Consider only words which occur at least 10 times in the corpus 102 | word_count_threshold = 10 103 | word_counts = {} 104 | nsents = 0 105 | for sent in all_captions: 106 | nsents += 1 107 | for w in sent.split(' '): 108 | word_counts[w] = word_counts.get(w, 0) + 1 109 | 110 | vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold] 111 | print('preprocessed words %d ' % len(vocab)) 112 | 113 | #find the maximum length of a description in a dataset 114 | max_length = max(len(des.split()) for des in all_captions) 115 | max_length 116 | 117 | despc = dict() 118 | for key, des_list in descriptions.items(): 119 | if key in t: 120 | despc[key] = list() 121 | for line in des_list: 122 | desc = 'startseq ' + line + ' endseq' 123 | despc[key].append(desc) 124 | 125 | 126 | # word mapping to integers 127 | ixtoword = {} 128 | wordtoix = {} 129 | 130 | ix = 1 131 | for word in vocab: 132 | wordtoix[word] = ix 133 | ixtoword[ix] = word 134 | ix += 1 135 | 136 | # convert a dictionary of clean descriptions to a list of descriptions 137 | def to_lines(descriptions): 138 | all_desc = list() 139 | for key in t: 140 | [all_desc.append(d) for d in descriptions[key]] 141 | return all_desc 142 | # calculate the length of the description with the most words 143 | def max_length(descriptions): 144 | lines = to_lines(descriptions) 145 | return max(len(d.split()) for d in lines) 146 | # determine the maximum sequence length 147 | max_length = max_length(despc) 148 | print('Max Description Length: %d' % max_length) 149 | 150 | bucket='projectdata27' 151 | key='project.pkl' 152 | pickle_byte_obj = pickle.dumps([wordtoix,ixtoword,max_length]) 153 | s3_resource = boto3.resource('s3') 154 | s3_resource.Object(bucket,key).put(Body=pickle_byte_obj) 155 | 156 | wordtoix,ixtoword,max_length = pickle.loads(s3_resource.Bucket("projectdata27").Object("project.pkl").get()['Body'].read()) 157 | 158 | s3 = boto3.resource('s3') 159 | 160 | bucket = s3.Bucket('projectdata27') 161 | 162 | temp = descriptions['1000268201_693b08cb0e'] 163 | image = bucket.Object('data/data/Images/'+'1000268201_693b08cb0e.jpg') 164 | img_data = image.get().get('Body').read() 165 | img=Image.open(io.BytesIO(img_data)) 166 | plt.imshow(img) 167 | 168 | for ix in temp: 169 | print(ix) 170 | 171 | t = despc['1000268201_693b08cb0e'] 172 | for ix in t: 173 | print(ix) 174 | 175 | train_path='s3://projectdata27/data/data/trainimages.txt' 176 | x_train = smart_open(train_path, 'r', encoding = 'utf-8').read().split("\n") 177 | 178 | x_train[:5] 179 | 180 | x_train.remove('') 181 | 182 | len(x_train) 183 | 184 | modelR = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg') 185 | modelR.summary() 186 | 187 | client = boto3.client('s3') 188 | client.upload_file(Filename='modelR.h5', 189 | Bucket="projectdata27", 190 | Key='modelR.h5') 191 | 192 | client.download_file('projectdata27', 193 | 'modelR.h5', 194 | 'modelR.h5') 195 | # returns a compiled model 196 | # identical to the previous one 197 | modelR = load_model('modelR.h5') 198 | 199 | modelR.summary() 200 | 201 | def preprocessing(img_path): 202 | k='data/data/Images/'+img_path 203 | imag = bucket.Object(k) 204 | img_data = imag.get().get('Body').read() 205 | img=Image.open(io.BytesIO(img_data)) 206 | img=img.resize((224,224)) 207 | 208 | 209 | im = img_to_array(img) 210 | im = np.expand_dims(im, axis=0) 211 | return im 212 | 213 | train_data = {} 214 | 215 | for ix in x_train: 216 | img = preprocessing(ix) 217 | train_data[ix] = modelR.predict(img).reshape(2048) 218 | 219 | train_data 220 | 221 | # load glove vectors for embedding layer 222 | vocab_size=1650 223 | embeddings_index = {} 224 | 225 | 226 | g = smart_open('s3://projectdata27/glove.6B.200d.txt', 'r', encoding = 'utf-8').read() 227 | 228 | for line in g.split("\n"): 229 | values = line.split(" ") 230 | word = values[0] 231 | indices = np.asarray(values[1: ], dtype = 'float32') 232 | embeddings_index[word] = indices 233 | 234 | 235 | emb_dim= 200 236 | emb_matrix = np.zeros((vocab_size, emb_dim)) 237 | for word, i in wordtoix.items(): 238 | emb_vec = embeddings_index.get(word) 239 | if emb_vec is not None: 240 | emb_matrix[i] = emb_vec 241 | emb_matrix.shape 242 | 243 | X1, X2, y = list(), list(), list() 244 | for key, des_list in despc.items(): 245 | if key in t: 246 | pic = train_data[key + '.jpg'] 247 | for cap in des_list: 248 | seq = [wordtoix[word] for word in cap.split(' ') if word in wordtoix] 249 | for i in range(1, len(seq)): 250 | in_seq, out_seq = seq[:i], seq[i] 251 | in_seq = pad_sequences([in_seq], maxlen = max_length)[0] 252 | out_seq = to_categorical([out_seq], num_classes = vocab_size)[0] 253 | # store 254 | X1.append(pic) 255 | X2.append(in_seq) 256 | y.append(out_seq) 257 | 258 | X2 = np.array(X2) 259 | X1 = np.array(X1) 260 | y = np.array(y) 261 | 262 | X1.shape 263 | 264 | X2.shape 265 | 266 | y.shape 267 | 268 | 232328*(2048+(34*200)) 269 | 270 | ip1 = Input(shape = (2048, )) 271 | fe1 = Dropout(0.2)(ip1) 272 | fe2 = Dense(256, activation = 'relu')(fe1) 273 | ip2 = Input(shape = (max_length, )) 274 | se1 = Embedding(vocab_size, emb_dim, mask_zero = True)(ip2) 275 | se2 = Dropout(0.2)(se1) 276 | se3 = LSTM(256)(se2) 277 | decoder1 = add([fe2, se3]) 278 | decoder2 = Dense(512, activation = 'relu')(decoder1) 279 | outputs = Dense(vocab_size, activation = 'softmax')(decoder2) 280 | model3 = Model(inputs = [ip1, ip2], outputs = outputs) 281 | 282 | model3.layers[2].set_weights([emb_matrix]) 283 | model3.layers[2].trainable = False 284 | model3.compile(loss = 'categorical_crossentropy', optimizer = 'adam',metrics=['accuracy']) 285 | 286 | 287 | # define the checkpoint 288 | filepath = "model4.h5" 289 | checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min') 290 | callbacks_list = [checkpoint] 291 | 292 | model3.fit([X1,X2], y, epochs=60, batch_size=256, callbacks=callbacks_list) 293 | 294 | client = boto3.client('s3') 295 | client.upload_file(Filename='model4.h5', 296 | Bucket="projectdata27", 297 | Key='model4.h5') 298 | 299 | client.download_file('projectdata27', 300 | 'model4.h5', 301 | 'model4.h5') 302 | # returns a compiled model 303 | # identical to the previous one 304 | model3 = load_model('model4.h5') 305 | 306 | model3.summary() 307 | 308 | 309 | ## Progresive loading 310 | 311 | def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch): 312 | X1, X2, y = list(), list(), list() 313 | n=0 314 | # loop for ever over images 315 | while 1: 316 | for key, desc_list in descriptions.items(): 317 | n+=1 318 | # retrieve the photo feature 319 | photo = photos[key+'.jpg'] 320 | for desc in desc_list: 321 | # encode the sequence 322 | seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix] 323 | # split one sequence into multiple X, y pairs 324 | for i in range(1, len(seq)): 325 | # split into input and output pair 326 | in_seq, out_seq = seq[:i], seq[i] 327 | # pad input sequence 328 | in_seq = pad_sequences([in_seq], maxlen=max_length)[0] 329 | # encode output sequence 330 | out_seq = to_categorical([out_seq], num_classes=vocab_size)[0] 331 | # store 332 | X1.append(photo) 333 | X2.append(in_seq) 334 | y.append(out_seq) 335 | # yield the batch data 336 | if n==num_photos_per_batch: 337 | yield (np.array(X1), np.array(X2)), np.array(y) 338 | X1, X2, y = list(), list(), list() 339 | n=0 340 | 341 | embedding_dim=200 342 | inputs1 = Input(shape=(2048,)) 343 | fe1 = Dropout(0.5)(inputs1) 344 | fe2 = Dense(256, activation='relu')(fe1) 345 | inputs2 = Input(shape=(max_length,)) 346 | se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2) 347 | se2 = Dropout(0.5)(se1) 348 | se3 = LSTM(256)(se2) 349 | decoder1 = add([fe2, se3]) 350 | decoder2 = Dense(512, activation='relu')(decoder1) 351 | outputs = Dense(vocab_size, activation='softmax')(decoder2) 352 | model2 = Model(inputs=[inputs1, inputs2], outputs=outputs) 353 | 354 | model2.layers[2].set_weights([emb_matrix]) 355 | model2.layers[2].trainable = False 356 | 357 | model2.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy']) 358 | 359 | epochs = 10 360 | number_pics_per_bath = 3 361 | steps = len(despc)//number_pics_per_bath 362 | 363 | for i in range(epochs): 364 | 365 | generator = data_generator(despc, train_data, wordtoix, max_length, number_pics_per_bath) 366 | 367 | 368 | model2.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1) 369 | model2.save('model2.h5') 370 | 371 | epochs = 10 372 | number_pics_per_bath = 3 373 | steps = len(despc)//number_pics_per_bath 374 | 375 | for i in range(epochs): 376 | 377 | generator = data_generator(despc, train_data, wordtoix, max_length, number_pics_per_bath) 378 | 379 | 380 | model2.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1) 381 | model2.save('model2.h5') 382 | 383 | 384 | 385 | 386 | 387 | --------------------------------------------------------------------------------