├── ImageCaptioning_Prediction.py
├── projectapp.py
├── README.md
├── Image_Captioning.py
└── ImageCaptioning_Model.py


/ImageCaptioning_Prediction.py:
--------------------------------------------------------------------------------
 1 | def prediction_function(img_path):
 2 |     import numpy as np
 3 |     import pandas as pd
 4 |     import matplotlib.pyplot as plt
 5 |     import os
 6 |     import keras
 7 |     import tensorflow as tf
 8 |     from keras.preprocessing.sequence import pad_sequences
 9 |     from keras.preprocessing.text import Tokenizer
10 |     from keras.layers import concatenate, BatchNormalization, Input
11 |     from keras.layers.merge import add
12 |     from keras.utils import to_categorical, plot_model
13 |     import io
14 |     import boto3
15 |     from smart_open import smart_open
16 |     import string
17 |     from keras.preprocessing.image import load_img, img_to_array
18 |     from PIL import Image
19 |     import numpy as np
20 |     from numpy.testing import assert_allclose
21 |     from keras.models import load_model
22 |     from keras.callbacks import ModelCheckpoint
23 |     import pickle
24 |     from keras.applications.resnet50 import ResNet50
25 |     from keras.optimizers import Adam
26 |     from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate
27 |     from keras.models import Sequential, Model
28 |     from keras.utils import np_utils
29 |     from keras.preprocessing import image, sequence
30 |     from gtts import gTTS
31 |     import IPython.display as ipd
32 |     import pickle
33 | 
34 |     s3_resource = boto3.resource('s3')
35 | 
36 |     wordtoix,ixtoword,max_length = pickle.loads(s3_resource.Bucket("projectdata27").Object("project.pkl").get()['Body'].read())
37 | 
38 |     bucket = s3_resource.Bucket('projectdata27')
39 | 
40 |     client = boto3.client('s3')
41 |     client.download_file('projectdata27',
42 |                      'model4.h5',
43 |                      'model4.h5')
44 |     # returns a compiled model
45 |     # identical to the previous one
46 |     model = load_model('model4.h5')
47 | 
48 |     client.download_file('projectdata27',
49 |                      'modelR.h5',
50 |                      'modelR.h5')
51 |     modelR = load_model('modelR.h5')
52 | 
53 |     k='data/data/Images/'+img_path
54 |     imag = bucket.Object(k)
55 |     img_data = imag.get().get('Body').read()
56 |     img=Image.open(io.BytesIO(img_data))
57 |     img=img.resize((224,224))
58 |     
59 |     im = img_to_array(img)
60 |     im = np.expand_dims(im, axis=0)
61 |     im = modelR.predict(im)
62 | 
63 |     in_text = 'startseq'
64 |     for i in range(max_length):
65 |         sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
66 |         sequence = pad_sequences([sequence], maxlen=max_length)
67 |         yhat = model3.predict([photo,sequence], verbose=0)
68 |         yhat = np.argmax(yhat)
69 |         word = ixtoword[yhat]
70 |         in_text += ' ' + word
71 |         if word == 'endseq':
72 |             break
73 |     final = in_text.split()
74 |     final = final[1:-1]
75 |     final = ' '.join(final)
76 | 
77 |     print(final)
78 | 
79 |     language = 'en'
80 |     myobj = gTTS(text=final, lang=language, slow=False)
81 |     myobj.save("project.mp3")
82 |     os.system("project.mp3")
83 |     ipd.Audio("project.mp3", autoplay=True)
84 | 
85 | 
86 | 
87 | 
88 | if __name__ == '__main__':
89 |     i = input('Enter the image id')
90 |     prediction_function(i)
91 |     
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/projectapp.py:
--------------------------------------------------------------------------------
 1 | def prediction_function():
 2 |     import streamlit as st
 3 |     import numpy as np
 4 |     import os
 5 |     import keras
 6 |     import tensorflow as tf
 7 |     from keras.preprocessing.sequence import pad_sequences
 8 |     from keras.preprocessing.text import Tokenizer
 9 |     from keras.layers import concatenate, BatchNormalization, Input
10 |     from keras.layers.merge import add
11 |     from keras.utils import to_categorical, plot_model
12 |     import io
13 |     import boto3
14 |     from smart_open import smart_open
15 |     import string
16 |     from keras.preprocessing.image import load_img, img_to_array
17 |     from PIL import Image
18 |     import numpy as np
19 |     from numpy.testing import assert_allclose
20 |     from keras.models import load_model
21 |     from keras.callbacks import ModelCheckpoint
22 |     import pickle
23 |     from keras.applications.resnet50 import ResNet50
24 |     from keras.optimizers import Adam
25 |     from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate
26 |     from keras.models import Sequential, Model
27 |     from keras.utils import np_utils
28 |     from keras.preprocessing import image, sequence
29 |     from gtts import gTTS
30 |     import pickle
31 |     import IPython.display as ipd
32 | 
33 |     st.title("Image upload")
34 |     image_file = st.file_uploader("Upload Image", type=["jpg"])
35 | 
36 |     if image_file is not None:
37 |         st.image(Image.open(image_file),width=200, height=200)
38 | 
39 |     if st.button("Generate Caption"):
40 |         s3_resource = boto3.resource('s3')
41 |         wordtoix,ixtoword,max_length = pickle.loads(s3_resource.Bucket("projectdata27").Object("project.pkl").get()['Body'].read())
42 | 
43 |         bucket = s3_resource.Bucket('projectdata27')
44 |         client = boto3.client('s3')
45 |         client.download_file('projectdata27',
46 |                      'model4.h5',
47 |                      'model4.h5')
48 |         # returns a compiled model
49 |         # identical to the previous one
50 |         model = load_model('model4.h5',compile=False)
51 |         client.download_file('projectdata27',
52 |                      'modelR.h5',
53 |                      'modelR.h5')
54 |         modelR = load_model('modelR.h5',compile=False)
55 |         img=Image.open(image_file)
56 |         img=img.resize((224,224))
57 |         im = img_to_array(img)
58 |         im = np.expand_dims(im, axis=0)
59 |         im = modelR.predict(im)
60 |         in_text = 'startseq'
61 |         for i in range(max_length):
62 |             sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
63 |             sequence = pad_sequences([sequence], maxlen=max_length)
64 |             yhat = model.predict([im,sequence], verbose=0)
65 |             yhat = np.argmax(yhat)
66 |             word = ixtoword[yhat]
67 |             in_text += ' ' + word
68 |             if word == 'endseq':
69 |                 break
70 |             final = in_text.split()
71 |             final = final[1:-1]
72 |             final = ' '.join(final)
73 |         st.success('Description:  '.format(final))
74 |         st.write(final)
75 | 
76 |         language = 'en'
77 |         myobj = gTTS(text=final, lang=language, slow=False)
78 |         myobj.save("project.mp3")
79 |         audio_file = open('project.mp3', 'rb')
80 |         audio_bytes = audio_file.read()
81 |         st.audio(audio_bytes, format='audio/ogg')
82 | 
83 | 
84 | if __name__ == '__main__':
85 |     
86 |     prediction_function()
87 |     
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Image_captioning
 2 | # Image Caption Generator using Deep Learning
 3 | Image captioning is a research area of Artificial Intelligence (AI) that deals with image understanding and a language description for that image. Image understanding needs to detect and recognize objects. It also needs to understand scene type or location, object properties and their interactions. Generating well-formed sentences requires both syntactic and semantic understanding of the language. In this project, a framework is developed leveraging the capabilities of artificial neural networks to “caption an image based on its significant features”.The generation process of image semantics not only understands the objects or scene recognition in the image, but also has the ability to analyze their states, understand the relationship among them and generate a correct sentence. The model would make use of Convolution neural networks to read image data and Long Short Term Memory (LSTM) for learning sentences/captions for image. The Flickr8K dataset is used to train the model.   
 4 |   
 5 | ## Problem Statement:  
 6 | The objective is to generate semantically and syntactically description about object or scene recognition in the image, and to understand the relationship among them.  
 7 |   
 8 | ## Dataset:
 9 | 1. **Flickr8k_Dataset**: Contains a total of 8092 images in JPG format with different shapes and sizes. Of which 6000 are used for training, 1000 for test and 1000 for development.
10 | 2. **Flickr8k_text** : Contains text files describing train_set ,test_set. Flickr8k.token.txt contains 5 captions for each image i.e. total 40460 captions.  
11 |   
12 | ## Built With:  
13 | Jupyter Interface on EMR  
14 | 
15 | ## What is Image Captioning?  
16 | Image Captioning is the process of generating textual description of an image. It uses both Natural Language Processing and Computer Vision to generate the captions.  
17 | ![1](https://user-images.githubusercontent.com/63635084/105141690-89dce100-5b1f-11eb-8d64-103f5905aa5c.png)  
18 |   
19 | ## Model Architecture:  
20 | ![WhatsApp Image 2021-01-20 at 1 34 45 PM](https://user-images.githubusercontent.com/63635084/105146486-1094bc80-5b26-11eb-9fbc-04128defd40d.jpeg)  
21 |   
22 | ## Execution:  
23 |   
24 | The problem needs two models and seamless integration between them. The network can be viewed as a combination of encoder and decoder. Encoder would be a convolutional neural network(CNN). Image is processed by CNN layer and features are extracted. End of CNN layer is connected to a Long short-term memory(LSTM) networks, a special kind of Recurrent Neural Network(RNN). LSTM’s are capable of learning long term dependencies. The model is built using Keras, a deep learning library in python. Keras is a high-level library that is above Tensorflow. The API is very simple and makes use of Tensorflow backend.  
25 | 
26 | Transfer learning is used for CNN implementation. Transfer learning is a major topic in machine learning that involves storing knowledge from one model and applying to another problem. The reason we use pretrained network is because, CNN models are difficult to train from scratch and it could be very computationally expensive that it takes several hours on GPU. In the scientific community, it is very common to use pretrained model on larger dataset and then using the model as a feature extractor.  
27 |   
28 | The output of image model acts as input to language model. To understand the captions under the images a Recurrent Neural Network(RNN) is used to solve the problem. Long Short-term memory (LSTM), which is a variation of RNN is used. LSTM works better and has powerful update equation and backpropagation. LSTM is a language model and decoder trained on feature vectors. LSTM’s had phenomenal influence and success in different problems like language modelling, speech recognition, translation etc.  
29 |   
30 | LSTM picks part of image and maps to the appropriate word in the caption. An embedding layer is created to get a vector representation for each word in the caption. Then the output vector is given as input to LSTM for the model to learn the neighbouring words for each word. Then the LSTM output is converted to fixed dimension using dense layer. Now, the outputs from both Language Model and Image model are combined, and input the vector to LSTM. LSTM learns the different captions for that image in training phase. The LSTM output is converted to the size of vocabulary size using the dense layer and activate the model using activation method. In testing phase, LSTM predict the captions for the image. LSTM predicts next word for the given image with the partial caption available at that stage.  
31 |   
32 | ## Network/Model:    
33 | ![1](https://user-images.githubusercontent.com/63635084/105346093-1b754d00-5c0b-11eb-92df-a0efcdc415b1.JPG)  
34 | ![cnn](https://user-images.githubusercontent.com/63635084/105346106-203a0100-5c0b-11eb-8d7d-3a1ea1679817.JPG)   
35 |   
36 | ## Code:
37 | [Click here for Code](https://github.com/AWS-Big-Data-Projects/IMAGE_CAPTION_GENERATOR/blob/master/Image_Captioning.py)  
38 |   
39 |   
40 | 
41 | 


--------------------------------------------------------------------------------
/Image_Captioning.py:
--------------------------------------------------------------------------------
  1 | ## Importing Required Libraries
  2 | 
  3 | import numpy as np   
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | import os
  7 | import keras
  8 | import tensorflow as tf 
  9 | from keras.preprocessing.sequence import pad_sequences 
 10 | from keras.preprocessing.text import Tokenizer 
 11 | from keras.layers import concatenate, BatchNormalization, Input
 12 | from keras.layers.merge import add 
 13 | from keras.utils import to_categorical, plot_model 
 14 | import io
 15 | import boto3
 16 | from smart_open import smart_open
 17 | import string
 18 | from keras.preprocessing.image import load_img, img_to_array
 19 | from PIL import Image
 20 | import numpy as np
 21 | from tensorflow.keras.applications.inception_v3 import InceptionV3
 22 | from numpy.testing import assert_allclose
 23 | from keras.models import load_model
 24 | from keras.callbacks import ModelCheckpoint
 25 | import pickle
 26 | from keras.applications.resnet50 import ResNet50
 27 | from keras.optimizers import Adam
 28 | from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate
 29 | from keras.models import Sequential, Model
 30 | from keras.utils import np_utils
 31 | from keras.preprocessing import image, sequence
 32 | 
 33 | 
 34 | ## Loading Text Data"""
 35 | 
 36 | token_path = 's3://projectdata27/data/data/captions.txt'
 37 | text = smart_open(token_path, 'r', encoding = 'utf-8').read()
 38 | 
 39 | 
 40 | ## Preprocessing Text Data
 41 | 
 42 | descriptions = dict()
 43 | for line in text.split('\n'):
 44 |     # split line by white space
 45 |     tokens = line.split(',')
 46 |     
 47 |     # take the first token as image id, the rest as description
 48 |     image_id, image_desc = tokens[0], tokens[1:]
 49 |     
 50 |     # extract filename from image id
 51 |     image_id = image_id.split('.')[0]
 52 |     
 53 |     # convert description tokens back to string
 54 |     image_desc = ' '.join(image_desc)
 55 |     if image_id not in descriptions.keys():
 56 |         descriptions[image_id] = list()
 57 |     descriptions[image_id].append(image_desc)
 58 | 
 59 | print(descriptions['3534548254_7bee952a0e'])
 60 | 
 61 | 
 62 | # prepare translation table for removing punctuation
 63 | 
 64 | table = str.maketrans('', '', string.punctuation)
 65 | for key, desc_list in descriptions.items():
 66 |     for i in range(len(desc_list)):
 67 |         desc = desc_list[i]
 68 |         # tokenize
 69 |         desc = desc.split()
 70 |         # convert to lower case
 71 |         desc = [word.lower() for word in desc]
 72 |         # remove punctuation from each token
 73 |         desc = [w.translate(table) for w in desc]
 74 |         # remove hanging 's' and 'a'
 75 |         desc = [word for word in desc if len(word)>1]
 76 |         # remove tokens with numbers in them
 77 |         desc = [word for word in desc if word.isalpha()]
 78 |         # store as string
 79 |         desc_list[i] =  ' '.join(desc)
 80 | 
 81 | del descriptions['']
 82 | 
 83 | 
 84 | t=[]
 85 | token_path = 's3://projectdata27/data/data/trainimages.txt'
 86 | train = smart_open(token_path, 'r', encoding = 'utf-8').read() 
 87 | for line in train.split('\n'):
 88 |     t.append(line[:-4])
 89 | 
 90 | t.remove('')
 91 | 
 92 | 
 93 | vocabulary = set()
 94 | for key in t:
 95 |     [vocabulary.update(d.split()) for d in descriptions[key]]
 96 | print('Original Vocabulary Size: %d' % len(vocabulary))
 97 | 
 98 | # Create a list of all the training captions
 99 | all_captions = []
100 | for key, val in descriptions.items():
101 |     if key in t:
102 |         for cap in val:
103 |             all_captions.append(cap)
104 | 
105 | 
106 | # Consider only words which occur at least 10 times in the corpus
107 | 
108 | word_count_threshold = 10
109 | word_counts = {}
110 | nsents = 0
111 | for sent in all_captions:
112 |     nsents += 1
113 |     for w in sent.split(' '):
114 |         word_counts[w] = word_counts.get(w, 0) + 1
115 | 
116 | vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
117 | print('preprocessed words %d ' % len(vocab))
118 | 
119 | 
120 | #find the maximum length of a description in a dataset
121 | 
122 | max_length = max(len(des.split()) for des in all_captions) 
123 | max_length
124 | 
125 | despc = dict() 
126 | for key, des_list in descriptions.items():
127 |     if key in t:
128 |         despc[key] = list()
129 |         for line in des_list:
130 |             desc = 'startseq ' + line + ' endseq'
131 |             despc[key].append(desc)
132 | 
133 | 
134 | # word mapping to integers
135 | 
136 | ixtoword = {} 
137 | wordtoix = {} 
138 |   
139 | ix = 1
140 | for word in vocab: 
141 |     wordtoix[word] = ix 
142 |     ixtoword[ix] = word 
143 |     ix += 1
144 | 
145 | 
146 | # convert a dictionary of clean descriptions to a list of descriptions
147 | 
148 | def to_lines(descriptions):
149 |  all_desc = list()
150 |  for key in t:
151 |   [all_desc.append(d) for d in descriptions[key]]
152 |  return all_desc
153 | # calculate the length of the description with the most words
154 | def max_length(descriptions):
155 |  lines = to_lines(descriptions)
156 |  return max(len(d.split()) for d in lines)
157 | # determine the maximum sequence length
158 | max_length = max_length(despc)
159 | print('Max Description Length: %d' % max_length)
160 | 
161 | s3 = boto3.resource('s3')
162 | 
163 | bucket = s3.Bucket('projectdata27')
164 | 
165 | temp = captions[10].split(",")
166 | image = bucket.Object('data/data/Images/'+temp[0])
167 | img_data = image.get().get('Body').read()
168 | img=Image.open(io.BytesIO(img_data))
169 | plt.imshow(img)
170 | 
171 | for ix in range(len(tokens[temp[0]])):
172 |     print(tokens[temp[0]][ix])
173 | 
174 | modelR = load_model('modelR.h5')
175 | 
176 | train_path='s3://projectdata27/data/data/trainimages.txt'
177 | x_train = smart_open(train_path, 'r', encoding = 'utf-8').read().split("\n")
178 | 
179 | 
180 | x_train.remove('')
181 | 
182 | 
183 | def preprocessing(img_path):
184 |     k='data/data/Images/'+img_path
185 |     imag = bucket.Object(k)
186 |     img_data = imag.get().get('Body').read()
187 |     img=Image.open(io.BytesIO(img_data))
188 |     img=img.resize((224,224))
189 |     
190 |     
191 |     im = img_to_array(img)
192 |     im = np.expand_dims(im, axis=0)
193 |     return im
194 | 
195 | train_data = {}
196 | 
197 | for ix in x_train:
198 |     img = preprocessing(ix)
199 |     train_data[ix] = modelR.predict(img).reshape(2048)
200 | 
201 | train_data
202 | 
203 | 
204 | # load glove vectors for embedding layer
205 | 
206 | vocab_size=1650
207 | embeddings_index = {} 
208 | 
209 | 
210 | g = smart_open('s3://projectdata27/glove.6B.200d.txt', 'r', encoding = 'utf-8').read()
211 | 
212 | for line in g.split("\n"): 
213 |     values = line.split(" ") 
214 |     word = values[0] 
215 |     indices = np.asarray(values[1: ], dtype = 'float32') 
216 |     embeddings_index[word] = indices
217 | 
218 | len(embeddings_index)
219 | 
220 | emb_dim= 200
221 | emb_matrix = np.zeros((vocab_size, emb_dim)) 
222 | for word, i in wordtoix.items(): 
223 |     emb_vec = embeddings_index.get(word) 
224 |     if emb_vec is not None: 
225 |         emb_matrix[i] = emb_vec 
226 | emb_matrix.shape
227 | 
228 | X1, X2, y = list(), list(), list() 
229 | for key, des_list in despc.items():
230 |     if key in t:
231 |         pic = train_data[key + '.jpg']
232 |         for cap in des_list:
233 |             seq = [wordtoix[word] for word in cap.split(' ') if word in wordtoix]
234 |             for i in range(1, len(seq)):
235 |                 in_seq, out_seq = seq[:i], seq[i]
236 |                 in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
237 |                 out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
238 |                 # store 
239 |                 X1.append(pic) 
240 |                 X2.append(in_seq)
241 |                 y.append(out_seq)
242 | 
243 | X2 = np.array(X2) 
244 | X1 = np.array(X1) 
245 | y = np.array(y)
246 | 
247 | 
248 | ip1 = Input(shape = (2048, )) 
249 | fe1 = Dropout(0.2)(ip1) 
250 | fe2 = Dense(256, activation = 'relu')(fe1) 
251 | ip2 = Input(shape = (max_length, )) 
252 | se1 = Embedding(vocab_size, emb_dim, mask_zero = True)(ip2) 
253 | se2 = Dropout(0.2)(se1) 
254 | se3 = LSTM(256)(se2) 
255 | decoder1 = add([fe2, se3]) 
256 | decoder2 = Dense(512, activation = 'relu')(decoder1) 
257 | outputs = Dense(vocab_size, activation = 'softmax')(decoder2) 
258 | model3 = Model(inputs = [ip1, ip2], outputs = outputs)
259 |  
260 | model3.layers[2].set_weights([emb_matrix]) 
261 | model3.layers[2].trainable = False
262 | model3.compile(loss = 'categorical_crossentropy', optimizer = 'adam',metrics=['accuracy'])
263 | 
264 | 
265 | # define the checkpoint
266 | 
267 | filepath = "model3.h5"
268 | checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
269 | callbacks_list = [checkpoint]
270 | 
271 | model3.fit([X1,X2], y, epochs=50, batch_size=256, callbacks=callbacks_list)
272 | 
273 | def feature_extraction(img_path):
274 |     k='data/data/Images/'+img_path
275 |     imag = bucket.Object(k)
276 |     img_data = imag.get().get('Body').read()
277 |     img=Image.open(io.BytesIO(img_data))
278 |     img=img.resize((224,224))
279 |     
280 |     im = img_to_array(img)
281 |     im = np.expand_dims(im, axis=0)
282 |     im = modelR.predict(im)
283 |     return im
284 | 
285 | def final_caption(photo):
286 |     in_text = 'startseq'
287 |     for i in range(max_length):
288 |         sequence = [wordtoix[w] for w in in_text.split() if w in wordtoix]
289 |         sequence = pad_sequences([sequence], maxlen=max_length)
290 |         yhat = model3.predict([photo,sequence], verbose=0)
291 |         yhat = np.argmax(yhat)
292 |         word = ixtoword[yhat]
293 |         in_text += ' ' + word
294 |         if word == 'endseq':
295 |             break
296 |     final = in_text.split()
297 |     final = final[1:-1]
298 |     final = ' '.join(final)
299 |     return final
300 | 
301 | 
302 | 
303 | ## ResNet50
304 | 
305 | from IPython.core.display import display, HTML
306 | display(HTML("""<a href="http://ethereon.github.io/netscope/#/gist/db945b393d40bfa26006">ResNet50 Architecture</a>"""))
307 | 
308 | modelR = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg')
309 | modelR.summary()
310 | 
311 | R=load_model('modelR.h5')
312 | 
313 | modelR = load_model('modelR.h5')
314 | 
315 | 
316 | 
317 | ## Progressive Loading
318 | 
319 | def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
320 |     X1, X2, y = list(), list(), list()
321 |     n=0
322 |     # loop for ever over images
323 |     while 1:
324 |         for key, desc_list in descriptions.items():
325 |             n+=1
326 |             # retrieve the photo feature
327 |             photo = photos[key+'.jpg']
328 |             for desc in desc_list:
329 |                 # encode the sequence
330 |                 seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
331 |                 # split one sequence into multiple X, y pairs
332 |                 for i in range(1, len(seq)):
333 |                     # split into input and output pair
334 |                     in_seq, out_seq = seq[:i], seq[i]
335 |                     # pad input sequence
336 |                     in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
337 |                     # encode output sequence
338 |                     out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
339 |                     # store
340 |                     X1.append(photo)
341 |                     X2.append(in_seq)
342 |                     y.append(out_seq)
343 |             # yield the batch data
344 |             if n==num_photos_per_batch:
345 |                 yield (np.array(X1), np.array(X2)), np.array(y)
346 |                 X1, X2, y = list(), list(), list()
347 |                 n=0
348 | 
349 | embedding_dim=200
350 | inputs1 = Input(shape=(2048,))
351 | fe1 = Dropout(0.5)(inputs1)
352 | fe2 = Dense(256, activation='relu')(fe1)
353 | inputs2 = Input(shape=(max_length,))
354 | se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
355 | se2 = Dropout(0.5)(se1)
356 | se3 = LSTM(256)(se2)
357 | decoder1 = add([fe2, se3])
358 | decoder2 = Dense(512, activation='relu')(decoder1)
359 | outputs = Dense(vocab_size, activation='softmax')(decoder2)
360 | model2 = Model(inputs=[inputs1, inputs2], outputs=outputs)
361 | 
362 | model2.layers[2].set_weights([emb_matrix])
363 | model2.layers[2].trainable = False
364 | 
365 | model2.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
366 | 
367 | epochs = 10
368 | number_pics_per_bath = 3
369 | steps = len(despc)//number_pics_per_bath
370 | 
371 | for i in range(epochs):
372 |     
373 |     generator = data_generator(despc, train_data, wordtoix, max_length, number_pics_per_bath)
374 |     
375 |     
376 |     model2.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
377 | model2.save('model2.h5')
378 | 
379 | 
380 | 


--------------------------------------------------------------------------------
/ImageCaptioning_Model.py:
--------------------------------------------------------------------------------
  1 | ## Importing Required Libraries
  2 | 
  3 | import numpy as np   
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | import os
  7 | import keras
  8 | import tensorflow as tf 
  9 | from keras.preprocessing.sequence import pad_sequences 
 10 | from keras.preprocessing.text import Tokenizer 
 11 | from keras.layers import concatenate, BatchNormalization, Input
 12 | from keras.layers.merge import add 
 13 | from keras.utils import to_categorical, plot_model  
 14 | import io
 15 | import boto3
 16 | from smart_open import smart_open
 17 | import string
 18 | from keras.preprocessing.image import load_img, img_to_array
 19 | from PIL import Image
 20 | import numpy as np
 21 | from numpy.testing import assert_allclose
 22 | from keras.models import load_model
 23 | from keras.callbacks import ModelCheckpoint
 24 | import pickle
 25 | from keras.applications.resnet50 import ResNet50
 26 | from keras.optimizers import Adam
 27 | from keras.layers import Dense, Flatten,Input, Convolution2D, Dropout, LSTM, TimeDistributed, Embedding, Bidirectional, Activation, RepeatVector,Concatenate
 28 | from keras.models import Sequential, Model
 29 | from keras.utils import np_utils
 30 | from keras.preprocessing import image, sequence
 31 | from gtts import gTTS
 32 | import IPython.display as ipd
 33 | 
 34 | """## Loading Text Data"""
 35 | 
 36 | token_path = 's3://projectdata27/data/data/captions.txt'
 37 | text = smart_open(token_path, 'r', encoding = 'utf-8').read()
 38 | 
 39 | """## Preprocessing Text Data"""
 40 | 
 41 | descriptions = dict()
 42 | for line in text.split('\n'):
 43 |     # split line by white space
 44 |     tokens = line.split(',')
 45 |     
 46 |     # take the first token as image id, the rest as description
 47 |     image_id, image_desc = tokens[0], tokens[1:]
 48 |     
 49 |     # extract filename from image id
 50 |     image_id = image_id.split('.')[0]
 51 |     
 52 |     # convert description tokens back to string
 53 |     image_desc = ' '.join(image_desc)
 54 |     if image_id not in descriptions.keys():
 55 |         descriptions[image_id] = list()
 56 |     descriptions[image_id].append(image_desc)
 57 | 
 58 | print(descriptions['3534548254_7bee952a0e'])
 59 | 
 60 | # prepare translation table for removing punctuation
 61 | table = str.maketrans('', '', string.punctuation)
 62 | for key, desc_list in descriptions.items():
 63 |     for i in range(len(desc_list)):
 64 |         desc = desc_list[i]
 65 |         # tokenize
 66 |         desc = desc.split()
 67 |         # convert to lower case
 68 |         desc = [word.lower() for word in desc]
 69 |         # remove punctuation from each token
 70 |         desc = [w.translate(table) for w in desc]
 71 |         # remove hanging 's' and 'a'
 72 |         desc = [word for word in desc if len(word)>1]
 73 |         # remove tokens with numbers in them
 74 |         desc = [word for word in desc if word.isalpha()]
 75 |         # store as string
 76 |         desc_list[i] =  ' '.join(desc)
 77 | 
 78 | del descriptions['']
 79 | 
 80 | t=[]
 81 | token_path = 's3://projectdata27/data/data/trainimages.txt'
 82 | train = smart_open(token_path, 'r', encoding = 'utf-8').read() 
 83 | for line in train.split('\n'):
 84 |     t.append(line[:-4])
 85 | 
 86 | t.remove('')
 87 | 
 88 | vocabulary = set()
 89 | for key in t:
 90 |     [vocabulary.update(d.split()) for d in descriptions[key]]
 91 | print('Original Vocabulary Size: %d' % len(vocabulary))
 92 | 
 93 | # Create a list of all the training captions
 94 | all_captions = []
 95 | for key, val in descriptions.items():
 96 |     if key in t:
 97 |         for cap in val:
 98 |             all_captions.append(cap)
 99 | 
100 | 
101 | # Consider only words which occur at least 10 times in the corpus
102 | word_count_threshold = 10
103 | word_counts = {}
104 | nsents = 0
105 | for sent in all_captions:
106 |     nsents += 1
107 |     for w in sent.split(' '):
108 |         word_counts[w] = word_counts.get(w, 0) + 1
109 | 
110 | vocab = [w for w in word_counts if word_counts[w] >= word_count_threshold]
111 | print('preprocessed words %d ' % len(vocab))
112 | 
113 | #find the maximum length of a description in a dataset 
114 | max_length = max(len(des.split()) for des in all_captions) 
115 | max_length
116 | 
117 | despc = dict() 
118 | for key, des_list in descriptions.items():
119 |     if key in t:
120 |         despc[key] = list()
121 |         for line in des_list:
122 |             desc = 'startseq ' + line + ' endseq'
123 |             despc[key].append(desc)
124 | 
125 | 
126 | # word mapping to integers 
127 | ixtoword = {} 
128 | wordtoix = {} 
129 |   
130 | ix = 1
131 | for word in vocab: 
132 |     wordtoix[word] = ix 
133 |     ixtoword[ix] = word 
134 |     ix += 1
135 | 
136 | # convert a dictionary of clean descriptions to a list of descriptions
137 | def to_lines(descriptions):
138 |  all_desc = list()
139 |  for key in t:
140 |   [all_desc.append(d) for d in descriptions[key]]
141 |  return all_desc
142 | # calculate the length of the description with the most words
143 | def max_length(descriptions):
144 |  lines = to_lines(descriptions)
145 |  return max(len(d.split()) for d in lines)
146 | # determine the maximum sequence length
147 | max_length = max_length(despc)
148 | print('Max Description Length: %d' % max_length)
149 | 
150 | bucket='projectdata27'
151 | key='project.pkl'
152 | pickle_byte_obj = pickle.dumps([wordtoix,ixtoword,max_length]) 
153 | s3_resource = boto3.resource('s3')
154 | s3_resource.Object(bucket,key).put(Body=pickle_byte_obj)
155 | 
156 | wordtoix,ixtoword,max_length = pickle.loads(s3_resource.Bucket("projectdata27").Object("project.pkl").get()['Body'].read())
157 | 
158 | s3 = boto3.resource('s3')
159 | 
160 | bucket = s3.Bucket('projectdata27')
161 | 
162 | temp = descriptions['1000268201_693b08cb0e']
163 | image = bucket.Object('data/data/Images/'+'1000268201_693b08cb0e.jpg')
164 | img_data = image.get().get('Body').read()
165 | img=Image.open(io.BytesIO(img_data))
166 | plt.imshow(img)
167 | 
168 | for ix in temp:
169 |     print(ix)
170 | 
171 | t = despc['1000268201_693b08cb0e']
172 | for ix in t:
173 |     print(ix)
174 | 
175 | train_path='s3://projectdata27/data/data/trainimages.txt'
176 | x_train = smart_open(train_path, 'r', encoding = 'utf-8').read().split("\n")
177 | 
178 | x_train[:5]
179 | 
180 | x_train.remove('')
181 | 
182 | len(x_train)
183 | 
184 | modelR = ResNet50(include_top=False,weights='imagenet',input_shape=(224,224,3),pooling='avg')
185 | modelR.summary()
186 | 
187 | client = boto3.client('s3')
188 | client.upload_file(Filename='modelR.h5',
189 |                   Bucket="projectdata27",
190 |                   Key='modelR.h5')
191 | 
192 | client.download_file('projectdata27',
193 |                      'modelR.h5',
194 |                      'modelR.h5')
195 | # returns a compiled model
196 | # identical to the previous one
197 | modelR = load_model('modelR.h5')
198 | 
199 | modelR.summary()
200 | 
201 | def preprocessing(img_path):
202 |     k='data/data/Images/'+img_path
203 |     imag = bucket.Object(k)
204 |     img_data = imag.get().get('Body').read()
205 |     img=Image.open(io.BytesIO(img_data))
206 |     img=img.resize((224,224))
207 |     
208 |     
209 |     im = img_to_array(img)
210 |     im = np.expand_dims(im, axis=0)
211 |     return im
212 | 
213 | train_data = {}
214 | 
215 | for ix in x_train:
216 |     img = preprocessing(ix)
217 |     train_data[ix] = modelR.predict(img).reshape(2048)
218 | 
219 | train_data
220 | 
221 | # load glove vectors for embedding layer 
222 | vocab_size=1650
223 | embeddings_index = {} 
224 | 
225 | 
226 | g = smart_open('s3://projectdata27/glove.6B.200d.txt', 'r', encoding = 'utf-8').read()
227 | 
228 | for line in g.split("\n"): 
229 |     values = line.split(" ") 
230 |     word = values[0] 
231 |     indices = np.asarray(values[1: ], dtype = 'float32') 
232 |     embeddings_index[word] = indices
233 | 
234 | 
235 | emb_dim= 200
236 | emb_matrix = np.zeros((vocab_size, emb_dim)) 
237 | for word, i in wordtoix.items(): 
238 |     emb_vec = embeddings_index.get(word) 
239 |     if emb_vec is not None: 
240 |         emb_matrix[i] = emb_vec 
241 | emb_matrix.shape
242 | 
243 | X1, X2, y = list(), list(), list() 
244 | for key, des_list in despc.items():
245 |     if key in t:
246 |         pic = train_data[key + '.jpg']
247 |         for cap in des_list:
248 |             seq = [wordtoix[word] for word in cap.split(' ') if word in wordtoix]
249 |             for i in range(1, len(seq)):
250 |                 in_seq, out_seq = seq[:i], seq[i]
251 |                 in_seq = pad_sequences([in_seq], maxlen = max_length)[0]
252 |                 out_seq = to_categorical([out_seq], num_classes = vocab_size)[0]
253 |                 # store 
254 |                 X1.append(pic) 
255 |                 X2.append(in_seq)
256 |                 y.append(out_seq)
257 | 
258 | X2 = np.array(X2) 
259 | X1 = np.array(X1) 
260 | y = np.array(y)
261 | 
262 | X1.shape
263 | 
264 | X2.shape
265 | 
266 | y.shape
267 | 
268 | 232328*(2048+(34*200))
269 | 
270 | ip1 = Input(shape = (2048, )) 
271 | fe1 = Dropout(0.2)(ip1) 
272 | fe2 = Dense(256, activation = 'relu')(fe1) 
273 | ip2 = Input(shape = (max_length, )) 
274 | se1 = Embedding(vocab_size, emb_dim, mask_zero = True)(ip2) 
275 | se2 = Dropout(0.2)(se1) 
276 | se3 = LSTM(256)(se2) 
277 | decoder1 = add([fe2, se3]) 
278 | decoder2 = Dense(512, activation = 'relu')(decoder1) 
279 | outputs = Dense(vocab_size, activation = 'softmax')(decoder2) 
280 | model3 = Model(inputs = [ip1, ip2], outputs = outputs)
281 |  
282 | model3.layers[2].set_weights([emb_matrix]) 
283 | model3.layers[2].trainable = False
284 | model3.compile(loss = 'categorical_crossentropy', optimizer = 'adam',metrics=['accuracy'])
285 | 
286 | 
287 | # define the checkpoint
288 | filepath = "model4.h5"
289 | checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
290 | callbacks_list = [checkpoint]
291 | 
292 | model3.fit([X1,X2], y, epochs=60, batch_size=256, callbacks=callbacks_list)
293 | 
294 | client = boto3.client('s3')
295 | client.upload_file(Filename='model4.h5',
296 |                   Bucket="projectdata27",
297 |                   Key='model4.h5')
298 | 
299 | client.download_file('projectdata27',
300 |                      'model4.h5',
301 |                      'model4.h5')
302 | # returns a compiled model
303 | # identical to the previous one
304 | model3 = load_model('model4.h5')
305 | 
306 | model3.summary()
307 | 
308 | 
309 | ## Progresive loading
310 | 
311 | def data_generator(descriptions, photos, wordtoix, max_length, num_photos_per_batch):
312 |     X1, X2, y = list(), list(), list()
313 |     n=0
314 |     # loop for ever over images
315 |     while 1:
316 |         for key, desc_list in descriptions.items():
317 |             n+=1
318 |             # retrieve the photo feature
319 |             photo = photos[key+'.jpg']
320 |             for desc in desc_list:
321 |                 # encode the sequence
322 |                 seq = [wordtoix[word] for word in desc.split(' ') if word in wordtoix]
323 |                 # split one sequence into multiple X, y pairs
324 |                 for i in range(1, len(seq)):
325 |                     # split into input and output pair
326 |                     in_seq, out_seq = seq[:i], seq[i]
327 |                     # pad input sequence
328 |                     in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
329 |                     # encode output sequence
330 |                     out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
331 |                     # store
332 |                     X1.append(photo)
333 |                     X2.append(in_seq)
334 |                     y.append(out_seq)
335 |             # yield the batch data
336 |             if n==num_photos_per_batch:
337 |                 yield (np.array(X1), np.array(X2)), np.array(y)
338 |                 X1, X2, y = list(), list(), list()
339 |                 n=0
340 | 
341 | embedding_dim=200
342 | inputs1 = Input(shape=(2048,))
343 | fe1 = Dropout(0.5)(inputs1)
344 | fe2 = Dense(256, activation='relu')(fe1)
345 | inputs2 = Input(shape=(max_length,))
346 | se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
347 | se2 = Dropout(0.5)(se1)
348 | se3 = LSTM(256)(se2)
349 | decoder1 = add([fe2, se3])
350 | decoder2 = Dense(512, activation='relu')(decoder1)
351 | outputs = Dense(vocab_size, activation='softmax')(decoder2)
352 | model2 = Model(inputs=[inputs1, inputs2], outputs=outputs)
353 | 
354 | model2.layers[2].set_weights([emb_matrix])
355 | model2.layers[2].trainable = False
356 | 
357 | model2.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
358 | 
359 | epochs = 10
360 | number_pics_per_bath = 3
361 | steps = len(despc)//number_pics_per_bath
362 | 
363 | for i in range(epochs):
364 |     
365 |     generator = data_generator(despc, train_data, wordtoix, max_length, number_pics_per_bath)
366 |     
367 |     
368 |     model2.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
369 | model2.save('model2.h5')
370 | 
371 | epochs = 10
372 | number_pics_per_bath = 3
373 | steps = len(despc)//number_pics_per_bath
374 | 
375 | for i in range(epochs):
376 |     
377 |     generator = data_generator(despc, train_data, wordtoix, max_length, number_pics_per_bath)
378 |     
379 |     
380 |     model2.fit_generator(generator, epochs=1, steps_per_epoch=steps, verbose=1)
381 | model2.save('model2.h5')
382 | 
383 | 
384 | 
385 | 
386 | 
387 | 


--------------------------------------------------------------------------------