├── README.md
├── img
    ├── dynamic.jpg
    └── framework.jpg
├── model
    ├── DataUtil.py
    ├── InitUtil.py
    ├── ModelUtil.py
    ├── SEModelUtil.py
    └── __init__.py
├── mqa_video+subtitle+question.py
├── mqa_video+subtitle+update+question.py
├── mqa_video+subtitle.py
├── mqa_video+subtitlel+update.py
└── train_split.json


/README.md:
--------------------------------------------------------------------------------
 1 | # Layered Memory Network (LMN)
 2 | The LMN model ranked 1st place on [MovieQA Video+Subtt-based Answering Challenge 2017](http://movieqa.cs.toronto.edu/workshops/iccv2017/) ([The Joint Video and Language Understanding Workshop, ICCV 2017](https://sites.google.com/site/describingmovies/workshop-at-iccv-17)).
 3 | 
 4 | - The flowchart of Layered Memory Network (LMN).
 5 | 
 6 | ![LMN](https://raw.githubusercontent.com/bowong/Layered-Memory-Network/master/img/framework.jpg)
 7 | 
 8 | - The framework of Dynamic Subtitle Memory module with update mechanism.
 9 | 
10 | ![DSM](https://raw.githubusercontent.com/bowong/Layered-Memory-Network/master/img/dynamic.jpg)
11 | 
12 | 
13 | 
14 | ## Train
15 | 
16 | 
17 | ```
18 | python mqa_video+subtitle+update+question.py
19 | ```
20 | 
21 | ## Paper
22 | 
23 | Bo Wang, Youjiang Xu, Yahong Han, Richang Hong. ["Movie Question Answering: Remembering the Textual Cues for Layered Visual Contents."](https://arxiv.org/abs/1804.09412) AAAI, 2018. [[Paper]](https://arxiv.org/abs/1804.09412)
24 | ```
25 | @inproceedings{Wang2018,
26 |   author    = {Bo Wang and Youjiang Xu and Yahong Han and Richang Hong},
27 |   title     = {Movie Question Answering: Remembering the Textual Cues for Layered Visual Contents},
28 |   booktitle = {AAAI},
29 |   year      = {2018},
30 | }
31 | ```
32 | 
33 | 


--------------------------------------------------------------------------------
/img/dynamic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowong/Layered-Memory-Network/86364077c40de7674088248b81ef805d7bfa7f4d/img/dynamic.jpg


--------------------------------------------------------------------------------
/img/framework.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowong/Layered-Memory-Network/86364077c40de7674088248b81ef805d7bfa7f4d/img/framework.jpg


--------------------------------------------------------------------------------
/model/DataUtil.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | #import MovieQA_benchmark as MovieQA
  4 | import re
  5 | import h5py
  6 | import tensorflow as tf
  7 | import math
  8 | from nltk.stem.snowball import SnowballStemmer
  9 | from collections import Counter
 10 | 
 11 | re_alphanumeric = re.compile('[^a-z0-9 -]+')
 12 | re_multispace = re.compile(' +')
 13 | snowball = SnowballStemmer('english') 
 14 | 
 15 | def preprocess_sentence(line):
 16 |     '''strip all punctuation, keep only alphanumerics
 17 |     '''
 18 |     line = re_alphanumeric.sub('', line)
 19 |     line = re_multispace.sub(' ', line)
 20 |     return line
 21 | 
 22 | def normalize_documents(stories, v2i, max_words=40):
 23 |     """Normalize all stories in the dictionary, get list of words per sentence.
 24 |     """
 25 |     for movie in stories.keys():
 26 |         for s, sentence in enumerate(stories[movie]):
 27 |             sentence = sentence.lower()
 28 |             sentence = preprocess_sentence(sentence.strip())
 29 |             sentence = sentence.split(' ')[:max_words]
 30 |             stories[movie][s] = sentence
 31 | 
 32 |     max_sentences = max([len(story) for story in stories.values()])
 33 |     max_words = max([len(sent) for story in stories.values() for sent in story])
 34 | 
 35 |     processed_stories = {}
 36 |     for imdb_key, story in stories.items():
 37 |         processed_stories[imdb_key] = np.zeros((max_sentences,max_words), dtype='int32')
 38 |         for jj, sentence in enumerate(story):
 39 |             for kk, word in enumerate(sentence):
 40 |                 if v2i.has_key(word):
 41 |                     processed_stories[imdb_key][jj, kk] = v2i[word]
 42 |                 else:
 43 |                     processed_stories[imdb_key][jj, kk] = v2i['UNK']
 44 | 
 45 |     return processed_stories,max_sentences,max_words
 46 | 
 47 | def preprocess_stories(stories,max_words=40):
 48 |     for movie in stories.keys():
 49 |         for s, sentence in enumerate(stories[movie]):
 50 |             sentence = sentence.lower()
 51 |             sentence = preprocess_sentence(sentence)
 52 |             sentence = sentence.split(' ')[:max_words]
 53 |             stories[movie][s] = sentence
 54 |     return stories
 55 | 
 56 | def create_vocabulary(QAs, stories, word_thresh=2, v2i={'': 0, 'UNK':1}):
 57 |     '''
 58 |     v2i = {'': 0, 'UNK':1}  # vocabulary to index
 59 |     '''
 60 |     print 'Create vocabulary...'
 61 | 
 62 |     # Get all story words
 63 |     all_words = [word for story in stories for sent in story for word in sent]
 64 |     print('number of words: %d' %len(all_words))
 65 | 
 66 | 
 67 |     QA_words = {}
 68 |     for QA in QAs:
 69 |         temp = {}
 70 |         q_w = preprocess_sentence(QA.question.strip().lower()).split(' ')
 71 |         a_w = [preprocess_sentence(answer.strip().lower()).split(' ') for answer in QA.answers]
 72 |         temp['q_w'] = q_w
 73 |         temp['a_w'] = a_w
 74 |         temp['qid'] = QA.qid
 75 |         temp['imdb_key'] = QA.imdb_key
 76 |         temp['question'] = QA.question
 77 |         temp['answers'] = QA.answers
 78 |         temp['correct_index'] = QA.correct_index
 79 |         # temp['plot_alignment'] = QA.plot_alignment
 80 |         temp['video_clips'] = QA.video_clips
 81 | 
 82 |         
 83 |         QA_words[QA.qid]=temp
 84 | 
 85 |         all_words.extend(q_w)
 86 |         for answer in a_w:
 87 |             all_words.extend(answer)
 88 | 
 89 | 
 90 |     # threshold vocabulary, at least N instances of every word
 91 |     vocab = Counter(all_words)
 92 |     vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh]
 93 | 
 94 |     # create vocabulary index
 95 |     for w in vocab:
 96 |         if w not in v2i.keys():
 97 |             v2i[w] = len(v2i)
 98 |     
 99 |     print('Created a vocabulary of %d words. Threshold removed %.2f %% words'\
100 |         %(len(v2i), 100*(1. * len(set(all_words))-len(v2i))/len(all_words)))
101 | 
102 |     return QA_words, v2i
103 | 
104 | def create_vocabulary_word2vec(QAs, stories, word_thresh=2, w2v_vocab=None, v2i={'': 0, 'UNK':1}):
105 |     '''
106 |     v2i = {'': 0, 'UNK':1}  # vocabulary to index
107 |     '''
108 |     print 'Create vocabulary...'
109 | 
110 |     if w2v_vocab is not None:
111 |         print "Adding words based on word2vec"
112 |     else:    
113 |         print "Adding all words"
114 | 
115 |     # Get all story words
116 |     all_words = [word for story in stories for sent in story for word in sent]
117 |     print('number of total words: %d' %len(all_words))
118 | 
119 | 
120 |     for QA in QAs:
121 |         q_w = preprocess_sentence(QA.question.strip().lower()).split(' ')
122 |         a_w = [preprocess_sentence(answer.strip().lower()).split(' ') for answer in QA.answers]
123 |         
124 |         all_words.extend(q_w)
125 |         for answer in a_w:
126 |             all_words.extend(answer)
127 | 
128 | 
129 |     # threshold vocabulary, at least N instances of every word
130 |     vocab = Counter(all_words)
131 |     vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh]
132 | 
133 |     # create vocabulary index
134 |     for w in vocab:
135 |         if w not in v2i.keys():
136 |             if w2v_vocab is None:
137 |                 # if word2vec is not provided, just dump the word to vocab
138 |                 v2i[w] = len(v2i)
139 |             elif w2v_vocab is not None and w in w2v_vocab:
140 |                 # check if word in vocab, or else ignore
141 |                 v2i[w] = len(v2i)
142 |     
143 |     print('Created a vocabulary of %d words. Threshold removed %.2f %% words'\
144 |         %(len(v2i), 100*(1. * len(set(all_words))-len(v2i))/len(all_words)))
145 | 
146 |     return v2i
147 | 
148 | def data_in_matrix_form(stories, v2i,max_sentences=None,max_words=None):
149 |     """Make the QA data set compatible for memory networks by
150 |     converting to matrix format (index into LUT vocabulary).
151 |     """
152 | 
153 |     def add_word_or_UNK():
154 |         if v2i.has_key(word):
155 |             return v2i[word]
156 |         else:
157 |             return v2i['UNK']
158 | 
159 |     # Encode stories
160 |     if max_sentences is None:
161 |         max_sentences = max([len(story) for story in stories.values()])
162 |     if max_words is None:
163 |         max_words = max([len(sent) for story in stories.values() for sent in story])
164 | 
165 |     storyM = {}
166 |     for imdb_key, story in stories.iteritems():
167 |         storyM[imdb_key] = np.zeros((max_sentences, max_words), dtype='int32')
168 |         for jj, sentence in enumerate(story):
169 |             for kk, word in enumerate(sentence):
170 |                 storyM[imdb_key][jj, kk] = add_word_or_UNK()
171 | 
172 |     print "#stories:", len(storyM)
173 |     print "storyM shape (movie 1):", storyM.values()[0].shape
174 | 
175 |     
176 |     return storyM,max_sentences,max_words
177 | 
178 | 
179 | 
180 | def S2I(sen, v2i, fixed_len):
181 |     '''
182 |         len_qa: fixed length of question or answer
183 |     '''
184 |     if type(sen)!=list:
185 |         sen = preprocess_sentence(sen.strip().lower()).split(' ')
186 |     res = []
187 |     for idx, w in enumerate(sen):
188 |         if idx<fixed_len:
189 |             if w in v2i.keys():
190 |                 res.append(v2i[w])
191 |             else:
192 |                 res.append(v2i['UNK'])
193 |     while(len(res)<fixed_len):
194 |         res.append(v2i[''])
195 |     return res
196 | 
197 | 
198 | def getBatchIndexedQAs(batch_qas_list,v2i, nql=16, nqa=10, numOfChoices=2):
199 |     '''
200 |         batch_qas_list: list of qas
201 |         QA_words: all the QAs, contains question words and answer words
202 |         v2i: vocabulary to index
203 |         nql: length of question
204 |         nqa: length of answer
205 |         numOfChoices: number of Choices utilized per QA, default set to 2 ==> right/wrong
206 | 
207 |         return: questions, answers, ground_truth
208 |             both of them are numeric indexed
209 |             ground_truth is one hot vector
210 |     '''
211 | 
212 |     batch_size = len(batch_qas_list)
213 |     questions = np.zeros((batch_size,nql),dtype='int32')
214 |     answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32')
215 |     ground_truth = np.zeros((batch_size,numOfChoices),dtype='int32')
216 | 
217 |     for idx, qa in enumerate(batch_qas_list):
218 |         # set question 
219 |         qid = qa.qid
220 |         questions[idx][:]=S2I(qa.question, v2i,nql)
221 |         
222 |         
223 |         # set anwsers
224 |         if numOfChoices==2:
225 |             ground_answer_pos = np.random.randint(0,numOfChoices)
226 |             ground_truth[idx][ground_answer_pos]=1
227 |             
228 |             # set correct answer
229 |             correct_index = int(qa.correct_index)
230 |             answers[idx][ground_answer_pos][:] = S2I(qa.answers[correct_index], v2i, nqa)
231 | 
232 | 
233 | 
234 |             wrong_index = np.random.randint(0,5)
235 |             while(wrong_index==correct_index):
236 |                 wrong_index = np.random.randint(0,5)
237 | 
238 |             # set wrong answer
239 |             answers[idx][1-ground_answer_pos][:]=S2I(qa.answers[wrong_index], v2i, nqa)
240 |         elif numOfChoices==5:
241 |             
242 |             # set correct answer
243 |             correct_index = int(qa.correct_index)
244 |             ground_truth[idx][correct_index]=1
245 |             for ans_idx, ans in enumerate(qa.answers):
246 |                 answers[idx][ans_idx][:]=S2I(ans, v2i, nqa)
247 | 
248 |         else:
249 |             raise ValueError('Invalid numOfChoices: ' + numOfChoices)
250 | 
251 |     return questions,answers,ground_truth
252 | 
253 | def getBatchTestIndexedQAs(batch_qas_list,v2i, nql=16, nqa=10, numOfChoices=2):
254 |     '''
255 |         batch_qas_list: list of qas
256 |         v2i: vocabulary to index
257 |         nql: length of question
258 |         nqa: length of answer
259 |         numOfChoices: number of Choices utilized per QA, default set to 2 ==> right/wrong
260 | 
261 |         return: questions, answers, ground_truth
262 |             both of them are numeric indexed
263 |             ground_truth is one hot vector
264 |     '''
265 | 
266 |     batch_size = len(batch_qas_list)
267 |     questions = np.zeros((batch_size,nql),dtype='int32')
268 |     answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32')
269 | 
270 |     for idx, qa in enumerate(batch_qas_list):
271 |         # set question 
272 |         qid = qa.qid
273 |         questions[idx][:]=S2I(qa.question, v2i,nql)
274 |         
275 |         # set anwsers
276 |         for ans_idx, ans in enumerate(qa.answers):
277 |             answers[idx][ans_idx][:]=S2I(ans, v2i, nqa)
278 | 
279 | 
280 |     return questions,answers
281 | 
282 | def getBatchVideoFeature(batch_qas_list, hf, feature_shape):
283 |     '''
284 |         video-based QA
285 |         there are video clips in all QA pairs.  
286 |     '''
287 | 
288 |     batch_size = len(batch_qas_list)
289 |     input_video = np.zeros((batch_size,)+tuple(feature_shape),dtype='float32')
290 | 
291 |     timesteps = feature_shape[0]
292 | 
293 |     for idx, qa in enumerate(batch_qas_list):
294 |         qid = qa.qid
295 |         video_clips = qa.video_clips
296 |         imdb_key = qa.imdb_key
297 | 
298 | 
299 | 
300 |         clips_features = []
301 |         if len(video_clips) != 0:
302 |             for clip in video_clips:
303 |                 dataset = imdb_key+'/'+clip
304 |                 if imdb_key in hf.keys() and clip in hf[imdb_key].keys():
305 |                     clips_features.extend(hf[dataset][:]) # clips_features.shape
306 | 
307 | 
308 |         if(len(clips_features)<=0):
309 |             # if there are not vlid features
310 |             for clip in hf[imdb_key].keys():
311 |                 dataset = imdb_key+'/'+clip
312 |                 clips_features.extend(hf[dataset][:]) # clips_features.shape
313 | 
314 |         
315 |         if(len(clips_features)>=timesteps):
316 |             interval = int(math.floor((len(clips_features)-1)/(timesteps-1)))
317 |             input_video[idx] = clips_features[0::interval][0:timesteps]
318 |         else:
319 |             input_video[idx][:len(clips_features)] = clips_features
320 |             for last_idx in xrange(len(clips_features),timesteps):
321 |                 input_video[idx][last_idx]=clips_features[-1]
322 | 
323 | 
324 |         # if qid not in hf_out.keys():
325 |         #     dset = hf_out.create_dataset(qid, feature_shape, dtype='f')
326 |         #     dset[:] = input_video[idx]
327 | 
328 | 
329 |     return input_video
330 | 
331 | def getBatchVideoFeatureFromQid(batch_qas_list, hf, feature_shape):
332 |     '''
333 |         video-based QA
334 |         there are video clips in all QA pairs.  
335 |     '''
336 | 
337 |     batch_size = len(batch_qas_list)
338 |     input_video = np.zeros((batch_size,)+tuple(feature_shape),dtype='float32')
339 | 
340 |     timesteps = feature_shape[0]
341 |     for idx, qa in enumerate(batch_qas_list):
342 |         qid = qa.qid
343 |         input_video[idx] = hf[qid][:]
344 |     return input_video
345 |     
346 | rng = np.random
347 | rng.seed(1234)
348 | def getw2v(batch_qa_list,w2v,v2i,d_w2v = 300):
349 | 
350 | 
351 |     voc_size = len(v2i)
352 | 
353 | 
354 |     pca_mat = None
355 |     #print "Initialize LUTs as word2vec and use linear projection layer"
356 | 
357 | 
358 |     LUT = np.zeros((voc_size, d_w2v), dtype='float32')
359 |     found_words = 0
360 | 
361 |     for w, v in v2i.iteritems():
362 |         if w in w2v.vocab:
363 |             LUT[v] = w2v.get_vector(w)
364 |             found_words +=1
365 |         else:
366 |             LUT[v] = rng.randn(d_w2v)
367 |             LUT[v] = LUT[v] / (np.linalg.norm(LUT[v]) + 1e-6)
368 | 
369 |     #print "Found %d / %d words" %(found_words, len(v2i))
370 | 
371 | 
372 |     # word 0 is blanked out, word 1 is 'UNK'
373 |     LUT[0] = np.zeros((d_w2v))
374 | 
375 |     # if linear projection layer is not the same shape as LUT, then initialize with PCA
376 | 
377 | 
378 |     # setup LUT!
379 |     T_w2v = tf.constant(LUT.astype('float32'))
380 | 
381 | 
382 |     word_shape = (26033, 300)
383 | 
384 |     w2v_new = np.zeros((batch_qa_list,)+word_shape,dtype='int32')
385 | 
386 | 
387 |     for idx in xrange(batch_qa_list):
388 |     
389 | 
390 |     
391 |     
392 |         w2v_new[idx][:] = LUT[:]
393 | 
394 |     return w2v_new
395 | 
396 |         #w2v_new = tf.tile(w2v_new, [input_shape[0],1,1])
397 |     
398 | 
399 | def getBatchIndexedStories(batch_qa_list,stories,v2i,story_shape):
400 |     batch_size = len(batch_qa_list)
401 |     input_stories = np.zeros((batch_size,)+story_shape,dtype='int32')
402 | 
403 |     for idx, qa in enumerate(batch_qa_list):
404 |         imdb_key = qa.imdb_key
405 |         interval = int(math.floor((len(stories[imdb_key])-1)/(story_shape[0]-1)))
406 | 
407 |         if interval != 0:
408 |             for k in xrange(story_shape[0]):
409 |                 # if(k<story_shape[0]):
410 |                 input_stories[idx][k] = stories[imdb_key][k*interval,:story_shape[1]]
411 |         else:
412 |             input_stories[idx][:len(stories[imdb_key])] = stories[imdb_key][:,:story_shape[1]]
413 | 
414 |     return input_stories
415 | 
416 | def getBatchRfrVideoFeature(batch_qa, hf, feature_shape,false_frame_num=2):
417 |     batch_size = len(batch_qa)
418 |     input_video = np.zeros((batch_size,)+tuple(feature_shape),dtype='float32')
419 | 
420 |     timesteps = feature_shape[0]
421 |     rfr_lables = np.zeros((batch_size,timesteps,2),dtype='int32')
422 |     rfr_lables[:,:,1] = 1
423 |     
424 |     for idx, qa in enumerate(batch_qa):
425 |         qid = qa.qid
426 |         video_clips = qa.video_clips
427 |         imdb_key = qa.imdb_key
428 |         clips_features = []
429 |         false_clips_features = []
430 |         if len(video_clips) != 0:
431 |             for clip in video_clips:
432 |                 dataset = imdb_key+'/'+clip
433 |                 if imdb_key in hf.keys() and clip in hf[imdb_key].keys():
434 |                     clips_features.extend(hf[dataset][:]) # clips_features.shape
435 |                 
436 | 
437 |         for clip in hf[imdb_key].keys():
438 |             dataset = imdb_key+'/'+clip
439 |             if clip not in video_clips:
440 |                 false_clips_features.extend(hf[dataset][:])
441 | 
442 | 
443 |         if(len(clips_features)<=0):
444 |             # if there are not vlid features
445 |             for clip in hf[imdb_key].keys():
446 |                 dataset = imdb_key+'/'+clip
447 |                 clips_features.extend(hf[dataset][:]) # clips_features.shape
448 | 
449 |         
450 |         if(len(clips_features)>=timesteps):
451 |             interval = int(math.floor((len(clips_features)-1)/(timesteps-1)))
452 |             input_video[idx] = clips_features[0::interval][0:timesteps]
453 |         else:
454 |             input_video[idx][:len(clips_features)] = clips_features
455 |             for last_idx in xrange(len(clips_features),timesteps):
456 |                 input_video[idx][last_idx]=clips_features[-1]
457 | 
458 |         false_clips_features = np.random.permutation(false_clips_features)
459 | 
460 |         false_frame_pos = np.random.permutation(range(0,timesteps))[:false_frame_num]
461 |         for _,ffp in enumerate(false_frame_pos):
462 |             input_video[idx][ffp] = false_clips_features[ffp]
463 |             rfr_lables[idx,ffp,0] = 1
464 |             rfr_lables[idx,ffp,1] = 0
465 | 
466 |     return input_video, rfr_lables
467 | 
468 | 
469 | def split_stories(full_stories,train_movies,val_movies):
470 |     train_stories = {}
471 |     val_stories = {}
472 |     for tm in train_movies:
473 |         train_stories[tm] = full_stories[tm]
474 |     for vm in val_movies:
475 |         val_stories[vm] = full_stories[vm]
476 | 
477 |     print('num of train stories:',len(train_stories))
478 |     print('num of val stories:',len(val_stories))
479 |     return train_stories,val_stories
480 | 
481 | def getBatchIndexedQAs_return(batch_qas_list,v2i, nql=16, nqa=10, numOfChoices=2):
482 |     '''
483 |         batch_qas_list: list of qas
484 |         QA_words: all the QAs, contains question words and answer words
485 |         v2i: vocabulary to index
486 |         nql: length of question
487 |         nqa: length of answer
488 |         numOfChoices: number of Choices utilized per QA, default set to 2 ==> right/wrong
489 | 
490 |         return: questions, answers, ground_truth
491 |             both of them are numeric indexed
492 |             ground_truth is one hot vector
493 |     '''
494 | 
495 |     batch_size = len(batch_qas_list)
496 |     questions = np.zeros((batch_size,nql),dtype='int32')
497 |     answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32')
498 |     ground_truth = np.zeros((batch_size,numOfChoices),dtype='int32')
499 | 
500 |     for idx, qa in enumerate(batch_qas_list):
501 |         # set question 
502 | 
503 |         questions[idx][:]=qa.question
504 |         
505 |         if numOfChoices==5:
506 |             
507 |             # set correct answer
508 |             #correct_index = qa.correct_index
509 |             ground_truth[idx]=qa.correct_index
510 |             for ans_idx, ans in enumerate(qa.answers):
511 |                 answers[idx][ans_idx][:]=ans
512 | 
513 |     
514 |         else:
515 |             raise ValueError('Invalid numOfChoices: ' + numOfChoices)
516 | 
517 |     return questions,answers,ground_truth
518 | 
519 | def getTestBatchIndexedQAs_return(batch_qas_list,v2i, nql=16, nqa=10, numOfChoices=2):
520 | 
521 |     batch_size = len(batch_qas_list)
522 |     questions = np.zeros((batch_size,nql),dtype='int32')
523 |     answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32')
524 | 
525 |     for idx, qa in enumerate(batch_qas_list):
526 | 
527 |         questions[idx][:]=qa.question
528 |         
529 |         if numOfChoices==5:
530 |             
531 |             for ans_idx, ans in enumerate(qa.answers):
532 |                 answers[idx][ans_idx][:]=ans
533 |         else:
534 |             raise ValueError('Invalid numOfChoices: ' + numOfChoices)
535 | 
536 |     return questions,answers  
537 | def main():
538 |     
539 |     task = 'video-based' # video-based or subtitle-based
540 | 
541 |     mqa = MovieQA.DataLoader()
542 | 
543 | 
544 |     # get 'subtitile-based' QA task dataset
545 |     stories, subtitle_QAs = mqa.get_story_qa_data('train', 'subtitle')
546 | 
547 |     # Create vocabulary
548 |     QA_words, v2i = create_vocabulary(subtitle_QAs, stories, word_thresh=2, v2i={'': 0, 'UNK':1})
549 | 
550 |     # get 'video-based' QA task training set
551 |     vl_qa, video_QAs = mqa.get_video_list('train', 'qa_clips')  # key: 'train:<id>', value: list of related clips
552 |     # vl_qa, _ = mqa.get_video_list('train', 'all_clips') # key:moive vid, value:list of related movid all_clips
553 | 
554 | 
555 |     
556 |     all_video_train_list = video_QAs
557 | 
558 |     batch_size = 20
559 |     total_train_qa = len(all_video_train_list)
560 |     num_batch = int(round(total_train_qa*1.0/batch_size))
561 | 
562 |     total_epoch = 100
563 | 
564 |     hf = h5py.File('/home/wb/movie_feature.hdf5','r')
565 |     feature_shape = (10,1024)
566 |     for epoch in xrange(total_epoch):
567 |         #shuffle
568 |         np.random.shuffle(all_video_train_list)
569 |         for batch_idx in xrange(num_batch):
570 |             batch_qa = all_video_train_list[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_train_qa)]
571 |             questions,answers,ground_truth = getBatchIndexedQAs(batch_qa,QA_words,v2i, nql=16, nqa=10, numOfChoices=2)
572 |             input_video = getBatchVideoFeature(batch_qa, QA_words, hf, feature_shape)
573 |             print(input_video)
574 |             print(ground_truth)
575 |             break
576 |         break
577 | 
578 | 
579 | if __name__=='__main__':
580 |     main()


--------------------------------------------------------------------------------
/model/InitUtil.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
 4 | 
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | 
 8 | def get_fans(shape):
 9 | 	if len(shape) == 2:
10 | 		fan_in = shape[0]
11 | 		fan_out = shape[1]
12 | 	elif len(shape) == 4 or len(shape) == 5:
13 | 		receptive_field_size = np.prod(shape[2:])
14 | 		fan_in = shape[1] * receptive_field_size
15 | 		fan_out = shape[0] * receptive_field_size
16 | 
17 | 	else:
18 | 		# No specific assumptions.
19 | 		fan_in = np.sqrt(np.prod(shape))
20 | 		fan_out = np.sqrt(np.prod(shape))
21 | 	return fan_in, fan_out
22 | 
23 | 
24 | def uniform(shape, scale=0.05, name=None, seed=None): #tf.float32
25 | 	if seed is None:
26 | 		# ensure that randomness is conditioned by the Numpy RNG
27 | 		seed = np.random.randint(10e8)
28 | 
29 | 	value = tf.random_uniform_initializer(
30 | 		-scale, scale, dtype=tf.float32, seed=seed)(shape)
31 | 
32 | 	return tf.Variable(value,name=name)
33 |     
34 | 
35 | 
36 | def glorot_uniform(shape, name=None):
37 |     fan_in, fan_out = get_fans(shape)
38 |     s = np.sqrt(6. / (fan_in + fan_out))
39 |     return uniform(shape, s, name=name)
40 | 
41 | 
42 | def orthogonal(shape, scale=1.1, name=None):
43 |     """Orthogonal initializer.
44 | 
45 |     # References
46 |         Saxe et al., http://arxiv.org/abs/1312.6120
47 |     """
48 |     flat_shape = (shape[0], np.prod(shape[1:]))
49 |     a = np.random.normal(0.0, 1.0, flat_shape)
50 |     u, _, v = np.linalg.svd(a, full_matrices=False)
51 |     # Pick the one with the correct shape.
52 |     q = u if u.shape == flat_shape else v
53 |     q = q.reshape(shape)
54 |     return tf.Variable(scale * q[:shape[0], :shape[1]], dtype=tf.float32, name=name)
55 | 
56 | def init_weight_variable(shape, init_method='glorot_uniform', name=None):
57 | 	# initial = tf.truncated_normal(shape, stddev=0.1, name=name)
58 | 	if init_method == 'uniform':
59 | 		return uniform(shape, scale=0.05, name=name, seed=None)
60 | 	elif init_method == 'glorot_uniform':
61 | 		return glorot_uniform(shape, name=name)
62 | 	elif init_method == 'orthogonal':
63 | 		return orthogonal(shape, scale=1.1, name=name)
64 | 	else:
65 | 		raise ValueError('Invalid init_method: ' + init_method)
66 | 	
67 | def init_bias_variable(shape,name=None):
68 | 	initial = tf.constant(0.1,shape=shape, name=name)
69 | 	return tf.Variable(initial, name=name)


--------------------------------------------------------------------------------
/model/ModelUtil.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
  4 | 
  5 | import tensorflow as tf
  6 | 
  7 | import numpy as np
  8 | from sklearn.decomposition import PCA
  9 | import cPickle as pickle
 10 | 
 11 | def get_fans(shape):
 12 | 	if len(shape) == 2:
 13 | 		fan_in = shape[0]
 14 | 		fan_out = shape[1]
 15 | 	elif len(shape) == 4 or len(shape) == 5:
 16 | 		receptive_field_size = np.prod(shape[2:])
 17 | 		fan_in = shape[1] * receptive_field_size
 18 | 		fan_out = shape[0] * receptive_field_size
 19 | 
 20 | 	else:
 21 | 		# No specific assumptions.
 22 | 		fan_in = np.sqrt(np.prod(shape))
 23 | 		fan_out = np.sqrt(np.prod(shape))
 24 | 	return fan_in, fan_out
 25 | 
 26 | 
 27 | def uniform(shape, scale=0.05, name=None, seed=None): #tf.float32
 28 | 	if seed is None:
 29 | 		# ensure that randomness is conditioned by the Numpy RNG
 30 | 		seed = np.random.randint(10e8)
 31 | 
 32 | 	value = tf.random_uniform_initializer(
 33 | 		-scale, scale, dtype=tf.float32, seed=seed)(shape)
 34 | 
 35 | 	return tf.Variable(value)
 36 |     
 37 | 
 38 | 
 39 | def glorot_uniform(shape, name=None):
 40 |     fan_in, fan_out = get_fans(shape)
 41 |     s = np.sqrt(6. / (fan_in + fan_out))
 42 |     return uniform(shape, s, name=name)
 43 | 
 44 | 
 45 | def orthogonal(shape, scale=1.1, name=None):
 46 |     """Orthogonal initializer.
 47 | 
 48 |     # References
 49 |         Saxe et al., http://arxiv.org/abs/1312.6120
 50 |     """
 51 |     flat_shape = (shape[0], np.prod(shape[1:]))
 52 |     a = np.random.normal(0.0, 1.0, flat_shape)
 53 |     u, _, v = np.linalg.svd(a, full_matrices=False)
 54 |     # Pick the one with the correct shape.
 55 |     q = u if u.shape == flat_shape else v
 56 |     q = q.reshape(shape)
 57 |     return tf.Variable(scale * q[:shape[0], :shape[1]], dtype=tf.float32, name=name)
 58 | 
 59 | def init_weight_variable(shape, init_method='glorot_uniform', name=None):
 60 | 	# initial = tf.truncated_normal(shape, stddev=0.1, name=name)
 61 | 	if init_method == 'uniform':
 62 | 		return uniform(shape, scale=0.05, name=name, seed=None)
 63 | 	elif init_method == 'glorot_uniform':
 64 | 		return glorot_uniform(shape, name=name)
 65 | 	elif init_method == 'orthogonal':
 66 | 		return orthogonal(shape, scale=1.1, name=name)
 67 | 	else:
 68 | 		raise ValueError('Invalid init_method: ' + init_method)
 69 | 	
 70 | def init_bias_variable(shape,name=None):
 71 | 	initial = tf.constant(0.1,shape=shape, name=name)
 72 | 	return tf.Variable(initial)
 73 | 
 74 | 
 75 | def matmul_wx(x, w, b, output_dims):
 76 | 	
 77 | 	return tf.matmul(x, w)+tf.reshape(b,(1,output_dims))
 78 | 	
 79 | 
 80 | def matmul_uh(u,h_tm1):
 81 | 	return tf.matmul(h_tm1,u)
 82 | 
 83 | 
 84 | 
 85 | def get_init_state(x, output_dims):
 86 | 	initial_state = tf.zeros_like(x)
 87 | 	initial_state = tf.reduce_sum(initial_state,axis=[1,2])
 88 | 	initial_state = tf.expand_dims(initial_state,dim=-1)
 89 | 	initial_state = tf.tile(initial_state,[1,output_dims])
 90 | 	return initial_state
 91 | 
 92 | 
 93 | def getVideoEncoder(x, output_dims, return_sequences=False):
 94 | 	'''
 95 | 		function: getVideoEncoder
 96 | 		parameters:
 97 | 
 98 | 			x: batch_size, timesteps , dims
 99 | 			output_dims: the output of the GRU dimensions
100 | 			num_class: number of class : ucf-101: 101
101 | 		return:
102 | 			the last GRU state, 
103 | 			or
104 | 			the sequences of the hidden states
105 | 
106 | 	'''
107 | 	input_shape = x.get_shape().as_list()
108 | 	assert len(input_shape)==3 
109 | 	timesteps = input_shape[1]
110 | 	input_dims = input_shape[2]
111 | 
112 | 	# get initial state
113 | 	initial_state = get_init_state(x, output_dims)
114 | 
115 | 	# initialize the parameters
116 | 	# W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h
117 | 	W_r = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_r")
118 | 	W_z = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_z")
119 | 	W_h = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_h")
120 | 
121 | 	U_r = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_r")
122 | 	U_z = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_z")
123 | 	U_h = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_h")
124 | 
125 | 	b_r = init_bias_variable((output_dims,),name="b_r")
126 | 	b_z = init_bias_variable((output_dims,),name="b_z")
127 | 	b_h = init_bias_variable((output_dims,),name="b_h")
128 | 
129 | 
130 | 	# batch_size x timesteps x dim -> timesteps x batch_size x dim
131 | 	axis = [1,0]+list(range(2,3))  # axis = [1,0,2]
132 | 	x = tf.transpose(x, perm=axis) # permutate the input_x --> timestemp, batch_size, input_dims
133 | 
134 | 	input_x = tf.TensorArray(
135 |             dtype=x.dtype,
136 |             size=timesteps,
137 |             tensor_array_name='input_x')
138 | 
139 | 	if hasattr(input_x, 'unstack'):
140 | 		input_x = input_x.unstack(x)
141 | 	else:
142 | 		input_x = input_x.unpack(x)	
143 | 
144 | 
145 | 	hidden_state = tf.TensorArray(
146 |             dtype=tf.float32,
147 |             size=timesteps,
148 |             tensor_array_name='hidden_state')
149 | 
150 | 
151 | 
152 | 
153 | 	def step(time, hidden_state, h_tm1):
154 | 		x_t = input_x.read(time) # batch_size * dim
155 | 
156 | 		preprocess_x_r = matmul_wx(x_t, W_r, b_r, output_dims)
157 | 		preprocess_x_z = matmul_wx(x_t, W_z, b_z, output_dims)
158 | 		preprocess_x_h = matmul_wx(x_t, W_h, b_h, output_dims)
159 | 
160 | 		r = tf.nn.sigmoid(preprocess_x_r+ matmul_uh(U_r,h_tm1))
161 | 		z = tf.nn.sigmoid(preprocess_x_z+ matmul_uh(U_z,h_tm1))
162 | 		hh = tf.nn.tanh(preprocess_x_h+ matmul_uh(U_h,h_tm1))
163 | 
164 | 		h = (1-z)*hh + z*h_tm1
165 | 
166 | 		hidden_state = hidden_state.write(time, h)
167 | 
168 | 		return (time+1,hidden_state,h)
169 | 
170 | 	
171 | 
172 | 
173 | 	time = tf.constant(0, dtype='int32', name='time')
174 | 
175 | 
176 | 	ret_out = tf.while_loop(
177 |             cond=lambda time, *_: time < timesteps,
178 |             body=step,
179 |             loop_vars=(time, hidden_state, initial_state),
180 |             parallel_iterations=32,
181 |             swap_memory=True)
182 | 
183 | 	output = ret_out[1]
184 | 	last_output = ret_out[-1] 
185 | 
186 | 	if hasattr(hidden_state, 'stack'):
187 | 		hidden_state = hidden_state.stack()
188 | 
189 | 	axis = [1,0] + list(range(2,3))
190 | 	outputs = tf.transpose(hidden_state,perm=axis)
191 | 
192 | 
193 | 	if return_sequences:
194 | 		return outputs
195 | 	else:
196 | 		return last_output
197 | 
198 | 
199 | def getEmbedding(words, size_voc, word_embedding_size):
200 | 	'''
201 | 		function: getEmbedding
202 | 		parameters:
203 | 			words: int, word index ; or a np.int32 list ## sample(null) * input_words_sequential
204 | 			size_voc: size of vocabulary
205 | 			embedding_size: the dimension after embedding
206 | 		return:
207 | 			embeded_words:the embeded words with shape (sample * timesteps * embedding dims)
208 | 			mask: each element in mask vector is 0 or 1,  indicate there is a word or a padding zero
209 | 	'''
210 | 
211 | 	W_e = tf.get_variable('W_e',(size_voc,word_embedding_size),initializer=tf.random_uniform_initializer(-0.05,0.05)) # share the embedding matrix
212 | 	embeded_words = tf.gather(W_e, words)
213 | 	mask =  tf.not_equal(words,0)
214 | 	return embeded_words, mask 
215 | 
216 | 
217 | 
218 | def getQuestionEncoder(embeded_words, output_dims, mask, return_sequences=False):
219 | 
220 | 	'''
221 | 		function: getQuestionEncoder
222 | 		parameters:
223 | 			embeded_words: sample*timestep*dim
224 | 			output_dims: the GRU hidden dim
225 | 			mask: bool type , samples * timestep
226 | 		return:
227 | 			the last GRU state, 
228 | 			or
229 | 			the sequences of the hidden states
230 | 	'''
231 | 	input_shape = embeded_words.get_shape().as_list()
232 | 	assert len(input_shape)==3 
233 | 
234 | 	timesteps = input_shape[1]
235 | 	input_dims = input_shape[2]
236 | 	# get initial state
237 | 	initial_state = get_init_state(embeded_words, output_dims)
238 | 
239 | 
240 | 	# initialize the parameters
241 | 	# W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h
242 | 	W_r = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_q_r")
243 | 	W_z = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_q_z")
244 | 	W_h = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_q_h")
245 | 
246 | 	U_r = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_q_r")
247 | 	U_z = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_q_z")
248 | 	U_h = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_q_h")
249 | 
250 | 	b_r = init_bias_variable((output_dims,),name="b_q_r")
251 | 	b_z = init_bias_variable((output_dims,),name="b_q_z")
252 | 	b_h = init_bias_variable((output_dims,),name="b_q_h")
253 | 
254 | 
255 | 	# batch_size x timesteps x dim -> timesteps x batch_size x dim
256 | 	axis = [1,0]+list(range(2,3))  # axis = [1,0,2]
257 | 	embeded_words = tf.transpose(embeded_words, perm=axis) # permutate the input_x --> timestemp, batch_size, input_dims
258 | 
259 | 
260 | 
261 | 	input_embeded_words = tf.TensorArray(
262 |             dtype=embeded_words.dtype,
263 |             size=timesteps,
264 |             tensor_array_name='input_embeded_words_q')
265 | 
266 | 
267 | 	if hasattr(input_embeded_words, 'unstack'):
268 | 		input_embeded_words = input_embeded_words.unstack(embeded_words)
269 | 	else:
270 | 		input_embeded_words = input_embeded_words.unpack(embeded_words)	
271 | 
272 | 
273 | 	# preprocess mask
274 | 	if len(mask.get_shape()) == len(input_shape)-1:
275 | 		mask = tf.expand_dims(mask,dim=-1)
276 | 	
277 | 	mask = tf.transpose(mask,perm=axis)
278 | 
279 | 	input_mask = tf.TensorArray(
280 | 		dtype=mask.dtype,
281 | 		size=timesteps,
282 | 		tensor_array_name='input_mask_q'
283 | 		)
284 | 
285 | 	if hasattr(input_mask, 'unstack'):
286 | 		input_mask = input_mask.unstack(mask)
287 | 	else:
288 | 		input_mask = input_mask.unpack(mask)
289 | 
290 | 
291 | 	hidden_state_q = tf.TensorArray(
292 |             dtype=tf.float32,
293 |             size=timesteps,
294 |             tensor_array_name='hidden_state_q')
295 | 
296 | 
297 | 
298 | 	def step(time, hidden_state_q, h_tm1):
299 | 		x_t = input_embeded_words.read(time) # batch_size * dim
300 | 		mask_t = input_mask.read(time)
301 | 
302 | 		preprocess_x_r = matmul_wx(x_t, W_r, b_r, output_dims)
303 | 		preprocess_x_z = matmul_wx(x_t, W_z, b_z, output_dims)
304 | 		preprocess_x_h = matmul_wx(x_t, W_h, b_h, output_dims)
305 | 
306 | 		r = tf.nn.sigmoid(preprocess_x_r+ matmul_uh(U_r,h_tm1))
307 | 		z = tf.nn.sigmoid(preprocess_x_z+ matmul_uh(U_z,h_tm1))
308 | 		hh = tf.nn.tanh(preprocess_x_h+ matmul_uh(U_h,h_tm1))
309 | 
310 | 		
311 | 		h = (1-z)*hh + z*h_tm1
312 | 		tiled_mask_t = tf.tile(mask_t, tf.stack([1, h.get_shape().as_list()[1]]))
313 | 
314 | 		h = tf.where(tiled_mask_t, h, h_tm1)
315 | 		
316 | 		hidden_state_q = hidden_state_q.write(time, h)
317 | 
318 | 		return (time+1,hidden_state_q,h)
319 | 
320 | 	
321 | 
322 | 
323 | 	time = tf.constant(0, dtype='int32', name='time')
324 | 
325 | 
326 | 	ret_out = tf.while_loop(
327 |             cond=lambda time, *_: time < timesteps,
328 |             body=step,
329 |             loop_vars=(time, hidden_state_q, initial_state),
330 |             parallel_iterations=32,
331 |             swap_memory=True)
332 | 
333 | 
334 | 	hidden_state_q = ret_out[1]
335 | 	last_output = ret_out[-1] 
336 | 	
337 | 	if hasattr(hidden_state_q, 'stack'):
338 | 		outputs = hidden_state_q.stack()
339 | 		print('stack')
340 | 	else:
341 | 		outputs = hidden_state_q.pack()
342 | 
343 | 	axis = [1,0] + list(range(2,3))
344 | 	outputs = tf.transpose(outputs,perm=axis)
345 | 
346 | 	if return_sequences:
347 | 		return outputs
348 | 	else:
349 | 		return last_output
350 | 
351 | 
352 | 
353 | 
354 | def getAnswerEmbedding(words, size_voc, word_embedding_size):
355 | 	'''
356 | 		function: getAnswerEmbedding
357 | 		parameters:
358 | 			words: int, word index ; or a np.int32 list ## sample(null) * numebrOfChoice * timesteps
359 | 			size_voc: size of vocabulary
360 | 			embedding_size: the dimension after embedding
361 | 		return:
362 | 			the embeded answers with shape(batch_size, numberOfChoices, timesteps, word_embedding_size)
363 | 	'''
364 | 	assert len(words.get_shape().as_list())==3 #
365 | 	input_shape = words.get_shape().as_list()
366 | 	numberOfChoices = input_shape[1]
367 | 	timesteps = input_shape[2]
368 | 
369 | 	mask =  tf.not_equal(words,0)
370 | 
371 | 	words = tf.reshape(words, (-1,timesteps))
372 | 	W_e = tf.get_variable('W_e',(size_voc,word_embedding_size),initializer=tf.random_uniform_initializer(-0.05,0.05)) # share the embedding matrix
373 | 	embeded_words = tf.gather(W_e, words)
374 | 	
375 | 
376 | 	embeded_words = tf.reshape(embeded_words,(-1,numberOfChoices,timesteps,word_embedding_size))
377 | 	
378 | 	return embeded_words, mask 
379 | 
380 | 
381 | 
382 | def getAnswerEncoder(embeded_words, output_dims, mask, return_sequences=False):
383 | 	'''
384 | 		function: getAnswerEncoder
385 | 		parameters:
386 | 			embeded_words: samples * numberOfChoices * timesteps  * dim
387 | 			output_dim: output of GRU, the dimension of answering vector
388 | 			mask : bool type, mask the embeded_words
389 | 			num_class: number of classifier
390 | 		return:
391 | 			the last encoded answers with shape(batch_size, numberOfChoices, output_dims)
392 | 			or
393 | 			the sequences.... with shape(batch_size, numberOfChoices, numberOfChoices, output_dims)
394 | 	'''
395 | 	input_shape = embeded_words.get_shape().as_list()
396 | 	assert len(input_shape)==4 
397 | 
398 | 
399 | 	numberOfChoices = input_shape[1]
400 | 	timesteps = input_shape[2]
401 | 	input_dims = input_shape[3]
402 | 
403 | 	# get initial state
404 | 	embeded_words = tf.reshape(embeded_words,(-1,timesteps,input_dims))
405 | 	initial_state = get_init_state(embeded_words, output_dims)
406 | 
407 | 	axis = [1,0,2]  
408 | 	embeded_words = tf.transpose(embeded_words, perm=axis) # permutate the 'embeded_words' --> timesteps x batch_size x numberOfChoices x dim
409 | 	# embeded_words = tf.reshape(embeded_words,(timesteps,-1,input_dims)) # reshape the 'embeded_words' --> timesteps x (batch x numberOfChoices) x dim
410 | 	
411 | 	# initialize the parameters
412 | 	# W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h
413 | 	W_r = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_a_r")
414 | 	W_z = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_a_z")
415 | 	W_h = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_a_h")
416 | 
417 | 	U_r = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_a_r")
418 | 	U_z = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_a_z")
419 | 	U_h = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_a_h")
420 | 
421 | 	b_r = init_bias_variable((output_dims,),name="b_a_r")
422 | 	b_z = init_bias_variable((output_dims,),name="b_a_z")
423 | 	b_h = init_bias_variable((output_dims,),name="b_a_h")
424 | 
425 | 
426 | 
427 | 	input_embeded_words = tf.TensorArray(
428 |             dtype=embeded_words.dtype,
429 |             size=timesteps,
430 |             tensor_array_name='input_embeded_words_a')
431 | 
432 | 
433 | 	if hasattr(input_embeded_words, 'unstack'):
434 | 		input_embeded_words = input_embeded_words.unstack(embeded_words)
435 | 	else:
436 | 		input_embeded_words = input_embeded_words.unpack(embeded_words)	
437 | 
438 | 
439 | 	# preprocess mask
440 | 	if len(mask.get_shape()) == len(input_shape)-1:
441 | 		mask = tf.expand_dims(mask,dim=-1)
442 | 	
443 | 	axis = [2,0,1,3]  
444 | 	mask = tf.transpose(mask,perm=axis)
445 | 	mask = tf.reshape(mask, (timesteps,-1,1))
446 | 
447 | 	input_mask = tf.TensorArray(
448 | 		dtype=mask.dtype,
449 | 		size=timesteps,
450 | 		tensor_array_name='input_mask_q'
451 | 		)
452 | 
453 | 	if hasattr(input_mask, 'unstack'):
454 | 		input_mask = input_mask.unstack(mask)
455 | 	else:
456 | 		input_mask = input_mask.unpack(mask)
457 | 
458 | 
459 | 	hidden_state_q = tf.TensorArray(
460 |             dtype=tf.float32,
461 |             size=timesteps,
462 |             tensor_array_name='hidden_state_a')
463 | 
464 | 	# if hasattr(hidden_state, 'unstack'):
465 | 	# 	hidden_state = hidden_state.unstack(hidden_state)
466 | 	# else:
467 | 	# 	hidden_state = hidden_state.unpack(hidden_state)
468 | 
469 | 
470 | 	def step(time, hidden_state_q, h_tm1):
471 | 		x_t = input_embeded_words.read(time) # batch_size * dim
472 | 		mask_t = input_mask.read(time)
473 | 
474 | 		preprocess_x_r = matmul_wx(x_t, W_r, b_r, output_dims)
475 | 		preprocess_x_z = matmul_wx(x_t, W_z, b_z, output_dims)
476 | 		preprocess_x_h = matmul_wx(x_t, W_h, b_h, output_dims)
477 | 
478 | 		r = tf.nn.sigmoid(preprocess_x_r+ matmul_uh(U_r,h_tm1))
479 | 		z = tf.nn.sigmoid(preprocess_x_z+ matmul_uh(U_z,h_tm1))
480 | 		hh = tf.nn.tanh(preprocess_x_h+ matmul_uh(U_h,h_tm1))
481 | 
482 | 		
483 | 		h = (1-z)*hh + z*h_tm1
484 | 		tiled_mask_t = tf.tile(mask_t, tf.stack([1, h.get_shape().as_list()[1]]))
485 | 
486 | 		h = tf.where(tiled_mask_t, h, h_tm1)
487 | 		
488 | 		hidden_state_q = hidden_state_q.write(time, h)
489 | 
490 | 		return (time+1,hidden_state_q,h)
491 | 
492 | 	
493 | 
494 | 
495 | 	time = tf.constant(0, dtype='int32', name='time')
496 | 
497 | 
498 | 	ret_out = tf.while_loop(
499 |             cond=lambda time, *_: time < timesteps,
500 |             body=step,
501 |             loop_vars=(time, hidden_state_q, initial_state),
502 |             parallel_iterations=32,
503 |             swap_memory=True)
504 | 
505 | 
506 | 	hidden_state_q = ret_out[1]
507 | 	last_output = ret_out[-1] 
508 | 
509 | 
510 | 	
511 | 	if hasattr(hidden_state_q, 'stack'):
512 | 		outputs = hidden_state_q.stack()
513 | 		print('stack')
514 | 	else:
515 | 		outputs = hidden_state_q.pack()
516 | 
517 | 	outputs = tf.reshape(outputs,(timesteps,-1,numberOfChoices,output_dims))
518 | 	axis = [1,2,0]+list(range(3,4))
519 | 	outputs = tf.transpose(outputs,perm=axis)
520 | 
521 | 	last_output = tf.reshape(last_output,(-1,numberOfChoices,output_dims))
522 | 	print('outputs:....',outputs.get_shape().as_list())
523 | 	if return_sequences:
524 | 		return outputs
525 | 	else:
526 | 		return last_output
527 | 
528 | 
529 | 
530 | 
531 | def getMemoryNetworks(embeded_stories, embeded_question, d_lproj, T_B=None, return_sequences=False):
532 | 
533 | 	'''
534 | 		embeded_stories: (batch_size, num_of_sentence, num_of_words, embeded_words_dims)
535 | 		embeded_question:(batch_size, embeded_words_dims)
536 | 		output_dims: the dimension of stories 
537 | 	'''
538 | 	stories_shape = embeded_stories.get_shape().as_list()
539 | 	embeded_question_shape = embeded_question.get_shape().as_list()
540 | 	num_of_sentence = stories_shape[-3]
541 | 	input_dims = stories_shape[-1]
542 | 	output_dims = embeded_question_shape[-1]
543 | 
544 | 
545 | 	embeded_stories = getAverageRepresentation(embeded_stories, T_B, d_lproj)
546 | 
547 | 	
548 | 	embeded_question = tf.tile(tf.expand_dims(embeded_question,dim=1),[1,num_of_sentence,1])
549 | 
550 | 	sen_weight = tf.reduce_sum(embeded_question*embeded_stories,reduction_indices=-1,keep_dims=True)
551 | 
552 | 	sen_weight = tf.nn.softmax(sen_weight,dim=1)
553 | 	sen_weight = tf.tile(sen_weight,[1,1,output_dims])
554 | 	if return_sequences:
555 | 		embeded_stories = embeded_stories*sen_weight
556 | 	else:
557 | 		embeded_stories = tf.reduce_sum(embeded_stories*sen_weight,reduction_indices=1) # (batch_size, output_dims)
558 | 
559 | 	return embeded_stories
560 | 
561 | def getMemoryNetworksMaxPooling(embeded_stories, embeded_question, d_lproj, T_B=None):
562 | 
563 | 	'''
564 | 		embeded_stories: (batch_size, num_of_sentence, num_of_words, embeded_words_dims)
565 | 		embeded_question:(batch_size, embeded_words_dims)
566 | 		output_dims: the dimension of stories 
567 | 	'''
568 | 	stories_shape = embeded_stories.get_shape().as_list()
569 | 	embeded_question_shape = embeded_question.get_shape().as_list()
570 | 	num_of_sentence = stories_shape[-3]
571 | 	input_dims = stories_shape[-1]
572 | 	output_dims = embeded_question_shape[-1]
573 | 
574 | 
575 | 	embeded_stories = getAverageRepresentation(embeded_stories, T_B, d_lproj)
576 | 
577 | 	
578 | 	embeded_question = tf.tile(tf.expand_dims(embeded_question,dim=1),[1,num_of_sentence,1])
579 | 
580 | 	sen_weight = tf.reduce_sum(embeded_question*embeded_stories,reduction_indices=-1,keep_dims=True)
581 | 
582 | 	sen_weight = tf.nn.softmax(sen_weight,dim=1)
583 | 	sen_weight = tf.tile(sen_weight,[1,1,output_dims])
584 | 
585 | 	embeded_stories = tf.reduce_max(embeded_stories*sen_weight,reduction_indices=1) # (batch_size, output_dims)
586 | 
587 | 	return embeded_stories
588 | 
589 | rng = np.random
590 | rng.seed(1234)
591 | 
592 | def init_linear_projection(rng, nrows, ncols, pca_mat=None):
593 |     """ Linear projection (for example when using fixed w2v as LUT """
594 |     if nrows == ncols:
595 |         P = np.eye(nrows)
596 |         print "Linear projection: initialized as identity matrix"
597 |     else:
598 |         assert([nrows, ncols] == pca_mat.shape, 'PCA matrix not of same size as RxC')
599 |         P = 0.1 * pca_mat
600 |         print "Linear projection: initialized with 0.1 PCA"
601 | 
602 |     return P.astype('float32')
603 | 
604 | def setWord2VecModelConfiguration(v2i,w2v,d_w2v,d_lproj):
605 | 	'''
606 | 		v2i: vocab(word) to int(index)
607 | 		w2v: word to vector
608 | 		d_w2v:dimension of w2v
609 | 		d_lproj: dimension of projection
610 | 	'''
611 | 	voc_size = len(v2i)
612 | 	np_mask = np.vstack((np.zeros(d_w2v),np.ones((voc_size-1,d_w2v))))
613 | 	T_mask = tf.constant(np_mask, tf.float32, name='LUT_mask')
614 | 
615 | 	pca_mat = None
616 | 	print "Initialize LUTs as word2vec and use linear projection layer"
617 | 
618 | 
619 | 	LUT = np.zeros((voc_size, d_w2v), dtype='float32')
620 | 	found_words = 0
621 | 
622 | 	for w, v in v2i.iteritems():
623 | 		if w in w2v.vocab:
624 | 			LUT[v] = w2v.get_vector(w)
625 | 			found_words +=1
626 | 		else:
627 | 			LUT[v] = rng.randn(d_w2v)
628 | 			LUT[v] = LUT[v] / (np.linalg.norm(LUT[v]) + 1e-6)
629 | 
630 | 	print "Found %d / %d words" %(found_words, len(v2i))
631 | 
632 | 
633 | 	# word 0 is blanked out, word 1 is 'UNK'
634 | 	LUT[0] = np.zeros((d_w2v))
635 | 
636 | 	# if linear projection layer is not the same shape as LUT, then initialize with PCA
637 | 	if d_lproj != LUT.shape[1]:
638 | 		pca = PCA(n_components=d_lproj, whiten=True)
639 | 		pca_mat = pca.fit_transform(LUT.T)  # 300 x 100?
640 | 
641 | 	# setup LUT!
642 | 	T_w2v = tf.constant(LUT.astype('float32'))
643 | 
644 | 	T_B = tf.Variable(init_linear_projection(rng, d_w2v, d_lproj, pca_mat), name='B')
645 | 
646 | 
647 | 
648 | 	return T_B, T_w2v, T_mask, pca_mat
649 | 
650 | 
651 | def getEmbeddingWithWord2Vec(words, T_w2v, T_mask):
652 | 	input_shape = words.get_shape().as_list()
653 | 
654 | 	mask =  tf.not_equal(words,0)
655 | 
656 | 	embeded_words = tf.gather(T_w2v,words)*tf.gather(T_mask,words)
657 | 
658 | 	return embeded_words, mask 
659 | 
660 | def getAverageRepresentation(sentence, T_B, d_lproj):
661 | 	sentence = tf.reduce_sum(sentence,reduction_indices=-2)
662 | 
663 | 
664 | 	sentence_shape = sentence.get_shape().as_list()
665 | 	if len(sentence_shape)==2:
666 | 		sentence = tf.matmul(sentence,T_B)
667 | 	elif len(sentence_shape)==3:
668 | 		sentence = tf.reshape(sentence,(-1,sentence_shape[-1]))
669 | 		sentence = tf.matmul(sentence,T_B)
670 | 		sentence = tf.reshape(sentence,(-1,sentence_shape[1],d_lproj))
671 | 	else:
672 | 		raise ValueError('Invalid sentence_shape:'+sentence_shape)
673 | 
674 | 	sentence = tf.nn.l2_normalize(sentence,-1)
675 | 	return sentence
676 | 
677 | 
678 | def getMultiModel(visual_feature, question_feature, answer_feature, common_space_dim):
679 | 	'''
680 | 		fucntion: getMultiModel
681 | 		parameters:
682 | 			visual_feature: batch_size * visual_encoded_dim
683 | 			question_feature: batch_size * question_encoded_dim
684 | 			answer_feature: batch_zize * numberOfChoices * answer_encoded_dim
685 | 			common_space_dim: embedding the visual,question,answer to the common space
686 | 		return: the embeded vectors(v,q,a)
687 | 	'''
688 | 	visual_shape = visual_feature.get_shape().as_list()
689 | 	question_shape = question_feature.get_shape().as_list()
690 | 	answer_shape = answer_feature.get_shape().as_list()
691 | 
692 | 	# build the transformed matrix
693 | 	W_v = init_weight_variable((visual_shape[1],common_space_dim),init_method='glorot_uniform',name="W_v")
694 | 	W_q = init_weight_variable((question_shape[1],common_space_dim),init_method='glorot_uniform',name="W_q")
695 | 	W_a = init_weight_variable((answer_shape[2],common_space_dim),init_method='glorot_uniform',name="W_a")
696 | 
697 | 
698 | 
699 | 	answer_feature = tf.reshape(answer_feature,(-1,answer_shape[2]))
700 | 
701 | 	# encoder the features into common space
702 | 	T_v = tf.matmul(visual_feature,W_v)
703 | 	T_q = tf.matmul(question_feature,W_q)
704 | 	T_a = tf.matmul(answer_feature,W_a)
705 | 
706 | 	T_a = tf.reshape(T_a,(-1,answer_shape[1],common_space_dim))
707 | 
708 | 	return T_v,T_q,T_a
709 | 
710 | def getRankingLoss(T_v, T_q, T_a, answer_index=None, alpha = 0.2 ,isTest=False):
711 | 	
712 | 	'''
713 | 		function: getRankingLoss
714 | 		parameters:
715 | 			answer_index: the ground truth index, one hot vector
716 | 		return:
717 | 			loss: tf.float32
718 | 	'''
719 | 	
720 | 	T_v_shape = T_v.get_shape().as_list()
721 | 	T_q_shape = T_q.get_shape().as_list()
722 | 	T_a_shape = T_a.get_shape().as_list()
723 | 
724 | 	numOfChoices = T_a_shape[1]
725 | 	common_space_dim = T_a_shape[2]
726 | 
727 | 	assert T_q_shape == T_v_shape
728 | 
729 | 	T_v = tf.nn.l2_normalize(T_v,1)
730 | 	T_q = tf.nn.l2_normalize(T_q,1)
731 | 	T_a = tf.nn.l2_normalize(T_a,2)
732 | 
733 | 	T_p = tf.nn.l2_normalize(T_v+T_q,1)
734 | 
735 | 	
736 | 
737 | 	# answer_index = tf.tile(tf.expand_dims(answer_index,dim=-1),[1,1,T_q_shape[-1]]) # sample * numOfChoices * common_space_dim
738 | 	
739 | 
740 | 	T_p = tf.tile(tf.expand_dims(T_p,dim=1),[1,numOfChoices,1])
741 | 
742 | 	# T_p = tf.nn.l2_normalize(T_p*T_a,2)
743 | 	T_p = T_p*T_a
744 | 	T_p = tf.reduce_sum(T_p, reduction_indices=-1)
745 | 
746 | 	scores = T_p
747 | 
748 | 	if not isTest:
749 | 		assert answer_index is not None
750 | 		positive = tf.reduce_sum(T_p*answer_index, reduction_indices=1, keep_dims=True) # sample , get the positive score
751 | 		positive = tf.tile(positive,[1,numOfChoices])
752 | 
753 | 		loss = (alpha - positive + T_p)*(1-answer_index)
754 | 
755 | 		loss = tf.maximum(0.,loss)
756 | 
757 | 		loss = tf.reduce_sum(loss,reduction_indices=-1)
758 | 
759 | 		return loss,scores
760 | 	else:
761 | 		return scores
762 | 
763 | 
764 | def getClassifierLoss(T_s, T_q, T_a, answer_index=None, isTest=False):
765 | 	
766 | 	'''
767 | 		function: getRankingLoss
768 | 		parameters:
769 | 			answer_index: the ground truth index, one hot vector
770 | 		return:
771 | 			loss: tf.float32
772 | 	'''
773 | 	
774 | 	T_s_shape = T_s.get_shape().as_list()
775 | 	T_q_shape = T_q.get_shape().as_list()
776 | 	T_a_shape = T_a.get_shape().as_list()
777 | 
778 | 	numOfChoices = T_a_shape[1]
779 | 	common_space_dim = T_a_shape[2]
780 | 
781 | 	assert T_q_shape == T_s_shape
782 | 
783 | 	T_s = tf.nn.l2_normalize(T_s+T_q,1)
784 | 	T_a = tf.nn.l2_normalize(T_a,2)
785 | 
786 | 	T_s = tf.tile(tf.expand_dims(T_s,dim=1),[1,numOfChoices,1])
787 | 
788 | 	# T_s = tf.nn.l2_normalize(T_s*T_a,2)
789 | 	T_h = T_s*T_a
790 | 	T_h = tf.reduce_sum(T_h, reduction_indices=-1)
791 | 
792 | 	scores = T_h
793 | 
794 | 	if not isTest:
795 | 		assert answer_index is not None
796 | 		loss = tf.nn.softmax_cross_entropy_with_logits(labels = answer_index, logits = scores)
797 | 		# acc_value = tf.metrics.accuracy(scores, answer_index)
798 | 		return loss,scores
799 | 	else:
800 | 		return scores
801 | 
802 | 
803 | 
804 | def getVideoSemanticEmbedding(x,w2v,T_B,pca_mat=None):
805 | 	'''
806 | 		x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
807 | 		w2v: word 2 vec (|v|,dim)
808 | 	'''
809 | 	input_shape = x.get_shape().as_list()
810 | 	w2v_shape = w2v.get_shape().as_list()
811 | 	assert(len(input_shape)==5)
812 | 	axis = [0,1,3,4,2]
813 | 	x = tf.transpose(x,perm=axis)
814 | 	x = tf.reshape(x,(-1,input_shape[2]))
815 | 	# x = tf.nn.l2_normalize(x,-1)
816 | 
817 | 	if pca_mat is not None:
818 | 		linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj')
819 | 	else:
820 | 		linear_proj = init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj')
821 | 
822 | 	x = tf.matmul(x,linear_proj) 
823 | 	x = tf.nn.l2_normalize(x,-1)
824 | 
825 | 	w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v)
826 | 
827 | 	x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|)
828 | 
829 | 	x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1]))
830 | 	axis = [0,1,4,2,3]
831 | 	x = tf.transpose(x,perm=axis)
832 | 	
833 | 	# can be extended to different architecture
834 | 	x = tf.reduce_sum(x,reduction_indices=[1,3,4])
835 | 	x = tf.nn.l2_normalize(x,-1)
836 | 
837 | 	x = tf.matmul(x,T_B)
838 | 
839 | 	
840 | 
841 | 	return x
842 | 
843 | 
844 | 
845 | if __name__=='__main__':
846 | 	print('video question answering model module!')
847 | 
848 | 	
849 | 
850 | 
851 | 
852 | 


--------------------------------------------------------------------------------
/model/SEModelUtil.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | os.environ["CUDA_VISIBLE_DEVICES"]="0"
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | from sklearn.decomposition import PCA
  6 | import ModelUtil
  7 | import InitUtil
  8 | 
  9 | 
 10 | def ndim(x):
 11 |     """Returns the number of axes in a tensor, as an integer.
 12 | 
 13 |     # Arguments
 14 |         x: Tensor or variable.
 15 | 
 16 |     # Returns
 17 |         Integer (scalar), number of axes.
 18 | 
 19 |     # Examples
 20 |     ```python
 21 |         >>> from keras import backend as K
 22 |         >>> input = K.placeholder(shape=(2, 4, 5))
 23 |         >>> val = np.array([[1, 2], [3, 4]])
 24 |         >>> kvar = K.variable(value=val)
 25 |         >>> K.ndim(input)
 26 |         3
 27 |         >>> K.ndim(kvar)
 28 |         2
 29 |     ```
 30 |     """
 31 |     if isinstance(x, tf.SparseTensor):
 32 |         return x._dims
 33 | 
 34 |     dims = x.get_shape()._dims
 35 |     if dims is not None:
 36 |         return len(dims)
 37 |     return None
 38 | def batch_dot(x, y, axes=None):
 39 |     """Batchwise dot product.
 40 | 
 41 |     `batch_dot` is used to compute dot product of `x` and `y` when
 42 |     `x` and `y` are data in batch, i.e. in a shape of
 43 |     `(batch_size, :)`.
 44 |     `batch_dot` results in a tensor or variable with less dimensions
 45 |     than the input. If the number of dimensions is reduced to 1,
 46 |     we use `expand_dims` to make sure that ndim is at least 2.
 47 | 
 48 |     # Arguments
 49 |         x, y: Keras tensors or variables with `ndim >= 2`
 50 |         axes: list of (or single) int with target dimensions.
 51 |             The lengths of `axes[0]` and `axes[1]` should be the same.
 52 | 
 53 |     # Returns
 54 |         A tensor with shape equal to the concatenation of `x`'s shape
 55 |         (less the dimension that was summed over) and `y`'s shape
 56 |         (less the batch dimension and the dimension that was summed over).
 57 |         If the final rank is 1, we reshape it to `(batch_size, 1)`.
 58 | 
 59 |     # Examples
 60 |         Assume `x = [[1, 2], [3, 4]]` and `y = [[5, 6], [7, 8]]`
 61 |         `batch_dot(x, y, axes=1) = [[17, 53]]` which is the main diagonal
 62 |         of `x.dot(y.T)`, although we never have to calculate the off-diagonal
 63 |         elements.
 64 | 
 65 |         Shape inference:
 66 |         Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`.
 67 |         If `axes` is (1, 2), to find the output shape of resultant tensor,
 68 |             loop through each dimension in `x`'s shape and `y`'s shape:
 69 | 
 70 |         * `x.shape[0]` : 100 : append to output shape
 71 |         * `x.shape[1]` : 20 : do not append to output shape,
 72 |             dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1)
 73 |         * `y.shape[0]` : 100 : do not append to output shape,
 74 |             always ignore first dimension of `y`
 75 |         * `y.shape[1]` : 30 : append to output shape
 76 |         * `y.shape[2]` : 20 : do not append to output shape,
 77 |             dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2)
 78 |         `output_shape` = `(100, 30)`
 79 | 
 80 |     ```python
 81 |         >>> x_batch = K.ones(shape=(32, 20, 1))
 82 |         >>> y_batch = K.ones(shape=(32, 30, 20))
 83 |         >>> xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[1, 2])
 84 |         >>> K.int_shape(xy_batch_dot)
 85 |         (32, 1, 30)
 86 |     ```
 87 |     """
 88 |     if isinstance(axes, int):
 89 |         axes = (axes, axes)
 90 |         #print('1')
 91 |     if ndim(x) == 2 and ndim(y) == 2:
 92 |         if tf_major_version >= 1:
 93 |             if axes[0] == axes[1]:
 94 |                 out = tf.reduce_sum(tf.multiply(x, y), axes[0])
 95 |             else:
 96 |                 out = tf.reduce_sum(tf.multiply(tf.transpose(x, [1, 0]), y), axes[1])
 97 |         else:
 98 |             if axes[0] == axes[1]:
 99 |                 out = tf.reduce_sum(tf.mul(x, y), axes[0])
100 |             else:
101 |                 out = tf.reduce_sum(tf.mul(tf.transpose(x, [1, 0]), y), axes[1])
102 |     else:
103 |         if axes is not None:
104 |             #print('2')
105 |             adj_x = None if axes[0] == ndim(x) - 1 else True
106 |             adj_y = True if axes[1] == ndim(y) - 1 else None
107 |         else:
108 |             #print('3')
109 |             adj_x = None
110 |             adj_y = None
111 |         # TODO: remove later.
112 |         if hasattr(tf, 'batch_matmul'):
113 |             try:
114 |                 out = tf.batch_matmul(x, y, adj_a=adj_x, adj_b=adj_y)
115 |                 #print('4')
116 |             except TypeError:
117 |                 out = tf.batch_matmul(x, y, adj_x=adj_x, adj_y=adj_y)
118 |         else:
119 |             out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y)
120 |     if ndim(out) == 1:
121 |         out = expand_dims(out, 1)
122 |     return out
123 | 
124 | 
125 | def getVideoDualSemanticEmbeddingWithQuestionAttention(x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None):
126 |     '''
127 |         x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
128 |         w2v: word 2 vec (|v|,dim)
129 |     '''
130 |     input_shape = x.get_shape().as_list()
131 |     w2v_shape = w2v.get_shape().as_list()
132 |     assert(len(input_shape)==5)
133 |     axis = [0,1,3,4,2]
134 |     x = tf.transpose(x,perm=axis)
135 |     x = tf.reshape(x,(-1,input_shape[2]))
136 | 
137 |     if pca_mat is not None:
138 |         linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj')
139 |     else:
140 |         linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj')
141 | 
142 |     x = tf.matmul(x,linear_proj) 
143 |     x = tf.nn.l2_normalize(x,-1)
144 | 
145 |     w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v)
146 |     x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|)
147 | 
148 |     
149 |     #-----------------------
150 | 
151 |     x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1]))
152 |     axis = [0,1,4,2,3]
153 |     x = tf.transpose(x,perm=axis)
154 |     
155 |     # can be extended to different architecture
156 |     x = tf.reduce_sum(x,reduction_indices=[3,4])
157 |     x = tf.nn.l2_normalize(x,-1)
158 | 
159 |     #-----------------------
160 |     stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
161 |     x = batch_dot(x,stories_cov)
162 |     #-----------------------
163 |     x = tf.nn.l2_normalize(x,-1)
164 | 
165 |     embedded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1])
166 | 
167 | 
168 |     frame_weight = tf.reduce_sum(x*embedded_question,reduction_indices=-1,keep_dims=True)
169 |     frame_weight = tf.nn.softmax(frame_weight,dim=1)
170 | 
171 |     frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])
172 | 
173 |     x = tf.reduce_sum(x*frame_weight,reduction_indices=1)
174 | 
175 |     x = tf.matmul(x,T_B)
176 | 
177 |     x = tf.nn.l2_normalize(x,-1)
178 |     return x
179 |     
180 |     
181 | 
182 | def getVideoDualSemanticEmbeddingWithQuestionAttention_up(x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None):
183 |     '''
184 |         x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
185 |         w2v: word 2 vec (|v|,dim)
186 |     '''
187 |     input_shape = x.get_shape().as_list()
188 |     w2v_shape = w2v.get_shape().as_list()
189 |     assert(len(input_shape)==5)
190 |     axis = [0,1,3,4,2]
191 |     x = tf.transpose(x,perm=axis)
192 |     x = tf.reshape(x,(-1,input_shape[2]))
193 | 
194 | 
195 |     if pca_mat is not None:
196 |         linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj')
197 |     else:
198 |         linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj')
199 | 
200 |     x = tf.matmul(x,linear_proj) 
201 |     x = tf.nn.l2_normalize(x,-1)
202 | 
203 |     #-----------------------
204 |     w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v)
205 |     x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|)
206 | 
207 |     #-----------------------
208 | 
209 |     x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1]))
210 |     axis = [0,1,4,2,3]
211 |     x = tf.transpose(x,perm=axis)
212 |     
213 |     # can be extended to different architecture
214 |     x = tf.reduce_sum(x,reduction_indices=[3,4])
215 |     x = tf.nn.l2_normalize(x,-1)
216 |     
217 |     
218 | 
219 |     #-----------------------
220 | 
221 | 
222 |     stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
223 |     x_out = batch_dot(x,stories_cov)
224 | 
225 | 
226 |     
227 |     #-----------------------
228 |     x = tf.nn.l2_normalize(x_out,-1)
229 | 
230 |     embedded_question_use = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1])
231 | 
232 |     
233 |     frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True)
234 |     
235 |     frame_weight = tf.nn.softmax(frame_weight,dim=1) 
236 |     
237 | 
238 | 
239 |     frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])
240 | 
241 |     x_weight_new = tf.reduce_sum(x*frame_weight,reduction_indices=1)
242 |     
243 | 
244 |     x_weight_use = tf.expand_dims(x_weight_new, dim = 1)
245 |     
246 |     story_weight = tf.matmul(x_weight_use,tf.transpose(embedded_stories_words,perm=[0,2,1]))
247 |     
248 |     story_weight = tf.nn.relu(story_weight)
249 |     
250 |     embedded_stories_words = tf.multiply(tf.transpose(story_weight,perm=[0,2,1]), embedded_stories_words)
251 |     stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
252 |     
253 |     x = batch_dot(x,stories_cov)
254 | 
255 | 
256 |     
257 | 
258 |     x = tf.nn.l2_normalize(x,-1)
259 |     
260 | 
261 |     frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True)
262 |     
263 |     frame_weight = tf.nn.softmax(frame_weight,dim=1) 
264 | 
265 |     frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])
266 | 
267 |     x = tf.reduce_sum(x*frame_weight,reduction_indices=1)
268 |     
269 | 
270 |     x = tf.matmul(x,T_B)
271 | 
272 |     x = tf.nn.l2_normalize(x,-1)
273 |     return x
274 | 
275 | 
276 | def getAverageRepresentation(sentence, T_B, d_lproj):
277 |     sentence = tf.reduce_sum(sentence,reduction_indices=-2)
278 | 
279 |     sentence_shape = sentence.get_shape().as_list()
280 |     if len(sentence_shape)==2:
281 |         sentence = tf.matmul(sentence,T_B)
282 |     elif len(sentence_shape)==3:
283 |         sentence = tf.reshape(sentence,(-1,sentence_shape[-1]))
284 |         sentence = tf.matmul(sentence,T_B)
285 |         sentence = tf.reshape(sentence,(-1,sentence_shape[1],d_lproj))
286 |     else:
287 |         raise ValueError('Invalid sentence_shape:'+sentence_shape)
288 | 
289 |     sentence = tf.nn.l2_normalize(sentence,-1)
290 |     return sentence
291 | 
292 | 
293 | 
294 | 
295 | 
296 | def getMemoryNetworks(embeded_stories, embeded_question, d_lproj, return_sequences=False):
297 | 
298 |     '''
299 |         embeded_stories: (batch_size, num_of_sentence, num_of_words, embeded_words_dims)
300 |         embeded_question:(batch_size, embeded_words_dims)
301 |         output_dims: the dimension of stories 
302 |     '''
303 |     stories_shape = embeded_stories.get_shape().as_list()
304 |     embeded_question_shape = embeded_question.get_shape().as_list()
305 |     num_of_sentence = stories_shape[-3]
306 |     input_dims = stories_shape[-1]
307 |     output_dims = embeded_question_shape[-1]
308 | 
309 | 
310 |     embeded_stories = tf.reduce_sum(embeded_stories,reduction_indices=-2)
311 |     embeded_stories = tf.nn.l2_normalize(embeded_stories,-2)
312 | 
313 |     
314 |     embeded_question = tf.tile(tf.expand_dims(embeded_question,dim=1),[1,num_of_sentence,1])
315 | 
316 |     sen_weight = tf.reduce_sum(embeded_question*embeded_stories,reduction_indices=-1,keep_dims=True)
317 | 
318 |     sen_weight = tf.nn.softmax(sen_weight,dim=1)
319 |     sen_weight = tf.tile(sen_weight,[1,1,output_dims])
320 |     if return_sequences:
321 |         embeded_stories = embeded_stories*sen_weight
322 |     else:
323 |         embeded_stories = tf.reduce_sum(embeded_stories*sen_weight,reduction_indices=1) # (batch_size, output_dims)
324 | 
325 |     return embeded_stories
326 | 
327 | 
328 | 
329 |     
330 | def getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid(embeded_stories, d_lproj, x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None,return_sequences=True):
331 |     '''
332 |         x: input video cnn feature with size of (batch_size, timesteps, channels, height, width)
333 |         w2v: word 2 vec (|v|,dim)
334 |     '''
335 |     input_shape = x.get_shape().as_list()
336 |     w2v_shape = w2v.get_shape().as_list()
337 |     assert(len(input_shape)==5)
338 |     axis = [0,1,3,4,2]
339 |     x = tf.transpose(x,perm=axis)
340 |     x = tf.reshape(x,(-1,input_shape[2]))
341 | 
342 |     if pca_mat is not None:
343 |         linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj')
344 |     else:
345 |         linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj')
346 | 
347 |     x = tf.matmul(x,linear_proj) 
348 |     x = tf.nn.l2_normalize(x,-1)
349 | 
350 | 
351 |     
352 |     #-----------------------
353 |     w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v)
354 |     x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|)
355 | 
356 |     #-----------------------
357 | 
358 |     x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1]))
359 |     axis = [0,1,4,2,3]
360 |     x = tf.transpose(x,perm=axis)
361 |     
362 |     # can be extended to different architecture
363 |     x = tf.reduce_sum(x,reduction_indices=[3,4])
364 |     x = tf.nn.l2_normalize(x,-1)
365 |     
366 |     
367 | 
368 |     #-----------------------
369 | 
370 |     stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
371 |     x_out = batch_dot(x,stories_cov)
372 | 
373 | 
374 |     
375 |     #-----------------------
376 |     x = tf.nn.l2_normalize(x_out,-1)
377 | 
378 |     embedded_question_use = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1])
379 | 
380 |     
381 |     frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True)
382 |     
383 |     frame_weight = tf.nn.softmax(frame_weight,dim=1) 
384 |     
385 | 
386 | 
387 |     frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])
388 | 
389 |     x_weight_new = tf.reduce_sum(x*frame_weight,reduction_indices=1)
390 |     
391 | 
392 |     x_weight_use = tf.expand_dims(x_weight_new, dim = 1)
393 |     
394 |     story_weight = tf.matmul(x_weight_use,tf.transpose(embedded_stories_words,perm=[0,2,1]))
395 |     
396 |     story_weight = tf.nn.relu(story_weight)
397 |     
398 |     embedded_stories_words = tf.multiply(tf.transpose(story_weight,perm=[0,2,1]), embedded_stories_words)
399 |     
400 |     stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words)
401 |     
402 |     x = batch_dot(x,stories_cov)
403 |     
404 |     x = tf.nn.l2_normalize(x_out,-1)
405 |     
406 |     #-------------------------------------------------------------------------------------------------------------
407 |     
408 |     stories_shape = embeded_stories.get_shape().as_list()
409 |     embeded_question_shape = embedded_question.get_shape().as_list()
410 |     num_of_sentence = stories_shape[-3]
411 |     input_dims = stories_shape[-1]
412 |     output_dims = embeded_question_shape[-1]
413 |     
414 |     print('embeded_question_shape', embeded_question_shape)
415 |     print('num_of_sentence', num_of_sentence)
416 |     
417 |     print('output_dims', output_dims)
418 |     print('stories_shape', stories_shape)
419 | 
420 |     
421 |     embeded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,num_of_sentence,1])
422 | 
423 |     sen_weight = tf.reduce_sum(embeded_question*embedded_stories_words,reduction_indices=-1,keep_dims=True)
424 | 
425 | 
426 |     sen_weight = tf.nn.relu(sen_weight)
427 |     sen_weight = tf.tile(sen_weight,[1,1,output_dims])
428 |     if return_sequences:
429 |         embeded_stories_used = embedded_stories_words*sen_weight
430 |     else:
431 |         embeded_stories_used = tf.reduce_sum(embedded_stories_words*sen_weight,reduction_indices=1)
432 |  
433 |     
434 |     #-------------------------------------------------------------------------------------------------------------
435 |     stories_cov = batch_dot(tf.transpose(embeded_stories_used,perm=[0,2,1]),embeded_stories_used)
436 |     
437 |     x = batch_dot(x,stories_cov)
438 | 
439 | 
440 |     
441 | 
442 |     #-----------------------
443 |     x = tf.nn.l2_normalize(x,-1)
444 |     
445 | 
446 | 
447 |     
448 |     frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True)
449 |     
450 |     frame_weight = tf.nn.softmax(frame_weight,dim=1) 
451 | 
452 |     frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]])
453 | 
454 |     x = tf.reduce_sum(x*frame_weight,reduction_indices=1)
455 | 
456 | 
457 |     #-----------------------------------------------
458 | 
459 |     x = tf.matmul(x,T_B)
460 | 
461 |     x = tf.nn.l2_normalize(x,-1)
462 |     
463 |     return x
464 | 
465 | 
466 | 


--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bowong/Layered-Memory-Network/86364077c40de7674088248b81ef805d7bfa7f4d/model/__init__.py


--------------------------------------------------------------------------------
/mqa_video+subtitle+question.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import h5py
  4 | import math
  5 | #import MovieQA_benchmark as MovieQA
  6 | from model import DataUtil
  7 | from model import ModelUtil
  8 | from model import SEModelUtil
  9 | import word2vec as w2v
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 12 | 
 13 | import tensorflow as tf
 14 | from sklearn.decomposition import PCA
 15 | import cPickle as pickle
 16 | import time
 17 | import json
 18 | from collections import namedtuple
 19 | 
 20 | def build_model(input_video, input_stories, input_question, input_answer, 
 21 |             v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300,
 22 |             answer_index = None, lr=0.01, question_guided=False):
 23 | 
 24 |     with tf.variable_scope('video_subtitle_hierarchical_frame_clip_question') as scope:
 25 |         
 26 |         T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj)
 27 |         # encode question
 28 |         embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask)
 29 |         embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj)
 30 | 
 31 |         # encode stories
 32 |         embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask)
 33 |         embedded_stories = ModelUtil.getMemoryNetworks(embedded_stories_words, embedded_question, d_lproj, T_B=T_B, return_sequences=True)
 34 | 
 35 |         # encode video
 36 | 
 37 |         embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention(input_video, T_w2v, embedded_stories, embedded_question, T_B, pca_mat=pca_mat) # batch x timesteps x d_w2v
 38 | 
 39 | 
 40 |         # encode answers
 41 |         embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask)
 42 |         embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj)
 43 | 
 44 |         # get video loss
 45 |         video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index)
 46 | 
 47 | 
 48 |         # train module
 49 |         loss = tf.reduce_mean(video_loss)
 50 | 
 51 |         optimizer = tf.train.GradientDescentOptimizer(lr)
 52 |         train = optimizer.minimize(loss)
 53 |         return train,loss,video_scores
 54 |         
 55 | def linear_project_pca_initialization(hf,  feature_shape, d_w2v=300, output_path=None):
 56 | 
 57 |     print('--utilize PCA to initialize the embedding matrix of feature to d_w2v')
 58 |     samples = []
 59 |     for imdb_key in hf.keys():
 60 |         feature = hf[imdb_key][:]
 61 |         axis = [0,2,3,1]
 62 |         feature = np.transpose(feature, tuple(axis))
 63 |         feature = np.reshape(feature,(-1,feature_shape[1]))
 64 |         feature = np.random.permutation(feature)
 65 |         samples.extend(feature[:50])
 66 |     print('samples:',len(samples))
 67 | 
 68 |     pca = PCA(n_components=d_w2v, whiten=True)
 69 |     pca_mat = pca.fit_transform(np.asarray(samples).T)  # 1024 x 300
 70 | 
 71 |     pickle.dump(pca_mat,open(output_path,'w'))
 72 |     print('pca_amt dump to file:',output_path)
 73 |     return pca_mat
 74 | 
 75 | 
 76 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape,
 77 |     loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32):
 78 |     if train is not None:
 79 |         np.random.shuffle(data)
 80 | 
 81 |     total_data = len(data)
 82 |     num_batch = int(round(total_data*1.0/batch_size))
 83 | 
 84 |     total_correct_num = 0
 85 |     total_loss = 0.0
 86 |     for batch_idx in xrange(num_batch):
 87 |         batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
 88 |         
 89 |         data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices)
 90 |         data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape)
 91 |         data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape)
 92 |         if train is not None:
 93 |             _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 94 |         else:
 95 |             l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 96 | 
 97 | 
 98 |         num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0))
 99 |         total_correct_num += num_correct
100 |         total_loss += l
101 | 
102 |     total_acc = total_correct_num*1.0/total_data
103 |     total_loss = total_loss/num_batch
104 |     return total_acc, total_loss
105 | 
106 | 
107 | 
108 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
109 |         feature_shape=(16,1024,7,7),
110 |         batch_size=8,total_epoch=100,
111 |         lr=0.01,pretrained_model=False,pca_mat_init_file=None):
112 |     
113 | 
114 |     w2v_mqa_model_filename = '/data1/wb/movie_plots_1364.d-300.mc1.w2v'
115 |     w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin')
116 | 
117 |     '''
118 |         model parameters
119 |     '''
120 |     size_voc = len(v2i)
121 |     max_sentences = 3660
122 |     max_words = 40
123 |     
124 |     story_shape = (max_sentences,max_words)
125 | 
126 |     size_voc = len(v2i)
127 | 
128 | 
129 |     print('building model ...')
130 |     
131 |     if os.path.exists(pca_mat_init_file):
132 |         pca_mat = pickle.load(open(pca_mat_init_file,'r'))
133 |     else:
134 |         pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file)
135 | 
136 |     print('pca_mat.shape:',pca_mat.shape)
137 | 
138 |     input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video')
139 |     input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories')
140 |     input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question')
141 |     input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer')
142 | 
143 |     y = tf.placeholder(tf.float32,shape=(None, numberOfChoices))
144 | 
145 |     train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model,
146 |             pca_mat=pca_mat,
147 |             d_w2v=300,d_lproj=300,
148 |             answer_index=y,  lr=lr)
149 | 
150 |     '''
151 |         configure && runtime environment
152 |     '''
153 |     config = tf.ConfigProto()
154 |     config.gpu_options.per_process_gpu_memory_fraction = 0.5
155 |     config.log_device_placement=False
156 | 
157 |     sess = tf.Session(config=config)
158 | 
159 |     init = tf.global_variables_initializer()
160 |     sess.run(init)
161 | 
162 |     '''
163 |         training parameters
164 |     '''
165 | 
166 |     with open('train_split.json') as fid:
167 |         trdev = json.load(fid)
168 | 
169 | 
170 |     def getTrainDevSplit(trained_video_QAs,trdev):
171 |         train_data = []
172 |         dev_data = []
173 |         for k, qa in enumerate(trained_video_QAs):
174 | 
175 |             if qa.imdb_key in trdev['train']:
176 |                 train_data.append(qa)
177 |             else:
178 |                 dev_data.append(qa)
179 |         return train_data,dev_data
180 | 
181 |     train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev)
182 | 
183 | 
184 |     with sess.as_default():
185 |         saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch)
186 |         if pretrained_model is not None:
187 |             saver.restore(sess, pretrained_model)
188 |             print('restore pre trained file:' + pretrained_model)
189 |         for epoch in xrange(total_epoch):
190 |             
191 |             # # shuffle
192 |             print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size))
193 |             # train phase
194 |             tic = time.time()
195 |             total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
196 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32)
197 |             print('    --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
198 | 
199 |             # dev phase
200 |             tic = time.time()
201 |             total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
202 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
203 |             print('    --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
204 |             # eval phase
205 | 
206 |             tic = time.time()
207 |             total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape,
208 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
209 |             print('    --Val--,  Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
210 | 
211 | 
212 | 
213 |             #save model
214 |             export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])
215 |             if not os.path.exists(export_path):
216 |                 os.makedirs(export_path)
217 |                 print('mkdir %s' %export_path)
218 |             save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt')
219 |             print("Model saved in file: %s" % save_path)
220 | 
221 | 
222 | def trans(all):
223 | 
224 |     qa_list = []
225 |     for dicts in all:
226 | 
227 |         qa_list.append(
228 |             QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'],
229 |                    dicts['imdb_key'], dicts['video_clips']))
230 |     return qa_list        
231 | 
232 | 
233 | if __name__ == '__main__':
234 | 
235 |     # 'video+subtitle task'
236 | 
237 |     nql=25 # sequences length for question
238 |     nqa=32 # sequences length for anwser
239 |     numberOfChoices = 5 # for input choices, one for correct, one for wrong answer
240 |     QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips')
241 |     
242 | 
243 |     v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb"))
244 |     qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb")))
245 |     qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb")))
246 |     train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb"))
247 |     val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb"))
248 | 
249 |     lr = 0.01
250 | 
251 |     '''
252 |     ---------------------------------
253 |     224x224 vgg all clips feature
254 |     '''
255 | 
256 |     video_feature_dims=512
257 |     timesteps_v=32 # sequences length for video
258 |     hight = 7
259 |     width = 7
260 |     feature_shape = (timesteps_v,video_feature_dims,hight,width)
261 | 
262 |     f_type = '224x224_VGG'
263 |     feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5'
264 |     pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl'
265 |     hf = h5py.File(feature_path,'r')
266 | 
267 | 
268 |     pretrained_model = None
269 |     train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
270 |         feature_shape=feature_shape,lr=lr,
271 |         batch_size=8,total_epoch=20,
272 |         pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file)
273 |     
274 | 
275 |     
276 |     
277 |     
278 |     
279 | 
280 | 
281 |     


--------------------------------------------------------------------------------
/mqa_video+subtitle+update+question.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import h5py
  4 | import math
  5 | #import MovieQA_benchmark as MovieQA
  6 | from model import DataUtil
  7 | from model import ModelUtil
  8 | from model import SEModelUtil
  9 | import word2vec as w2v
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 12 | 
 13 | import tensorflow as tf
 14 | from sklearn.decomposition import PCA
 15 | import cPickle as pickle
 16 | import time
 17 | import json
 18 | from collections import namedtuple
 19 | 
 20 | def build_model(input_video, input_stories, input_question, input_answer, 
 21 |             v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300,
 22 |             answer_index = None, lr=0.01, question_guided=False):
 23 | 
 24 | 
 25 |     with tf.variable_scope('video_subtitle_hierarchical_frame_clip') as scope:
 26 |         
 27 | 
 28 |         T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj)
 29 |         # encode question
 30 |         embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask)
 31 |         embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj)
 32 | 
 33 |         # encode stories
 34 |         embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask)
 35 | 
 36 |         embeded_stories = SEModelUtil.getAverageRepresentation(embedded_stories_words, T_B, d_lproj)
 37 | 
 38 |         embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid(embedded_stories_words, d_lproj,input_video, T_w2v, embeded_stories, embedded_question, T_B, pca_mat=pca_mat, return_sequences=True)
 39 | 
 40 |         # encode answers
 41 |         embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask)
 42 |         embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj)
 43 | 
 44 |         # get video loss
 45 |         video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index)
 46 | 
 47 |         # train module
 48 |         loss = tf.reduce_mean(video_loss)
 49 | 
 50 |         optimizer = tf.train.GradientDescentOptimizer(lr)
 51 | 
 52 |         train = optimizer.minimize(loss)
 53 |         return train,loss,video_scores
 54 |         
 55 | def linear_project_pca_initialization(hf,  feature_shape, d_w2v=300, output_path=None):
 56 | 
 57 |     print('--utilize PCA to initialize the embedding matrix of feature to d_w2v')
 58 |     samples = []
 59 |     for imdb_key in hf.keys():
 60 |         feature = hf[imdb_key][:]
 61 |         axis = [0,2,3,1]
 62 |         feature = np.transpose(feature, tuple(axis))
 63 |         feature = np.reshape(feature,(-1,feature_shape[1]))
 64 |         feature = np.random.permutation(feature)
 65 |         samples.extend(feature[:50])
 66 |     print('samples:',len(samples))
 67 | 
 68 |     pca = PCA(n_components=d_w2v, whiten=True)
 69 |     pca_mat = pca.fit_transform(np.asarray(samples).T)  # 1024 x 300
 70 | 
 71 |     pickle.dump(pca_mat,open(output_path,'w'))
 72 |     print('pca_amt dump to file:',output_path)
 73 |     return pca_mat
 74 | 
 75 | 
 76 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape,
 77 |     loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32):
 78 |     if train is not None:
 79 |         np.random.shuffle(data)
 80 | 
 81 |     total_data = len(data)
 82 |     num_batch = int(round(total_data*1.0/batch_size))
 83 | 
 84 |     total_correct_num = 0
 85 |     total_loss = 0.0
 86 |     for batch_idx in xrange(num_batch):
 87 |         batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
 88 |         
 89 |         data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices)
 90 |         data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape)
 91 |         data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape)
 92 |         if train is not None:
 93 |             _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 94 |         else:
 95 |             l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 96 | 
 97 |         num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0))
 98 |         total_correct_num += num_correct
 99 |         total_loss += l
100 |     total_acc = total_correct_num*1.0/total_data
101 |     total_loss = total_loss/num_batch
102 |     return total_acc, total_loss
103 | 
104 | 
105 | 
106 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
107 |         feature_shape=(16,1024,7,7),
108 |         batch_size=8,total_epoch=100,
109 |         lr=0.01,pretrained_model=False,pca_mat_init_file=None):
110 |     
111 | 
112 |     w2v_mqa_model_filename = '/home/wb/movie_plots_1364.d-300.mc1.w2v'
113 |     w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin')
114 | 
115 | 
116 |     '''
117 |         model parameters
118 |     '''
119 |     size_voc = len(v2i)
120 | 
121 |     max_sentences = 3660
122 |     
123 |     max_words = 40
124 |     
125 |     story_shape = (max_sentences,max_words)
126 | 
127 |     size_voc = len(v2i)
128 | 
129 | 
130 |     print('building model ...')
131 |     
132 |     if os.path.exists(pca_mat_init_file):
133 |         pca_mat = pickle.load(open(pca_mat_init_file,'r'))
134 |     else:
135 |         pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file)
136 | 
137 |     print('pca_mat.shape:',pca_mat.shape)
138 | 
139 |     input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video')
140 |     input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories')
141 |     input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question')
142 |     input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer')
143 | 
144 |     y = tf.placeholder(tf.float32,shape=(None, numberOfChoices))
145 | 
146 |     train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model,
147 |             pca_mat=pca_mat,
148 |             d_w2v=300,d_lproj=300,
149 |             answer_index=y,  lr=lr)
150 | 
151 |     '''
152 |         configure && runtime environment
153 |     '''
154 |     config = tf.ConfigProto()
155 |     config.gpu_options.per_process_gpu_memory_fraction = 0.5
156 |     config.log_device_placement=False
157 |     sess = tf.Session(config=config)
158 |     init = tf.global_variables_initializer()
159 |     sess.run(init)
160 | 
161 |     '''
162 |         training parameters
163 |     '''
164 | 
165 |     with open('train_split.json') as fid:
166 |         trdev = json.load(fid)
167 | 
168 | 
169 |     def getTrainDevSplit(trained_video_QAs,trdev):
170 |         train_data = []
171 |         dev_data = []
172 |         for k, qa in enumerate(trained_video_QAs):
173 | 
174 |             if qa.imdb_key in trdev['train']:
175 |                 train_data.append(qa)
176 |             else:
177 |                 dev_data.append(qa)
178 |         return train_data,dev_data
179 | 
180 |     train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev)
181 | 
182 | 
183 |     with sess.as_default():
184 |         saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch)
185 |         if pretrained_model is not None:
186 |             saver.restore(sess, pretrained_model)
187 |             print('restore pre trained file:' + pretrained_model)
188 |         for epoch in xrange(total_epoch):
189 |             
190 |             # # shuffle
191 |             print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size))
192 |             # train phase
193 |             tic = time.time()
194 |             total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
195 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32)
196 |             print('    --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
197 | 
198 |             # dev phase
199 |             tic = time.time()
200 |             total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
201 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
202 |             print('    --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
203 |             # eval phase
204 |             tic = time.time()
205 |             total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape,
206 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
207 |             print('    --Val--,  Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
208 | 
209 |             #save model
210 |             export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])
211 |             if not os.path.exists(export_path):
212 |                 os.makedirs(export_path)
213 |                 print('mkdir %s' %export_path)
214 |             save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt')
215 |             print("Model saved in file: %s" % save_path)
216 | 
217 | 
218 | def trans(all):
219 | 
220 |     qa_list = []
221 |     for dicts in all:
222 | 
223 |         qa_list.append(
224 |             QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'],
225 |                    dicts['imdb_key'], dicts['video_clips']))
226 |     return qa_list        
227 | 
228 | 
229 | if __name__ == '__main__':
230 | 
231 |     # 'video+subtitle task'
232 | 
233 |     nql=25 # sequences length for question
234 |     nqa=32 # sequences length for anwser
235 |     numberOfChoices = 5 # for input choices, one for correct, one for wrong answer
236 |     QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips')
237 |     
238 | 
239 |     v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb"))
240 |     qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb")))
241 |     qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb")))
242 |     train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb"))
243 |     val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb"))
244 | 
245 |     lr = 0.01
246 | 
247 | 
248 |     '''
249 |     ---------------------------------
250 |     224x224 vgg all clips feature
251 |     '''
252 | 
253 |     video_feature_dims=512
254 |     timesteps_v=32 # sequences length for video
255 |     hight = 7
256 |     width = 7
257 |     feature_shape = (timesteps_v,video_feature_dims,hight,width)
258 | 
259 |     f_type = '224x224_VGG'
260 |     feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5'
261 |     pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl'
262 | 
263 | 
264 |     hf = h5py.File(feature_path,'r')
265 | 
266 |     pretrained_model = None
267 |     train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
268 |         feature_shape=feature_shape,lr=lr,
269 |         batch_size=8,total_epoch=40,
270 |         pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file)
271 |     
272 | 
273 |     
274 |     
275 |     
276 |     
277 | 
278 | 
279 |     


--------------------------------------------------------------------------------
/mqa_video+subtitle.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import h5py
  4 | import math
  5 | #import MovieQA_benchmark as MovieQA
  6 | from model import DataUtil
  7 | from model import ModelUtil
  8 | from model import SEModelUtil
  9 | import word2vec as w2v
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 12 | 
 13 | import tensorflow as tf
 14 | from sklearn.decomposition import PCA
 15 | import cPickle as pickle
 16 | import time
 17 | import json
 18 | from collections import namedtuple
 19 | 
 20 | def build_model(input_video, input_stories, input_question, input_answer, 
 21 |             v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300,
 22 |             answer_index = None, lr=0.01, question_guided=False):
 23 | 
 24 | 
 25 |     with tf.variable_scope('video_subtitle_hierarchical_frame_clip') as scope:
 26 |         
 27 | 
 28 |         T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj)
 29 |         # encode question
 30 |         embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask)
 31 |         embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj)
 32 | 
 33 |         # encode stories
 34 |         embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask)
 35 |         embeded_stories = SEModelUtil.getAverageRepresentation(embedded_stories_words, T_B, d_lproj)
 36 |         # encode video
 37 |         embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention(input_video, T_w2v, embeded_stories, embedded_question, T_B, pca_mat=pca_mat) # batch x timesteps x d_w2v
 38 | 
 39 |         # encode answers
 40 |         embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask)
 41 |         embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj)
 42 | 
 43 |         # get video loss
 44 |         video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index)
 45 | 
 46 |         # train module
 47 |         loss = tf.reduce_mean(video_loss)
 48 |         optimizer = tf.train.GradientDescentOptimizer(lr)
 49 |         train = optimizer.minimize(loss)
 50 |         return train,loss,video_scores
 51 |         
 52 | def linear_project_pca_initialization(hf,  feature_shape, d_w2v=300, output_path=None):
 53 | 
 54 |     print('--utilize PCA to initialize the embedding matrix of feature to d_w2v')
 55 |     samples = []
 56 |     for imdb_key in hf.keys():
 57 |         feature = hf[imdb_key][:]
 58 |         axis = [0,2,3,1]
 59 |         feature = np.transpose(feature, tuple(axis))
 60 |         feature = np.reshape(feature,(-1,feature_shape[1]))
 61 |         feature = np.random.permutation(feature)
 62 |         samples.extend(feature[:50])
 63 |     print('samples:',len(samples))
 64 | 
 65 |     pca = PCA(n_components=d_w2v, whiten=True)
 66 |     pca_mat = pca.fit_transform(np.asarray(samples).T)  # 1024 x 300
 67 | 
 68 |     pickle.dump(pca_mat,open(output_path,'w'))
 69 |     print('pca_amt dump to file:',output_path)
 70 |     return pca_mat
 71 | 
 72 | 
 73 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape,
 74 |     loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32):
 75 |     if train is not None:
 76 |         np.random.shuffle(data)
 77 | 
 78 |     total_data = len(data)
 79 |     num_batch = int(round(total_data*1.0/batch_size))
 80 | 
 81 |     total_correct_num = 0
 82 |     total_loss = 0.0
 83 |     for batch_idx in xrange(num_batch):
 84 |         batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
 85 |         
 86 |         data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices)
 87 |         data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape)
 88 |         data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape)
 89 |         if train is not None:
 90 |             _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 91 |         else:
 92 |             l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 93 | 
 94 | 
 95 |         num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0))
 96 |         total_correct_num += num_correct
 97 |         total_loss += l
 98 | 
 99 |     total_acc = total_correct_num*1.0/total_data
100 |     total_loss = total_loss/num_batch
101 |     return total_acc, total_loss
102 | 
103 | 
104 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
105 |         feature_shape=(16,1024,7,7),
106 |         batch_size=8,total_epoch=100,
107 |         lr=0.01,pretrained_model=False,pca_mat_init_file=None):
108 |     
109 | 
110 |     w2v_mqa_model_filename = '/data1/wb/movie_plots_1364.d-300.mc1.w2v'
111 |     w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin')
112 | 
113 |     '''
114 |         model parameters
115 |     '''
116 |     size_voc = len(v2i)
117 |     max_sentences = 3660
118 |     max_words = 40
119 |     story_shape = (max_sentences,max_words)
120 |     size_voc = len(v2i)
121 | 
122 | 
123 |     print('building model ...')
124 |     
125 |     if os.path.exists(pca_mat_init_file):
126 |         pca_mat = pickle.load(open(pca_mat_init_file,'r'))
127 |     else:
128 |         pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file)
129 | 
130 |     print('pca_mat.shape:',pca_mat.shape)
131 | 
132 |     input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video')
133 |     input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories')
134 |     input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question')
135 |     input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer')
136 | 
137 |     y = tf.placeholder(tf.float32,shape=(None, numberOfChoices))
138 | 
139 |     train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model,
140 |             pca_mat=pca_mat,
141 |             d_w2v=300,d_lproj=300,
142 |             answer_index=y,  lr=lr)
143 | 
144 |     '''
145 |         configure && runtime environment
146 |     '''
147 |     config = tf.ConfigProto()
148 |     config.gpu_options.per_process_gpu_memory_fraction = 0.4
149 |     config.log_device_placement=False
150 | 
151 |     sess = tf.Session(config=config)
152 | 
153 |     init = tf.global_variables_initializer()
154 |     sess.run(init)
155 | 
156 |     '''
157 |         training parameters
158 |     '''
159 | 
160 |     with open('train_split.json') as fid:
161 |         trdev = json.load(fid)
162 | 
163 | 
164 |     def getTrainDevSplit(trained_video_QAs,trdev):
165 |         train_data = []
166 |         dev_data = []
167 |         for k, qa in enumerate(trained_video_QAs):
168 | 
169 |             if qa.imdb_key in trdev['train']:
170 |                 train_data.append(qa)
171 |             else:
172 |                 dev_data.append(qa)
173 |         return train_data,dev_data
174 | 
175 |     train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev)
176 | 
177 | 
178 | 
179 |     with sess.as_default():
180 |         saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch)
181 |         if pretrained_model is not None:
182 |             saver.restore(sess, pretrained_model)
183 |             print('restore pre trained file:' + pretrained_model)
184 |         for epoch in xrange(total_epoch):
185 |             
186 |             # # shuffle
187 |             print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size))
188 |             # train phase
189 |             tic = time.time()
190 |             total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
191 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32)
192 |             print('    --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
193 | 
194 |             # dev phase
195 |             tic = time.time()
196 |             total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
197 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
198 |             print('    --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
199 |             # eval phase
200 |             tic = time.time()
201 |             total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape,
202 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
203 |             print('    --Val--,  Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
204 | 
205 |             #save model
206 |             export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])
207 |             if not os.path.exists(export_path):
208 |                 os.makedirs(export_path)
209 |                 print('mkdir %s' %export_path)
210 |             save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt')
211 |             print("Model saved in file: %s" % save_path)
212 | 
213 | 
214 | def trans(all):
215 | 
216 |     qa_list = []
217 |     for dicts in all:
218 | 
219 |         qa_list.append(
220 |             QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'],
221 |                    dicts['imdb_key'], dicts['video_clips']))
222 |     return qa_list        
223 | 
224 | 
225 | if __name__ == '__main__':
226 | 
227 |     # 'video+subtitle task'
228 |     
229 |     nql=25 # sequences length for question
230 |     nqa=32 # sequences length for anwser
231 |     numberOfChoices = 5 # for input choices, one for correct, one for wrong answer
232 |     QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips')
233 | 
234 |     v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb"))
235 |     qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb")))
236 |     qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb")))
237 |     train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb"))
238 |     val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb"))
239 | 
240 |     lr = 0.01
241 | 
242 |     '''
243 |     ---------------------------------
244 |     224x224 vgg all clips feature
245 |     '''
246 | 
247 |     video_feature_dims=512
248 |     timesteps_v=32 # sequences length for video
249 |     hight = 7
250 |     width = 7
251 |     feature_shape = (timesteps_v,video_feature_dims,hight,width)
252 | 
253 |     f_type = '224x224_VGG'
254 |     feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5'
255 |     pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl'
256 | 
257 |     hf = h5py.File(feature_path,'r')
258 | 
259 |     pretrained_model = None
260 |     train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
261 |         feature_shape=feature_shape,lr=lr,
262 |         batch_size=8,total_epoch=20,
263 |         pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file)
264 |     
265 | 
266 |     
267 |     
268 |     
269 |     
270 | 
271 | 
272 |     


--------------------------------------------------------------------------------
/mqa_video+subtitlel+update.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | import h5py
  4 | import math
  5 | #import MovieQA_benchmark as MovieQA
  6 | from model import DataUtil
  7 | from model import ModelUtil
  8 | from model import SEModelUtil
  9 | import word2vec as w2v
 10 | 
 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1"
 12 | 
 13 | import tensorflow as tf
 14 | from sklearn.decomposition import PCA
 15 | import cPickle as pickle
 16 | import time
 17 | import json
 18 | from collections import namedtuple
 19 | 
 20 | def build_model(input_video, input_stories, input_question, input_answer, 
 21 |             v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300,
 22 |             answer_index = None, lr=0.01, question_guided=False):
 23 | 
 24 | 
 25 |     with tf.variable_scope('video_subtitle_hierarchical_frame_clip') as scope:
 26 |         
 27 | 
 28 |         T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj)
 29 |         # encode question
 30 |         embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask)
 31 |         embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj)
 32 | 
 33 |         # encode stories
 34 |         embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask)
 35 |         embeded_stories = SEModelUtil.getAverageRepresentation(embedded_stories_words, T_B, d_lproj)
 36 |         # encode video
 37 |         embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention_up(input_video, T_w2v, embeded_stories, embedded_question, T_B, pca_mat=pca_mat) # batch x timesteps x d_w2v
 38 | 
 39 | 
 40 |         # encode answers
 41 |         embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask)
 42 |         embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj)
 43 | 
 44 |         # get video loss
 45 |         video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index)
 46 | 
 47 |         # train module
 48 |         loss = tf.reduce_mean(video_loss)
 49 | 
 50 |         optimizer = tf.train.GradientDescentOptimizer(lr)
 51 | 
 52 |         
 53 |         train = optimizer.minimize(loss)
 54 |         return train,loss,video_scores
 55 |         
 56 | def linear_project_pca_initialization(hf,  feature_shape, d_w2v=300, output_path=None):
 57 | 
 58 |     print('--utilize PCA to initialize the embedding matrix of feature to d_w2v')
 59 |     samples = []
 60 |     for imdb_key in hf.keys():
 61 |         feature = hf[imdb_key][:]
 62 |         axis = [0,2,3,1]
 63 |         feature = np.transpose(feature, tuple(axis))
 64 |         feature = np.reshape(feature,(-1,feature_shape[1]))
 65 |         feature = np.random.permutation(feature)
 66 |         samples.extend(feature[:50])
 67 |     print('samples:',len(samples))
 68 | 
 69 |     pca = PCA(n_components=d_w2v, whiten=True)
 70 |     pca_mat = pca.fit_transform(np.asarray(samples).T)  # 1024 x 300
 71 | 
 72 |     pickle.dump(pca_mat,open(output_path,'w'))
 73 |     print('pca_amt dump to file:',output_path)
 74 |     return pca_mat
 75 | 
 76 | 
 77 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape,
 78 |     loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32):
 79 |     if train is not None:
 80 |         np.random.shuffle(data)
 81 | 
 82 |     total_data = len(data)
 83 |     num_batch = int(round(total_data*1.0/batch_size))
 84 | 
 85 |     total_correct_num = 0
 86 |     total_loss = 0.0
 87 |     for batch_idx in xrange(num_batch):
 88 |         batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)]
 89 |         
 90 |         data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices)
 91 |         data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape)
 92 |         data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape)
 93 |         if train is not None:
 94 |             _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 95 |         else:
 96 |             l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y})
 97 | 
 98 |         num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0))
 99 |         total_correct_num += num_correct
100 |         total_loss += l
101 |         
102 |     total_acc = total_correct_num*1.0/total_data
103 |     total_loss = total_loss/num_batch
104 |     return total_acc, total_loss
105 | 
106 | 
107 | 
108 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
109 |         feature_shape=(32,1024,7,7),
110 |         batch_size=8,total_epoch=100,
111 |         lr=0.01,pretrained_model=False,pca_mat_init_file=None):
112 |     
113 | 
114 |     w2v_mqa_model_filename = '/home/wb/movie_plots_1364.d-300.mc1.w2v'
115 |     w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin')
116 | 
117 | 
118 |     '''
119 |         model parameters
120 |     '''
121 |     size_voc = len(v2i)
122 | 
123 |     
124 |     max_sentences = 3660
125 |     
126 |     max_words = 40
127 |     
128 |     story_shape = (max_sentences,max_words)
129 | 
130 |     size_voc = len(v2i)
131 | 
132 | 
133 | 
134 |     print('building model ...')
135 |     
136 |     if os.path.exists(pca_mat_init_file):
137 |         pca_mat = pickle.load(open(pca_mat_init_file,'r'))
138 |     else:
139 |         pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file)
140 | 
141 |     print('pca_mat.shape:',pca_mat.shape)
142 | 
143 |     input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video')
144 |     input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories')
145 |     input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question')
146 |     input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer')
147 | 
148 |     y = tf.placeholder(tf.float32,shape=(None, numberOfChoices))
149 | 
150 |     train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model,
151 |             pca_mat=pca_mat,
152 |             d_w2v=300,d_lproj=300,
153 |             answer_index=y,  lr=lr)
154 | 
155 |     '''
156 |         configure && runtime environment
157 |     '''
158 |     config = tf.ConfigProto()
159 |     config.gpu_options.per_process_gpu_memory_fraction = 0.5
160 |     config.log_device_placement=False
161 | 
162 |     sess = tf.Session(config=config)
163 | 
164 |     init = tf.global_variables_initializer()
165 |     sess.run(init)
166 | 
167 |     '''
168 |         training parameters
169 |     '''
170 | 
171 |     with open('train_split.json') as fid:
172 |         trdev = json.load(fid)
173 | 
174 | 
175 |     def getTrainDevSplit(trained_video_QAs,trdev):
176 |         train_data = []
177 |         dev_data = []
178 |         for k, qa in enumerate(trained_video_QAs):
179 | 
180 |             if qa.imdb_key in trdev['train']:
181 |                 train_data.append(qa)
182 |             else:
183 |                 dev_data.append(qa)
184 |         return train_data,dev_data
185 | 
186 |     train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev)
187 | 
188 | 
189 |     with sess.as_default():
190 |         saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch)
191 |         if pretrained_model is not None:
192 |             saver.restore(sess, pretrained_model)
193 |             print('restore pre trained file:' + pretrained_model)
194 |         for epoch in xrange(total_epoch):
195 |             
196 |             # # shuffle
197 |             print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size))
198 |             # train phase
199 |             tic = time.time()
200 |             total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
201 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32)
202 |             print('    --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
203 | 
204 |             # dev phase
205 |             tic = time.time()
206 |             total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape,
207 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
208 |             print('    --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
209 |             # eval phase
210 |             tic = time.time()
211 |             total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape,
212 |                 loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32)
213 |             print('    --Val--,  Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic))
214 | 
215 | 
216 | 
217 |             #save model
218 |             export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0])
219 |             if not os.path.exists(export_path):
220 |                 os.makedirs(export_path)
221 |                 print('mkdir %s' %export_path)
222 |             save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt')
223 |             print("Model saved in file: %s" % save_path)
224 | 
225 | 
226 | def trans(all):
227 | 
228 |     qa_list = []
229 |     for dicts in all:
230 | 
231 |         qa_list.append(
232 |             QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'],
233 |                    dicts['imdb_key'], dicts['video_clips']))
234 |     return qa_list        
235 | 
236 | 
237 | if __name__ == '__main__':
238 |     # 'video+subtitle task'
239 | 
240 |     
241 |     
242 |     nql=25 # sequences length for question
243 |     nqa=32 # sequences length for anwser
244 |     numberOfChoices = 5 # for input choices, one for correct, one for wrong answer
245 |     QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips')
246 |     
247 | 
248 |     v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb"))
249 |     qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb")))
250 |     qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb")))
251 |     train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb"))
252 |     val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb"))
253 | 
254 |     lr = 0.01
255 | 
256 | 
257 |     '''
258 |     ---------------------------------
259 |     224x224 vgg all clips feature
260 |     '''
261 | 
262 |     video_feature_dims=512
263 |     timesteps_v=32 # sequences length for video
264 |     hight = 7
265 |     width = 7
266 |     feature_shape = (timesteps_v,video_feature_dims,hight,width)
267 | 
268 |     f_type = '224x224_VGG'
269 |     feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5'
270 |     pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl'
271 | 
272 | 
273 |     hf = h5py.File(feature_path,'r')
274 | 
275 |     pretrained_model = None
276 |     train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5,
277 |         feature_shape=feature_shape,lr=lr,
278 |         batch_size=8,total_epoch=30,
279 |         pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file)
280 |     
281 | 
282 |     
283 |     
284 |     
285 |     
286 | 
287 | 
288 |     


--------------------------------------------------------------------------------
/train_split.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "train": [
  3 |     "tt0118842", 
  4 |     "tt0417741", 
  5 |     "tt0119488", 
  6 |     "tt0362269", 
  7 |     "tt0480025", 
  8 |     "tt0114924", 
  9 |     "tt0125439", 
 10 |     "tt0256415", 
 11 |     "tt0970416", 
 12 |     "tt1193138", 
 13 |     "tt2194499", 
 14 |     "tt0140352", 
 15 |     "tt0074285", 
 16 |     "tt0780504", 
 17 |     "tt1675434", 
 18 |     "tt1068680", 
 19 |     "tt0234215", 
 20 |     "tt1197624", 
 21 |     "tt0486822", 
 22 |     "tt0307987", 
 23 |     "tt0322259", 
 24 |     "tt1174732", 
 25 |     "tt0804522", 
 26 |     "tt0119396", 
 27 |     "tt1232829", 
 28 |     "tt1454468", 
 29 |     "tt0800039", 
 30 |     "tt0455824", 
 31 |     "tt0119141", 
 32 |     "tt0120888", 
 33 |     "tt0120611", 
 34 |     "tt0443453", 
 35 |     "tt0276751", 
 36 |     "tt0383574", 
 37 |     "tt2614684", 
 38 |     "tt2719848", 
 39 |     "tt0270980", 
 40 |     "tt1800241", 
 41 |     "tt2334873", 
 42 |     "tt0425210", 
 43 |     "tt0096874", 
 44 |     "tt0475394", 
 45 |     "tt0241527", 
 46 |     "tt0147800", 
 47 |     "tt0343818", 
 48 |     "tt0268978", 
 49 |     "tt0120815", 
 50 |     "tt1399103", 
 51 |     "tt0458352", 
 52 |     "tt0319061", 
 53 |     "tt0120915", 
 54 |     "tt1284575", 
 55 |     "tt2402927", 
 56 |     "tt0218967", 
 57 |     "tt0830570", 
 58 |     "tt0163025", 
 59 |     "tt0099685", 
 60 |     "tt0375679", 
 61 |     "tt0476964", 
 62 |     "tt0324554", 
 63 |     "tt0331811", 
 64 |     "tt0083866", 
 65 |     "tt0112681", 
 66 |     "tt1535970", 
 67 |     "tt0169547", 
 68 |     "tt0111161", 
 69 |     "tt0144084", 
 70 |     "tt1099212", 
 71 |     "tt0343660", 
 72 |     "tt0091042", 
 73 |     "tt0790636", 
 74 |     "tt1401152", 
 75 |     "tt0068646", 
 76 |     "tt0780571", 
 77 |     "tt0104036", 
 78 |     "tt0133152", 
 79 |     "tt2294629", 
 80 |     "tt2278388", 
 81 |     "tt1276104", 
 82 |     "tt1131734", 
 83 |     "tt0167261", 
 84 |     "tt1092026", 
 85 |     "tt0095953", 
 86 |     "tt0332452", 
 87 |     "tt0240890", 
 88 |     "tt0947798", 
 89 |     "tt1133985", 
 90 |     "tt0458525", 
 91 |     "tt0903624", 
 92 |     "tt0988595", 
 93 |     "tt1178663", 
 94 |     "tt0457939", 
 95 |     "tt1228705", 
 96 |     "tt0119282", 
 97 |     "tt0367882", 
 98 |     "tt1632708", 
 99 |     "tt0086190", 
100 |     "tt0325980", 
101 |     "tt0433400", 
102 |     "tt0107614", 
103 |     "tt0120890", 
104 |     "tt1499658", 
105 |     "tt0119822", 
106 |     "tt1229822", 
107 |     "tt0499549", 
108 |     "tt0416320", 
109 |     "tt1201607", 
110 |     "tt1033643", 
111 |     "tt1877832", 
112 |     "tt1285016", 
113 |     "tt0212720", 
114 |     "tt1189340", 
115 |     "tt1385826", 
116 |     "tt0452623", 
117 |     "tt0421715", 
118 |     "tt1205489", 
119 |     "tt0452625", 
120 |     "tt1024648", 
121 |     "tt1646971", 
122 |     "tt0114709", 
123 |     "tt1598822", 
124 |     "tt0087469", 
125 |     "tt0116282", 
126 |     "tt0221027", 
127 |     "tt0295297", 
128 |     "tt0091867", 
129 |     "tt0206634", 
130 |     "tt0242653", 
131 |     "tt0910970", 
132 |     "tt0337978", 
133 |     "tt0335119", 
134 |     "tt1045658", 
135 |     "tt0328107", 
136 |     "tt1981115", 
137 |     "tt0313542", 
138 |     "tt0166924", 
139 |     "tt0440963", 
140 |     "tt1790885", 
141 |     "tt0120737", 
142 |     "tt1261945", 
143 |     "tt0139134", 
144 |     "tt1375666", 
145 |     "tt0290334", 
146 |     "tt0167404", 
147 |     "tt1446714", 
148 |     "tt1343092", 
149 |     "tt1951264", 
150 |     "tt0866439", 
151 |     "tt1637725", 
152 |     "tt1046173", 
153 |     "tt0478311", 
154 |     "tt1659337", 
155 |     "tt0316654", 
156 |     "tt0365907", 
157 |     "tt1104001", 
158 |     "tt1979320", 
159 |     "tt0109831", 
160 |     "tt2294449", 
161 |     "tt0103064", 
162 |     "tt0388795", 
163 |     "tt0114814", 
164 |     "tt0409847", 
165 |     "tt0120689", 
166 |     "tt0120828", 
167 |     "tt0106918", 
168 |     "tt2923316", 
169 |     "tt2310332", 
170 |     "tt1692486", 
171 |     "tt0119643", 
172 |     "tt0317198", 
173 |     "tt1837562", 
174 |     "tt2382396", 
175 |     "tt0145734", 
176 |     "tt0480242", 
177 |     "tt0381061", 
178 |     "tt0449088", 
179 |     "tt0086879", 
180 |     "tt1570728", 
181 |     "tt1727770", 
182 |     "tt0119654", 
183 |     "tt0146882", 
184 |     "tt0993846", 
185 |     "tt1726592", 
186 |     "tt0108185", 
187 |     "tt0240772", 
188 |     "tt0408306", 
189 |     "tt1000774", 
190 |     "tt0118556", 
191 |     "tt0385752", 
192 |     "tt0108160", 
193 |     "tt0245238", 
194 |     "tt1504320", 
195 |     "tt0244353", 
196 |     "tt0118571", 
197 |     "tt0418279", 
198 |     "tt0133093", 
199 |     "tt0372237", 
200 |     "tt1840309", 
201 |     "tt0281358", 
202 |     "tt0404203", 
203 |     "tt1542344", 
204 |     "tt1229340", 
205 |     "tt0800320", 
206 |     "tt0108052", 
207 |     "tt0112384", 
208 |     "tt0163187", 
209 |     "tt0071315", 
210 |     "tt0112697", 
211 |     "tt0433416", 
212 |     "tt0441773", 
213 |     "tt0107290", 
214 |     "tt1058017", 
215 |     "tt0800369", 
216 |     "tt0083658", 
217 |     "tt0172495", 
218 |     "tt0100405", 
219 |     "tt0113161", 
220 |     "tt0097165", 
221 |     "tt0907657", 
222 |     "tt0118564", 
223 |     "tt0314331", 
224 |     "tt0335266", 
225 |     "tt0120338", 
226 |     "tt0097576", 
227 |     "tt0080684", 
228 |     "tt0414055", 
229 |     "tt0411061", 
230 |     "tt0171433", 
231 |     "tt0359950", 
232 |     "tt0120586", 
233 |     "tt0305711", 
234 |     "tt0208092", 
235 |     "tt0104257", 
236 |     "tt0200550", 
237 |     "tt0298203", 
238 |     "tt1010048", 
239 |     "tt1798709", 
240 |     "tt0213149", 
241 |     "tt0217630", 
242 |     "tt0075314", 
243 |     "tt0349903"
244 |   ], 
245 |   "dev": [
246 |     "tt0258463", 
247 |     "tt0410400", 
248 |     "tt1853728", 
249 |     "tt3346224", 
250 |     "tt1454029", 
251 |     "tt0379786", 
252 |     "tt0959337", 
253 |     "tt0450259", 
254 |     "tt0118715", 
255 |     "tt0089218", 
256 |     "tt0333780", 
257 |     "tt0467406", 
258 |     "tt0376994", 
259 |     "tt0120735", 
260 |     "tt1001508", 
261 |     "tt1725986", 
262 |     "tt0246772", 
263 |     "tt0493464", 
264 |     "tt0083987", 
265 |     "tt0209475", 
266 |     "tt0397078", 
267 |     "tt2024544", 
268 |     "tt1065073", 
269 |     "tt1120985", 
270 |     "tt0161860", 
271 |     "tt0294357", 
272 |     "tt1170358", 
273 |     "tt0443706"
274 |   ]
275 | }


--------------------------------------------------------------------------------