├── README.md ├── img ├── dynamic.jpg └── framework.jpg ├── model ├── DataUtil.py ├── InitUtil.py ├── ModelUtil.py ├── SEModelUtil.py └── __init__.py ├── mqa_video+subtitle+question.py ├── mqa_video+subtitle+update+question.py ├── mqa_video+subtitle.py ├── mqa_video+subtitlel+update.py └── train_split.json /README.md: -------------------------------------------------------------------------------- 1 | # Layered Memory Network (LMN) 2 | The LMN model ranked 1st place on [MovieQA Video+Subtt-based Answering Challenge 2017](http://movieqa.cs.toronto.edu/workshops/iccv2017/) ([The Joint Video and Language Understanding Workshop, ICCV 2017](https://sites.google.com/site/describingmovies/workshop-at-iccv-17)). 3 | 4 | - The flowchart of Layered Memory Network (LMN). 5 | 6 | ![LMN](https://raw.githubusercontent.com/bowong/Layered-Memory-Network/master/img/framework.jpg) 7 | 8 | - The framework of Dynamic Subtitle Memory module with update mechanism. 9 | 10 | ![DSM](https://raw.githubusercontent.com/bowong/Layered-Memory-Network/master/img/dynamic.jpg) 11 | 12 | 13 | 14 | ## Train 15 | 16 | 17 | ``` 18 | python mqa_video+subtitle+update+question.py 19 | ``` 20 | 21 | ## Paper 22 | 23 | Bo Wang, Youjiang Xu, Yahong Han, Richang Hong. ["Movie Question Answering: Remembering the Textual Cues for Layered Visual Contents."](https://arxiv.org/abs/1804.09412) AAAI, 2018. [[Paper]](https://arxiv.org/abs/1804.09412) 24 | ``` 25 | @inproceedings{Wang2018, 26 | author = {Bo Wang and Youjiang Xu and Yahong Han and Richang Hong}, 27 | title = {Movie Question Answering: Remembering the Textual Cues for Layered Visual Contents}, 28 | booktitle = {AAAI}, 29 | year = {2018}, 30 | } 31 | ``` 32 | 33 | -------------------------------------------------------------------------------- /img/dynamic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bowong/Layered-Memory-Network/86364077c40de7674088248b81ef805d7bfa7f4d/img/dynamic.jpg -------------------------------------------------------------------------------- /img/framework.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bowong/Layered-Memory-Network/86364077c40de7674088248b81ef805d7bfa7f4d/img/framework.jpg -------------------------------------------------------------------------------- /model/DataUtil.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | #import MovieQA_benchmark as MovieQA 4 | import re 5 | import h5py 6 | import tensorflow as tf 7 | import math 8 | from nltk.stem.snowball import SnowballStemmer 9 | from collections import Counter 10 | 11 | re_alphanumeric = re.compile('[^a-z0-9 -]+') 12 | re_multispace = re.compile(' +') 13 | snowball = SnowballStemmer('english') 14 | 15 | def preprocess_sentence(line): 16 | '''strip all punctuation, keep only alphanumerics 17 | ''' 18 | line = re_alphanumeric.sub('', line) 19 | line = re_multispace.sub(' ', line) 20 | return line 21 | 22 | def normalize_documents(stories, v2i, max_words=40): 23 | """Normalize all stories in the dictionary, get list of words per sentence. 24 | """ 25 | for movie in stories.keys(): 26 | for s, sentence in enumerate(stories[movie]): 27 | sentence = sentence.lower() 28 | sentence = preprocess_sentence(sentence.strip()) 29 | sentence = sentence.split(' ')[:max_words] 30 | stories[movie][s] = sentence 31 | 32 | max_sentences = max([len(story) for story in stories.values()]) 33 | max_words = max([len(sent) for story in stories.values() for sent in story]) 34 | 35 | processed_stories = {} 36 | for imdb_key, story in stories.items(): 37 | processed_stories[imdb_key] = np.zeros((max_sentences,max_words), dtype='int32') 38 | for jj, sentence in enumerate(story): 39 | for kk, word in enumerate(sentence): 40 | if v2i.has_key(word): 41 | processed_stories[imdb_key][jj, kk] = v2i[word] 42 | else: 43 | processed_stories[imdb_key][jj, kk] = v2i['UNK'] 44 | 45 | return processed_stories,max_sentences,max_words 46 | 47 | def preprocess_stories(stories,max_words=40): 48 | for movie in stories.keys(): 49 | for s, sentence in enumerate(stories[movie]): 50 | sentence = sentence.lower() 51 | sentence = preprocess_sentence(sentence) 52 | sentence = sentence.split(' ')[:max_words] 53 | stories[movie][s] = sentence 54 | return stories 55 | 56 | def create_vocabulary(QAs, stories, word_thresh=2, v2i={'': 0, 'UNK':1}): 57 | ''' 58 | v2i = {'': 0, 'UNK':1} # vocabulary to index 59 | ''' 60 | print 'Create vocabulary...' 61 | 62 | # Get all story words 63 | all_words = [word for story in stories for sent in story for word in sent] 64 | print('number of words: %d' %len(all_words)) 65 | 66 | 67 | QA_words = {} 68 | for QA in QAs: 69 | temp = {} 70 | q_w = preprocess_sentence(QA.question.strip().lower()).split(' ') 71 | a_w = [preprocess_sentence(answer.strip().lower()).split(' ') for answer in QA.answers] 72 | temp['q_w'] = q_w 73 | temp['a_w'] = a_w 74 | temp['qid'] = QA.qid 75 | temp['imdb_key'] = QA.imdb_key 76 | temp['question'] = QA.question 77 | temp['answers'] = QA.answers 78 | temp['correct_index'] = QA.correct_index 79 | # temp['plot_alignment'] = QA.plot_alignment 80 | temp['video_clips'] = QA.video_clips 81 | 82 | 83 | QA_words[QA.qid]=temp 84 | 85 | all_words.extend(q_w) 86 | for answer in a_w: 87 | all_words.extend(answer) 88 | 89 | 90 | # threshold vocabulary, at least N instances of every word 91 | vocab = Counter(all_words) 92 | vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh] 93 | 94 | # create vocabulary index 95 | for w in vocab: 96 | if w not in v2i.keys(): 97 | v2i[w] = len(v2i) 98 | 99 | print('Created a vocabulary of %d words. Threshold removed %.2f %% words'\ 100 | %(len(v2i), 100*(1. * len(set(all_words))-len(v2i))/len(all_words))) 101 | 102 | return QA_words, v2i 103 | 104 | def create_vocabulary_word2vec(QAs, stories, word_thresh=2, w2v_vocab=None, v2i={'': 0, 'UNK':1}): 105 | ''' 106 | v2i = {'': 0, 'UNK':1} # vocabulary to index 107 | ''' 108 | print 'Create vocabulary...' 109 | 110 | if w2v_vocab is not None: 111 | print "Adding words based on word2vec" 112 | else: 113 | print "Adding all words" 114 | 115 | # Get all story words 116 | all_words = [word for story in stories for sent in story for word in sent] 117 | print('number of total words: %d' %len(all_words)) 118 | 119 | 120 | for QA in QAs: 121 | q_w = preprocess_sentence(QA.question.strip().lower()).split(' ') 122 | a_w = [preprocess_sentence(answer.strip().lower()).split(' ') for answer in QA.answers] 123 | 124 | all_words.extend(q_w) 125 | for answer in a_w: 126 | all_words.extend(answer) 127 | 128 | 129 | # threshold vocabulary, at least N instances of every word 130 | vocab = Counter(all_words) 131 | vocab = [k for k in vocab.keys() if vocab[k] >= word_thresh] 132 | 133 | # create vocabulary index 134 | for w in vocab: 135 | if w not in v2i.keys(): 136 | if w2v_vocab is None: 137 | # if word2vec is not provided, just dump the word to vocab 138 | v2i[w] = len(v2i) 139 | elif w2v_vocab is not None and w in w2v_vocab: 140 | # check if word in vocab, or else ignore 141 | v2i[w] = len(v2i) 142 | 143 | print('Created a vocabulary of %d words. Threshold removed %.2f %% words'\ 144 | %(len(v2i), 100*(1. * len(set(all_words))-len(v2i))/len(all_words))) 145 | 146 | return v2i 147 | 148 | def data_in_matrix_form(stories, v2i,max_sentences=None,max_words=None): 149 | """Make the QA data set compatible for memory networks by 150 | converting to matrix format (index into LUT vocabulary). 151 | """ 152 | 153 | def add_word_or_UNK(): 154 | if v2i.has_key(word): 155 | return v2i[word] 156 | else: 157 | return v2i['UNK'] 158 | 159 | # Encode stories 160 | if max_sentences is None: 161 | max_sentences = max([len(story) for story in stories.values()]) 162 | if max_words is None: 163 | max_words = max([len(sent) for story in stories.values() for sent in story]) 164 | 165 | storyM = {} 166 | for imdb_key, story in stories.iteritems(): 167 | storyM[imdb_key] = np.zeros((max_sentences, max_words), dtype='int32') 168 | for jj, sentence in enumerate(story): 169 | for kk, word in enumerate(sentence): 170 | storyM[imdb_key][jj, kk] = add_word_or_UNK() 171 | 172 | print "#stories:", len(storyM) 173 | print "storyM shape (movie 1):", storyM.values()[0].shape 174 | 175 | 176 | return storyM,max_sentences,max_words 177 | 178 | 179 | 180 | def S2I(sen, v2i, fixed_len): 181 | ''' 182 | len_qa: fixed length of question or answer 183 | ''' 184 | if type(sen)!=list: 185 | sen = preprocess_sentence(sen.strip().lower()).split(' ') 186 | res = [] 187 | for idx, w in enumerate(sen): 188 | if idx right/wrong 206 | 207 | return: questions, answers, ground_truth 208 | both of them are numeric indexed 209 | ground_truth is one hot vector 210 | ''' 211 | 212 | batch_size = len(batch_qas_list) 213 | questions = np.zeros((batch_size,nql),dtype='int32') 214 | answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32') 215 | ground_truth = np.zeros((batch_size,numOfChoices),dtype='int32') 216 | 217 | for idx, qa in enumerate(batch_qas_list): 218 | # set question 219 | qid = qa.qid 220 | questions[idx][:]=S2I(qa.question, v2i,nql) 221 | 222 | 223 | # set anwsers 224 | if numOfChoices==2: 225 | ground_answer_pos = np.random.randint(0,numOfChoices) 226 | ground_truth[idx][ground_answer_pos]=1 227 | 228 | # set correct answer 229 | correct_index = int(qa.correct_index) 230 | answers[idx][ground_answer_pos][:] = S2I(qa.answers[correct_index], v2i, nqa) 231 | 232 | 233 | 234 | wrong_index = np.random.randint(0,5) 235 | while(wrong_index==correct_index): 236 | wrong_index = np.random.randint(0,5) 237 | 238 | # set wrong answer 239 | answers[idx][1-ground_answer_pos][:]=S2I(qa.answers[wrong_index], v2i, nqa) 240 | elif numOfChoices==5: 241 | 242 | # set correct answer 243 | correct_index = int(qa.correct_index) 244 | ground_truth[idx][correct_index]=1 245 | for ans_idx, ans in enumerate(qa.answers): 246 | answers[idx][ans_idx][:]=S2I(ans, v2i, nqa) 247 | 248 | else: 249 | raise ValueError('Invalid numOfChoices: ' + numOfChoices) 250 | 251 | return questions,answers,ground_truth 252 | 253 | def getBatchTestIndexedQAs(batch_qas_list,v2i, nql=16, nqa=10, numOfChoices=2): 254 | ''' 255 | batch_qas_list: list of qas 256 | v2i: vocabulary to index 257 | nql: length of question 258 | nqa: length of answer 259 | numOfChoices: number of Choices utilized per QA, default set to 2 ==> right/wrong 260 | 261 | return: questions, answers, ground_truth 262 | both of them are numeric indexed 263 | ground_truth is one hot vector 264 | ''' 265 | 266 | batch_size = len(batch_qas_list) 267 | questions = np.zeros((batch_size,nql),dtype='int32') 268 | answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32') 269 | 270 | for idx, qa in enumerate(batch_qas_list): 271 | # set question 272 | qid = qa.qid 273 | questions[idx][:]=S2I(qa.question, v2i,nql) 274 | 275 | # set anwsers 276 | for ans_idx, ans in enumerate(qa.answers): 277 | answers[idx][ans_idx][:]=S2I(ans, v2i, nqa) 278 | 279 | 280 | return questions,answers 281 | 282 | def getBatchVideoFeature(batch_qas_list, hf, feature_shape): 283 | ''' 284 | video-based QA 285 | there are video clips in all QA pairs. 286 | ''' 287 | 288 | batch_size = len(batch_qas_list) 289 | input_video = np.zeros((batch_size,)+tuple(feature_shape),dtype='float32') 290 | 291 | timesteps = feature_shape[0] 292 | 293 | for idx, qa in enumerate(batch_qas_list): 294 | qid = qa.qid 295 | video_clips = qa.video_clips 296 | imdb_key = qa.imdb_key 297 | 298 | 299 | 300 | clips_features = [] 301 | if len(video_clips) != 0: 302 | for clip in video_clips: 303 | dataset = imdb_key+'/'+clip 304 | if imdb_key in hf.keys() and clip in hf[imdb_key].keys(): 305 | clips_features.extend(hf[dataset][:]) # clips_features.shape 306 | 307 | 308 | if(len(clips_features)<=0): 309 | # if there are not vlid features 310 | for clip in hf[imdb_key].keys(): 311 | dataset = imdb_key+'/'+clip 312 | clips_features.extend(hf[dataset][:]) # clips_features.shape 313 | 314 | 315 | if(len(clips_features)>=timesteps): 316 | interval = int(math.floor((len(clips_features)-1)/(timesteps-1))) 317 | input_video[idx] = clips_features[0::interval][0:timesteps] 318 | else: 319 | input_video[idx][:len(clips_features)] = clips_features 320 | for last_idx in xrange(len(clips_features),timesteps): 321 | input_video[idx][last_idx]=clips_features[-1] 322 | 323 | 324 | # if qid not in hf_out.keys(): 325 | # dset = hf_out.create_dataset(qid, feature_shape, dtype='f') 326 | # dset[:] = input_video[idx] 327 | 328 | 329 | return input_video 330 | 331 | def getBatchVideoFeatureFromQid(batch_qas_list, hf, feature_shape): 332 | ''' 333 | video-based QA 334 | there are video clips in all QA pairs. 335 | ''' 336 | 337 | batch_size = len(batch_qas_list) 338 | input_video = np.zeros((batch_size,)+tuple(feature_shape),dtype='float32') 339 | 340 | timesteps = feature_shape[0] 341 | for idx, qa in enumerate(batch_qas_list): 342 | qid = qa.qid 343 | input_video[idx] = hf[qid][:] 344 | return input_video 345 | 346 | rng = np.random 347 | rng.seed(1234) 348 | def getw2v(batch_qa_list,w2v,v2i,d_w2v = 300): 349 | 350 | 351 | voc_size = len(v2i) 352 | 353 | 354 | pca_mat = None 355 | #print "Initialize LUTs as word2vec and use linear projection layer" 356 | 357 | 358 | LUT = np.zeros((voc_size, d_w2v), dtype='float32') 359 | found_words = 0 360 | 361 | for w, v in v2i.iteritems(): 362 | if w in w2v.vocab: 363 | LUT[v] = w2v.get_vector(w) 364 | found_words +=1 365 | else: 366 | LUT[v] = rng.randn(d_w2v) 367 | LUT[v] = LUT[v] / (np.linalg.norm(LUT[v]) + 1e-6) 368 | 369 | #print "Found %d / %d words" %(found_words, len(v2i)) 370 | 371 | 372 | # word 0 is blanked out, word 1 is 'UNK' 373 | LUT[0] = np.zeros((d_w2v)) 374 | 375 | # if linear projection layer is not the same shape as LUT, then initialize with PCA 376 | 377 | 378 | # setup LUT! 379 | T_w2v = tf.constant(LUT.astype('float32')) 380 | 381 | 382 | word_shape = (26033, 300) 383 | 384 | w2v_new = np.zeros((batch_qa_list,)+word_shape,dtype='int32') 385 | 386 | 387 | for idx in xrange(batch_qa_list): 388 | 389 | 390 | 391 | 392 | w2v_new[idx][:] = LUT[:] 393 | 394 | return w2v_new 395 | 396 | #w2v_new = tf.tile(w2v_new, [input_shape[0],1,1]) 397 | 398 | 399 | def getBatchIndexedStories(batch_qa_list,stories,v2i,story_shape): 400 | batch_size = len(batch_qa_list) 401 | input_stories = np.zeros((batch_size,)+story_shape,dtype='int32') 402 | 403 | for idx, qa in enumerate(batch_qa_list): 404 | imdb_key = qa.imdb_key 405 | interval = int(math.floor((len(stories[imdb_key])-1)/(story_shape[0]-1))) 406 | 407 | if interval != 0: 408 | for k in xrange(story_shape[0]): 409 | # if(k=timesteps): 451 | interval = int(math.floor((len(clips_features)-1)/(timesteps-1))) 452 | input_video[idx] = clips_features[0::interval][0:timesteps] 453 | else: 454 | input_video[idx][:len(clips_features)] = clips_features 455 | for last_idx in xrange(len(clips_features),timesteps): 456 | input_video[idx][last_idx]=clips_features[-1] 457 | 458 | false_clips_features = np.random.permutation(false_clips_features) 459 | 460 | false_frame_pos = np.random.permutation(range(0,timesteps))[:false_frame_num] 461 | for _,ffp in enumerate(false_frame_pos): 462 | input_video[idx][ffp] = false_clips_features[ffp] 463 | rfr_lables[idx,ffp,0] = 1 464 | rfr_lables[idx,ffp,1] = 0 465 | 466 | return input_video, rfr_lables 467 | 468 | 469 | def split_stories(full_stories,train_movies,val_movies): 470 | train_stories = {} 471 | val_stories = {} 472 | for tm in train_movies: 473 | train_stories[tm] = full_stories[tm] 474 | for vm in val_movies: 475 | val_stories[vm] = full_stories[vm] 476 | 477 | print('num of train stories:',len(train_stories)) 478 | print('num of val stories:',len(val_stories)) 479 | return train_stories,val_stories 480 | 481 | def getBatchIndexedQAs_return(batch_qas_list,v2i, nql=16, nqa=10, numOfChoices=2): 482 | ''' 483 | batch_qas_list: list of qas 484 | QA_words: all the QAs, contains question words and answer words 485 | v2i: vocabulary to index 486 | nql: length of question 487 | nqa: length of answer 488 | numOfChoices: number of Choices utilized per QA, default set to 2 ==> right/wrong 489 | 490 | return: questions, answers, ground_truth 491 | both of them are numeric indexed 492 | ground_truth is one hot vector 493 | ''' 494 | 495 | batch_size = len(batch_qas_list) 496 | questions = np.zeros((batch_size,nql),dtype='int32') 497 | answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32') 498 | ground_truth = np.zeros((batch_size,numOfChoices),dtype='int32') 499 | 500 | for idx, qa in enumerate(batch_qas_list): 501 | # set question 502 | 503 | questions[idx][:]=qa.question 504 | 505 | if numOfChoices==5: 506 | 507 | # set correct answer 508 | #correct_index = qa.correct_index 509 | ground_truth[idx]=qa.correct_index 510 | for ans_idx, ans in enumerate(qa.answers): 511 | answers[idx][ans_idx][:]=ans 512 | 513 | 514 | else: 515 | raise ValueError('Invalid numOfChoices: ' + numOfChoices) 516 | 517 | return questions,answers,ground_truth 518 | 519 | def getTestBatchIndexedQAs_return(batch_qas_list,v2i, nql=16, nqa=10, numOfChoices=2): 520 | 521 | batch_size = len(batch_qas_list) 522 | questions = np.zeros((batch_size,nql),dtype='int32') 523 | answers = np.zeros((batch_size,numOfChoices,nqa),dtype='int32') 524 | 525 | for idx, qa in enumerate(batch_qas_list): 526 | 527 | questions[idx][:]=qa.question 528 | 529 | if numOfChoices==5: 530 | 531 | for ans_idx, ans in enumerate(qa.answers): 532 | answers[idx][ans_idx][:]=ans 533 | else: 534 | raise ValueError('Invalid numOfChoices: ' + numOfChoices) 535 | 536 | return questions,answers 537 | def main(): 538 | 539 | task = 'video-based' # video-based or subtitle-based 540 | 541 | mqa = MovieQA.DataLoader() 542 | 543 | 544 | # get 'subtitile-based' QA task dataset 545 | stories, subtitle_QAs = mqa.get_story_qa_data('train', 'subtitle') 546 | 547 | # Create vocabulary 548 | QA_words, v2i = create_vocabulary(subtitle_QAs, stories, word_thresh=2, v2i={'': 0, 'UNK':1}) 549 | 550 | # get 'video-based' QA task training set 551 | vl_qa, video_QAs = mqa.get_video_list('train', 'qa_clips') # key: 'train:', value: list of related clips 552 | # vl_qa, _ = mqa.get_video_list('train', 'all_clips') # key:moive vid, value:list of related movid all_clips 553 | 554 | 555 | 556 | all_video_train_list = video_QAs 557 | 558 | batch_size = 20 559 | total_train_qa = len(all_video_train_list) 560 | num_batch = int(round(total_train_qa*1.0/batch_size)) 561 | 562 | total_epoch = 100 563 | 564 | hf = h5py.File('/home/wb/movie_feature.hdf5','r') 565 | feature_shape = (10,1024) 566 | for epoch in xrange(total_epoch): 567 | #shuffle 568 | np.random.shuffle(all_video_train_list) 569 | for batch_idx in xrange(num_batch): 570 | batch_qa = all_video_train_list[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_train_qa)] 571 | questions,answers,ground_truth = getBatchIndexedQAs(batch_qa,QA_words,v2i, nql=16, nqa=10, numOfChoices=2) 572 | input_video = getBatchVideoFeature(batch_qa, QA_words, hf, feature_shape) 573 | print(input_video) 574 | print(ground_truth) 575 | break 576 | break 577 | 578 | 579 | if __name__=='__main__': 580 | main() -------------------------------------------------------------------------------- /model/InitUtil.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | def get_fans(shape): 9 | if len(shape) == 2: 10 | fan_in = shape[0] 11 | fan_out = shape[1] 12 | elif len(shape) == 4 or len(shape) == 5: 13 | receptive_field_size = np.prod(shape[2:]) 14 | fan_in = shape[1] * receptive_field_size 15 | fan_out = shape[0] * receptive_field_size 16 | 17 | else: 18 | # No specific assumptions. 19 | fan_in = np.sqrt(np.prod(shape)) 20 | fan_out = np.sqrt(np.prod(shape)) 21 | return fan_in, fan_out 22 | 23 | 24 | def uniform(shape, scale=0.05, name=None, seed=None): #tf.float32 25 | if seed is None: 26 | # ensure that randomness is conditioned by the Numpy RNG 27 | seed = np.random.randint(10e8) 28 | 29 | value = tf.random_uniform_initializer( 30 | -scale, scale, dtype=tf.float32, seed=seed)(shape) 31 | 32 | return tf.Variable(value,name=name) 33 | 34 | 35 | 36 | def glorot_uniform(shape, name=None): 37 | fan_in, fan_out = get_fans(shape) 38 | s = np.sqrt(6. / (fan_in + fan_out)) 39 | return uniform(shape, s, name=name) 40 | 41 | 42 | def orthogonal(shape, scale=1.1, name=None): 43 | """Orthogonal initializer. 44 | 45 | # References 46 | Saxe et al., http://arxiv.org/abs/1312.6120 47 | """ 48 | flat_shape = (shape[0], np.prod(shape[1:])) 49 | a = np.random.normal(0.0, 1.0, flat_shape) 50 | u, _, v = np.linalg.svd(a, full_matrices=False) 51 | # Pick the one with the correct shape. 52 | q = u if u.shape == flat_shape else v 53 | q = q.reshape(shape) 54 | return tf.Variable(scale * q[:shape[0], :shape[1]], dtype=tf.float32, name=name) 55 | 56 | def init_weight_variable(shape, init_method='glorot_uniform', name=None): 57 | # initial = tf.truncated_normal(shape, stddev=0.1, name=name) 58 | if init_method == 'uniform': 59 | return uniform(shape, scale=0.05, name=name, seed=None) 60 | elif init_method == 'glorot_uniform': 61 | return glorot_uniform(shape, name=name) 62 | elif init_method == 'orthogonal': 63 | return orthogonal(shape, scale=1.1, name=name) 64 | else: 65 | raise ValueError('Invalid init_method: ' + init_method) 66 | 67 | def init_bias_variable(shape,name=None): 68 | initial = tf.constant(0.1,shape=shape, name=name) 69 | return tf.Variable(initial, name=name) -------------------------------------------------------------------------------- /model/ModelUtil.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 4 | 5 | import tensorflow as tf 6 | 7 | import numpy as np 8 | from sklearn.decomposition import PCA 9 | import cPickle as pickle 10 | 11 | def get_fans(shape): 12 | if len(shape) == 2: 13 | fan_in = shape[0] 14 | fan_out = shape[1] 15 | elif len(shape) == 4 or len(shape) == 5: 16 | receptive_field_size = np.prod(shape[2:]) 17 | fan_in = shape[1] * receptive_field_size 18 | fan_out = shape[0] * receptive_field_size 19 | 20 | else: 21 | # No specific assumptions. 22 | fan_in = np.sqrt(np.prod(shape)) 23 | fan_out = np.sqrt(np.prod(shape)) 24 | return fan_in, fan_out 25 | 26 | 27 | def uniform(shape, scale=0.05, name=None, seed=None): #tf.float32 28 | if seed is None: 29 | # ensure that randomness is conditioned by the Numpy RNG 30 | seed = np.random.randint(10e8) 31 | 32 | value = tf.random_uniform_initializer( 33 | -scale, scale, dtype=tf.float32, seed=seed)(shape) 34 | 35 | return tf.Variable(value) 36 | 37 | 38 | 39 | def glorot_uniform(shape, name=None): 40 | fan_in, fan_out = get_fans(shape) 41 | s = np.sqrt(6. / (fan_in + fan_out)) 42 | return uniform(shape, s, name=name) 43 | 44 | 45 | def orthogonal(shape, scale=1.1, name=None): 46 | """Orthogonal initializer. 47 | 48 | # References 49 | Saxe et al., http://arxiv.org/abs/1312.6120 50 | """ 51 | flat_shape = (shape[0], np.prod(shape[1:])) 52 | a = np.random.normal(0.0, 1.0, flat_shape) 53 | u, _, v = np.linalg.svd(a, full_matrices=False) 54 | # Pick the one with the correct shape. 55 | q = u if u.shape == flat_shape else v 56 | q = q.reshape(shape) 57 | return tf.Variable(scale * q[:shape[0], :shape[1]], dtype=tf.float32, name=name) 58 | 59 | def init_weight_variable(shape, init_method='glorot_uniform', name=None): 60 | # initial = tf.truncated_normal(shape, stddev=0.1, name=name) 61 | if init_method == 'uniform': 62 | return uniform(shape, scale=0.05, name=name, seed=None) 63 | elif init_method == 'glorot_uniform': 64 | return glorot_uniform(shape, name=name) 65 | elif init_method == 'orthogonal': 66 | return orthogonal(shape, scale=1.1, name=name) 67 | else: 68 | raise ValueError('Invalid init_method: ' + init_method) 69 | 70 | def init_bias_variable(shape,name=None): 71 | initial = tf.constant(0.1,shape=shape, name=name) 72 | return tf.Variable(initial) 73 | 74 | 75 | def matmul_wx(x, w, b, output_dims): 76 | 77 | return tf.matmul(x, w)+tf.reshape(b,(1,output_dims)) 78 | 79 | 80 | def matmul_uh(u,h_tm1): 81 | return tf.matmul(h_tm1,u) 82 | 83 | 84 | 85 | def get_init_state(x, output_dims): 86 | initial_state = tf.zeros_like(x) 87 | initial_state = tf.reduce_sum(initial_state,axis=[1,2]) 88 | initial_state = tf.expand_dims(initial_state,dim=-1) 89 | initial_state = tf.tile(initial_state,[1,output_dims]) 90 | return initial_state 91 | 92 | 93 | def getVideoEncoder(x, output_dims, return_sequences=False): 94 | ''' 95 | function: getVideoEncoder 96 | parameters: 97 | 98 | x: batch_size, timesteps , dims 99 | output_dims: the output of the GRU dimensions 100 | num_class: number of class : ucf-101: 101 101 | return: 102 | the last GRU state, 103 | or 104 | the sequences of the hidden states 105 | 106 | ''' 107 | input_shape = x.get_shape().as_list() 108 | assert len(input_shape)==3 109 | timesteps = input_shape[1] 110 | input_dims = input_shape[2] 111 | 112 | # get initial state 113 | initial_state = get_init_state(x, output_dims) 114 | 115 | # initialize the parameters 116 | # W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h 117 | W_r = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_r") 118 | W_z = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_z") 119 | W_h = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_h") 120 | 121 | U_r = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_r") 122 | U_z = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_z") 123 | U_h = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_h") 124 | 125 | b_r = init_bias_variable((output_dims,),name="b_r") 126 | b_z = init_bias_variable((output_dims,),name="b_z") 127 | b_h = init_bias_variable((output_dims,),name="b_h") 128 | 129 | 130 | # batch_size x timesteps x dim -> timesteps x batch_size x dim 131 | axis = [1,0]+list(range(2,3)) # axis = [1,0,2] 132 | x = tf.transpose(x, perm=axis) # permutate the input_x --> timestemp, batch_size, input_dims 133 | 134 | input_x = tf.TensorArray( 135 | dtype=x.dtype, 136 | size=timesteps, 137 | tensor_array_name='input_x') 138 | 139 | if hasattr(input_x, 'unstack'): 140 | input_x = input_x.unstack(x) 141 | else: 142 | input_x = input_x.unpack(x) 143 | 144 | 145 | hidden_state = tf.TensorArray( 146 | dtype=tf.float32, 147 | size=timesteps, 148 | tensor_array_name='hidden_state') 149 | 150 | 151 | 152 | 153 | def step(time, hidden_state, h_tm1): 154 | x_t = input_x.read(time) # batch_size * dim 155 | 156 | preprocess_x_r = matmul_wx(x_t, W_r, b_r, output_dims) 157 | preprocess_x_z = matmul_wx(x_t, W_z, b_z, output_dims) 158 | preprocess_x_h = matmul_wx(x_t, W_h, b_h, output_dims) 159 | 160 | r = tf.nn.sigmoid(preprocess_x_r+ matmul_uh(U_r,h_tm1)) 161 | z = tf.nn.sigmoid(preprocess_x_z+ matmul_uh(U_z,h_tm1)) 162 | hh = tf.nn.tanh(preprocess_x_h+ matmul_uh(U_h,h_tm1)) 163 | 164 | h = (1-z)*hh + z*h_tm1 165 | 166 | hidden_state = hidden_state.write(time, h) 167 | 168 | return (time+1,hidden_state,h) 169 | 170 | 171 | 172 | 173 | time = tf.constant(0, dtype='int32', name='time') 174 | 175 | 176 | ret_out = tf.while_loop( 177 | cond=lambda time, *_: time < timesteps, 178 | body=step, 179 | loop_vars=(time, hidden_state, initial_state), 180 | parallel_iterations=32, 181 | swap_memory=True) 182 | 183 | output = ret_out[1] 184 | last_output = ret_out[-1] 185 | 186 | if hasattr(hidden_state, 'stack'): 187 | hidden_state = hidden_state.stack() 188 | 189 | axis = [1,0] + list(range(2,3)) 190 | outputs = tf.transpose(hidden_state,perm=axis) 191 | 192 | 193 | if return_sequences: 194 | return outputs 195 | else: 196 | return last_output 197 | 198 | 199 | def getEmbedding(words, size_voc, word_embedding_size): 200 | ''' 201 | function: getEmbedding 202 | parameters: 203 | words: int, word index ; or a np.int32 list ## sample(null) * input_words_sequential 204 | size_voc: size of vocabulary 205 | embedding_size: the dimension after embedding 206 | return: 207 | embeded_words:the embeded words with shape (sample * timesteps * embedding dims) 208 | mask: each element in mask vector is 0 or 1, indicate there is a word or a padding zero 209 | ''' 210 | 211 | W_e = tf.get_variable('W_e',(size_voc,word_embedding_size),initializer=tf.random_uniform_initializer(-0.05,0.05)) # share the embedding matrix 212 | embeded_words = tf.gather(W_e, words) 213 | mask = tf.not_equal(words,0) 214 | return embeded_words, mask 215 | 216 | 217 | 218 | def getQuestionEncoder(embeded_words, output_dims, mask, return_sequences=False): 219 | 220 | ''' 221 | function: getQuestionEncoder 222 | parameters: 223 | embeded_words: sample*timestep*dim 224 | output_dims: the GRU hidden dim 225 | mask: bool type , samples * timestep 226 | return: 227 | the last GRU state, 228 | or 229 | the sequences of the hidden states 230 | ''' 231 | input_shape = embeded_words.get_shape().as_list() 232 | assert len(input_shape)==3 233 | 234 | timesteps = input_shape[1] 235 | input_dims = input_shape[2] 236 | # get initial state 237 | initial_state = get_init_state(embeded_words, output_dims) 238 | 239 | 240 | # initialize the parameters 241 | # W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h 242 | W_r = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_q_r") 243 | W_z = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_q_z") 244 | W_h = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_q_h") 245 | 246 | U_r = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_q_r") 247 | U_z = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_q_z") 248 | U_h = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_q_h") 249 | 250 | b_r = init_bias_variable((output_dims,),name="b_q_r") 251 | b_z = init_bias_variable((output_dims,),name="b_q_z") 252 | b_h = init_bias_variable((output_dims,),name="b_q_h") 253 | 254 | 255 | # batch_size x timesteps x dim -> timesteps x batch_size x dim 256 | axis = [1,0]+list(range(2,3)) # axis = [1,0,2] 257 | embeded_words = tf.transpose(embeded_words, perm=axis) # permutate the input_x --> timestemp, batch_size, input_dims 258 | 259 | 260 | 261 | input_embeded_words = tf.TensorArray( 262 | dtype=embeded_words.dtype, 263 | size=timesteps, 264 | tensor_array_name='input_embeded_words_q') 265 | 266 | 267 | if hasattr(input_embeded_words, 'unstack'): 268 | input_embeded_words = input_embeded_words.unstack(embeded_words) 269 | else: 270 | input_embeded_words = input_embeded_words.unpack(embeded_words) 271 | 272 | 273 | # preprocess mask 274 | if len(mask.get_shape()) == len(input_shape)-1: 275 | mask = tf.expand_dims(mask,dim=-1) 276 | 277 | mask = tf.transpose(mask,perm=axis) 278 | 279 | input_mask = tf.TensorArray( 280 | dtype=mask.dtype, 281 | size=timesteps, 282 | tensor_array_name='input_mask_q' 283 | ) 284 | 285 | if hasattr(input_mask, 'unstack'): 286 | input_mask = input_mask.unstack(mask) 287 | else: 288 | input_mask = input_mask.unpack(mask) 289 | 290 | 291 | hidden_state_q = tf.TensorArray( 292 | dtype=tf.float32, 293 | size=timesteps, 294 | tensor_array_name='hidden_state_q') 295 | 296 | 297 | 298 | def step(time, hidden_state_q, h_tm1): 299 | x_t = input_embeded_words.read(time) # batch_size * dim 300 | mask_t = input_mask.read(time) 301 | 302 | preprocess_x_r = matmul_wx(x_t, W_r, b_r, output_dims) 303 | preprocess_x_z = matmul_wx(x_t, W_z, b_z, output_dims) 304 | preprocess_x_h = matmul_wx(x_t, W_h, b_h, output_dims) 305 | 306 | r = tf.nn.sigmoid(preprocess_x_r+ matmul_uh(U_r,h_tm1)) 307 | z = tf.nn.sigmoid(preprocess_x_z+ matmul_uh(U_z,h_tm1)) 308 | hh = tf.nn.tanh(preprocess_x_h+ matmul_uh(U_h,h_tm1)) 309 | 310 | 311 | h = (1-z)*hh + z*h_tm1 312 | tiled_mask_t = tf.tile(mask_t, tf.stack([1, h.get_shape().as_list()[1]])) 313 | 314 | h = tf.where(tiled_mask_t, h, h_tm1) 315 | 316 | hidden_state_q = hidden_state_q.write(time, h) 317 | 318 | return (time+1,hidden_state_q,h) 319 | 320 | 321 | 322 | 323 | time = tf.constant(0, dtype='int32', name='time') 324 | 325 | 326 | ret_out = tf.while_loop( 327 | cond=lambda time, *_: time < timesteps, 328 | body=step, 329 | loop_vars=(time, hidden_state_q, initial_state), 330 | parallel_iterations=32, 331 | swap_memory=True) 332 | 333 | 334 | hidden_state_q = ret_out[1] 335 | last_output = ret_out[-1] 336 | 337 | if hasattr(hidden_state_q, 'stack'): 338 | outputs = hidden_state_q.stack() 339 | print('stack') 340 | else: 341 | outputs = hidden_state_q.pack() 342 | 343 | axis = [1,0] + list(range(2,3)) 344 | outputs = tf.transpose(outputs,perm=axis) 345 | 346 | if return_sequences: 347 | return outputs 348 | else: 349 | return last_output 350 | 351 | 352 | 353 | 354 | def getAnswerEmbedding(words, size_voc, word_embedding_size): 355 | ''' 356 | function: getAnswerEmbedding 357 | parameters: 358 | words: int, word index ; or a np.int32 list ## sample(null) * numebrOfChoice * timesteps 359 | size_voc: size of vocabulary 360 | embedding_size: the dimension after embedding 361 | return: 362 | the embeded answers with shape(batch_size, numberOfChoices, timesteps, word_embedding_size) 363 | ''' 364 | assert len(words.get_shape().as_list())==3 # 365 | input_shape = words.get_shape().as_list() 366 | numberOfChoices = input_shape[1] 367 | timesteps = input_shape[2] 368 | 369 | mask = tf.not_equal(words,0) 370 | 371 | words = tf.reshape(words, (-1,timesteps)) 372 | W_e = tf.get_variable('W_e',(size_voc,word_embedding_size),initializer=tf.random_uniform_initializer(-0.05,0.05)) # share the embedding matrix 373 | embeded_words = tf.gather(W_e, words) 374 | 375 | 376 | embeded_words = tf.reshape(embeded_words,(-1,numberOfChoices,timesteps,word_embedding_size)) 377 | 378 | return embeded_words, mask 379 | 380 | 381 | 382 | def getAnswerEncoder(embeded_words, output_dims, mask, return_sequences=False): 383 | ''' 384 | function: getAnswerEncoder 385 | parameters: 386 | embeded_words: samples * numberOfChoices * timesteps * dim 387 | output_dim: output of GRU, the dimension of answering vector 388 | mask : bool type, mask the embeded_words 389 | num_class: number of classifier 390 | return: 391 | the last encoded answers with shape(batch_size, numberOfChoices, output_dims) 392 | or 393 | the sequences.... with shape(batch_size, numberOfChoices, numberOfChoices, output_dims) 394 | ''' 395 | input_shape = embeded_words.get_shape().as_list() 396 | assert len(input_shape)==4 397 | 398 | 399 | numberOfChoices = input_shape[1] 400 | timesteps = input_shape[2] 401 | input_dims = input_shape[3] 402 | 403 | # get initial state 404 | embeded_words = tf.reshape(embeded_words,(-1,timesteps,input_dims)) 405 | initial_state = get_init_state(embeded_words, output_dims) 406 | 407 | axis = [1,0,2] 408 | embeded_words = tf.transpose(embeded_words, perm=axis) # permutate the 'embeded_words' --> timesteps x batch_size x numberOfChoices x dim 409 | # embeded_words = tf.reshape(embeded_words,(timesteps,-1,input_dims)) # reshape the 'embeded_words' --> timesteps x (batch x numberOfChoices) x dim 410 | 411 | # initialize the parameters 412 | # W_r,U_r,b_r; W_z, U_z, b_z; W_h, U_h, b_h 413 | W_r = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_a_r") 414 | W_z = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_a_z") 415 | W_h = init_weight_variable((input_dims,output_dims),init_method='glorot_uniform',name="W_a_h") 416 | 417 | U_r = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_a_r") 418 | U_z = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_a_z") 419 | U_h = init_weight_variable((output_dims,output_dims),init_method='orthogonal',name="U_a_h") 420 | 421 | b_r = init_bias_variable((output_dims,),name="b_a_r") 422 | b_z = init_bias_variable((output_dims,),name="b_a_z") 423 | b_h = init_bias_variable((output_dims,),name="b_a_h") 424 | 425 | 426 | 427 | input_embeded_words = tf.TensorArray( 428 | dtype=embeded_words.dtype, 429 | size=timesteps, 430 | tensor_array_name='input_embeded_words_a') 431 | 432 | 433 | if hasattr(input_embeded_words, 'unstack'): 434 | input_embeded_words = input_embeded_words.unstack(embeded_words) 435 | else: 436 | input_embeded_words = input_embeded_words.unpack(embeded_words) 437 | 438 | 439 | # preprocess mask 440 | if len(mask.get_shape()) == len(input_shape)-1: 441 | mask = tf.expand_dims(mask,dim=-1) 442 | 443 | axis = [2,0,1,3] 444 | mask = tf.transpose(mask,perm=axis) 445 | mask = tf.reshape(mask, (timesteps,-1,1)) 446 | 447 | input_mask = tf.TensorArray( 448 | dtype=mask.dtype, 449 | size=timesteps, 450 | tensor_array_name='input_mask_q' 451 | ) 452 | 453 | if hasattr(input_mask, 'unstack'): 454 | input_mask = input_mask.unstack(mask) 455 | else: 456 | input_mask = input_mask.unpack(mask) 457 | 458 | 459 | hidden_state_q = tf.TensorArray( 460 | dtype=tf.float32, 461 | size=timesteps, 462 | tensor_array_name='hidden_state_a') 463 | 464 | # if hasattr(hidden_state, 'unstack'): 465 | # hidden_state = hidden_state.unstack(hidden_state) 466 | # else: 467 | # hidden_state = hidden_state.unpack(hidden_state) 468 | 469 | 470 | def step(time, hidden_state_q, h_tm1): 471 | x_t = input_embeded_words.read(time) # batch_size * dim 472 | mask_t = input_mask.read(time) 473 | 474 | preprocess_x_r = matmul_wx(x_t, W_r, b_r, output_dims) 475 | preprocess_x_z = matmul_wx(x_t, W_z, b_z, output_dims) 476 | preprocess_x_h = matmul_wx(x_t, W_h, b_h, output_dims) 477 | 478 | r = tf.nn.sigmoid(preprocess_x_r+ matmul_uh(U_r,h_tm1)) 479 | z = tf.nn.sigmoid(preprocess_x_z+ matmul_uh(U_z,h_tm1)) 480 | hh = tf.nn.tanh(preprocess_x_h+ matmul_uh(U_h,h_tm1)) 481 | 482 | 483 | h = (1-z)*hh + z*h_tm1 484 | tiled_mask_t = tf.tile(mask_t, tf.stack([1, h.get_shape().as_list()[1]])) 485 | 486 | h = tf.where(tiled_mask_t, h, h_tm1) 487 | 488 | hidden_state_q = hidden_state_q.write(time, h) 489 | 490 | return (time+1,hidden_state_q,h) 491 | 492 | 493 | 494 | 495 | time = tf.constant(0, dtype='int32', name='time') 496 | 497 | 498 | ret_out = tf.while_loop( 499 | cond=lambda time, *_: time < timesteps, 500 | body=step, 501 | loop_vars=(time, hidden_state_q, initial_state), 502 | parallel_iterations=32, 503 | swap_memory=True) 504 | 505 | 506 | hidden_state_q = ret_out[1] 507 | last_output = ret_out[-1] 508 | 509 | 510 | 511 | if hasattr(hidden_state_q, 'stack'): 512 | outputs = hidden_state_q.stack() 513 | print('stack') 514 | else: 515 | outputs = hidden_state_q.pack() 516 | 517 | outputs = tf.reshape(outputs,(timesteps,-1,numberOfChoices,output_dims)) 518 | axis = [1,2,0]+list(range(3,4)) 519 | outputs = tf.transpose(outputs,perm=axis) 520 | 521 | last_output = tf.reshape(last_output,(-1,numberOfChoices,output_dims)) 522 | print('outputs:....',outputs.get_shape().as_list()) 523 | if return_sequences: 524 | return outputs 525 | else: 526 | return last_output 527 | 528 | 529 | 530 | 531 | def getMemoryNetworks(embeded_stories, embeded_question, d_lproj, T_B=None, return_sequences=False): 532 | 533 | ''' 534 | embeded_stories: (batch_size, num_of_sentence, num_of_words, embeded_words_dims) 535 | embeded_question:(batch_size, embeded_words_dims) 536 | output_dims: the dimension of stories 537 | ''' 538 | stories_shape = embeded_stories.get_shape().as_list() 539 | embeded_question_shape = embeded_question.get_shape().as_list() 540 | num_of_sentence = stories_shape[-3] 541 | input_dims = stories_shape[-1] 542 | output_dims = embeded_question_shape[-1] 543 | 544 | 545 | embeded_stories = getAverageRepresentation(embeded_stories, T_B, d_lproj) 546 | 547 | 548 | embeded_question = tf.tile(tf.expand_dims(embeded_question,dim=1),[1,num_of_sentence,1]) 549 | 550 | sen_weight = tf.reduce_sum(embeded_question*embeded_stories,reduction_indices=-1,keep_dims=True) 551 | 552 | sen_weight = tf.nn.softmax(sen_weight,dim=1) 553 | sen_weight = tf.tile(sen_weight,[1,1,output_dims]) 554 | if return_sequences: 555 | embeded_stories = embeded_stories*sen_weight 556 | else: 557 | embeded_stories = tf.reduce_sum(embeded_stories*sen_weight,reduction_indices=1) # (batch_size, output_dims) 558 | 559 | return embeded_stories 560 | 561 | def getMemoryNetworksMaxPooling(embeded_stories, embeded_question, d_lproj, T_B=None): 562 | 563 | ''' 564 | embeded_stories: (batch_size, num_of_sentence, num_of_words, embeded_words_dims) 565 | embeded_question:(batch_size, embeded_words_dims) 566 | output_dims: the dimension of stories 567 | ''' 568 | stories_shape = embeded_stories.get_shape().as_list() 569 | embeded_question_shape = embeded_question.get_shape().as_list() 570 | num_of_sentence = stories_shape[-3] 571 | input_dims = stories_shape[-1] 572 | output_dims = embeded_question_shape[-1] 573 | 574 | 575 | embeded_stories = getAverageRepresentation(embeded_stories, T_B, d_lproj) 576 | 577 | 578 | embeded_question = tf.tile(tf.expand_dims(embeded_question,dim=1),[1,num_of_sentence,1]) 579 | 580 | sen_weight = tf.reduce_sum(embeded_question*embeded_stories,reduction_indices=-1,keep_dims=True) 581 | 582 | sen_weight = tf.nn.softmax(sen_weight,dim=1) 583 | sen_weight = tf.tile(sen_weight,[1,1,output_dims]) 584 | 585 | embeded_stories = tf.reduce_max(embeded_stories*sen_weight,reduction_indices=1) # (batch_size, output_dims) 586 | 587 | return embeded_stories 588 | 589 | rng = np.random 590 | rng.seed(1234) 591 | 592 | def init_linear_projection(rng, nrows, ncols, pca_mat=None): 593 | """ Linear projection (for example when using fixed w2v as LUT """ 594 | if nrows == ncols: 595 | P = np.eye(nrows) 596 | print "Linear projection: initialized as identity matrix" 597 | else: 598 | assert([nrows, ncols] == pca_mat.shape, 'PCA matrix not of same size as RxC') 599 | P = 0.1 * pca_mat 600 | print "Linear projection: initialized with 0.1 PCA" 601 | 602 | return P.astype('float32') 603 | 604 | def setWord2VecModelConfiguration(v2i,w2v,d_w2v,d_lproj): 605 | ''' 606 | v2i: vocab(word) to int(index) 607 | w2v: word to vector 608 | d_w2v:dimension of w2v 609 | d_lproj: dimension of projection 610 | ''' 611 | voc_size = len(v2i) 612 | np_mask = np.vstack((np.zeros(d_w2v),np.ones((voc_size-1,d_w2v)))) 613 | T_mask = tf.constant(np_mask, tf.float32, name='LUT_mask') 614 | 615 | pca_mat = None 616 | print "Initialize LUTs as word2vec and use linear projection layer" 617 | 618 | 619 | LUT = np.zeros((voc_size, d_w2v), dtype='float32') 620 | found_words = 0 621 | 622 | for w, v in v2i.iteritems(): 623 | if w in w2v.vocab: 624 | LUT[v] = w2v.get_vector(w) 625 | found_words +=1 626 | else: 627 | LUT[v] = rng.randn(d_w2v) 628 | LUT[v] = LUT[v] / (np.linalg.norm(LUT[v]) + 1e-6) 629 | 630 | print "Found %d / %d words" %(found_words, len(v2i)) 631 | 632 | 633 | # word 0 is blanked out, word 1 is 'UNK' 634 | LUT[0] = np.zeros((d_w2v)) 635 | 636 | # if linear projection layer is not the same shape as LUT, then initialize with PCA 637 | if d_lproj != LUT.shape[1]: 638 | pca = PCA(n_components=d_lproj, whiten=True) 639 | pca_mat = pca.fit_transform(LUT.T) # 300 x 100? 640 | 641 | # setup LUT! 642 | T_w2v = tf.constant(LUT.astype('float32')) 643 | 644 | T_B = tf.Variable(init_linear_projection(rng, d_w2v, d_lproj, pca_mat), name='B') 645 | 646 | 647 | 648 | return T_B, T_w2v, T_mask, pca_mat 649 | 650 | 651 | def getEmbeddingWithWord2Vec(words, T_w2v, T_mask): 652 | input_shape = words.get_shape().as_list() 653 | 654 | mask = tf.not_equal(words,0) 655 | 656 | embeded_words = tf.gather(T_w2v,words)*tf.gather(T_mask,words) 657 | 658 | return embeded_words, mask 659 | 660 | def getAverageRepresentation(sentence, T_B, d_lproj): 661 | sentence = tf.reduce_sum(sentence,reduction_indices=-2) 662 | 663 | 664 | sentence_shape = sentence.get_shape().as_list() 665 | if len(sentence_shape)==2: 666 | sentence = tf.matmul(sentence,T_B) 667 | elif len(sentence_shape)==3: 668 | sentence = tf.reshape(sentence,(-1,sentence_shape[-1])) 669 | sentence = tf.matmul(sentence,T_B) 670 | sentence = tf.reshape(sentence,(-1,sentence_shape[1],d_lproj)) 671 | else: 672 | raise ValueError('Invalid sentence_shape:'+sentence_shape) 673 | 674 | sentence = tf.nn.l2_normalize(sentence,-1) 675 | return sentence 676 | 677 | 678 | def getMultiModel(visual_feature, question_feature, answer_feature, common_space_dim): 679 | ''' 680 | fucntion: getMultiModel 681 | parameters: 682 | visual_feature: batch_size * visual_encoded_dim 683 | question_feature: batch_size * question_encoded_dim 684 | answer_feature: batch_zize * numberOfChoices * answer_encoded_dim 685 | common_space_dim: embedding the visual,question,answer to the common space 686 | return: the embeded vectors(v,q,a) 687 | ''' 688 | visual_shape = visual_feature.get_shape().as_list() 689 | question_shape = question_feature.get_shape().as_list() 690 | answer_shape = answer_feature.get_shape().as_list() 691 | 692 | # build the transformed matrix 693 | W_v = init_weight_variable((visual_shape[1],common_space_dim),init_method='glorot_uniform',name="W_v") 694 | W_q = init_weight_variable((question_shape[1],common_space_dim),init_method='glorot_uniform',name="W_q") 695 | W_a = init_weight_variable((answer_shape[2],common_space_dim),init_method='glorot_uniform',name="W_a") 696 | 697 | 698 | 699 | answer_feature = tf.reshape(answer_feature,(-1,answer_shape[2])) 700 | 701 | # encoder the features into common space 702 | T_v = tf.matmul(visual_feature,W_v) 703 | T_q = tf.matmul(question_feature,W_q) 704 | T_a = tf.matmul(answer_feature,W_a) 705 | 706 | T_a = tf.reshape(T_a,(-1,answer_shape[1],common_space_dim)) 707 | 708 | return T_v,T_q,T_a 709 | 710 | def getRankingLoss(T_v, T_q, T_a, answer_index=None, alpha = 0.2 ,isTest=False): 711 | 712 | ''' 713 | function: getRankingLoss 714 | parameters: 715 | answer_index: the ground truth index, one hot vector 716 | return: 717 | loss: tf.float32 718 | ''' 719 | 720 | T_v_shape = T_v.get_shape().as_list() 721 | T_q_shape = T_q.get_shape().as_list() 722 | T_a_shape = T_a.get_shape().as_list() 723 | 724 | numOfChoices = T_a_shape[1] 725 | common_space_dim = T_a_shape[2] 726 | 727 | assert T_q_shape == T_v_shape 728 | 729 | T_v = tf.nn.l2_normalize(T_v,1) 730 | T_q = tf.nn.l2_normalize(T_q,1) 731 | T_a = tf.nn.l2_normalize(T_a,2) 732 | 733 | T_p = tf.nn.l2_normalize(T_v+T_q,1) 734 | 735 | 736 | 737 | # answer_index = tf.tile(tf.expand_dims(answer_index,dim=-1),[1,1,T_q_shape[-1]]) # sample * numOfChoices * common_space_dim 738 | 739 | 740 | T_p = tf.tile(tf.expand_dims(T_p,dim=1),[1,numOfChoices,1]) 741 | 742 | # T_p = tf.nn.l2_normalize(T_p*T_a,2) 743 | T_p = T_p*T_a 744 | T_p = tf.reduce_sum(T_p, reduction_indices=-1) 745 | 746 | scores = T_p 747 | 748 | if not isTest: 749 | assert answer_index is not None 750 | positive = tf.reduce_sum(T_p*answer_index, reduction_indices=1, keep_dims=True) # sample , get the positive score 751 | positive = tf.tile(positive,[1,numOfChoices]) 752 | 753 | loss = (alpha - positive + T_p)*(1-answer_index) 754 | 755 | loss = tf.maximum(0.,loss) 756 | 757 | loss = tf.reduce_sum(loss,reduction_indices=-1) 758 | 759 | return loss,scores 760 | else: 761 | return scores 762 | 763 | 764 | def getClassifierLoss(T_s, T_q, T_a, answer_index=None, isTest=False): 765 | 766 | ''' 767 | function: getRankingLoss 768 | parameters: 769 | answer_index: the ground truth index, one hot vector 770 | return: 771 | loss: tf.float32 772 | ''' 773 | 774 | T_s_shape = T_s.get_shape().as_list() 775 | T_q_shape = T_q.get_shape().as_list() 776 | T_a_shape = T_a.get_shape().as_list() 777 | 778 | numOfChoices = T_a_shape[1] 779 | common_space_dim = T_a_shape[2] 780 | 781 | assert T_q_shape == T_s_shape 782 | 783 | T_s = tf.nn.l2_normalize(T_s+T_q,1) 784 | T_a = tf.nn.l2_normalize(T_a,2) 785 | 786 | T_s = tf.tile(tf.expand_dims(T_s,dim=1),[1,numOfChoices,1]) 787 | 788 | # T_s = tf.nn.l2_normalize(T_s*T_a,2) 789 | T_h = T_s*T_a 790 | T_h = tf.reduce_sum(T_h, reduction_indices=-1) 791 | 792 | scores = T_h 793 | 794 | if not isTest: 795 | assert answer_index is not None 796 | loss = tf.nn.softmax_cross_entropy_with_logits(labels = answer_index, logits = scores) 797 | # acc_value = tf.metrics.accuracy(scores, answer_index) 798 | return loss,scores 799 | else: 800 | return scores 801 | 802 | 803 | 804 | def getVideoSemanticEmbedding(x,w2v,T_B,pca_mat=None): 805 | ''' 806 | x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) 807 | w2v: word 2 vec (|v|,dim) 808 | ''' 809 | input_shape = x.get_shape().as_list() 810 | w2v_shape = w2v.get_shape().as_list() 811 | assert(len(input_shape)==5) 812 | axis = [0,1,3,4,2] 813 | x = tf.transpose(x,perm=axis) 814 | x = tf.reshape(x,(-1,input_shape[2])) 815 | # x = tf.nn.l2_normalize(x,-1) 816 | 817 | if pca_mat is not None: 818 | linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj') 819 | else: 820 | linear_proj = init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') 821 | 822 | x = tf.matmul(x,linear_proj) 823 | x = tf.nn.l2_normalize(x,-1) 824 | 825 | w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v) 826 | 827 | x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|) 828 | 829 | x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1])) 830 | axis = [0,1,4,2,3] 831 | x = tf.transpose(x,perm=axis) 832 | 833 | # can be extended to different architecture 834 | x = tf.reduce_sum(x,reduction_indices=[1,3,4]) 835 | x = tf.nn.l2_normalize(x,-1) 836 | 837 | x = tf.matmul(x,T_B) 838 | 839 | 840 | 841 | return x 842 | 843 | 844 | 845 | if __name__=='__main__': 846 | print('video question answering model module!') 847 | 848 | 849 | 850 | 851 | 852 | -------------------------------------------------------------------------------- /model/SEModelUtil.py: -------------------------------------------------------------------------------- 1 | import os 2 | os.environ["CUDA_VISIBLE_DEVICES"]="0" 3 | import tensorflow as tf 4 | import numpy as np 5 | from sklearn.decomposition import PCA 6 | import ModelUtil 7 | import InitUtil 8 | 9 | 10 | def ndim(x): 11 | """Returns the number of axes in a tensor, as an integer. 12 | 13 | # Arguments 14 | x: Tensor or variable. 15 | 16 | # Returns 17 | Integer (scalar), number of axes. 18 | 19 | # Examples 20 | ```python 21 | >>> from keras import backend as K 22 | >>> input = K.placeholder(shape=(2, 4, 5)) 23 | >>> val = np.array([[1, 2], [3, 4]]) 24 | >>> kvar = K.variable(value=val) 25 | >>> K.ndim(input) 26 | 3 27 | >>> K.ndim(kvar) 28 | 2 29 | ``` 30 | """ 31 | if isinstance(x, tf.SparseTensor): 32 | return x._dims 33 | 34 | dims = x.get_shape()._dims 35 | if dims is not None: 36 | return len(dims) 37 | return None 38 | def batch_dot(x, y, axes=None): 39 | """Batchwise dot product. 40 | 41 | `batch_dot` is used to compute dot product of `x` and `y` when 42 | `x` and `y` are data in batch, i.e. in a shape of 43 | `(batch_size, :)`. 44 | `batch_dot` results in a tensor or variable with less dimensions 45 | than the input. If the number of dimensions is reduced to 1, 46 | we use `expand_dims` to make sure that ndim is at least 2. 47 | 48 | # Arguments 49 | x, y: Keras tensors or variables with `ndim >= 2` 50 | axes: list of (or single) int with target dimensions. 51 | The lengths of `axes[0]` and `axes[1]` should be the same. 52 | 53 | # Returns 54 | A tensor with shape equal to the concatenation of `x`'s shape 55 | (less the dimension that was summed over) and `y`'s shape 56 | (less the batch dimension and the dimension that was summed over). 57 | If the final rank is 1, we reshape it to `(batch_size, 1)`. 58 | 59 | # Examples 60 | Assume `x = [[1, 2], [3, 4]]` and `y = [[5, 6], [7, 8]]` 61 | `batch_dot(x, y, axes=1) = [[17, 53]]` which is the main diagonal 62 | of `x.dot(y.T)`, although we never have to calculate the off-diagonal 63 | elements. 64 | 65 | Shape inference: 66 | Let `x`'s shape be `(100, 20)` and `y`'s shape be `(100, 30, 20)`. 67 | If `axes` is (1, 2), to find the output shape of resultant tensor, 68 | loop through each dimension in `x`'s shape and `y`'s shape: 69 | 70 | * `x.shape[0]` : 100 : append to output shape 71 | * `x.shape[1]` : 20 : do not append to output shape, 72 | dimension 1 of `x` has been summed over. (`dot_axes[0]` = 1) 73 | * `y.shape[0]` : 100 : do not append to output shape, 74 | always ignore first dimension of `y` 75 | * `y.shape[1]` : 30 : append to output shape 76 | * `y.shape[2]` : 20 : do not append to output shape, 77 | dimension 2 of `y` has been summed over. (`dot_axes[1]` = 2) 78 | `output_shape` = `(100, 30)` 79 | 80 | ```python 81 | >>> x_batch = K.ones(shape=(32, 20, 1)) 82 | >>> y_batch = K.ones(shape=(32, 30, 20)) 83 | >>> xy_batch_dot = K.batch_dot(x_batch, y_batch, axes=[1, 2]) 84 | >>> K.int_shape(xy_batch_dot) 85 | (32, 1, 30) 86 | ``` 87 | """ 88 | if isinstance(axes, int): 89 | axes = (axes, axes) 90 | #print('1') 91 | if ndim(x) == 2 and ndim(y) == 2: 92 | if tf_major_version >= 1: 93 | if axes[0] == axes[1]: 94 | out = tf.reduce_sum(tf.multiply(x, y), axes[0]) 95 | else: 96 | out = tf.reduce_sum(tf.multiply(tf.transpose(x, [1, 0]), y), axes[1]) 97 | else: 98 | if axes[0] == axes[1]: 99 | out = tf.reduce_sum(tf.mul(x, y), axes[0]) 100 | else: 101 | out = tf.reduce_sum(tf.mul(tf.transpose(x, [1, 0]), y), axes[1]) 102 | else: 103 | if axes is not None: 104 | #print('2') 105 | adj_x = None if axes[0] == ndim(x) - 1 else True 106 | adj_y = True if axes[1] == ndim(y) - 1 else None 107 | else: 108 | #print('3') 109 | adj_x = None 110 | adj_y = None 111 | # TODO: remove later. 112 | if hasattr(tf, 'batch_matmul'): 113 | try: 114 | out = tf.batch_matmul(x, y, adj_a=adj_x, adj_b=adj_y) 115 | #print('4') 116 | except TypeError: 117 | out = tf.batch_matmul(x, y, adj_x=adj_x, adj_y=adj_y) 118 | else: 119 | out = tf.matmul(x, y, adjoint_a=adj_x, adjoint_b=adj_y) 120 | if ndim(out) == 1: 121 | out = expand_dims(out, 1) 122 | return out 123 | 124 | 125 | def getVideoDualSemanticEmbeddingWithQuestionAttention(x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None): 126 | ''' 127 | x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) 128 | w2v: word 2 vec (|v|,dim) 129 | ''' 130 | input_shape = x.get_shape().as_list() 131 | w2v_shape = w2v.get_shape().as_list() 132 | assert(len(input_shape)==5) 133 | axis = [0,1,3,4,2] 134 | x = tf.transpose(x,perm=axis) 135 | x = tf.reshape(x,(-1,input_shape[2])) 136 | 137 | if pca_mat is not None: 138 | linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj') 139 | else: 140 | linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') 141 | 142 | x = tf.matmul(x,linear_proj) 143 | x = tf.nn.l2_normalize(x,-1) 144 | 145 | w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v) 146 | x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|) 147 | 148 | 149 | #----------------------- 150 | 151 | x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1])) 152 | axis = [0,1,4,2,3] 153 | x = tf.transpose(x,perm=axis) 154 | 155 | # can be extended to different architecture 156 | x = tf.reduce_sum(x,reduction_indices=[3,4]) 157 | x = tf.nn.l2_normalize(x,-1) 158 | 159 | #----------------------- 160 | stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) 161 | x = batch_dot(x,stories_cov) 162 | #----------------------- 163 | x = tf.nn.l2_normalize(x,-1) 164 | 165 | embedded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1]) 166 | 167 | 168 | frame_weight = tf.reduce_sum(x*embedded_question,reduction_indices=-1,keep_dims=True) 169 | frame_weight = tf.nn.softmax(frame_weight,dim=1) 170 | 171 | frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) 172 | 173 | x = tf.reduce_sum(x*frame_weight,reduction_indices=1) 174 | 175 | x = tf.matmul(x,T_B) 176 | 177 | x = tf.nn.l2_normalize(x,-1) 178 | return x 179 | 180 | 181 | 182 | def getVideoDualSemanticEmbeddingWithQuestionAttention_up(x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None): 183 | ''' 184 | x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) 185 | w2v: word 2 vec (|v|,dim) 186 | ''' 187 | input_shape = x.get_shape().as_list() 188 | w2v_shape = w2v.get_shape().as_list() 189 | assert(len(input_shape)==5) 190 | axis = [0,1,3,4,2] 191 | x = tf.transpose(x,perm=axis) 192 | x = tf.reshape(x,(-1,input_shape[2])) 193 | 194 | 195 | if pca_mat is not None: 196 | linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj') 197 | else: 198 | linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') 199 | 200 | x = tf.matmul(x,linear_proj) 201 | x = tf.nn.l2_normalize(x,-1) 202 | 203 | #----------------------- 204 | w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v) 205 | x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|) 206 | 207 | #----------------------- 208 | 209 | x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1])) 210 | axis = [0,1,4,2,3] 211 | x = tf.transpose(x,perm=axis) 212 | 213 | # can be extended to different architecture 214 | x = tf.reduce_sum(x,reduction_indices=[3,4]) 215 | x = tf.nn.l2_normalize(x,-1) 216 | 217 | 218 | 219 | #----------------------- 220 | 221 | 222 | stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) 223 | x_out = batch_dot(x,stories_cov) 224 | 225 | 226 | 227 | #----------------------- 228 | x = tf.nn.l2_normalize(x_out,-1) 229 | 230 | embedded_question_use = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1]) 231 | 232 | 233 | frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True) 234 | 235 | frame_weight = tf.nn.softmax(frame_weight,dim=1) 236 | 237 | 238 | 239 | frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) 240 | 241 | x_weight_new = tf.reduce_sum(x*frame_weight,reduction_indices=1) 242 | 243 | 244 | x_weight_use = tf.expand_dims(x_weight_new, dim = 1) 245 | 246 | story_weight = tf.matmul(x_weight_use,tf.transpose(embedded_stories_words,perm=[0,2,1])) 247 | 248 | story_weight = tf.nn.relu(story_weight) 249 | 250 | embedded_stories_words = tf.multiply(tf.transpose(story_weight,perm=[0,2,1]), embedded_stories_words) 251 | stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) 252 | 253 | x = batch_dot(x,stories_cov) 254 | 255 | 256 | 257 | 258 | x = tf.nn.l2_normalize(x,-1) 259 | 260 | 261 | frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True) 262 | 263 | frame_weight = tf.nn.softmax(frame_weight,dim=1) 264 | 265 | frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) 266 | 267 | x = tf.reduce_sum(x*frame_weight,reduction_indices=1) 268 | 269 | 270 | x = tf.matmul(x,T_B) 271 | 272 | x = tf.nn.l2_normalize(x,-1) 273 | return x 274 | 275 | 276 | def getAverageRepresentation(sentence, T_B, d_lproj): 277 | sentence = tf.reduce_sum(sentence,reduction_indices=-2) 278 | 279 | sentence_shape = sentence.get_shape().as_list() 280 | if len(sentence_shape)==2: 281 | sentence = tf.matmul(sentence,T_B) 282 | elif len(sentence_shape)==3: 283 | sentence = tf.reshape(sentence,(-1,sentence_shape[-1])) 284 | sentence = tf.matmul(sentence,T_B) 285 | sentence = tf.reshape(sentence,(-1,sentence_shape[1],d_lproj)) 286 | else: 287 | raise ValueError('Invalid sentence_shape:'+sentence_shape) 288 | 289 | sentence = tf.nn.l2_normalize(sentence,-1) 290 | return sentence 291 | 292 | 293 | 294 | 295 | 296 | def getMemoryNetworks(embeded_stories, embeded_question, d_lproj, return_sequences=False): 297 | 298 | ''' 299 | embeded_stories: (batch_size, num_of_sentence, num_of_words, embeded_words_dims) 300 | embeded_question:(batch_size, embeded_words_dims) 301 | output_dims: the dimension of stories 302 | ''' 303 | stories_shape = embeded_stories.get_shape().as_list() 304 | embeded_question_shape = embeded_question.get_shape().as_list() 305 | num_of_sentence = stories_shape[-3] 306 | input_dims = stories_shape[-1] 307 | output_dims = embeded_question_shape[-1] 308 | 309 | 310 | embeded_stories = tf.reduce_sum(embeded_stories,reduction_indices=-2) 311 | embeded_stories = tf.nn.l2_normalize(embeded_stories,-2) 312 | 313 | 314 | embeded_question = tf.tile(tf.expand_dims(embeded_question,dim=1),[1,num_of_sentence,1]) 315 | 316 | sen_weight = tf.reduce_sum(embeded_question*embeded_stories,reduction_indices=-1,keep_dims=True) 317 | 318 | sen_weight = tf.nn.softmax(sen_weight,dim=1) 319 | sen_weight = tf.tile(sen_weight,[1,1,output_dims]) 320 | if return_sequences: 321 | embeded_stories = embeded_stories*sen_weight 322 | else: 323 | embeded_stories = tf.reduce_sum(embeded_stories*sen_weight,reduction_indices=1) # (batch_size, output_dims) 324 | 325 | return embeded_stories 326 | 327 | 328 | 329 | 330 | def getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid(embeded_stories, d_lproj, x,w2v,embedded_stories_words,embedded_question,T_B,pca_mat=None,return_sequences=True): 331 | ''' 332 | x: input video cnn feature with size of (batch_size, timesteps, channels, height, width) 333 | w2v: word 2 vec (|v|,dim) 334 | ''' 335 | input_shape = x.get_shape().as_list() 336 | w2v_shape = w2v.get_shape().as_list() 337 | assert(len(input_shape)==5) 338 | axis = [0,1,3,4,2] 339 | x = tf.transpose(x,perm=axis) 340 | x = tf.reshape(x,(-1,input_shape[2])) 341 | 342 | if pca_mat is not None: 343 | linear_proj = tf.Variable(0.1*pca_mat,dtype='float32',name='visual_linear_proj') 344 | else: 345 | linear_proj = InitUtil.init_weight_variable((input_shape[2],w2v_shape[-1]), init_method='uniform', name='visual_linear_proj') 346 | 347 | x = tf.matmul(x,linear_proj) 348 | x = tf.nn.l2_normalize(x,-1) 349 | 350 | 351 | 352 | #----------------------- 353 | w2v_cov = tf.matmul(tf.transpose(w2v,perm=[1,0]),w2v) 354 | x = tf.matmul(x,w2v_cov) # (batch_size*timesteps*height*width, |V|) 355 | 356 | #----------------------- 357 | 358 | x = tf.reshape(x,(-1,input_shape[1],input_shape[3],input_shape[4],w2v_shape[-1])) 359 | axis = [0,1,4,2,3] 360 | x = tf.transpose(x,perm=axis) 361 | 362 | # can be extended to different architecture 363 | x = tf.reduce_sum(x,reduction_indices=[3,4]) 364 | x = tf.nn.l2_normalize(x,-1) 365 | 366 | 367 | 368 | #----------------------- 369 | 370 | stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) 371 | x_out = batch_dot(x,stories_cov) 372 | 373 | 374 | 375 | #----------------------- 376 | x = tf.nn.l2_normalize(x_out,-1) 377 | 378 | embedded_question_use = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,input_shape[1],1]) 379 | 380 | 381 | frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True) 382 | 383 | frame_weight = tf.nn.softmax(frame_weight,dim=1) 384 | 385 | 386 | 387 | frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) 388 | 389 | x_weight_new = tf.reduce_sum(x*frame_weight,reduction_indices=1) 390 | 391 | 392 | x_weight_use = tf.expand_dims(x_weight_new, dim = 1) 393 | 394 | story_weight = tf.matmul(x_weight_use,tf.transpose(embedded_stories_words,perm=[0,2,1])) 395 | 396 | story_weight = tf.nn.relu(story_weight) 397 | 398 | embedded_stories_words = tf.multiply(tf.transpose(story_weight,perm=[0,2,1]), embedded_stories_words) 399 | 400 | stories_cov = batch_dot(tf.transpose(embedded_stories_words,perm=[0,2,1]),embedded_stories_words) 401 | 402 | x = batch_dot(x,stories_cov) 403 | 404 | x = tf.nn.l2_normalize(x_out,-1) 405 | 406 | #------------------------------------------------------------------------------------------------------------- 407 | 408 | stories_shape = embeded_stories.get_shape().as_list() 409 | embeded_question_shape = embedded_question.get_shape().as_list() 410 | num_of_sentence = stories_shape[-3] 411 | input_dims = stories_shape[-1] 412 | output_dims = embeded_question_shape[-1] 413 | 414 | print('embeded_question_shape', embeded_question_shape) 415 | print('num_of_sentence', num_of_sentence) 416 | 417 | print('output_dims', output_dims) 418 | print('stories_shape', stories_shape) 419 | 420 | 421 | embeded_question = tf.tile(tf.expand_dims(embedded_question,dim=1),[1,num_of_sentence,1]) 422 | 423 | sen_weight = tf.reduce_sum(embeded_question*embedded_stories_words,reduction_indices=-1,keep_dims=True) 424 | 425 | 426 | sen_weight = tf.nn.relu(sen_weight) 427 | sen_weight = tf.tile(sen_weight,[1,1,output_dims]) 428 | if return_sequences: 429 | embeded_stories_used = embedded_stories_words*sen_weight 430 | else: 431 | embeded_stories_used = tf.reduce_sum(embedded_stories_words*sen_weight,reduction_indices=1) 432 | 433 | 434 | #------------------------------------------------------------------------------------------------------------- 435 | stories_cov = batch_dot(tf.transpose(embeded_stories_used,perm=[0,2,1]),embeded_stories_used) 436 | 437 | x = batch_dot(x,stories_cov) 438 | 439 | 440 | 441 | 442 | #----------------------- 443 | x = tf.nn.l2_normalize(x,-1) 444 | 445 | 446 | 447 | 448 | frame_weight = tf.reduce_sum(x*embedded_question_use,reduction_indices=-1,keep_dims=True) 449 | 450 | frame_weight = tf.nn.softmax(frame_weight,dim=1) 451 | 452 | frame_weight =tf.tile(frame_weight,[1,1,w2v_shape[-1]]) 453 | 454 | x = tf.reduce_sum(x*frame_weight,reduction_indices=1) 455 | 456 | 457 | #----------------------------------------------- 458 | 459 | x = tf.matmul(x,T_B) 460 | 461 | x = tf.nn.l2_normalize(x,-1) 462 | 463 | return x 464 | 465 | 466 | -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bowong/Layered-Memory-Network/86364077c40de7674088248b81ef805d7bfa7f4d/model/__init__.py -------------------------------------------------------------------------------- /mqa_video+subtitle+question.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import h5py 4 | import math 5 | #import MovieQA_benchmark as MovieQA 6 | from model import DataUtil 7 | from model import ModelUtil 8 | from model import SEModelUtil 9 | import word2vec as w2v 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 12 | 13 | import tensorflow as tf 14 | from sklearn.decomposition import PCA 15 | import cPickle as pickle 16 | import time 17 | import json 18 | from collections import namedtuple 19 | 20 | def build_model(input_video, input_stories, input_question, input_answer, 21 | v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300, 22 | answer_index = None, lr=0.01, question_guided=False): 23 | 24 | with tf.variable_scope('video_subtitle_hierarchical_frame_clip_question') as scope: 25 | 26 | T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj) 27 | # encode question 28 | embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask) 29 | embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj) 30 | 31 | # encode stories 32 | embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask) 33 | embedded_stories = ModelUtil.getMemoryNetworks(embedded_stories_words, embedded_question, d_lproj, T_B=T_B, return_sequences=True) 34 | 35 | # encode video 36 | 37 | embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention(input_video, T_w2v, embedded_stories, embedded_question, T_B, pca_mat=pca_mat) # batch x timesteps x d_w2v 38 | 39 | 40 | # encode answers 41 | embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask) 42 | embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj) 43 | 44 | # get video loss 45 | video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index) 46 | 47 | 48 | # train module 49 | loss = tf.reduce_mean(video_loss) 50 | 51 | optimizer = tf.train.GradientDescentOptimizer(lr) 52 | train = optimizer.minimize(loss) 53 | return train,loss,video_scores 54 | 55 | def linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=None): 56 | 57 | print('--utilize PCA to initialize the embedding matrix of feature to d_w2v') 58 | samples = [] 59 | for imdb_key in hf.keys(): 60 | feature = hf[imdb_key][:] 61 | axis = [0,2,3,1] 62 | feature = np.transpose(feature, tuple(axis)) 63 | feature = np.reshape(feature,(-1,feature_shape[1])) 64 | feature = np.random.permutation(feature) 65 | samples.extend(feature[:50]) 66 | print('samples:',len(samples)) 67 | 68 | pca = PCA(n_components=d_w2v, whiten=True) 69 | pca_mat = pca.fit_transform(np.asarray(samples).T) # 1024 x 300 70 | 71 | pickle.dump(pca_mat,open(output_path,'w')) 72 | print('pca_amt dump to file:',output_path) 73 | return pca_mat 74 | 75 | 76 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape, 77 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32): 78 | if train is not None: 79 | np.random.shuffle(data) 80 | 81 | total_data = len(data) 82 | num_batch = int(round(total_data*1.0/batch_size)) 83 | 84 | total_correct_num = 0 85 | total_loss = 0.0 86 | for batch_idx in xrange(num_batch): 87 | batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] 88 | 89 | data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices) 90 | data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape) 91 | data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape) 92 | if train is not None: 93 | _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 94 | else: 95 | l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 96 | 97 | 98 | num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0)) 99 | total_correct_num += num_correct 100 | total_loss += l 101 | 102 | total_acc = total_correct_num*1.0/total_data 103 | total_loss = total_loss/num_batch 104 | return total_acc, total_loss 105 | 106 | 107 | 108 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 109 | feature_shape=(16,1024,7,7), 110 | batch_size=8,total_epoch=100, 111 | lr=0.01,pretrained_model=False,pca_mat_init_file=None): 112 | 113 | 114 | w2v_mqa_model_filename = '/data1/wb/movie_plots_1364.d-300.mc1.w2v' 115 | w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') 116 | 117 | ''' 118 | model parameters 119 | ''' 120 | size_voc = len(v2i) 121 | max_sentences = 3660 122 | max_words = 40 123 | 124 | story_shape = (max_sentences,max_words) 125 | 126 | size_voc = len(v2i) 127 | 128 | 129 | print('building model ...') 130 | 131 | if os.path.exists(pca_mat_init_file): 132 | pca_mat = pickle.load(open(pca_mat_init_file,'r')) 133 | else: 134 | pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file) 135 | 136 | print('pca_mat.shape:',pca_mat.shape) 137 | 138 | input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video') 139 | input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories') 140 | input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question') 141 | input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer') 142 | 143 | y = tf.placeholder(tf.float32,shape=(None, numberOfChoices)) 144 | 145 | train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model, 146 | pca_mat=pca_mat, 147 | d_w2v=300,d_lproj=300, 148 | answer_index=y, lr=lr) 149 | 150 | ''' 151 | configure && runtime environment 152 | ''' 153 | config = tf.ConfigProto() 154 | config.gpu_options.per_process_gpu_memory_fraction = 0.5 155 | config.log_device_placement=False 156 | 157 | sess = tf.Session(config=config) 158 | 159 | init = tf.global_variables_initializer() 160 | sess.run(init) 161 | 162 | ''' 163 | training parameters 164 | ''' 165 | 166 | with open('train_split.json') as fid: 167 | trdev = json.load(fid) 168 | 169 | 170 | def getTrainDevSplit(trained_video_QAs,trdev): 171 | train_data = [] 172 | dev_data = [] 173 | for k, qa in enumerate(trained_video_QAs): 174 | 175 | if qa.imdb_key in trdev['train']: 176 | train_data.append(qa) 177 | else: 178 | dev_data.append(qa) 179 | return train_data,dev_data 180 | 181 | train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev) 182 | 183 | 184 | with sess.as_default(): 185 | saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch) 186 | if pretrained_model is not None: 187 | saver.restore(sess, pretrained_model) 188 | print('restore pre trained file:' + pretrained_model) 189 | for epoch in xrange(total_epoch): 190 | 191 | # # shuffle 192 | print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size)) 193 | # train phase 194 | tic = time.time() 195 | total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 196 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32) 197 | print(' --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 198 | 199 | # dev phase 200 | tic = time.time() 201 | total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 202 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 203 | print(' --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 204 | # eval phase 205 | 206 | tic = time.time() 207 | total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape, 208 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 209 | print(' --Val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 210 | 211 | 212 | 213 | #save model 214 | export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0]) 215 | if not os.path.exists(export_path): 216 | os.makedirs(export_path) 217 | print('mkdir %s' %export_path) 218 | save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt') 219 | print("Model saved in file: %s" % save_path) 220 | 221 | 222 | def trans(all): 223 | 224 | qa_list = [] 225 | for dicts in all: 226 | 227 | qa_list.append( 228 | QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'], 229 | dicts['imdb_key'], dicts['video_clips'])) 230 | return qa_list 231 | 232 | 233 | if __name__ == '__main__': 234 | 235 | # 'video+subtitle task' 236 | 237 | nql=25 # sequences length for question 238 | nqa=32 # sequences length for anwser 239 | numberOfChoices = 5 # for input choices, one for correct, one for wrong answer 240 | QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips') 241 | 242 | 243 | v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb")) 244 | qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb"))) 245 | qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb"))) 246 | train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb")) 247 | val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb")) 248 | 249 | lr = 0.01 250 | 251 | ''' 252 | --------------------------------- 253 | 224x224 vgg all clips feature 254 | ''' 255 | 256 | video_feature_dims=512 257 | timesteps_v=32 # sequences length for video 258 | hight = 7 259 | width = 7 260 | feature_shape = (timesteps_v,video_feature_dims,hight,width) 261 | 262 | f_type = '224x224_VGG' 263 | feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5' 264 | pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl' 265 | hf = h5py.File(feature_path,'r') 266 | 267 | 268 | pretrained_model = None 269 | train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 270 | feature_shape=feature_shape,lr=lr, 271 | batch_size=8,total_epoch=20, 272 | pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file) 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | -------------------------------------------------------------------------------- /mqa_video+subtitle+update+question.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import h5py 4 | import math 5 | #import MovieQA_benchmark as MovieQA 6 | from model import DataUtil 7 | from model import ModelUtil 8 | from model import SEModelUtil 9 | import word2vec as w2v 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 12 | 13 | import tensorflow as tf 14 | from sklearn.decomposition import PCA 15 | import cPickle as pickle 16 | import time 17 | import json 18 | from collections import namedtuple 19 | 20 | def build_model(input_video, input_stories, input_question, input_answer, 21 | v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300, 22 | answer_index = None, lr=0.01, question_guided=False): 23 | 24 | 25 | with tf.variable_scope('video_subtitle_hierarchical_frame_clip') as scope: 26 | 27 | 28 | T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj) 29 | # encode question 30 | embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask) 31 | embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj) 32 | 33 | # encode stories 34 | embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask) 35 | 36 | embeded_stories = SEModelUtil.getAverageRepresentation(embedded_stories_words, T_B, d_lproj) 37 | 38 | embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention_question_guid(embedded_stories_words, d_lproj,input_video, T_w2v, embeded_stories, embedded_question, T_B, pca_mat=pca_mat, return_sequences=True) 39 | 40 | # encode answers 41 | embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask) 42 | embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj) 43 | 44 | # get video loss 45 | video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index) 46 | 47 | # train module 48 | loss = tf.reduce_mean(video_loss) 49 | 50 | optimizer = tf.train.GradientDescentOptimizer(lr) 51 | 52 | train = optimizer.minimize(loss) 53 | return train,loss,video_scores 54 | 55 | def linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=None): 56 | 57 | print('--utilize PCA to initialize the embedding matrix of feature to d_w2v') 58 | samples = [] 59 | for imdb_key in hf.keys(): 60 | feature = hf[imdb_key][:] 61 | axis = [0,2,3,1] 62 | feature = np.transpose(feature, tuple(axis)) 63 | feature = np.reshape(feature,(-1,feature_shape[1])) 64 | feature = np.random.permutation(feature) 65 | samples.extend(feature[:50]) 66 | print('samples:',len(samples)) 67 | 68 | pca = PCA(n_components=d_w2v, whiten=True) 69 | pca_mat = pca.fit_transform(np.asarray(samples).T) # 1024 x 300 70 | 71 | pickle.dump(pca_mat,open(output_path,'w')) 72 | print('pca_amt dump to file:',output_path) 73 | return pca_mat 74 | 75 | 76 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape, 77 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32): 78 | if train is not None: 79 | np.random.shuffle(data) 80 | 81 | total_data = len(data) 82 | num_batch = int(round(total_data*1.0/batch_size)) 83 | 84 | total_correct_num = 0 85 | total_loss = 0.0 86 | for batch_idx in xrange(num_batch): 87 | batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] 88 | 89 | data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices) 90 | data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape) 91 | data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape) 92 | if train is not None: 93 | _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 94 | else: 95 | l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 96 | 97 | num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0)) 98 | total_correct_num += num_correct 99 | total_loss += l 100 | total_acc = total_correct_num*1.0/total_data 101 | total_loss = total_loss/num_batch 102 | return total_acc, total_loss 103 | 104 | 105 | 106 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 107 | feature_shape=(16,1024,7,7), 108 | batch_size=8,total_epoch=100, 109 | lr=0.01,pretrained_model=False,pca_mat_init_file=None): 110 | 111 | 112 | w2v_mqa_model_filename = '/home/wb/movie_plots_1364.d-300.mc1.w2v' 113 | w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') 114 | 115 | 116 | ''' 117 | model parameters 118 | ''' 119 | size_voc = len(v2i) 120 | 121 | max_sentences = 3660 122 | 123 | max_words = 40 124 | 125 | story_shape = (max_sentences,max_words) 126 | 127 | size_voc = len(v2i) 128 | 129 | 130 | print('building model ...') 131 | 132 | if os.path.exists(pca_mat_init_file): 133 | pca_mat = pickle.load(open(pca_mat_init_file,'r')) 134 | else: 135 | pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file) 136 | 137 | print('pca_mat.shape:',pca_mat.shape) 138 | 139 | input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video') 140 | input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories') 141 | input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question') 142 | input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer') 143 | 144 | y = tf.placeholder(tf.float32,shape=(None, numberOfChoices)) 145 | 146 | train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model, 147 | pca_mat=pca_mat, 148 | d_w2v=300,d_lproj=300, 149 | answer_index=y, lr=lr) 150 | 151 | ''' 152 | configure && runtime environment 153 | ''' 154 | config = tf.ConfigProto() 155 | config.gpu_options.per_process_gpu_memory_fraction = 0.5 156 | config.log_device_placement=False 157 | sess = tf.Session(config=config) 158 | init = tf.global_variables_initializer() 159 | sess.run(init) 160 | 161 | ''' 162 | training parameters 163 | ''' 164 | 165 | with open('train_split.json') as fid: 166 | trdev = json.load(fid) 167 | 168 | 169 | def getTrainDevSplit(trained_video_QAs,trdev): 170 | train_data = [] 171 | dev_data = [] 172 | for k, qa in enumerate(trained_video_QAs): 173 | 174 | if qa.imdb_key in trdev['train']: 175 | train_data.append(qa) 176 | else: 177 | dev_data.append(qa) 178 | return train_data,dev_data 179 | 180 | train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev) 181 | 182 | 183 | with sess.as_default(): 184 | saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch) 185 | if pretrained_model is not None: 186 | saver.restore(sess, pretrained_model) 187 | print('restore pre trained file:' + pretrained_model) 188 | for epoch in xrange(total_epoch): 189 | 190 | # # shuffle 191 | print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size)) 192 | # train phase 193 | tic = time.time() 194 | total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 195 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32) 196 | print(' --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 197 | 198 | # dev phase 199 | tic = time.time() 200 | total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 201 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 202 | print(' --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 203 | # eval phase 204 | tic = time.time() 205 | total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape, 206 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 207 | print(' --Val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 208 | 209 | #save model 210 | export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0]) 211 | if not os.path.exists(export_path): 212 | os.makedirs(export_path) 213 | print('mkdir %s' %export_path) 214 | save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt') 215 | print("Model saved in file: %s" % save_path) 216 | 217 | 218 | def trans(all): 219 | 220 | qa_list = [] 221 | for dicts in all: 222 | 223 | qa_list.append( 224 | QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'], 225 | dicts['imdb_key'], dicts['video_clips'])) 226 | return qa_list 227 | 228 | 229 | if __name__ == '__main__': 230 | 231 | # 'video+subtitle task' 232 | 233 | nql=25 # sequences length for question 234 | nqa=32 # sequences length for anwser 235 | numberOfChoices = 5 # for input choices, one for correct, one for wrong answer 236 | QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips') 237 | 238 | 239 | v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb")) 240 | qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb"))) 241 | qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb"))) 242 | train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb")) 243 | val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb")) 244 | 245 | lr = 0.01 246 | 247 | 248 | ''' 249 | --------------------------------- 250 | 224x224 vgg all clips feature 251 | ''' 252 | 253 | video_feature_dims=512 254 | timesteps_v=32 # sequences length for video 255 | hight = 7 256 | width = 7 257 | feature_shape = (timesteps_v,video_feature_dims,hight,width) 258 | 259 | f_type = '224x224_VGG' 260 | feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5' 261 | pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl' 262 | 263 | 264 | hf = h5py.File(feature_path,'r') 265 | 266 | pretrained_model = None 267 | train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 268 | feature_shape=feature_shape,lr=lr, 269 | batch_size=8,total_epoch=40, 270 | pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file) 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | -------------------------------------------------------------------------------- /mqa_video+subtitle.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import h5py 4 | import math 5 | #import MovieQA_benchmark as MovieQA 6 | from model import DataUtil 7 | from model import ModelUtil 8 | from model import SEModelUtil 9 | import word2vec as w2v 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 12 | 13 | import tensorflow as tf 14 | from sklearn.decomposition import PCA 15 | import cPickle as pickle 16 | import time 17 | import json 18 | from collections import namedtuple 19 | 20 | def build_model(input_video, input_stories, input_question, input_answer, 21 | v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300, 22 | answer_index = None, lr=0.01, question_guided=False): 23 | 24 | 25 | with tf.variable_scope('video_subtitle_hierarchical_frame_clip') as scope: 26 | 27 | 28 | T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj) 29 | # encode question 30 | embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask) 31 | embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj) 32 | 33 | # encode stories 34 | embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask) 35 | embeded_stories = SEModelUtil.getAverageRepresentation(embedded_stories_words, T_B, d_lproj) 36 | # encode video 37 | embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention(input_video, T_w2v, embeded_stories, embedded_question, T_B, pca_mat=pca_mat) # batch x timesteps x d_w2v 38 | 39 | # encode answers 40 | embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask) 41 | embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj) 42 | 43 | # get video loss 44 | video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index) 45 | 46 | # train module 47 | loss = tf.reduce_mean(video_loss) 48 | optimizer = tf.train.GradientDescentOptimizer(lr) 49 | train = optimizer.minimize(loss) 50 | return train,loss,video_scores 51 | 52 | def linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=None): 53 | 54 | print('--utilize PCA to initialize the embedding matrix of feature to d_w2v') 55 | samples = [] 56 | for imdb_key in hf.keys(): 57 | feature = hf[imdb_key][:] 58 | axis = [0,2,3,1] 59 | feature = np.transpose(feature, tuple(axis)) 60 | feature = np.reshape(feature,(-1,feature_shape[1])) 61 | feature = np.random.permutation(feature) 62 | samples.extend(feature[:50]) 63 | print('samples:',len(samples)) 64 | 65 | pca = PCA(n_components=d_w2v, whiten=True) 66 | pca_mat = pca.fit_transform(np.asarray(samples).T) # 1024 x 300 67 | 68 | pickle.dump(pca_mat,open(output_path,'w')) 69 | print('pca_amt dump to file:',output_path) 70 | return pca_mat 71 | 72 | 73 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape, 74 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32): 75 | if train is not None: 76 | np.random.shuffle(data) 77 | 78 | total_data = len(data) 79 | num_batch = int(round(total_data*1.0/batch_size)) 80 | 81 | total_correct_num = 0 82 | total_loss = 0.0 83 | for batch_idx in xrange(num_batch): 84 | batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] 85 | 86 | data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices) 87 | data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape) 88 | data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape) 89 | if train is not None: 90 | _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 91 | else: 92 | l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 93 | 94 | 95 | num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0)) 96 | total_correct_num += num_correct 97 | total_loss += l 98 | 99 | total_acc = total_correct_num*1.0/total_data 100 | total_loss = total_loss/num_batch 101 | return total_acc, total_loss 102 | 103 | 104 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 105 | feature_shape=(16,1024,7,7), 106 | batch_size=8,total_epoch=100, 107 | lr=0.01,pretrained_model=False,pca_mat_init_file=None): 108 | 109 | 110 | w2v_mqa_model_filename = '/data1/wb/movie_plots_1364.d-300.mc1.w2v' 111 | w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') 112 | 113 | ''' 114 | model parameters 115 | ''' 116 | size_voc = len(v2i) 117 | max_sentences = 3660 118 | max_words = 40 119 | story_shape = (max_sentences,max_words) 120 | size_voc = len(v2i) 121 | 122 | 123 | print('building model ...') 124 | 125 | if os.path.exists(pca_mat_init_file): 126 | pca_mat = pickle.load(open(pca_mat_init_file,'r')) 127 | else: 128 | pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file) 129 | 130 | print('pca_mat.shape:',pca_mat.shape) 131 | 132 | input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video') 133 | input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories') 134 | input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question') 135 | input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer') 136 | 137 | y = tf.placeholder(tf.float32,shape=(None, numberOfChoices)) 138 | 139 | train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model, 140 | pca_mat=pca_mat, 141 | d_w2v=300,d_lproj=300, 142 | answer_index=y, lr=lr) 143 | 144 | ''' 145 | configure && runtime environment 146 | ''' 147 | config = tf.ConfigProto() 148 | config.gpu_options.per_process_gpu_memory_fraction = 0.4 149 | config.log_device_placement=False 150 | 151 | sess = tf.Session(config=config) 152 | 153 | init = tf.global_variables_initializer() 154 | sess.run(init) 155 | 156 | ''' 157 | training parameters 158 | ''' 159 | 160 | with open('train_split.json') as fid: 161 | trdev = json.load(fid) 162 | 163 | 164 | def getTrainDevSplit(trained_video_QAs,trdev): 165 | train_data = [] 166 | dev_data = [] 167 | for k, qa in enumerate(trained_video_QAs): 168 | 169 | if qa.imdb_key in trdev['train']: 170 | train_data.append(qa) 171 | else: 172 | dev_data.append(qa) 173 | return train_data,dev_data 174 | 175 | train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev) 176 | 177 | 178 | 179 | with sess.as_default(): 180 | saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch) 181 | if pretrained_model is not None: 182 | saver.restore(sess, pretrained_model) 183 | print('restore pre trained file:' + pretrained_model) 184 | for epoch in xrange(total_epoch): 185 | 186 | # # shuffle 187 | print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size)) 188 | # train phase 189 | tic = time.time() 190 | total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 191 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32) 192 | print(' --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 193 | 194 | # dev phase 195 | tic = time.time() 196 | total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 197 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 198 | print(' --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 199 | # eval phase 200 | tic = time.time() 201 | total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape, 202 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 203 | print(' --Val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 204 | 205 | #save model 206 | export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0]) 207 | if not os.path.exists(export_path): 208 | os.makedirs(export_path) 209 | print('mkdir %s' %export_path) 210 | save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt') 211 | print("Model saved in file: %s" % save_path) 212 | 213 | 214 | def trans(all): 215 | 216 | qa_list = [] 217 | for dicts in all: 218 | 219 | qa_list.append( 220 | QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'], 221 | dicts['imdb_key'], dicts['video_clips'])) 222 | return qa_list 223 | 224 | 225 | if __name__ == '__main__': 226 | 227 | # 'video+subtitle task' 228 | 229 | nql=25 # sequences length for question 230 | nqa=32 # sequences length for anwser 231 | numberOfChoices = 5 # for input choices, one for correct, one for wrong answer 232 | QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips') 233 | 234 | v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb")) 235 | qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb"))) 236 | qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb"))) 237 | train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb")) 238 | val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb")) 239 | 240 | lr = 0.01 241 | 242 | ''' 243 | --------------------------------- 244 | 224x224 vgg all clips feature 245 | ''' 246 | 247 | video_feature_dims=512 248 | timesteps_v=32 # sequences length for video 249 | hight = 7 250 | width = 7 251 | feature_shape = (timesteps_v,video_feature_dims,hight,width) 252 | 253 | f_type = '224x224_VGG' 254 | feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5' 255 | pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl' 256 | 257 | hf = h5py.File(feature_path,'r') 258 | 259 | pretrained_model = None 260 | train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 261 | feature_shape=feature_shape,lr=lr, 262 | batch_size=8,total_epoch=20, 263 | pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file) 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | -------------------------------------------------------------------------------- /mqa_video+subtitlel+update.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import h5py 4 | import math 5 | #import MovieQA_benchmark as MovieQA 6 | from model import DataUtil 7 | from model import ModelUtil 8 | from model import SEModelUtil 9 | import word2vec as w2v 10 | 11 | os.environ["CUDA_VISIBLE_DEVICES"]="1" 12 | 13 | import tensorflow as tf 14 | from sklearn.decomposition import PCA 15 | import cPickle as pickle 16 | import time 17 | import json 18 | from collections import namedtuple 19 | 20 | def build_model(input_video, input_stories, input_question, input_answer, 21 | v2i,w2v_model,pca_mat=None,d_w2v=300,d_lproj=300, 22 | answer_index = None, lr=0.01, question_guided=False): 23 | 24 | 25 | with tf.variable_scope('video_subtitle_hierarchical_frame_clip') as scope: 26 | 27 | 28 | T_B, T_w2v, T_mask, pca_mat_ = ModelUtil.setWord2VecModelConfiguration(v2i,w2v_model,d_w2v,d_lproj) 29 | # encode question 30 | embedded_question_words, mask_q = ModelUtil.getEmbeddingWithWord2Vec(input_question, T_w2v, T_mask) 31 | embedded_question = SEModelUtil.getAverageRepresentation(embedded_question_words,T_B,d_lproj) 32 | 33 | # encode stories 34 | embedded_stories_words, mask_s = ModelUtil.getEmbeddingWithWord2Vec(input_stories, T_w2v, T_mask) 35 | embeded_stories = SEModelUtil.getAverageRepresentation(embedded_stories_words, T_B, d_lproj) 36 | # encode video 37 | embedded_video = SEModelUtil.getVideoDualSemanticEmbeddingWithQuestionAttention_up(input_video, T_w2v, embeded_stories, embedded_question, T_B, pca_mat=pca_mat) # batch x timesteps x d_w2v 38 | 39 | 40 | # encode answers 41 | embedded_answer_words, mask_a = ModelUtil.getEmbeddingWithWord2Vec(input_answer, T_w2v, T_mask) 42 | embedded_answer = SEModelUtil.getAverageRepresentation(embedded_answer_words,T_B,d_lproj) 43 | 44 | # get video loss 45 | video_loss,video_scores = ModelUtil.getClassifierLoss(embedded_video, embedded_question, embedded_answer, answer_index=answer_index) 46 | 47 | # train module 48 | loss = tf.reduce_mean(video_loss) 49 | 50 | optimizer = tf.train.GradientDescentOptimizer(lr) 51 | 52 | 53 | train = optimizer.minimize(loss) 54 | return train,loss,video_scores 55 | 56 | def linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=None): 57 | 58 | print('--utilize PCA to initialize the embedding matrix of feature to d_w2v') 59 | samples = [] 60 | for imdb_key in hf.keys(): 61 | feature = hf[imdb_key][:] 62 | axis = [0,2,3,1] 63 | feature = np.transpose(feature, tuple(axis)) 64 | feature = np.reshape(feature,(-1,feature_shape[1])) 65 | feature = np.random.permutation(feature) 66 | samples.extend(feature[:50]) 67 | print('samples:',len(samples)) 68 | 69 | pca = PCA(n_components=d_w2v, whiten=True) 70 | pca_mat = pca.fit_transform(np.asarray(samples).T) # 1024 x 300 71 | 72 | pickle.dump(pca_mat,open(output_path,'w')) 73 | print('pca_amt dump to file:',output_path) 74 | return pca_mat 75 | 76 | 77 | def exe_model(sess, data, batch_size, v2i, hf, feature_shape, stories, story_shape, 78 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32): 79 | if train is not None: 80 | np.random.shuffle(data) 81 | 82 | total_data = len(data) 83 | num_batch = int(round(total_data*1.0/batch_size)) 84 | 85 | total_correct_num = 0 86 | total_loss = 0.0 87 | for batch_idx in xrange(num_batch): 88 | batch_qa = data[batch_idx*batch_size:min((batch_idx+1)*batch_size,total_data)] 89 | 90 | data_q,data_a,data_y = DataUtil.getBatchIndexedQAs_return(batch_qa,v2i, nql=nql, nqa=nqa, numOfChoices=numberOfChoices) 91 | data_s = DataUtil.getBatchIndexedStories(batch_qa,stories,v2i,story_shape) 92 | data_v = DataUtil.getBatchVideoFeatureFromQid(batch_qa, hf, feature_shape) 93 | if train is not None: 94 | _, l, s = sess.run([train,loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 95 | else: 96 | l, s = sess.run([loss,scores],feed_dict={input_video:data_v, input_stories:data_s, input_question:data_q, input_answer:data_a, y:data_y}) 97 | 98 | num_correct = np.sum(np.where(np.argmax(s,axis=-1)==np.argmax(data_y,axis=-1),1,0)) 99 | total_correct_num += num_correct 100 | total_loss += l 101 | 102 | total_acc = total_correct_num*1.0/total_data 103 | total_loss = total_loss/num_batch 104 | return total_acc, total_loss 105 | 106 | 107 | 108 | def train_model(train_stories,val_stories,v2i,trained_video_QAs,val_video_QAs,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 109 | feature_shape=(32,1024,7,7), 110 | batch_size=8,total_epoch=100, 111 | lr=0.01,pretrained_model=False,pca_mat_init_file=None): 112 | 113 | 114 | w2v_mqa_model_filename = '/home/wb/movie_plots_1364.d-300.mc1.w2v' 115 | w2v_model = w2v.load(w2v_mqa_model_filename, kind='bin') 116 | 117 | 118 | ''' 119 | model parameters 120 | ''' 121 | size_voc = len(v2i) 122 | 123 | 124 | max_sentences = 3660 125 | 126 | max_words = 40 127 | 128 | story_shape = (max_sentences,max_words) 129 | 130 | size_voc = len(v2i) 131 | 132 | 133 | 134 | print('building model ...') 135 | 136 | if os.path.exists(pca_mat_init_file): 137 | pca_mat = pickle.load(open(pca_mat_init_file,'r')) 138 | else: 139 | pca_mat = linear_project_pca_initialization(hf, feature_shape, d_w2v=300, output_path=pca_mat_init_file) 140 | 141 | print('pca_mat.shape:',pca_mat.shape) 142 | 143 | input_video = tf.placeholder(tf.float32, shape=(None,)+feature_shape,name='input_video') 144 | input_stories = tf.placeholder(tf.int32, shape=(None, max_sentences, max_words),name='input_stories') 145 | input_question = tf.placeholder(tf.int32, shape=(None,nql), name='input_question') 146 | input_answer = tf.placeholder(tf.int32, shape=(None,numberOfChoices,nqa), name='input_answer') 147 | 148 | y = tf.placeholder(tf.float32,shape=(None, numberOfChoices)) 149 | 150 | train,loss,scores = build_model(input_video, input_stories, input_question, input_answer, v2i,w2v_model, 151 | pca_mat=pca_mat, 152 | d_w2v=300,d_lproj=300, 153 | answer_index=y, lr=lr) 154 | 155 | ''' 156 | configure && runtime environment 157 | ''' 158 | config = tf.ConfigProto() 159 | config.gpu_options.per_process_gpu_memory_fraction = 0.5 160 | config.log_device_placement=False 161 | 162 | sess = tf.Session(config=config) 163 | 164 | init = tf.global_variables_initializer() 165 | sess.run(init) 166 | 167 | ''' 168 | training parameters 169 | ''' 170 | 171 | with open('train_split.json') as fid: 172 | trdev = json.load(fid) 173 | 174 | 175 | def getTrainDevSplit(trained_video_QAs,trdev): 176 | train_data = [] 177 | dev_data = [] 178 | for k, qa in enumerate(trained_video_QAs): 179 | 180 | if qa.imdb_key in trdev['train']: 181 | train_data.append(qa) 182 | else: 183 | dev_data.append(qa) 184 | return train_data,dev_data 185 | 186 | train_data,dev_data = getTrainDevSplit(trained_video_QAs,trdev) 187 | 188 | 189 | with sess.as_default(): 190 | saver = tf.train.Saver(sharded=True,max_to_keep=total_epoch) 191 | if pretrained_model is not None: 192 | saver.restore(sess, pretrained_model) 193 | print('restore pre trained file:' + pretrained_model) 194 | for epoch in xrange(total_epoch): 195 | 196 | # # shuffle 197 | print('Epoch: %d/%d, Batch_size: %d' %(epoch+1,total_epoch,batch_size)) 198 | # train phase 199 | tic = time.time() 200 | total_acc, total_loss = exe_model(sess, train_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 201 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=train, nql=25, nqa=32) 202 | print(' --Train--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 203 | 204 | # dev phase 205 | tic = time.time() 206 | total_acc, total_loss = exe_model(sess, dev_data, batch_size, v2i, hf, feature_shape, train_stories, story_shape, 207 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 208 | print(' --Train-val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 209 | # eval phase 210 | tic = time.time() 211 | total_acc, total_loss = exe_model(sess, val_video_QAs, batch_size, v2i, hf, feature_shape, val_stories, story_shape, 212 | loss, scores, input_video, input_question, input_stories, input_answer, y, numberOfChoices=5, train=None, nql=25, nqa=32) 213 | print(' --Val--, Loss: %.5f, Acc: %.5f.......Time:%.3f' %(total_loss,total_acc,time.time()-tic)) 214 | 215 | 216 | 217 | #save model 218 | export_path = '/data1/wb/saved_model/vqa_baseline/video+subtitle'+'/'+f_type+'_b'+str(batch_size)+'/'+'lr'+str(lr)+'_f'+str(feature_shape[0]) 219 | if not os.path.exists(export_path): 220 | os.makedirs(export_path) 221 | print('mkdir %s' %export_path) 222 | save_path = saver.save(sess, export_path+'/'+'E'+str(epoch+1)+'_A'+str(total_acc)+'.ckpt') 223 | print("Model saved in file: %s" % save_path) 224 | 225 | 226 | def trans(all): 227 | 228 | qa_list = [] 229 | for dicts in all: 230 | 231 | qa_list.append( 232 | QAInfo(dicts['qid'], dicts['questions'], dicts['answers'] , dicts['ground_truth'], 233 | dicts['imdb_key'], dicts['video_clips'])) 234 | return qa_list 235 | 236 | 237 | if __name__ == '__main__': 238 | # 'video+subtitle task' 239 | 240 | 241 | 242 | nql=25 # sequences length for question 243 | nqa=32 # sequences length for anwser 244 | numberOfChoices = 5 # for input choices, one for correct, one for wrong answer 245 | QAInfo = namedtuple('QAInfo','qid question answers correct_index imdb_key video_clips') 246 | 247 | 248 | v2i = pickle.load(open("/data1/wb/movieQA_v2i.pkl","rb")) 249 | qa_train = trans(pickle.load(open("/data1/wb/process_train.pkl","rb"))) 250 | qa_val = trans(pickle.load(open("/data1/wb/process_val.pkl","rb"))) 251 | train_stories = pickle.load(open("/data1/wb/train_stories.pkl","rb")) 252 | val_stories = pickle.load(open("/data1/wb/val_stories.pkl","rb")) 253 | 254 | lr = 0.01 255 | 256 | 257 | ''' 258 | --------------------------------- 259 | 224x224 vgg all clips feature 260 | ''' 261 | 262 | video_feature_dims=512 263 | timesteps_v=32 # sequences length for video 264 | hight = 7 265 | width = 7 266 | feature_shape = (timesteps_v,video_feature_dims,hight,width) 267 | 268 | f_type = '224x224_VGG' 269 | feature_path = '/data1/wb/224x224_movie_all_clips_vgg_'+str(timesteps_v)+'f.h5' 270 | pca_mat_init_file = '/data1/wb/224x224_vgg_pca_mat.pkl' 271 | 272 | 273 | hf = h5py.File(feature_path,'r') 274 | 275 | pretrained_model = None 276 | train_model(train_stories,val_stories,v2i,qa_train,qa_val,hf,f_type,nql=25,nqa=32,numberOfChoices=5, 277 | feature_shape=feature_shape,lr=lr, 278 | batch_size=8,total_epoch=30, 279 | pretrained_model=pretrained_model,pca_mat_init_file=pca_mat_init_file) 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | -------------------------------------------------------------------------------- /train_split.json: -------------------------------------------------------------------------------- 1 | { 2 | "train": [ 3 | "tt0118842", 4 | "tt0417741", 5 | "tt0119488", 6 | "tt0362269", 7 | "tt0480025", 8 | "tt0114924", 9 | "tt0125439", 10 | "tt0256415", 11 | "tt0970416", 12 | "tt1193138", 13 | "tt2194499", 14 | "tt0140352", 15 | "tt0074285", 16 | "tt0780504", 17 | "tt1675434", 18 | "tt1068680", 19 | "tt0234215", 20 | "tt1197624", 21 | "tt0486822", 22 | "tt0307987", 23 | "tt0322259", 24 | "tt1174732", 25 | "tt0804522", 26 | "tt0119396", 27 | "tt1232829", 28 | "tt1454468", 29 | "tt0800039", 30 | "tt0455824", 31 | "tt0119141", 32 | "tt0120888", 33 | "tt0120611", 34 | "tt0443453", 35 | "tt0276751", 36 | "tt0383574", 37 | "tt2614684", 38 | "tt2719848", 39 | "tt0270980", 40 | "tt1800241", 41 | "tt2334873", 42 | "tt0425210", 43 | "tt0096874", 44 | "tt0475394", 45 | "tt0241527", 46 | "tt0147800", 47 | "tt0343818", 48 | "tt0268978", 49 | "tt0120815", 50 | "tt1399103", 51 | "tt0458352", 52 | "tt0319061", 53 | "tt0120915", 54 | "tt1284575", 55 | "tt2402927", 56 | "tt0218967", 57 | "tt0830570", 58 | "tt0163025", 59 | "tt0099685", 60 | "tt0375679", 61 | "tt0476964", 62 | "tt0324554", 63 | "tt0331811", 64 | "tt0083866", 65 | "tt0112681", 66 | "tt1535970", 67 | "tt0169547", 68 | "tt0111161", 69 | "tt0144084", 70 | "tt1099212", 71 | "tt0343660", 72 | "tt0091042", 73 | "tt0790636", 74 | "tt1401152", 75 | "tt0068646", 76 | "tt0780571", 77 | "tt0104036", 78 | "tt0133152", 79 | "tt2294629", 80 | "tt2278388", 81 | "tt1276104", 82 | "tt1131734", 83 | "tt0167261", 84 | "tt1092026", 85 | "tt0095953", 86 | "tt0332452", 87 | "tt0240890", 88 | "tt0947798", 89 | "tt1133985", 90 | "tt0458525", 91 | "tt0903624", 92 | "tt0988595", 93 | "tt1178663", 94 | "tt0457939", 95 | "tt1228705", 96 | "tt0119282", 97 | "tt0367882", 98 | "tt1632708", 99 | "tt0086190", 100 | "tt0325980", 101 | "tt0433400", 102 | "tt0107614", 103 | "tt0120890", 104 | "tt1499658", 105 | "tt0119822", 106 | "tt1229822", 107 | "tt0499549", 108 | "tt0416320", 109 | "tt1201607", 110 | "tt1033643", 111 | "tt1877832", 112 | "tt1285016", 113 | "tt0212720", 114 | "tt1189340", 115 | "tt1385826", 116 | "tt0452623", 117 | "tt0421715", 118 | "tt1205489", 119 | "tt0452625", 120 | "tt1024648", 121 | "tt1646971", 122 | "tt0114709", 123 | "tt1598822", 124 | "tt0087469", 125 | "tt0116282", 126 | "tt0221027", 127 | "tt0295297", 128 | "tt0091867", 129 | "tt0206634", 130 | "tt0242653", 131 | "tt0910970", 132 | "tt0337978", 133 | "tt0335119", 134 | "tt1045658", 135 | "tt0328107", 136 | "tt1981115", 137 | "tt0313542", 138 | "tt0166924", 139 | "tt0440963", 140 | "tt1790885", 141 | "tt0120737", 142 | "tt1261945", 143 | "tt0139134", 144 | "tt1375666", 145 | "tt0290334", 146 | "tt0167404", 147 | "tt1446714", 148 | "tt1343092", 149 | "tt1951264", 150 | "tt0866439", 151 | "tt1637725", 152 | "tt1046173", 153 | "tt0478311", 154 | "tt1659337", 155 | "tt0316654", 156 | "tt0365907", 157 | "tt1104001", 158 | "tt1979320", 159 | "tt0109831", 160 | "tt2294449", 161 | "tt0103064", 162 | "tt0388795", 163 | "tt0114814", 164 | "tt0409847", 165 | "tt0120689", 166 | "tt0120828", 167 | "tt0106918", 168 | "tt2923316", 169 | "tt2310332", 170 | "tt1692486", 171 | "tt0119643", 172 | "tt0317198", 173 | "tt1837562", 174 | "tt2382396", 175 | "tt0145734", 176 | "tt0480242", 177 | "tt0381061", 178 | "tt0449088", 179 | "tt0086879", 180 | "tt1570728", 181 | "tt1727770", 182 | "tt0119654", 183 | "tt0146882", 184 | "tt0993846", 185 | "tt1726592", 186 | "tt0108185", 187 | "tt0240772", 188 | "tt0408306", 189 | "tt1000774", 190 | "tt0118556", 191 | "tt0385752", 192 | "tt0108160", 193 | "tt0245238", 194 | "tt1504320", 195 | "tt0244353", 196 | "tt0118571", 197 | "tt0418279", 198 | "tt0133093", 199 | "tt0372237", 200 | "tt1840309", 201 | "tt0281358", 202 | "tt0404203", 203 | "tt1542344", 204 | "tt1229340", 205 | "tt0800320", 206 | "tt0108052", 207 | "tt0112384", 208 | "tt0163187", 209 | "tt0071315", 210 | "tt0112697", 211 | "tt0433416", 212 | "tt0441773", 213 | "tt0107290", 214 | "tt1058017", 215 | "tt0800369", 216 | "tt0083658", 217 | "tt0172495", 218 | "tt0100405", 219 | "tt0113161", 220 | "tt0097165", 221 | "tt0907657", 222 | "tt0118564", 223 | "tt0314331", 224 | "tt0335266", 225 | "tt0120338", 226 | "tt0097576", 227 | "tt0080684", 228 | "tt0414055", 229 | "tt0411061", 230 | "tt0171433", 231 | "tt0359950", 232 | "tt0120586", 233 | "tt0305711", 234 | "tt0208092", 235 | "tt0104257", 236 | "tt0200550", 237 | "tt0298203", 238 | "tt1010048", 239 | "tt1798709", 240 | "tt0213149", 241 | "tt0217630", 242 | "tt0075314", 243 | "tt0349903" 244 | ], 245 | "dev": [ 246 | "tt0258463", 247 | "tt0410400", 248 | "tt1853728", 249 | "tt3346224", 250 | "tt1454029", 251 | "tt0379786", 252 | "tt0959337", 253 | "tt0450259", 254 | "tt0118715", 255 | "tt0089218", 256 | "tt0333780", 257 | "tt0467406", 258 | "tt0376994", 259 | "tt0120735", 260 | "tt1001508", 261 | "tt1725986", 262 | "tt0246772", 263 | "tt0493464", 264 | "tt0083987", 265 | "tt0209475", 266 | "tt0397078", 267 | "tt2024544", 268 | "tt1065073", 269 | "tt1120985", 270 | "tt0161860", 271 | "tt0294357", 272 | "tt1170358", 273 | "tt0443706" 274 | ] 275 | } --------------------------------------------------------------------------------