├── .gitignore ├── Poster.pdf ├── README ├── README.md ├── keras_lstm.py ├── keras_util.py ├── mctest_baseline.py ├── mctest_dataset_parser.py ├── mctest_dataset_parser_v2.py ├── mctest_lstm.py ├── memnn_numpy.py ├── memnn_theano.py ├── memnn_theano_v2.py ├── memnn_theano_v3.py ├── nltk_utils.py ├── pararth_final_report.pdf ├── pararth_milestone.pdf ├── pos_pruning.py ├── qa_dataset_parser.py ├── theano_util.py ├── wmemnn.py ├── wmemnnmc.py └── wordvec_pruning.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /Poster.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/Poster.pdf -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/README -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Question Answering Using Memory Networks 2 | CS224D Project 3 | -------------------------------------------------------------------------------- /keras_lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import numpy as np 4 | import sys, re 5 | 6 | from keras.preprocessing import sequence 7 | from keras.initializations import uniform 8 | from keras.optimizers import SGD, RMSprop, Adagrad 9 | from keras.utils import np_utils 10 | from keras.models import Sequential 11 | from keras.layers.core import Dense, Dropout, Activation 12 | from keras.layers.embeddings import Embedding 13 | from keras.layers.recurrent import LSTM, GRU 14 | 15 | # mode can be 'baseline' or 'memnn' 16 | def load_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True, mode='memnn'): 17 | #dataset = [] 18 | dataset_ids = [] 19 | #labels = [] 20 | label_ids = [] 21 | with open(input_file) as f: 22 | article = {} 23 | article_no = 0 24 | for line in f: 25 | line = line.strip() 26 | if len(line) > 0 and line[:2] == '1 ' and len(dataset_ids) > 0: # new article 27 | article = {} 28 | article_no += 1 29 | if '\t' in line: # question 30 | question_parts = line.split('\t') 31 | tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split() 32 | if update_word_ids: 33 | for token in tokens[1:]: 34 | if token not in word_to_id: 35 | word_to_id[token] = word_id 36 | word_id += 1 37 | if question_parts[1] not in word_to_id: 38 | word_to_id[question_parts[1]] = word_id 39 | word_id += 1 40 | 41 | stmt_ids = map(int, question_parts[2].strip().split()) 42 | sequence = [] 43 | if mode == 'baseline': 44 | for s in range(int(tokens[0])): 45 | if s in article: 46 | sequence += article[s] 47 | else: 48 | for s in stmt_ids: 49 | sequence += article[s] 50 | 51 | for token in tokens[1:]: 52 | sequence.append(token) 53 | 54 | if article_no == 0: 55 | print("seq: %s | label: %s" % (' '.join(sequence).ljust(70), question_parts[1])) 56 | 57 | dataset_ids.append(map(lambda t: word_to_id[t], sequence)) 58 | label_ids.append(word_to_id[question_parts[1]]) 59 | 60 | else: # statement 61 | tokens = re.sub(r'([\.\?])$', r' \1', line).split() 62 | if update_word_ids: 63 | for token in tokens[1:]: 64 | if token not in word_to_id: 65 | word_to_id[token] = word_id 66 | word_id += 1 67 | 68 | line_no = int(tokens[0]) 69 | article[line_no] = [] 70 | for token in tokens[1:]: 71 | article[line_no].append(token) 72 | 73 | return dataset_ids, label_ids, word_to_id, word_id 74 | 75 | if __name__ == "__main__": 76 | train_file = sys.argv[1] 77 | test_file = train_file.replace('train', 'test') 78 | 79 | mode = 'memnn' 80 | if len(sys.argv) > 2: 81 | mode = sys.argv[2] # should be 'baseline' or 'memnn' 82 | 83 | nb_epoch = 10 84 | if len(sys.argv) > 3: 85 | nb_epoch = int(sys.argv[3]) 86 | 87 | print("Loading train data...") 88 | X_train, y_train, word_to_id, num_words = load_dataset(train_file, mode=mode) 89 | print("Loading test data...") 90 | X_test, y_test, _, _ = load_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, mode=mode) 91 | 92 | id_to_word = dict([(v, k) for k, v in word_to_id.iteritems()]) 93 | 94 | y_train_cat = np_utils.to_categorical(y_train, nb_classes=num_words) 95 | y_test_cat = np_utils.to_categorical(y_test, nb_classes=num_words) 96 | 97 | print(len(X_train), 'train sequences') 98 | print(len(X_test), 'test sequences') 99 | 100 | print("Pad sequences (samples x time)") 101 | X_train = sequence.pad_sequences(X_train) 102 | X_test = sequence.pad_sequences(X_test) 103 | print('X_train shape:', X_train.shape) 104 | print('X_test shape:', X_test.shape) 105 | 106 | print('Build model...') 107 | batch_size = 1 108 | in_embedding_size = 100 109 | out_embedding_size = 100 110 | 111 | model = Sequential() 112 | model.add(Embedding(num_words, in_embedding_size)) 113 | model.add(LSTM(in_embedding_size, out_embedding_size)) 114 | model.add(Dropout(0.5)) 115 | model.add(Dense(out_embedding_size, num_words)) 116 | model.add(Activation('softmax')) 117 | 118 | sgd_optimizer = SGD(lr=0.006, momentum=0.9, decay=0.99, nesterov=True) 119 | adg_optimizer = Adagrad() 120 | rms_optimizer = RMSprop() 121 | model.compile(loss='categorical_crossentropy', optimizer=rms_optimizer, class_mode="categorical", theano_mode='FAST_COMPILE') 122 | 123 | print("Train...") 124 | model.fit(X_train, y_train_cat, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True) 125 | score = model.evaluate(X_test, y_test_cat, batch_size=batch_size) 126 | print('Test score:', score) 127 | 128 | classes_proba = model.predict_proba(X_test, batch_size=batch_size) 129 | for i in range(5): 130 | probs = sorted(zip(range(len(classes_proba)), classes_proba[i].tolist()), key=lambda x: x[1], reverse=True) 131 | print('Test sample %d (Correct label: %s)' % (i, id_to_word[y_test[i]])) 132 | for j, p in probs[:5]: 133 | print(id_to_word[j].ljust(20) + ': ' + str(p)) 134 | 135 | classes = np_utils.probas_to_classes(classes_proba) 136 | acc = np_utils.accuracy(classes, y_test) 137 | print('Test accuracy:', acc) 138 | 139 | # print(classes.shape) 140 | # print(classes[0]) 141 | # print(y_test[0]) 142 | 143 | # classes_list = classes.tolist() 144 | # print(map(lambda x: id_to_word[x], classes_list[:25])) 145 | -------------------------------------------------------------------------------- /keras_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | 4 | def parse_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True): 5 | dataset = [] 6 | labels = [] 7 | with open(input_file) as f: 8 | words = [] 9 | for line in f: 10 | line = line.strip() 11 | if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article 12 | words = [] 13 | if '\t' in line: 14 | question_parts = line.split('\t') 15 | tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split() 16 | if update_word_ids: 17 | for token in tokens[1:]: 18 | if token not in word_to_id: 19 | word_to_id[token] = word_id 20 | word_id += 1 21 | 22 | dataset.append(words) 23 | labels.append(word_to_id[question_parts[1]]) 24 | else: 25 | tokens = re.sub(r'([\.\?])$', r' \1', line).split() 26 | if update_word_ids: 27 | for token in tokens[1:]: 28 | if token not in word_to_id: 29 | word_to_id[token] = word_id 30 | word_id += 1 31 | 32 | for token in tokens[1:]: 33 | words.append(word_to_id[token]) 34 | 35 | return dataset, labels, word_to_id 36 | -------------------------------------------------------------------------------- /mctest_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys, random, pprint 3 | import cPickle 4 | import math, os 5 | 6 | class MCTestBaseline: 7 | def __init__(self, n_words=20, word_to_id=None, null_word_id=-1): 8 | self.n_words = n_words 9 | self.word_to_id = word_to_id 10 | self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems()) 11 | self.null_word_id = null_word_id 12 | 13 | def remove_nulls(self, stmt): 14 | return filter(lambda x: x != self.null_word_id, stmt) 15 | 16 | def compute_inverse_count(self, stmt_list): 17 | counts = {} 18 | for word in stmt_list: 19 | if word not in counts: 20 | counts[word] = 0 21 | counts[word] += 1 22 | 23 | ic = {} 24 | for k, v in counts.iteritems(): 25 | ic[k] = math.log10(1 + 1.0/float(v)) 26 | return ic 27 | 28 | def compute_scores(self, statements, question, answers, stop_words): 29 | stmt_list = [word for stmt in statements for word in self.remove_nulls(stmt)] 30 | stmt_set = set(stmt_list) 31 | ques_set = set(self.remove_nulls(question)) 32 | ans_set = map(lambda x: set(self.remove_nulls(x)), answers) 33 | ic = self.compute_inverse_count(stmt_list) 34 | scores = [] 35 | for i in range(4): 36 | sw_score = -1 37 | S = ans_set[i] | ques_set 38 | S_list = list(S) 39 | for j in range(len(stmt_list)): 40 | curr_score = 0 41 | for w in range(len(S_list)): 42 | if j+w < len(stmt_list) and stmt_list[j+w] in S: 43 | if stmt_list[j+w] in stmt_set: 44 | curr_score += ic[stmt_list[j+w]] 45 | if sw_score == -1 or curr_score > sw_score: 46 | sw_score = curr_score 47 | 48 | d_score = -1 49 | S_Q = (ques_set & stmt_set) - stop_words 50 | S_A = (ans_set[i] & stmt_set) - stop_words 51 | if len(S_Q) == 0 or len(S_A) == 0: 52 | d_score = 1 53 | else: 54 | min_dist = len(stmt_list) 55 | last_q = -1 56 | last_a = -1 57 | for i in range(len(stmt_list)): 58 | if stmt_list[i] in S_Q and stmt_list[i] in S_A: 59 | min_dist = 0 60 | break 61 | if stmt_list[i] in S_Q: 62 | last_q = i 63 | if last_a >= 0 and i - last_a < min_dist: 64 | min_dist = i - last_a 65 | elif stmt_list[i] in S_A: 66 | last_a = i 67 | if last_q >= 0 and i - last_q < min_dist: 68 | min_dist = i - last_q 69 | d_score = float(min_dist + 1) / float(len(stmt_list) + 1) 70 | scores.append(sw_score - d_score) 71 | 72 | return scores 73 | 74 | 75 | def train(self): 76 | pass 77 | 78 | def predict(self, dataset, questions, stop_words=set(), max_words=20, print_errors=True): 79 | correct_answers = 0 80 | wrong_answers = 0 81 | 82 | for i, question in enumerate(questions): 83 | statements_seq = question[2] 84 | question_seq = question[3] 85 | answers = question[5] 86 | correct = question[4] 87 | 88 | # print statements_seq 89 | # print question_seq 90 | # print answers 91 | # print correct 92 | 93 | 94 | scores = self.compute_scores(statements_seq, question_seq, answers, stop_words) 95 | predicted = np.argmax(scores) 96 | 97 | if predicted == correct: 98 | correct_answers += 1 99 | else: 100 | if print_errors and np.random.rand() < 0.1: 101 | correct_words = map(lambda x: self.id_to_word[x], self.remove_nulls(question[5][correct])) 102 | predicted_words = map(lambda x: self.id_to_word[x], self.remove_nulls(question[5][predicted])) 103 | print 'Correct: %s (%d %.3f), Guess: %s (%d %.3f)' % (correct_words, correct, scores[correct], predicted_words, predicted, scores[predicted]) 104 | wrong_answers += 1 105 | 106 | #if len(questions) > 1000: 107 | # print '(%d/%d) %d correct, %d wrong' % (i+1, len(questions), correct_answers, wrong_answers) 108 | 109 | accuracy = 100.0 * float(correct_answers) / (correct_answers + wrong_answers) 110 | print '%d correct, %d wrong, %.2f%% acc' % (correct_answers, wrong_answers, accuracy) 111 | 112 | 113 | if __name__ == "__main__": 114 | train_file = sys.argv[1] 115 | test_file = train_file.replace('train', 'test') 116 | stop_file = os.path.join(os.path.dirname(train_file), 'stopwords.pickle') 117 | 118 | print("Loading pickled train dataset") 119 | f = file(train_file, 'rb') 120 | obj = cPickle.load(f) 121 | train_dataset, train_questions, word_to_id, num_words, null_word_id, train_max_stmts, train_max_words = obj 122 | f.close() 123 | 124 | print("Loading pickled test dataset") 125 | f = file(test_file, 'rb') 126 | obj = cPickle.load(f) 127 | test_dataset, test_questions, _, _, _, test_max_stmts, test_max_words = obj 128 | f.close() 129 | 130 | print("Loading pickled stop words") 131 | f = file(stop_file, 'rb') 132 | obj = cPickle.load(f) 133 | stop_words = obj 134 | f.close() 135 | 136 | print "Dataset has %d words" % num_words 137 | 138 | baseline = MCTestBaseline(n_words=num_words, word_to_id=word_to_id, null_word_id=null_word_id) 139 | baseline.predict(train_dataset, train_questions, stop_words, train_max_words) 140 | baseline.predict(test_dataset, test_questions, stop_words, test_max_words) 141 | -------------------------------------------------------------------------------- /mctest_dataset_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from theano_util import * 4 | 5 | def only_words(line): 6 | ps = re.sub(r'[^a-zA-Z0-9]', r' ', line) 7 | ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations 8 | ns = re.sub(r'(\d+)', r' ', ws) # Put spaces around numbers 9 | hs = re.sub(r'-', r' ', ns) # Replace hyphens with space 10 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 11 | rs = rs.lower().strip() 12 | return rs 13 | 14 | def clean_sentence(line): 15 | ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters 16 | ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations 17 | ns = re.sub(r'(\d+)', r' ', ws) # Put spaces around numbers 18 | hs = re.sub(r'-', r' ', ns) # Replace hyphens with space 19 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 20 | rs = rs.lower().strip() 21 | return rs 22 | 23 | def get_sentences(line): 24 | ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters 25 | s = re.sub(r'(? ', ws) # Put spaces around numbers 28 | hs = re.sub(r'-', r' ', ns) # Replace hyphens with space 29 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 30 | rs = rs.lower().strip() 31 | return rs.split('\t') 32 | 33 | def get_answer_index(a): 34 | answer_to_index = { 35 | 'A': 0, 36 | 'B': 1, 37 | 'C': 2, 38 | 'D': 3, 39 | } 40 | return answer_to_index[a] 41 | 42 | def parse_mc_test_dataset(questions_file, answers_file, word_id=0, word_to_id={}, update_word_ids=True, max_stmts=20, max_words=20, pad=True): 43 | dataset = [] 44 | questions = [] 45 | 46 | null_word = '' 47 | if null_word not in word_to_id: 48 | if update_word_ids == True: 49 | word_to_id[null_word] = word_id 50 | word_id += 1 51 | else: 52 | print "Null word not found!! AAAAA" 53 | sys.exit(1) 54 | null_word_id = word_to_id[null_word] 55 | 56 | article_files = set() 57 | print("Parsing questions %s %s" % (questions_file, answers_file)) 58 | q_file = open(questions_file, 'r') 59 | a_file = open(answers_file, 'r') 60 | 61 | questions_data = q_file.readlines() 62 | answers_data = a_file.readlines() 63 | 64 | assert(len(questions_data) == len(answers_data)) 65 | 66 | more_than_1_word_answers = 0 67 | answer_word_unknown = 0 68 | 69 | for i in xrange(len(questions_data)): 70 | question_line = questions_data[i] 71 | answer_line = answers_data[i] 72 | 73 | question_pieces = question_line.strip().split('\t') 74 | assert(len(question_pieces) == 23) 75 | 76 | answer_pieces = answer_line.strip().split('\t') 77 | assert(len(answer_pieces) == 4) 78 | 79 | text = question_pieces[2] 80 | text = text.replace('\\newline', ' ') 81 | sentences = get_sentences(text) 82 | 83 | statements = [] 84 | for s in sentences: 85 | tokens = s.strip().split() 86 | 87 | if update_word_ids: 88 | for token in tokens: 89 | if token not in word_to_id: 90 | word_to_id[token] = word_id 91 | word_id += 1 92 | else: 93 | tokens = filter(lambda x: x in word_to_id, tokens) 94 | 95 | if pad: 96 | tokens = pad_statement(tokens, null_word, max_words) 97 | 98 | statements.append(tokens) 99 | dataset.append(tokens) 100 | 101 | if pad: 102 | statements = pad_memories(statements, null_word, max_stmts, max_words) 103 | 104 | # 4 questions 105 | for j in range(4): 106 | q_index = (j * 5) + 3 107 | q_words = question_pieces[q_index] 108 | q_words = clean_sentence(q_words).split() 109 | 110 | options = [ 111 | only_words(question_pieces[q_index + 1]), 112 | only_words(question_pieces[q_index + 2]), 113 | only_words(question_pieces[q_index + 3]), 114 | only_words(question_pieces[q_index + 4]), 115 | ] 116 | correct = get_answer_index(answer_pieces[j]) 117 | answer = options[correct] 118 | 119 | if update_word_ids: 120 | for token in (q_words + options): 121 | if token not in word_to_id: 122 | word_to_id[token] = word_id 123 | word_id += 1 124 | else: 125 | q_words = filter(lambda x: x in word_to_id, q_words) 126 | 127 | if pad: 128 | q_words = pad_statement(q_words, null_word, max_words) 129 | 130 | # Ignore more than 1 word answers 131 | if len(answer.split(' ')) > 1: 132 | more_than_1_word_answers += 1 133 | continue 134 | elif len(filter(lambda x: x not in word_to_id, options)) > 0: 135 | answer_word_unknown += 1 136 | continue 137 | 138 | option_word_ids = map(lambda x: word_to_id[x], options) 139 | 140 | article_no = len(questions) 141 | questions.append([article_no, -1, statements, q_words, answer, option_word_ids]) 142 | 143 | print "There are %d questions" % len(questions) 144 | print "There are %d statements" % len(dataset) 145 | print "There are %d words" % len(word_to_id) 146 | print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers 147 | print "Ignored %d questions which had an unknown answer word" % answer_word_unknown 148 | 149 | print("Final processing...") 150 | questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) 151 | return dataset, questions_seq, word_to_id, word_id, null_word_id 152 | 153 | import cPickle 154 | 155 | if __name__ == "__main__": 156 | ADD_PADDING = True 157 | 158 | train_file = 'mc500.train.tsv' 159 | train_answers = train_file.replace('tsv', 'ans') 160 | 161 | test_file = train_file.replace('train', 'test') 162 | test_answers = test_file.replace('tsv', 'ans') 163 | 164 | data_dir = sys.argv[1] 165 | 166 | train_dataset, train_questions, word_to_id, num_words, null_word_id = parse_mc_test_dataset(data_dir + '/' + train_file, data_dir + '/' + train_answers, pad=ADD_PADDING) 167 | test_dataset, test_questions, word_to_id, num_words, null_word_id = parse_mc_test_dataset(data_dir + '/' + test_file, data_dir + '/' + test_answers, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, pad=ADD_PADDING) 168 | 169 | # Add dev to test 170 | test2_file = train_file.replace('train', 'dev') 171 | test2_answers = test2_file.replace('tsv', 'ans') 172 | test2_dataset, test2_questions, word_to_id, num_words, null_word_id = parse_mc_test_dataset(data_dir + '/' + test2_file, data_dir + '/' + test2_answers, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, pad=ADD_PADDING) 173 | 174 | test_dataset += test2_dataset 175 | test_questions += test2_questions 176 | 177 | # Pickle!!!! 178 | print("Pickling train...") 179 | train_pickle = train_file.replace('tsv', 'pickle') 180 | f = file(data_dir + '/' + train_pickle, 'wb') 181 | cPickle.dump((train_dataset, train_questions, word_to_id, num_words, null_word_id), f, protocol=cPickle.HIGHEST_PROTOCOL) 182 | f.close() 183 | 184 | print("Pickling test...") 185 | test_pickle = test_file.replace('tsv', 'pickle') 186 | f = file(data_dir + '/' + test_pickle, 'wb') 187 | cPickle.dump((test_dataset, test_questions, word_to_id, num_words, null_word_id), f, protocol=cPickle.HIGHEST_PROTOCOL) 188 | f.close() 189 | -------------------------------------------------------------------------------- /mctest_dataset_parser_v2.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys, os 3 | import cPickle 4 | 5 | from theano_util import ( 6 | pad_memories, 7 | pad_statement, 8 | ) 9 | 10 | from pos_pruning import prune_statements 11 | 12 | def only_words(line): 13 | ps = re.sub(r'[^a-zA-Z0-9\']', r' ', line) 14 | ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations 15 | ws = re.sub(r" ' ", r"'", ws) # Remove spaces around ' 16 | # ns = re.sub(r'(\d+)', r' ', ws) # Put spaces around numbers 17 | hs = re.sub(r'-', r' ', ws) # Replace hyphens with space 18 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 19 | rs = rs.lower().strip().split(' ') 20 | return rs 21 | 22 | def clean_sentence(line): 23 | ps = re.sub(r'[^a-zA-Z0-9\.\?\!\']', ' ', line) # Split on punctuations and hex characters 24 | ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations 25 | ws = re.sub(r" ' ", r"'", ws) # Remove spaces around ' 26 | # ns = re.sub(r'(\d+)', r' ', ws) # Put spaces around numbers 27 | hs = re.sub(r'-', r' ', ws) # Replace hyphens with space 28 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 29 | rs = rs.lower().strip() 30 | return rs 31 | 32 | def get_sentences(line): 33 | ps = re.sub(r'[^a-zA-Z0-9\.\?\!\']', ' ', line) # Split on punctuations and hex characters 34 | s = re.sub(r'(? ', ws) # Put spaces around numbers 38 | hs = re.sub(r'-', r' ', ws) # Replace hyphens with space 39 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 40 | rs = rs.lower().strip() 41 | return rs.split('\t') 42 | 43 | def get_answer_index(a): 44 | answer_to_index = { 45 | 'A': 0, 46 | 'B': 1, 47 | 'C': 2, 48 | 'D': 3, 49 | } 50 | return answer_to_index[a] 51 | 52 | def transform_ques_weak(question, word_to_id, num_words): 53 | indices = [] 54 | for stmt in question[2]: 55 | index_stmt = map(lambda x: word_to_id[x], stmt) 56 | indices.append(index_stmt) 57 | question[2] = indices 58 | question[3] = map(lambda x: word_to_id[x], question[3]) 59 | question[5] = map(lambda l: map(lambda x: word_to_id[x], l), question[5]) 60 | return question 61 | 62 | def parse_mc_test_dataset(questions_file, answers_file, word_id=0, word_to_id={}, update_word_ids=True, pad=True, add_pruning=False): 63 | dataset = [] 64 | questions = [] 65 | 66 | null_word = '' 67 | if null_word not in word_to_id: 68 | if update_word_ids == True: 69 | word_to_id[null_word] = word_id 70 | word_id += 1 71 | else: 72 | print "Null word not found!! AAAAA" 73 | sys.exit(1) 74 | null_word_id = word_to_id[null_word] 75 | 76 | article_files = set() 77 | print("Parsing questions %s %s" % (questions_file, answers_file)) 78 | q_file = open(questions_file, 'r') 79 | a_file = open(answers_file, 'r') 80 | 81 | questions_data = q_file.readlines() 82 | answers_data = a_file.readlines() 83 | 84 | assert(len(questions_data) == len(answers_data)) 85 | 86 | more_than_1_word_answers = 0 87 | answer_word_unknown = 0 88 | 89 | for i in xrange(len(questions_data)): 90 | question_line = questions_data[i] 91 | answer_line = answers_data[i] 92 | 93 | question_pieces = question_line.strip().split('\t') 94 | assert(len(question_pieces) == 23) 95 | 96 | answer_pieces = answer_line.strip().split('\t') 97 | assert(len(answer_pieces) == 4) 98 | 99 | text = question_pieces[2] 100 | text = text.replace('\\newline', ' ') 101 | sentences = get_sentences(text) 102 | 103 | statements = [] 104 | for s in sentences: 105 | tokens = s.strip().split() 106 | 107 | if update_word_ids: 108 | for token in tokens: 109 | if token not in word_to_id: 110 | word_to_id[token] = word_id 111 | word_id += 1 112 | else: 113 | tokens = filter(lambda x: x in word_to_id, tokens) 114 | 115 | statements.append(tokens) 116 | dataset.append(tokens) 117 | 118 | # 4 questions 119 | for j in range(4): 120 | q_index = (j * 5) + 3 121 | q_words = question_pieces[q_index] 122 | q_words = clean_sentence(q_words).split() 123 | 124 | options = [ 125 | only_words(question_pieces[q_index + 1]), 126 | only_words(question_pieces[q_index + 2]), 127 | only_words(question_pieces[q_index + 3]), 128 | only_words(question_pieces[q_index + 4]), 129 | ] 130 | correct = get_answer_index(answer_pieces[j]) 131 | answer = options[correct] 132 | 133 | # if len(answer) > 1: 134 | # more_than_1_word_answers += 1 135 | # continue 136 | 137 | if update_word_ids: 138 | for token in q_words: 139 | if token not in word_to_id: 140 | word_to_id[token] = word_id 141 | word_id += 1 142 | for o in options: 143 | for token in o: 144 | if token not in word_to_id: 145 | word_to_id[token] = word_id 146 | word_id += 1 147 | else: 148 | q_words = filter(lambda x: x in word_to_id, q_words) 149 | 150 | if q_words[0] == 'multiple' or q_words[0] == 'one': 151 | del q_words[0] 152 | 153 | # Ignore questions with unknown words in the answer 154 | options_word_ids = [] 155 | skip = False 156 | for o in options: 157 | option_word_ids = [] 158 | for w in o: 159 | if w not in word_to_id: 160 | if update_word_ids: 161 | word_to_id[w] = word_id 162 | word_id += 1 163 | option_word_ids.append(w) 164 | else: 165 | skip = True 166 | break 167 | else: 168 | option_word_ids.append(w) 169 | if skip: 170 | break 171 | else: 172 | #if len(option_word_ids) > 1: 173 | # skip = True 174 | # more_than_1_word_answers += 1 175 | # break 176 | options_word_ids.append(option_word_ids) 177 | 178 | if skip: 179 | answer_word_unknown += 1 180 | continue 181 | 182 | article_no = len(questions) 183 | questions.append([article_no, -1, statements, q_words, correct, options_word_ids]) 184 | 185 | print "There are %d questions" % len(questions) 186 | print "There are %d statements" % len(dataset) 187 | print "There are %d words" % len(word_to_id) 188 | print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers 189 | print "Ignored %d questions which had an unknown answer word" % answer_word_unknown 190 | 191 | if add_pruning: 192 | print("Trying to prune extraneaous statements...") 193 | questions = prune_statements(dataset, questions) 194 | before_prune = len(questions) 195 | questions = filter(lambda x: len(x[2]) > 1, questions) 196 | after_prune = len(questions) 197 | print("Pruning invalidated %d questions" % (before_prune - after_prune)) 198 | 199 | max_stmts = None 200 | max_words = None 201 | if pad: 202 | s_lens = [] 203 | q_lens = [] 204 | for i in xrange(len(questions)): 205 | q = questions[i] 206 | s_lens.append(len(q[2])) 207 | for j in xrange(len(q[2])): 208 | q_lens.append(len(q[2][j])) 209 | 210 | max_stmts = max(s_lens) 211 | max_words = max(q_lens) 212 | print "Max statement length: ", max_words 213 | print "Max number of statements: ", max_stmts 214 | 215 | for i in xrange(len(questions)): 216 | q = questions[i] 217 | # Statements 218 | 219 | for j in xrange(len(q[2])): 220 | q[2][j] = pad_statement(q[2][j], null_word, max_words) 221 | 222 | q[2] = pad_memories(q[2], null_word, max_stmts, max_words) 223 | q[3] = pad_statement(q[3], null_word, max_words) 224 | 225 | for j in xrange(len(q[5])): 226 | q[5][j] = pad_statement(q[5][j], null_word, max_words) 227 | 228 | 229 | print("Final processing...") 230 | questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) 231 | return dataset, questions_seq, word_to_id, word_id, null_word_id, max_stmts, max_words 232 | 233 | def parse_stop_words(stop_file, word_id=0, word_to_id={}, update_word_ids=False): 234 | stop_words = set() 235 | with open(stop_file) as f: 236 | for line in f: 237 | token = line.strip() 238 | if not token in word_to_id: 239 | if update_word_ids: 240 | word_to_id[token] = word_id 241 | word_id += 1 242 | else: 243 | continue 244 | stop_words.add(word_to_id[token]) 245 | return stop_words 246 | 247 | if __name__ == "__main__": 248 | ADD_PADDING = True 249 | ADD_PRUNING = False 250 | # Consider padding from the other side 251 | 252 | if len(sys.argv) > 2: 253 | dataset = sys.argv[2] 254 | else: 255 | dataset = 'mc160' 256 | 257 | train_file = dataset + '.train.tsv' 258 | print "Train file:", train_file 259 | 260 | train_answers = train_file.replace('tsv', 'ans') 261 | 262 | test_file = train_file.replace('train', 'test') 263 | test_answers = test_file.replace('tsv', 'ans') 264 | 265 | data_dir = sys.argv[1] 266 | 267 | train_obj = parse_mc_test_dataset(os.path.join(data_dir, train_file), os.path.join(data_dir, train_answers), pad=ADD_PADDING, add_pruning=ADD_PRUNING) 268 | num_words = train_obj[3] 269 | word_to_id = train_obj[2] 270 | test_obj = parse_mc_test_dataset(os.path.join(data_dir, test_file), os.path.join(data_dir, test_answers), word_id=num_words, word_to_id=word_to_id, update_word_ids=True, pad=ADD_PADDING, add_pruning=ADD_PRUNING) 271 | num_words = test_obj[3] 272 | word_to_id = test_obj[2] 273 | 274 | # Add dev to test 275 | # test2_file = train_file.replace('train', 'dev') 276 | # test2_answers = test2_file.replace('tsv', 'ans') 277 | # test2_obj = parse_mc_test_dataset(os.path.join(data_dir, test2_file), os.path.join(data_dir, test2_answers), word_id=num_words, word_to_id=word_to_id, update_word_ids=True, pad=ADD_PADDING, add_pruning=ADD_PRUNING) 278 | 279 | #test_obj[0] += test2_obj[0] 280 | #test_obj[1] += test2_obj[1] 281 | 282 | stop_file = 'stopwords.txt' 283 | stop_obj = parse_stop_words(os.path.join(data_dir, stop_file), word_id=num_words, word_to_id=word_to_id) 284 | 285 | # Pickle!!!! 286 | train_pickle = train_file.replace('tsv', 'pickle') 287 | print("Pickling train... " + train_pickle) 288 | f = file(os.path.join(data_dir, train_pickle), 'wb') 289 | cPickle.dump(train_obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 290 | f.close() 291 | 292 | test_pickle = test_file.replace('tsv', 'pickle') 293 | print("Pickling test... " + test_pickle) 294 | f = file(os.path.join(data_dir, test_pickle), 'wb') 295 | cPickle.dump(test_obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 296 | f.close() 297 | 298 | stop_pickle = stop_file.replace('txt', 'pickle') 299 | print("Pickling stop words... " + stop_pickle) 300 | f = file(os.path.join(data_dir, stop_pickle), 'wb') 301 | cPickle.dump(stop_obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 302 | f.close() 303 | -------------------------------------------------------------------------------- /mctest_lstm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | import numpy as np 4 | import sys, re 5 | 6 | import cPickle 7 | 8 | from keras.preprocessing import sequence 9 | from keras.initializations import uniform 10 | from keras.optimizers import SGD, RMSprop, Adagrad 11 | from keras.utils import np_utils 12 | from keras.models import Sequential 13 | from keras.layers.core import Dense, Dropout, Activation 14 | from keras.layers.embeddings import Embedding 15 | from keras.layers.recurrent import LSTM, GRU 16 | 17 | def flatten(a): 18 | ret = [] 19 | for item in a: 20 | if type(item) == list: 21 | for k in item: 22 | ret.append(k) 23 | else: 24 | ret.append(k) 25 | return ret 26 | 27 | def get_dataset(questions): 28 | X = [] 29 | y = [] 30 | for question in questions: 31 | statements = [] 32 | statements += flatten(question[2]) 33 | statements += question[3] 34 | X.append(statements) 35 | y.append(question[4]) 36 | return X,y 37 | 38 | if __name__ == "__main__": 39 | train_file = sys.argv[1] 40 | test_file = train_file.replace('train', 'test') 41 | 42 | print("Loading pickled train dataset") 43 | f = file(train_file, 'rb') 44 | obj = cPickle.load(f) 45 | train_dataset, train_questions, word_to_id, num_words, null_word_id = obj 46 | 47 | print("Loading pickled test dataset") 48 | f = file(test_file, 'rb') 49 | obj = cPickle.load(f) 50 | test_dataset, test_questions, _, _, _ = obj 51 | 52 | nb_epoch = 10 53 | if len(sys.argv) > 2: 54 | nb_epoch = int(sys.argv[2]) 55 | 56 | X_train, y_train = get_dataset(train_questions) 57 | X_test, y_test = get_dataset(test_questions) 58 | 59 | id_to_word = dict([(v, k) for k, v in word_to_id.iteritems()]) 60 | 61 | y_train_cat = np_utils.to_categorical(y_train, nb_classes=num_words) 62 | y_test_cat = np_utils.to_categorical(y_test, nb_classes=num_words) 63 | 64 | print(len(X_train), 'train sequences') 65 | print(len(X_test), 'test sequences') 66 | 67 | print("Pad sequences (samples x time)") 68 | X_train = sequence.pad_sequences(X_train) 69 | X_test = sequence.pad_sequences(X_test) 70 | print('X_train shape:', X_train.shape) 71 | print('X_test shape:', X_test.shape) 72 | 73 | print('Build model...') 74 | batch_size = 1 75 | in_embedding_size = 100 76 | out_embedding_size = 100 77 | 78 | model = Sequential() 79 | model.add(Embedding(num_words, in_embedding_size)) 80 | model.add(LSTM(in_embedding_size, out_embedding_size)) 81 | model.add(Dropout(0.5)) 82 | model.add(Dense(out_embedding_size, num_words)) 83 | model.add(Activation('softmax')) 84 | 85 | sgd_optimizer = SGD(lr=0.006, momentum=0.9, decay=0.99, nesterov=True) 86 | adg_optimizer = Adagrad() 87 | rms_optimizer = RMSprop() 88 | model.compile(loss='categorical_crossentropy', optimizer=rms_optimizer, class_mode="categorical", theano_mode='FAST_COMPILE') 89 | 90 | print("Train...") 91 | model.fit(X_train, y_train_cat, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True) 92 | score = model.evaluate(X_test, y_test_cat, batch_size=batch_size) 93 | print('Test score:', score) 94 | 95 | classes_proba = model.predict_proba(X_test, batch_size=batch_size) 96 | for i in range(5): 97 | probs = sorted(zip(range(len(classes_proba)), classes_proba[i].tolist()), key=lambda x: x[1], reverse=True) 98 | print('Test sample %d (Correct label: %s)' % (i, id_to_word[y_test[i]])) 99 | for j, p in probs[:5]: 100 | print(id_to_word[j].ljust(20) + ': ' + str(p)) 101 | 102 | classes = np_utils.probas_to_classes(classes_proba) 103 | 104 | correct, wrong = 0, 0 105 | for (i,q) in enumerate(test_questions): 106 | options = q[5] 107 | options_probs = classes_proba[i][options] 108 | best_idx = np.argmax(options_probs) 109 | predicted = options[best_idx] 110 | print('Test sample %d (Correct label: %s)' % (i, id_to_word[y_test[i]])) 111 | for k in range(len(options)): 112 | print(id_to_word[options[k]].ljust(20) + ': ' + str(options_probs[k])) 113 | 114 | if predicted == y_test[i]: 115 | correct += 1 116 | else: 117 | wrong += 1 118 | 119 | print('%d correct, %d wrong' % (correct, wrong)) 120 | -------------------------------------------------------------------------------- /memnn_numpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys, re, random 3 | 4 | def init_shared_normal(num_rows, num_cols, scale=1): 5 | '''Initialize a matrix shared variable with normally distributed 6 | elements.''' 7 | return np.random.normal(scale=scale, size=(num_rows, num_cols)) 8 | 9 | def init_shared_zeros(*shape): 10 | '''Initialize a vector shared variable with zero elements.''' 11 | return np.zeros(shape, dtype=dtype) 12 | 13 | class MemNN: 14 | def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, n_epochs=100): 15 | self.n_embedding = n_embedding 16 | self.lr = lr 17 | self.margin = margin 18 | self.n_epochs = n_epochs 19 | self.n_words = n_words 20 | self.n_D = 2 * self.n_words 21 | 22 | self.U_O = init_shared_normal(n_embedding, self.n_D, 0.01) 23 | 24 | def calc_score(self, phi_x, phi_y): 25 | return phi_x.T.dot(self.U_O.T).dot(self.U_O).dot(phi_y) 26 | 27 | def calc_grad_S_O_U_O(self, phi_x, phi_y): 28 | return self.U_O.dot(np.outer(phi_x, phi_y) + np.outer(phi_y, phi_x)) 29 | 30 | def calc_cost_and_grad(self, phi_x, phi_f1, phi_f1bar): 31 | correct_score = self.calc_score(phi_x, phi_f1) 32 | false_score = self.calc_score(phi_x, phi_f1bar) 33 | cost = max(0, self.margin - correct_score + false_score) 34 | grad = {} 35 | grad['U_O'] = 0 36 | if cost > 0: 37 | grad['U_O'] = -1*self.calc_grad_S_O_U_O(phi_x, phi_f1) + self.calc_grad_S_O_U_O(phi_x, phi_f1bar) 38 | return cost, grad 39 | 40 | def train(self, dataset_bow, questions, num_words): 41 | for epoch in xrange(self.n_epochs): 42 | costs = [] 43 | 44 | random.shuffle(questions) 45 | for i, question in enumerate(questions): 46 | article_no = question[0] 47 | line_no = question[1] 48 | question_phi = question[2] 49 | correct_stmt = question[4] 50 | seq = [i for i in range(line_no)] 51 | del seq[correct_stmt] 52 | false_stmt = random.choice(seq) 53 | #print article_no, line_no, correct_stmt, false_stmt 54 | phi_x = np.zeros((self.n_D,)) 55 | phi_x[:num_words] = question_phi 56 | phi_f1 = np.zeros((self.n_D,)) 57 | phi_f1[num_words:2*num_words] = dataset_bow[article_no][correct_stmt] 58 | phi_f1bar = np.zeros((self.n_D,)) 59 | phi_f1bar[num_words:2*num_words] = dataset_bow[article_no][false_stmt] 60 | 61 | # if article_no == 0 and line_no == 2: 62 | # corr_score = self.calc_score(phi_x, phi_f1) 63 | # fals_score = self.calc_score(phi_x, phi_f1bar) 64 | # print "[BEFORE] corr score: %f, false score: %f" % (corr_score, fals_score) 65 | 66 | cost, grad = self.calc_cost_and_grad(phi_x, phi_f1, phi_f1bar) 67 | costs.append(cost) 68 | self.U_O -= self.lr * grad['U_O'] 69 | 70 | # if article_no == 0 and line_no == 2: 71 | # corr_score = self.calc_score(phi_x, phi_f1) 72 | # fals_score = self.calc_score(phi_x, phi_f1bar) 73 | # print "[ AFTER] corr score: %f, false score: %f" % (corr_score, fals_score) 74 | 75 | # if epoch % 100 == 0: 76 | # print 'Epoch %i/%i' % (epoch + 1, self.n_epochs), np.mean(costs) 77 | # sys.stdout.flush() 78 | 79 | # print np.mean(costs), np.mean(self.U_O), np.max(self.U_O), np.min(self.U_O) 80 | 81 | def predict(self, dataset, questions): 82 | correct_answers = 0 83 | wrong_answers = 0 84 | for i, question in enumerate(questions): 85 | article_no = question[0] 86 | line_no = question[1] 87 | question_phi = question[2] 88 | correct_stmt = question[4] 89 | 90 | phi_x = np.zeros((self.n_D,)) 91 | phi_x[:num_words] = question_phi 92 | 93 | answer = -1 94 | max_score = -99999 95 | for l in range(line_no): 96 | phi_f = np.zeros((self.n_D,)) 97 | phi_f[num_words:2*num_words] = dataset[article_no][l] 98 | 99 | #print phi_x, phi_f 100 | score = self.calc_score(phi_x, phi_f) 101 | if answer == -1 or score > max_score: 102 | max_score = score 103 | answer = l 104 | 105 | if article_no == 0: 106 | print "%d: corr stmt: %d, answer: %d" % (i, correct_stmt, answer) 107 | 108 | if answer == correct_stmt: 109 | correct_answers += 1 110 | else: 111 | wrong_answers += 1 112 | 113 | print '%d correct, %d wrong' % (correct_answers, wrong_answers) 114 | 115 | 116 | def parse_dataset(input_file): 117 | dataset = [] 118 | questions = [] 119 | word_to_id = {} 120 | word_id = 0 121 | with open(input_file) as f: 122 | statements = [] 123 | article_no = 0 124 | line_no = 0 125 | stmt_to_line = {} 126 | for line in f: 127 | line = line.strip() 128 | if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article 129 | dataset.append(statements) 130 | statements = [] 131 | line_no = 0 132 | stmt_to_line = {} 133 | article_no += 1 134 | if '\t' in line: 135 | question_parts = line.split('\t') 136 | tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0]).split() 137 | for token in tokens[1:]: 138 | if not token in word_to_id: 139 | word_to_id[token] = word_id 140 | word_id += 1 141 | questions.append([article_no, line_no, ' '.join(tokens[1:]), question_parts[1], stmt_to_line[question_parts[2]]]) 142 | else: 143 | tokens = re.sub(r'([\.\?])$', r' \1', line).split() 144 | stmt_to_line[tokens[0]] = line_no 145 | for token in tokens[1:]: 146 | if not token in word_to_id: 147 | word_to_id[token] = word_id 148 | word_id += 1 149 | statements.append(' '.join(tokens[1:])) 150 | line_no += 1 151 | if len(statements) > 0: 152 | dataset.append(statements) 153 | return dataset, questions, word_to_id, word_id 154 | 155 | def compute_phi(input_str, word_to_id, num_words): 156 | phi = np.zeros((num_words,)) 157 | for token in input_str.split(): 158 | phi[word_to_id[token]] += 1 159 | return phi 160 | 161 | def transform_ques(question, word_to_id, num_words): 162 | question[2] = compute_phi(question[2], word_to_id, num_words) 163 | return question 164 | 165 | if __name__ == "__main__": 166 | training_dataset = sys.argv[1] 167 | test_dataset = training_dataset.replace('train', 'test') 168 | 169 | dataset, questions, word_to_id, num_words = parse_dataset(training_dataset) 170 | dataset_bow = map(lambda y: map(lambda x: compute_phi(x, word_to_id, num_words), y), dataset) 171 | questions_bow = map(lambda x: transform_ques(x, word_to_id, num_words), questions) 172 | # print dataset[0], dataset_bow[0], questions_bow[0] 173 | #print len(dataset_bow) 174 | memNN = MemNN(n_words=num_words, n_epochs=100, margin=1.0) 175 | memNN.train(dataset_bow, questions_bow, num_words) 176 | 177 | test_dataset, test_questions, _, _ = parse_dataset(test_dataset) 178 | test_dataset_bow = map(lambda y: map(lambda x: compute_phi(x, word_to_id, num_words), y), test_dataset) 179 | test_questions_bow = map(lambda x: transform_ques(x, word_to_id, num_words), test_questions) 180 | memNN.predict(test_dataset_bow, test_questions_bow) 181 | -------------------------------------------------------------------------------- /memnn_theano.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import sys, random 5 | 6 | from theano_util import * 7 | 8 | class MemNN: 9 | def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, n_epochs=100): 10 | self.n_embedding = n_embedding 11 | self.lr = lr 12 | self.margin = margin 13 | self.n_epochs = n_epochs 14 | self.n_words = n_words 15 | self.n_D = 2 * self.n_words 16 | self.n_embedding = n_embedding 17 | 18 | phi_x = T.vector('phi_x') 19 | phi_f1 = T.vector('phi_f1') 20 | phi_f1bar = T.vector('phi_f1bar') 21 | 22 | # Supporting memories 23 | phi_m0 = T.vector('phi_m0') 24 | 25 | # True word 26 | phi_r = T.vector('phi_r') 27 | 28 | # False words 29 | phi_rbar = T.vector('phi_rbar') 30 | 31 | self.U_O = init_shared_normal(self.n_embedding, self.n_D, 0.01) 32 | self.U_R = init_shared_normal(self.n_embedding, self.n_D, 0.01) 33 | 34 | cost = self.calc_cost(phi_x, phi_f1, phi_f1bar, phi_m0, phi_r, phi_rbar) 35 | params = [self.U_O, self.U_R] 36 | gradient = T.grad(cost, params) 37 | 38 | updates=[] 39 | for param, gparam in zip(params, gradient): 40 | updates.append((param, param - gparam * self.lr)) 41 | 42 | self.train_function = theano.function(inputs = [phi_x, phi_f1, phi_f1bar, phi_m0, phi_r, phi_rbar], 43 | outputs = cost, 44 | updates = updates) 45 | 46 | phi_f = T.vector('phi_f') 47 | 48 | score_o = self.calc_score_o(phi_x, phi_f) 49 | self.predict_function_o = theano.function(inputs = [phi_x, phi_f], outputs = score_o) 50 | 51 | score_r = self.calc_score_r(phi_x, phi_f) 52 | self.predict_function_r = theano.function(inputs = [phi_x, phi_f], outputs = score_r) 53 | 54 | def calc_score(self, phi_x, phi_y, U): 55 | #return T.dot(T.dot(phi_x.T, self.U_O.T), T.dot(self.U_O, phi_y)) 56 | return T.dot(U.dot(phi_x), U.dot(phi_y)) 57 | 58 | def calc_score_o(self, phi_x, phi_y): 59 | return self.calc_score(phi_x, phi_y, self.U_O) 60 | 61 | def calc_score_r(self, phi_x, phi_y): 62 | return self.calc_score(phi_x, phi_y, self.U_R) 63 | 64 | def calc_cost(self, phi_x, phi_f1, phi_f1bar, phi_m0, phi_r, phi_rbar): 65 | correct_score1 = self.calc_score_o(phi_x, phi_f1) 66 | false_score1 = self.calc_score_o(phi_x, phi_f1bar) 67 | 68 | correct_score2 = self.calc_score_r(phi_x + phi_m0, phi_r) 69 | false_score2 = self.calc_score_r(phi_x + phi_m0, phi_rbar) 70 | 71 | cost = ( 72 | T.maximum(0, self.margin - correct_score1 + false_score1) + 73 | T.maximum(0, self.margin - correct_score2 + false_score2) 74 | ) 75 | return cost 76 | 77 | def train(self, dataset_bow, questions, num_words): 78 | for epoch in xrange(self.n_epochs): 79 | costs = [] 80 | 81 | random.shuffle(questions) 82 | for i, question in enumerate(questions): 83 | article_no = question[0] 84 | line_no = question[1] 85 | question_phi = question[2] 86 | correct_stmt = question[4] 87 | seq = [i for i in range(line_no)] 88 | del seq[correct_stmt] 89 | false_stmt = random.choice(seq) 90 | #print article_no, line_no, correct_stmt, false_stmt 91 | phi_x = np.zeros((self.n_D,)) 92 | phi_x[:num_words] = question_phi 93 | phi_f1 = np.zeros((self.n_D,)) 94 | phi_f1[num_words:2*num_words] = dataset_bow[article_no][correct_stmt] 95 | phi_f1bar = np.zeros((self.n_D,)) 96 | phi_f1bar[num_words:2*num_words] = dataset_bow[article_no][false_stmt] 97 | 98 | if article_no == 0 and line_no == 2: 99 | corr_score = self.predict_function(phi_x, phi_f1) 100 | fals_score = self.predict_function(phi_x, phi_f1bar) 101 | print "[BEFORE] corr score: %f, false score: %f" % (corr_score, fals_score) 102 | 103 | cost = self.train_function(phi_x, phi_f1, phi_f1bar) 104 | costs.append(cost) 105 | 106 | if article_no == 0 and line_no == 2: 107 | corr_score = self.predict_function(phi_x, phi_f1) 108 | fals_score = self.predict_function(phi_x, phi_f1bar) 109 | print "[ AFTER] corr score: %f, false score: %f" % (corr_score, fals_score) 110 | 111 | if epoch % 100 == 0: 112 | # print 'Epoch %i/%i' % (epoch + 1, self.n_epochs), np.mean(costs) 113 | sys.stdout.flush() 114 | 115 | # print np.mean(costs), np.mean(self.U_O.get_value()), np.max(self.U_O.get_value()), np.min(self.U_O.get_value()) 116 | 117 | def predict(self, dataset, questions): 118 | correct_answers = 0 119 | wrong_answers = 0 120 | for i, question in enumerate(questions): 121 | article_no = question[0] 122 | line_no = question[1] 123 | question_phi = question[2] 124 | correct_stmt = question[4] 125 | 126 | phi_x = np.zeros((self.n_D,)) 127 | phi_x[:num_words] = question_phi 128 | 129 | answer = -1 130 | max_score = -99999 131 | for i in range(line_no): 132 | phi_f = np.zeros((self.n_D,)) 133 | phi_f[num_words:2*num_words] = dataset[article_no][i] 134 | 135 | #print phi_x, phi_f 136 | score = self.predict_function(phi_x, phi_f) 137 | if answer == -1 or score > max_score: 138 | max_score = score 139 | answer = i 140 | 141 | if answer == correct_stmt: 142 | correct_answers += 1 143 | else: 144 | wrong_answers += 1 145 | 146 | print '%d correct, %d wrong' % (correct_answers, wrong_answers) 147 | 148 | if __name__ == "__main__": 149 | training_dataset = sys.argv[1] 150 | test_dataset = training_dataset.replace('train', 'test') 151 | 152 | dataset, questions, word_to_id, num_words = parse_dataset(training_dataset) 153 | memNN = MemNN(n_words=num_words, n_embedding=100, lr=0.01, n_epochs=10, margin=1.0, word_to_id=word_to_id) 154 | memNN.train(dataset, questions) 155 | 156 | test_dataset, test_questions, _, _ = parse_dataset(test_dataset, word_id=num_words, word_to_id=word_to_id, update_word_ids=False) 157 | memNN.predict(test_dataset, test_questions) 158 | -------------------------------------------------------------------------------- /memnn_theano_v2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import sys, random, pprint 5 | 6 | from theano_util import * 7 | 8 | class MemNN: 9 | def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, n_epochs=100, momentum=0.9, word_to_id=None): 10 | self.n_embedding = n_embedding 11 | self.lr = lr 12 | self.momentum = momentum 13 | self.margin = margin 14 | self.n_epochs = n_epochs 15 | self.n_words = n_words 16 | self.n_D = 3 * self.n_words + 3 17 | 18 | self.word_to_id = word_to_id 19 | self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems()) 20 | 21 | # Question 22 | phi_x = T.vector('phi_x') 23 | 24 | # True statements 25 | phi_f1_1 = T.vector('phi_f1_1') 26 | phi_f2_1 = T.vector('phi_f2_1') 27 | 28 | # False statements 29 | phi_f1_2 = T.vector('phi_f1_2') 30 | phi_f2_2 = T.vector('phi_f2_2') 31 | 32 | # Supporting memories 33 | phi_m0 = T.vector('phi_m0') 34 | phi_m1 = T.vector('phi_m1') 35 | 36 | # True word 37 | phi_r = T.vector('phi_r') 38 | 39 | # False words 40 | phi_rbars = T.matrix('phi_rbars') 41 | 42 | self.U_O = init_shared_normal(n_embedding, self.n_D, 0.01) 43 | self.U_R = init_shared_normal(n_embedding, self.n_D, 0.01) 44 | 45 | # Total S_R cost for all sampled words 46 | tot_sr_cost = T.scalar('sr_cost') 47 | 48 | cost = self.calc_cost(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0, phi_m1, phi_r, phi_rbars, tot_sr_cost) 49 | params = [self.U_O, self.U_R] 50 | gradient = T.grad(cost, params) 51 | 52 | l_rate = T.scalar('l_rate') 53 | 54 | updates=[] 55 | for param, gparam in zip(params, gradient): 56 | param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable) 57 | updates.append((param, param - param_update * l_rate)) 58 | updates.append((param_update, self.momentum*param_update + (1. - self.momentum)*gparam)) 59 | 60 | self.train_function = theano.function( 61 | inputs = [phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, \ 62 | phi_m0, phi_m1, phi_r, phi_rbars, \ 63 | theano.Param(l_rate, default=self.lr), \ 64 | theano.Param(tot_sr_cost, default=0.0)], 65 | outputs = cost, 66 | updates = updates) 67 | 68 | # Candidate statement for prediction 69 | phi_f = T.vector('phi_f') 70 | 71 | score_o = self.calc_score_o(phi_x, phi_f) 72 | self.predict_function_o = theano.function(inputs = [phi_x, phi_f], outputs = score_o) 73 | 74 | score_r = self.calc_score_r(phi_x, phi_f) 75 | self.predict_function_r = theano.function(inputs = [phi_x, phi_f], outputs = score_r) 76 | 77 | def calc_score_o(self, phi_x, phi_y_yp_t): 78 | return T.dot(self.U_O.dot(phi_x), self.U_O.dot(phi_y_yp_t)) 79 | 80 | def calc_score_r(self, phi_x, phi_y): 81 | return T.dot(self.U_R.dot(phi_x), self.U_R.dot(phi_y)) 82 | 83 | # phi_f1_1 = phi_f1 - phi_f1bar + phi_t1_1 84 | # phi_f1_2 = phi_f1bar - phi_f1 + phi_t1_2 85 | def calc_cost(self, phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0, phi_m1, phi_r, phi_rbars, tot_sr_cost): 86 | score1_1 = self.calc_score_o(phi_x, phi_f1_1) 87 | score1_2 = self.calc_score_o(phi_x, phi_f1_2) 88 | 89 | score2_1 = self.calc_score_o(phi_x + phi_m0, phi_f2_1) 90 | score2_2 = self.calc_score_o(phi_x + phi_m0, phi_f2_2) 91 | 92 | s_o_cost = ( 93 | T.maximum(0, self.margin - score1_1) + T.maximum(0, self.margin + score1_2) + 94 | T.maximum(0, self.margin - score2_1) + T.maximum(0, self.margin + score2_2) 95 | ) 96 | 97 | def compute_sr_cost(phi_rbar, correct_score): 98 | false_score = self.calc_score_r(phi_x + phi_m0 + phi_m1, phi_rbar) 99 | return T.maximum(0, self.margin - correct_score + false_score) 100 | 101 | correct_score3 = self.calc_score_r(phi_x + phi_m0 + phi_m1, phi_r) 102 | sr_costs, sr_updates = theano.reduce(lambda phi_rbar, tot_sr_cost: tot_sr_cost + compute_sr_cost(phi_rbar, correct_score3), 103 | sequences=phi_rbars, outputs_info=[{'initial': tot_sr_cost}]) 104 | 105 | cost = s_o_cost + sr_costs 106 | return cost 107 | 108 | def construct_phi(self, phi_type, bow=None, word_id=None, ids=None): 109 | # type 0: question (phi_x) 110 | # type 1: supporting memory (phi_m*) 111 | # type 2: candidate memory (phi_y) 112 | # type 3: word vector 113 | # type 4: write-time features 114 | assert(phi_type >= 0 and phi_type < 5) 115 | phi = np.zeros((3*self.n_words + 3,)) 116 | if phi_type < 3: 117 | assert(bow is not None) 118 | phi[phi_type*self.n_words:(phi_type+1)*self.n_words] = bow 119 | elif phi_type == 3: 120 | assert(word_id != None and word_id < self.n_words) 121 | phi[2*self.n_words + word_id] = 1 122 | else: 123 | assert(ids != None and len(ids) == 3) 124 | if ids[0] > ids[1]: phi[3*self.n_words] = 1 125 | if ids[0] > ids[2]: phi[3*self.n_words+1] = 1 126 | if ids[1] > ids[2]: phi[3*self.n_words+2] = 1 127 | return phi 128 | 129 | # returns (phi_y - phi_yp + phi_t) 130 | def construct_wt_phi(self, index_x, index_y, index_yp, y, yp): 131 | phi_y = self.construct_phi(2, bow=y) 132 | phi_yp = self.construct_phi(2, bow=yp) 133 | phi_t = self.construct_phi(4, ids=[index_x, index_y, index_yp]) 134 | return phi_y - phi_yp + phi_t 135 | 136 | def neg_sample(self, c, num): 137 | assert(c < num) 138 | assert(num > 1) 139 | f = random.randint(0, num-2) 140 | if f == c: 141 | f = num-1 142 | return f 143 | 144 | def find_m0(self, index_x, phi_x, statements, ignore=None): 145 | max_score = float("-inf") 146 | index_m0 = 0 147 | m0 = statements[0] 148 | for i in xrange(1,len(statements)): 149 | if ignore and i == ignore: 150 | continue 151 | 152 | s = statements[i] 153 | phi_s = self.construct_wt_phi(index_x, i, index_m0, s, m0) 154 | 155 | if self.predict_function_o(phi_x, phi_s) >= 0: 156 | index_m0 = i 157 | m0 = s 158 | 159 | return index_m0, m0 160 | 161 | def train(self, dataset_bow, questions, lr_schedule=None): 162 | l_rate = self.lr 163 | for epoch in xrange(self.n_epochs): 164 | costs = [] 165 | 166 | if lr_schedule != None and epoch in lr_schedule: 167 | l_rate = lr_schedule[epoch] 168 | 169 | random.shuffle(questions) 170 | for i, question in enumerate(questions): 171 | article_no = question[0] 172 | article = dataset_bow[article_no] 173 | line_no = question[1] 174 | question_phi = question[2] 175 | correct_stmts = question[4].split(' ') 176 | correct_stmt1 = int(correct_stmts[0]) 177 | correct_stmt2 = int(correct_stmts[1]) 178 | 179 | if line_no <= 1: 180 | continue 181 | 182 | # The question 183 | phi_x = self.construct_phi(0, bow=question_phi) 184 | 185 | # Find m0 186 | index_m0, m0 = self.find_m0(line_no, phi_x, article[:line_no]) 187 | phi_m0 = self.construct_phi(1, bow=m0) 188 | 189 | # Find m1 190 | index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, article[:line_no], ignore=index_m0) 191 | phi_m1 = self.construct_phi(1, bow=m1) 192 | 193 | # False statement 1 194 | false_stmt1 = index_m0 195 | if false_stmt1 == correct_stmt1: 196 | false_stmt1 = self.neg_sample(correct_stmt1, line_no) 197 | phi_f1_1 = self.construct_wt_phi(line_no, correct_stmt1, false_stmt1, article[correct_stmt1], article[false_stmt1]) 198 | phi_f1_2 = self.construct_wt_phi(line_no, false_stmt1, correct_stmt1, article[false_stmt1], article[correct_stmt1]) 199 | 200 | # False statement 2 201 | false_stmt2 = index_m1 202 | if false_stmt2 == correct_stmt2: 203 | false_stmt2 = self.neg_sample(correct_stmt2, line_no) 204 | phi_f2_1 = self.construct_wt_phi(line_no, correct_stmt2, false_stmt2, article[correct_stmt2], article[false_stmt2]) 205 | phi_f2_2 = self.construct_wt_phi(line_no, false_stmt2, correct_stmt2, article[false_stmt2], article[correct_stmt2]) 206 | 207 | # Correct word 208 | correct_word = question[3] 209 | phi_r = self.construct_phi(3, word_id=correct_word) 210 | 211 | # False word 212 | false_word_ids = [i for i in range(self.n_words)] 213 | del false_word_ids[correct_word] 214 | # Find the highest ranking word, if it isnt the correct word, add it to list 215 | # Possible that this word will be added twice, but that is okay 216 | false_word1, score = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False) 217 | if false_word1 != correct_word: 218 | false_word_ids.insert(0, false_word1) 219 | # Clip no. of samples to 20 220 | false_word_ids = false_word_ids[:min(20,len(false_word_ids))] 221 | phi_rbars = np.vstack(tuple(map(lambda word_id: self.construct_phi(3, word_id=word_id), false_word_ids))) 222 | 223 | if article_no == 1 and line_no == 12: 224 | print '[SAMPLE] %s\t%s' % (self.id_to_word[correct_word], self.id_to_word[false_word1]) 225 | w, score = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False) 226 | print "[BEFORE] %.3f\t%.3f\t%.3f\t%.3f\tm0:%d\tm1:%d\ta:%s\ts:%.3f\tc:%s" % ( 227 | self.predict_function_o(phi_x, phi_f1_1), 228 | self.predict_function_o(phi_x, phi_f1_2), 229 | self.predict_function_o(phi_x + phi_m0, phi_f2_1), 230 | self.predict_function_o(phi_x + phi_m0, phi_f2_2), 231 | index_m0, index_m1, 232 | self.id_to_word[w], score, self.id_to_word[correct_word] 233 | ) 234 | 235 | cost = self.train_function(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, \ 236 | phi_m0, phi_m1, phi_r, phi_rbars, \ 237 | l_rate) 238 | costs.append(cost) 239 | 240 | if article_no == 1 and line_no == 12: 241 | index_m0, m0 = self.find_m0(line_no, phi_x, article[:line_no]) 242 | phi_m0 = self.construct_phi(1, bow=m0) 243 | index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, article[:line_no], ignore=index_m0) 244 | phi_m1 = self.construct_phi(1, bow=m1) 245 | w, score = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False) 246 | print "[ AFTER] %.3f\t%.3f\t%.3f\t%.3f\tm0:%d\tm1:%d\ta:%s\ts:%.3f\tc:%s" % ( 247 | self.predict_function_o(phi_x, phi_f1_1), 248 | self.predict_function_o(phi_x, phi_f1_2), 249 | self.predict_function_o(phi_x + phi_m0, phi_f2_1), 250 | self.predict_function_o(phi_x + phi_m0, phi_f2_2), 251 | index_m0, index_m1, 252 | self.id_to_word[w], score, self.id_to_word[correct_word] 253 | ) 254 | 255 | print "Epoch %d: %f" % (epoch, np.mean(costs)) 256 | 257 | def find_word(self, phi_x, verbose=False): 258 | max_score = float("-inf") 259 | best_word = -1 260 | for i in xrange(self.n_words): 261 | phi_r = self.construct_phi(3, word_id=i) 262 | score = self.predict_function_r(phi_x, phi_r) 263 | if verbose: 264 | print '[ FIND] w:%s\ts:%.3f' % ( 265 | self.id_to_word[i], 266 | score 267 | ) 268 | if score > max_score: 269 | max_score = score 270 | best_word = i 271 | 272 | assert(best_word >= 0) 273 | return best_word, score 274 | 275 | def predict(self, dataset, questions): 276 | correct_answers = 0 277 | wrong_answers = 0 278 | fake_correct_answers = 0 279 | for i, question in enumerate(questions): 280 | article_no = question[0] 281 | line_no = question[1] 282 | question_phi = question[2] 283 | correct = question[3] 284 | 285 | phi_x = self.construct_phi(0, bow=question_phi) 286 | 287 | statements = dataset[article_no] 288 | 289 | phi_m0 = None 290 | phi_m1 = None 291 | if len(statements) == 0: 292 | print "Stupid question" 293 | continue 294 | elif len(statements) == 1: 295 | print "Stupid question?" 296 | phi_m0 = self.construct_phi(1, statements[0]) 297 | phi_m1 = self.construct_phi(1, statements[0]) 298 | else: 299 | index_m0, m0 = self.find_m0(line_no, phi_x, statements[:line_no]) 300 | phi_m0 = self.construct_phi(1, m0) 301 | index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, statements[:line_no], ignore=index_m0) 302 | phi_m1 = self.construct_phi(1, m1) 303 | 304 | c1 = int(question[4].split(' ')[0]) 305 | c2 = int(question[4].split(' ')[1]) 306 | if (index_m0 == c1 or index_m0 == c2) and (index_m1 == c1 or index_m1 == c2): 307 | fake_correct_answers += 1 308 | 309 | if article_no <= 2: 310 | predicted, _ = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False) 311 | print "%d, %d, %d: predicted: %s, correct: %s" % (i, article_no, line_no, self.id_to_word[predicted], self.id_to_word[correct]) 312 | else: 313 | predicted, _ = self.find_word(phi_x + phi_m0 + phi_m1) 314 | if predicted == correct: 315 | correct_answers += 1 316 | else: 317 | wrong_answers += 1 318 | 319 | print '%d correct, %d wrong, %d fake_correct' % (correct_answers, wrong_answers, fake_correct_answers) 320 | 321 | if __name__ == "__main__": 322 | train_file = sys.argv[1] 323 | test_file = train_file.replace('train', 'test') 324 | 325 | train_dataset, train_questions, word_to_id, num_words = parse_dataset(train_file) 326 | test_dataset, test_questions, _, _ = parse_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False) 327 | 328 | if len(sys.argv) > 2: 329 | n_epochs = int(sys.argv[2]) 330 | else: 331 | n_epochs = 10 332 | 333 | memNN = MemNN(n_words=num_words, n_embedding=100, lr=0.01, n_epochs=n_epochs, margin=0.1, word_to_id=word_to_id) 334 | # memNN.train(train_dataset, train_questions, lr_schedule=dict([(0, 0.01), (20, 0.005), (50, 0.001)])) 335 | memNN.train(train_dataset, train_questions) 336 | memNN.predict(test_dataset, test_questions) 337 | -------------------------------------------------------------------------------- /memnn_theano_v3.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import sys, random, pprint 5 | 6 | from theano_util import * 7 | from keras.activations import tanh, hard_sigmoid 8 | from keras.initializations import glorot_uniform, orthogonal 9 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix 10 | 11 | def inspect_inputs(i, node, fn): 12 | print i, node, "inputs:", [input[0] for input in fn.inputs], 13 | 14 | def inspect_outputs(i, node, fn): 15 | print i, node, "outputs:", [output[0] for output in fn.outputs] 16 | 17 | class MemNN: 18 | def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, momentum=0.9, word_to_id=None): 19 | self.n_embedding = n_embedding 20 | self.n_lstm_embed = n_embedding 21 | self.word_embed = n_embedding 22 | self.lr = lr 23 | self.momentum = momentum 24 | self.margin = margin 25 | self.n_words = n_words 26 | self.n_D = 3 * self.n_words + 3 27 | 28 | self.word_to_id = word_to_id 29 | self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems()) 30 | 31 | # Question 32 | x = T.vector('x') 33 | phi_x = T.vector('phi_x') 34 | 35 | # True statements 36 | phi_f1_1 = T.vector('phi_f1_1') 37 | phi_f2_1 = T.vector('phi_f2_1') 38 | 39 | # False statements 40 | phi_f1_2 = T.vector('phi_f1_2') 41 | phi_f2_2 = T.vector('phi_f2_2') 42 | 43 | # Supporting memories 44 | m0 = T.vector('m0') 45 | m1 = T.vector('m1') 46 | phi_m0 = T.vector('phi_m0') 47 | phi_m1 = T.vector('phi_m1') 48 | 49 | # True word 50 | r = T.vector('r') 51 | 52 | # Word sequence 53 | words = T.ivector('words') 54 | 55 | # Scoring function 56 | self.U_O = init_shared_normal(n_embedding, self.n_D, 0.01) 57 | 58 | # Word embedding 59 | self.L = glorot_uniform((self.n_words, self.word_embed)) 60 | self.Lprime = glorot_uniform((self.n_words, self.n_lstm_embed)) 61 | 62 | # LSTM 63 | self.W_i = glorot_uniform((self.word_embed, self.n_lstm_embed)) 64 | self.U_i = orthogonal((self.n_lstm_embed, self.n_lstm_embed)) 65 | self.b_i = shared_zeros((self.n_lstm_embed)) 66 | 67 | self.W_f = glorot_uniform((self.word_embed, self.n_lstm_embed)) 68 | self.U_f = orthogonal((self.n_lstm_embed, self.n_lstm_embed)) 69 | self.b_f = shared_zeros((self.n_lstm_embed)) 70 | 71 | self.W_c = glorot_uniform((self.word_embed, self.n_lstm_embed)) 72 | self.U_c = orthogonal((self.n_lstm_embed, self.n_lstm_embed)) 73 | self.b_c = shared_zeros((self.n_lstm_embed)) 74 | 75 | self.W_o = glorot_uniform((self.word_embed, self.n_lstm_embed)) 76 | self.U_o = orthogonal((self.n_lstm_embed, self.n_lstm_embed)) 77 | self.b_o = shared_zeros((self.n_lstm_embed)) 78 | 79 | mem_cost = self.calc_cost(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0) 80 | 81 | lstm_output = self.lstm_cost(words) 82 | self.predict_function_r = theano.function(inputs = [words], outputs = lstm_output, allow_input_downcast=True) 83 | 84 | lstm_cost = -T.sum(T.mul(r, T.log(lstm_output))) 85 | 86 | cost = mem_cost + lstm_cost 87 | 88 | params = [ 89 | self.U_O, 90 | self.W_i, self.U_i, self.b_i, 91 | self.W_f, self.U_f, self.b_f, 92 | self.W_c, self.U_c, self.b_c, 93 | self.W_o, self.U_o, self.b_o, 94 | self.L, self.Lprime 95 | ] 96 | 97 | grads = T.grad(cost, params) 98 | 99 | # Parameter updates 100 | updates = self.get_updates(params, grads, method='adagrad') 101 | 102 | l_rate = T.scalar('l_rate') 103 | 104 | # Theano functions 105 | self.train_function = theano.function( 106 | inputs = [phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, 107 | phi_m0, r, words, 108 | theano.Param(l_rate, default=self.lr)], 109 | outputs = cost, 110 | updates = updates, 111 | on_unused_input='warn', 112 | allow_input_downcast=True, 113 | ) 114 | #mode='FAST_COMPILE') 115 | #mode='DebugMode') 116 | #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs)) 117 | 118 | # Candidate statement for prediction 119 | phi_f = T.vector('phi_f') 120 | 121 | score_o = self.calc_score_o(phi_x, phi_f) 122 | self.predict_function_o = theano.function(inputs = [phi_x, phi_f], outputs = score_o) 123 | 124 | def get_updates(self, params, grads, method=None, **kwargs): 125 | self.rho = 0.95 126 | self.epsilon = 1e-6 127 | 128 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 129 | updates=[] 130 | 131 | if method == 'adadelta': 132 | print "Using ADADELTA" 133 | delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] 134 | for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 135 | new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator 136 | updates.append((a, new_a)) 137 | 138 | # use the new accumulator and the *old* delta_accumulator 139 | update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon) 140 | 141 | new_p = p - self.lr * update 142 | updates.append((p, new_p)) # apply constraints 143 | 144 | # update delta_accumulator 145 | new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2 146 | updates.append((d_a, new_d_a)) 147 | 148 | 149 | elif method == 'adam': 150 | # unimplemented 151 | print "Using ADAM" 152 | 153 | elif method == 'adagrad': 154 | print "Using ADAGRAD" 155 | for p, g, a in zip(params, grads, accumulators): 156 | new_a = a + g ** 2 # update accumulator 157 | updates.append((a, new_a)) 158 | 159 | new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon) 160 | updates.append((p, new_p)) # apply constraints 161 | 162 | else: # Default 163 | print "Using MOMENTUM" 164 | l_rate = kwargs['l_rate'] 165 | for param, gparam in zip(params, gradient): 166 | param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable) 167 | updates.append((param, param - param_update * l_rate)) 168 | updates.append((param_update, self.momentum*param_update + (1. - self.momentum)*gparam)) 169 | 170 | return updates 171 | 172 | def _step(self, 173 | xi_t, xf_t, xc_t, xo_t, 174 | h_tm1, c_tm1, 175 | u_i, u_f, u_o, u_c): 176 | 177 | i_t = hard_sigmoid(xi_t + T.dot(h_tm1, u_i)) 178 | f_t = hard_sigmoid(xf_t + T.dot(h_tm1, u_f)) 179 | c_t = f_t * c_tm1 + i_t * tanh(xc_t + T.dot(h_tm1, u_c)) 180 | o_t = hard_sigmoid(xo_t + T.dot(h_tm1, u_o)) 181 | h_t = o_t * tanh(c_t) 182 | return h_t, c_t 183 | 184 | # words: word index in n_words 185 | def lstm_cost(self, words): 186 | x = self.L[words] 187 | 188 | # Each element of x is (word_embed,) shape 189 | xi = T.dot(x, self.W_i) + self.b_i 190 | xf = T.dot(x, self.W_f) + self.b_f 191 | xc = T.dot(x, self.W_c) + self.b_c 192 | xo = T.dot(x, self.W_o) + self.b_o 193 | 194 | [outputs, memories], updates = theano.scan( 195 | self._step, 196 | sequences=[xi, xf, xc, xo], 197 | outputs_info=[ 198 | alloc_zeros_matrix(self.n_lstm_embed), 199 | alloc_zeros_matrix(self.n_lstm_embed), 200 | ], 201 | non_sequences=[ 202 | self.U_i, self.U_f, self.U_o, self.U_c, 203 | ], 204 | truncate_gradient=-1 205 | ) 206 | 207 | r = T.dot(self.Lprime, outputs[-1]) 208 | 209 | return T.nnet.softmax(r) 210 | 211 | def calc_score_o(self, phi_x, phi_y_yp_t): 212 | return T.dot(self.U_O.dot(phi_x), self.U_O.dot(phi_y_yp_t)) 213 | 214 | # phi_f1_1 = phi_f1 - phi_f1bar + phi_t1_1 215 | # phi_f1_2 = phi_f1bar - phi_f1 + phi_t1_2 216 | def calc_cost(self, phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0): 217 | score1_1 = self.calc_score_o(phi_x, phi_f1_1) 218 | score1_2 = self.calc_score_o(phi_x, phi_f1_2) 219 | 220 | score2_1 = self.calc_score_o(phi_x + phi_m0, phi_f2_1) 221 | score2_2 = self.calc_score_o(phi_x + phi_m0, phi_f2_2) 222 | 223 | s_o_cost = ( 224 | T.maximum(0, self.margin - score1_1) + T.maximum(0, self.margin + score1_2) + 225 | T.maximum(0, self.margin - score2_1) + T.maximum(0, self.margin + score2_2) 226 | ) 227 | 228 | return s_o_cost 229 | 230 | def construct_phi(self, phi_type, bow=None, word_id=None, ids=None): 231 | # type 0: question (phi_x) 232 | # type 1: supporting memory (phi_m*) 233 | # type 2: candidate memory (phi_y) 234 | # type 3: word vector 235 | # type 4: write-time features 236 | # type 5: 0s 237 | assert(phi_type >= 0 and phi_type < 6) 238 | phi = np.zeros((3*self.n_words + 3,)) 239 | if phi_type < 3: 240 | assert(bow is not None) 241 | phi[phi_type*self.n_words:(phi_type+1)*self.n_words] = bow 242 | elif phi_type == 3: 243 | assert(word_id != None and word_id < self.n_words) 244 | phi[2*self.n_words + word_id] = 1 245 | elif phi_type == 5: 246 | pass 247 | else: 248 | assert(ids != None and len(ids) == 3) 249 | if ids[0] > ids[1]: phi[3*self.n_words] = 1 250 | if ids[0] > ids[2]: phi[3*self.n_words+1] = 1 251 | if ids[1] > ids[2]: phi[3*self.n_words+2] = 1 252 | return phi 253 | 254 | def make_one_hot(self, index): 255 | v = np.zeros((self.n_words)) 256 | v[index] = 1.0 257 | return v 258 | 259 | # returns (phi_y - phi_yp + phi_t) 260 | def construct_wt_phi(self, index_x, index_y, index_yp, y, yp): 261 | phi_y = self.construct_phi(2, bow=y) 262 | phi_yp = self.construct_phi(2, bow=yp) 263 | phi_t = self.construct_phi(4, ids=[index_x, index_y, index_yp]) 264 | return phi_y - phi_yp + phi_t 265 | 266 | def neg_sample(self, c, num): 267 | assert(c < num) 268 | assert(num > 1) 269 | f = random.randint(0, num-2) 270 | if f == c: 271 | f = num-1 272 | return f 273 | 274 | def find_m0(self, index_x, phi_x, statements, ignore=None): 275 | max_score = float("-inf") 276 | index_m0 = 0 277 | m0 = statements[0] 278 | for i in xrange(1,len(statements)): 279 | if ignore and i == ignore: 280 | continue 281 | 282 | s = statements[i] 283 | phi_s = self.construct_wt_phi(index_x, i, index_m0, s, m0) 284 | 285 | if self.predict_function_o(phi_x, phi_s) >= 0: 286 | index_m0 = i 287 | m0 = s 288 | 289 | return index_m0, m0 290 | 291 | def train(self, dataset_seq, dataset_bow, questions, n_epochs=100, lr_schedule=None): 292 | l_rate = self.lr 293 | for epoch in xrange(n_epochs): 294 | costs = [] 295 | 296 | if lr_schedule != None and epoch in lr_schedule: 297 | l_rate = lr_schedule[epoch] 298 | 299 | random.shuffle(questions) 300 | for i, question in enumerate(questions): 301 | article_no = question[0] 302 | article = dataset_bow[article_no] 303 | line_no = question[1] 304 | question_phi = question[2] 305 | correct_stmts = question[4].split(' ') 306 | correct_stmt1 = int(correct_stmts[0]) 307 | is_single_statement = len(correct_stmts) == 1 308 | correct_stmt2 = None 309 | if not is_single_statement: 310 | correct_stmt2 = int(correct_stmts[1]) 311 | question_seq = question[-1] 312 | 313 | if line_no <= 1: 314 | continue 315 | 316 | # The question 317 | phi_x = self.construct_phi(0, bow=question_phi) 318 | 319 | # Find m0 320 | index_m0, m0 = self.find_m0(line_no, phi_x, article[:line_no]) 321 | phi_m0 = self.construct_phi(1, bow=m0) 322 | 323 | # Find m1 324 | index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, article[:line_no], ignore=index_m0) 325 | phi_m1 = self.construct_phi(1, bow=m1) 326 | 327 | # False statement 1 328 | false_stmt1 = index_m0 329 | if false_stmt1 == correct_stmt1: 330 | false_stmt1 = self.neg_sample(correct_stmt1, line_no) 331 | phi_f1_1 = self.construct_wt_phi(line_no, correct_stmt1, false_stmt1, article[correct_stmt1], article[false_stmt1]) 332 | phi_f1_2 = self.construct_wt_phi(line_no, false_stmt1, correct_stmt1, article[false_stmt1], article[correct_stmt1]) 333 | 334 | # False statement 2 335 | phi_f2_1 = None 336 | phi_f2_2 = None 337 | if not is_single_statement: 338 | false_stmt2 = index_m1 339 | if false_stmt2 == correct_stmt2: 340 | false_stmt2 = self.neg_sample(correct_stmt2, line_no) 341 | phi_f2_1 = self.construct_wt_phi(line_no, correct_stmt2, false_stmt2, article[correct_stmt2], article[false_stmt2]) 342 | phi_f2_2 = self.construct_wt_phi(line_no, false_stmt2, correct_stmt2, article[false_stmt2], article[correct_stmt2]) 343 | else: 344 | phi_f2_1 = self.construct_phi(5) 345 | phi_f2_2 = self.construct_phi(5) 346 | 347 | # Correct word 348 | correct_word = question[3] 349 | r = self.make_one_hot(correct_word) 350 | 351 | words = np.asarray(dataset_seq[article_no][index_m0] + dataset_seq[article_no][index_m1] + question_seq) 352 | 353 | cost = self.train_function(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, 354 | phi_m0, r, words) 355 | #print "%d: %f" % (i, cost) 356 | costs.append(cost) 357 | 358 | print "Epoch %d: %f" % (epoch, np.mean(costs)) 359 | 360 | def find_word(self, words): 361 | probs = self.predict_function_r(words) 362 | return np.argmax(probs) 363 | 364 | def predict(self, dataset_seq, dataset_bow, questions): 365 | correct_answers = 0 366 | wrong_answers = 0 367 | fake_correct_answers = 0 368 | for i, question in enumerate(questions): 369 | article_no = question[0] 370 | line_no = question[1] 371 | question_phi = question[2] 372 | correct = question[3] 373 | question_seq = question[-1] 374 | 375 | x = question_phi 376 | phi_x = self.construct_phi(0, bow=question_phi) 377 | 378 | statements = dataset_bow[article_no] 379 | 380 | phi_m0 = None 381 | phi_m1 = None 382 | if len(statements) == 0: 383 | print "Stupid question" 384 | continue 385 | elif len(statements) == 1: 386 | print "Stupid question?" 387 | phi_m0 = self.construct_phi(1, statements[0]) 388 | phi_m1 = self.construct_phi(1, statements[0]) 389 | else: 390 | index_m0, m0 = self.find_m0(line_no, phi_x, statements[:line_no]) 391 | phi_m0 = self.construct_phi(1, m0) 392 | index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, statements[:line_no], ignore=index_m0) 393 | 394 | correct_stmts = question[4].split(' ') 395 | is_single_statement = len(correct_stmts) == 1 396 | c1 = int(correct_stmts[0]) 397 | c2 = int(question[4].split(' ')[1]) if not is_single_statement else None 398 | if (index_m0 == c1 or index_m0 == c2) and (index_m1 == c1 or index_m1 == c2): 399 | fake_correct_answers += 1 400 | 401 | predicted = self.find_word( 402 | np.asarray(dataset_seq[article_no][index_m0] + dataset_seq[article_no][index_m1] + question_seq) 403 | ) 404 | # print 'Correct: %s (%d), Guess: %s (%d)' % (self.id_to_word[correct], correct, self.id_to_word[predicted], predicted) 405 | if predicted == correct: 406 | correct_answers += 1 407 | else: 408 | wrong_answers += 1 409 | 410 | print '%d correct, %d wrong, %d fake_correct' % (correct_answers, wrong_answers, fake_correct_answers) 411 | 412 | def train_weak(self, dataset, questions, n_epochs=100, lr_schedule=None): 413 | l_rate = self.lr 414 | for epoch in xrange(n_epochs): 415 | costs = [] 416 | 417 | if lr_schedule != None and epoch in lr_schedule: 418 | l_rate = lr_schedule[epoch] 419 | 420 | random.shuffle(questions) 421 | for i, question in enumerate(questions): 422 | article_no = question[0] 423 | article = dataset[article_no] 424 | line_no = question[1] 425 | statements_seq = question[2][:-1] 426 | question_seq = question[2][-1] 427 | 428 | if line_no <= 1: 429 | continue 430 | 431 | # Correct word 432 | correct_word = question[3] 433 | 434 | cost = self.train_function(statements_seq, question_seq, correct_word) 435 | 436 | #print "%d: %f" % (i, cost) 437 | costs.append(cost) 438 | 439 | print "Epoch %d: %f" % (epoch, np.mean(costs)) 440 | 441 | def predict_weak(self, dataset, questions): 442 | correct_answers = 0 443 | wrong_answers = 0 444 | for i, question in enumerate(questions): 445 | article_no = question[0] 446 | article = dataset[article_no] 447 | line_no = question[1] 448 | statements_seq = question[2][:-1] 449 | question_seq = question[2][-1] 450 | correct = question[3] 451 | 452 | predicted = self.predict_function( 453 | np.asarray(statements_seq), np.asarray(question_seq) 454 | ) 455 | # print 'Correct: %s (%d), Guess: %s (%d)' % (self.id_to_word[correct], correct, self.id_to_word[predicted], predicted) 456 | if predicted == correct: 457 | correct_answers += 1 458 | else: 459 | wrong_answers += 1 460 | 461 | print '%d correct, %d wrong' % (correct_answers, wrong_answers) 462 | 463 | if __name__ == "__main__": 464 | train_file = sys.argv[1] 465 | test_file = train_file.replace('train', 'test') 466 | 467 | train_dataset_seq, train_dataset_bow, train_questions, word_to_id, num_words = parse_dataset(train_file) 468 | test_dataset_seq, test_dataset_bow, test_questions, _, _ = parse_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False) 469 | 470 | if len(sys.argv) > 2: 471 | n_epochs = int(sys.argv[2]) 472 | else: 473 | n_epochs = 10 474 | 475 | memNN = MemNN(n_words=num_words, n_embedding=100, lr=0.01, margin=0.1, word_to_id=word_to_id) 476 | #memNN.train(train_dataset_seq, train_dataset_bow, train_questions, n_epochs=n_epochs, lr_schedule=dict([(0, 0.02), (20, 0.01), (50, 0.005), (80, 0.002)])) 477 | #memNN.train(train_dataset_seq, train_dataset_bow, train_questions, lr_schedule=dict([(0, 0.01), (15, 0.009), (30, 0.007), (50, 0.005), (60, 0.003), (85, 0.001)])) 478 | #memNN.train(train_dataset_seq, train_dataset_bow, train_questions) 479 | #memNN.predict(train_dataset, train_questions) 480 | #memNN.predict(test_dataset_seq, test_dataset_bow, test_questions) 481 | 482 | for i in xrange(20): 483 | memNN.train(train_dataset_seq, train_dataset_bow, train_questions, n_epochs=5) 484 | memNN.predict(test_dataset_seq, test_dataset_bow, test_questions) 485 | -------------------------------------------------------------------------------- /nltk_utils.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | from nltk.stem.wordnet import WordNetLemmatizer 3 | from nltk.tokenize import word_tokenize 4 | from nltk.corpus import wordnet as wn 5 | 6 | def is_noun(tag): 7 | return tag in ['NN', 'NNS', 'NNP', 'NNPS'] 8 | 9 | def is_verb(tag): 10 | return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] 11 | 12 | def is_adverb(tag): 13 | return tag in ['RB', 'RBR', 'RBS'] 14 | 15 | def is_adjective(tag): 16 | return tag in ['JJ', 'JJR', 'JJS'] 17 | 18 | def penn_to_wn(tag): 19 | if is_adjective(tag): 20 | return wn.ADJ 21 | elif is_noun(tag): 22 | return wn.NOUN 23 | elif is_adverb(tag): 24 | return wn.ADV 25 | elif is_verb(tag): 26 | return wn.VERB 27 | return wn.NOUN 28 | 29 | def memoize1(f): 30 | memo = {} 31 | def helper(x): 32 | if x not in memo: 33 | memo[x] = f(x) 34 | return memo[x] 35 | return helper 36 | 37 | def memoize2(f): 38 | memo = {} 39 | def helper(x,y): 40 | if (x,y) not in memo: 41 | memo[(x,y)] = f(x, y) 42 | return memo[(x,y)] 43 | return helper 44 | 45 | def stem_word(word): 46 | return nltk.stem.snowball.EnglishStemmer().stem(word) 47 | 48 | stem_word = memoize1(stem_word) 49 | 50 | def get_lemma(word, tag): 51 | return WordNetLemmatizer().lemmatize(word, tag) 52 | 53 | get_lemma = memoize2(get_lemma) 54 | 55 | def canonicalize_tokens(tokens): 56 | canonical_tokens = [] 57 | tags = nltk.pos_tag(tokens) 58 | for tag in tags: 59 | wn_tag = penn_to_wn(tag[1]) 60 | t = get_lemma(tag[0], wn_tag) 61 | t = stem_word(t) 62 | canonical_tokens.append(t) 63 | return canonical_tokens 64 | -------------------------------------------------------------------------------- /pararth_final_report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/pararth_final_report.pdf -------------------------------------------------------------------------------- /pararth_milestone.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/pararth_milestone.pdf -------------------------------------------------------------------------------- /pos_pruning.py: -------------------------------------------------------------------------------- 1 | from nltk_utils import * 2 | 3 | import nltk 4 | from nltk.corpus import wordnet as wn 5 | 6 | def memoizefirst(f): 7 | memo = {} 8 | def helper(x, y): 9 | if x not in memo: 10 | memo[x] = f(x, y) 11 | return memo[x] 12 | return helper 13 | 14 | def get_noun_set(article, tokens): 15 | tags = nltk.pos_tag(tokens) 16 | nouns = set( 17 | map( 18 | lambda x: x[0], 19 | filter( 20 | lambda x: x[1] == wn.NOUN, 21 | map(lambda x: (x[0], penn_to_wn(x[1])), tags), 22 | ) 23 | ) 24 | ) 25 | return nouns 26 | 27 | get_noun_set = memoizefirst(get_noun_set) 28 | 29 | def prune_statements(dataset, questions, debug=True): 30 | total_old = 0 31 | total_new = 0 32 | 33 | for i in range(len(questions)): 34 | question = questions[i] 35 | new_statements = [] 36 | old_statements = question[2] 37 | 38 | # Keep only statements which have at least 1 common noun 39 | q = question[3] 40 | q_nouns = get_noun_set('|'.join(q), q) 41 | 42 | for s in old_statements: 43 | s_nouns = get_noun_set('|'.join(s), s) 44 | if len(s_nouns.intersection(q_nouns)) > 0: 45 | new_statements.append(s) 46 | 47 | questions[i][2] = new_statements 48 | total_old += len(old_statements) 49 | total_new += len(new_statements) 50 | 51 | if debug and i < 3: 52 | print "Question: ", q, "Statements:\n", old_statements, "\n", new_statements, "\nbefore %d after %d" % (len(old_statements), len(new_statements)) 53 | 54 | #print("Before %d After %d" % (total_old, total_new)) 55 | return questions 56 | -------------------------------------------------------------------------------- /qa_dataset_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from theano_util import * 4 | #from wordvec_pruning import prune_statements 5 | from pos_pruning import prune_statements 6 | 7 | from nltk_utils import * 8 | 9 | def only_words(line): 10 | ps = re.sub(r'[^a-zA-Z0-9]', r' ', line) 11 | ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations 12 | ns = re.sub(r'(\d+)', r' ', ws) # Put spaces around numbers 13 | hs = re.sub(r'-', r' ', ns) # Replace hyphens with space 14 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 15 | return rs 16 | 17 | def clean_sentence(line): 18 | ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters 19 | ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations 20 | ns = re.sub(r'(\d+)', r' ', ws) # Put spaces around numbers 21 | hs = re.sub(r'-', r' ', ns) # Replace hyphens with space 22 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 23 | return rs 24 | 25 | def get_sentences(line): 26 | ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters 27 | s = re.sub(r'(? ', ws) # Put spaces around numbers 30 | hs = re.sub(r'-', r' ', ns) # Replace hyphens with space 31 | rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1 32 | 33 | return rs.split('\t') 34 | 35 | def parse_qa_dataset(input_dir, word_id=0, word_to_id={}, update_word_ids=True): 36 | dataset = [] 37 | questions = [] 38 | 39 | article_files = set() 40 | print("Parsing questions...") 41 | with open(input_dir + '/question_answer_pairs.txt') as f: 42 | for line in f: 43 | # Skip first line 44 | if 'ArticleFile' in line: 45 | continue 46 | 47 | line = line.strip() 48 | 49 | # Skip empty lines 50 | if len(line) == 0: 51 | continue 52 | 53 | parts = line.split('\t') 54 | if len(parts) != 6: 55 | print("Malformed line: " + line) 56 | continue 57 | 58 | question = parts[1] 59 | answer = parts[2] 60 | answer = canonicalize_tokens([only_words(answer).strip().lower()]) 61 | assert(len(answer) == 1) 62 | answer = answer[0] 63 | 64 | article_name = parts[5] 65 | 66 | # There are other fields in the dataset, use them later if you want 67 | 68 | # This dataset has repeated questions. What to do? 69 | 70 | # Don't answer questions with more than 1 word answers 71 | if len(answer) == 0 or len(answer.split(' ')) > 1: 72 | # Skip for now 73 | continue 74 | 75 | if not update_word_ids and answer not in word_to_id: 76 | continue 77 | 78 | question_parts = question.split('\t') 79 | tokens = clean_sentence(question_parts[0]).strip().split() 80 | tokens = filter(lambda x: len(x.strip()) > 0, tokens) 81 | tokens = map(lambda x: x.lower(), tokens) 82 | tokens = canonicalize_tokens(tokens) 83 | 84 | if not update_word_ids: 85 | tokens = filter(lambda x: x in word_to_id, tokens) 86 | 87 | question_tokens = tokens 88 | if update_word_ids: 89 | for token in (tokens + [answer]): 90 | if token not in word_to_id: 91 | word_to_id[token] = word_id 92 | word_id += 1 93 | 94 | article_no = len(questions) 95 | 96 | article_file = input_dir + '/' + article_name + '.txt.clean' 97 | article_files.add(article_file) 98 | dataset.append(question_tokens) 99 | questions.append([article_no, article_file, None, question_tokens, answer]) 100 | 101 | article_data = {} 102 | print("Parsing articles...") 103 | for article_file in article_files: 104 | # Get all statements in the dataset for this question 105 | 106 | print("Parsing: " + article_file) 107 | s_file = open(article_file) 108 | statements = [] 109 | for statement in s_file: 110 | if len(statement.strip()) == 0: 111 | continue 112 | 113 | sentences = get_sentences(statement.strip()) 114 | 115 | for sentence in sentences: 116 | tokens = sentence.strip().split() 117 | tokens = filter(lambda x: len(x.strip()) > 0, tokens) 118 | tokens = map(lambda x: x.lower(), tokens) 119 | tokens = canonicalize_tokens(tokens) 120 | 121 | if not update_word_ids: 122 | tokens = filter(lambda x: x in word_to_id, tokens) 123 | 124 | article = tokens 125 | statements.append(article) 126 | dataset.append(article) 127 | if update_word_ids: 128 | for token in tokens: 129 | if token not in word_to_id: 130 | word_to_id[token] = word_id 131 | word_id += 1 132 | 133 | article_data[article_file] = statements 134 | 135 | print("Mapping articles to statements...") 136 | print("There are %d questions before deduplication" % len(questions)) 137 | question_set = set() 138 | for i in xrange(len(questions)): 139 | question = questions[i] 140 | question_tuple = tuple(question[3]) 141 | if question_tuple in question_set: 142 | question[0] = None 143 | continue 144 | 145 | question_set.add(question_tuple) 146 | question[2] = article_data[question[1]] 147 | 148 | questions = filter(lambda x: x[0] is not None, questions) 149 | print("There are %d questions after deduplication" % len(questions)) 150 | 151 | print("Trying to prune extraneaous statements...") 152 | questions = prune_statements(dataset, questions) 153 | before_prune = len(questions) 154 | questions = filter(lambda x: len(x[2]) > 1, questions) 155 | after_prune = len(questions) 156 | print("Pruning invalidated %d questions", (before_prune - after_prune)) 157 | 158 | print("Final processing...") 159 | questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) 160 | return dataset, questions_seq, word_to_id, word_id 161 | 162 | import cPickle 163 | import random 164 | 165 | if __name__ == "__main__": 166 | train_file = sys.argv[1] 167 | test_file = sys.argv[2] 168 | 169 | train_dataset, train_questions, word_to_id, num_words = parse_qa_dataset(train_file) 170 | test_dataset, test_questions, word_to_id, num_words = parse_qa_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False) 171 | 172 | #test_dataset, test_questions, _, _ = parse_dataset_weak(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False) 173 | 174 | # each element of train_questions contains: [article_no, line_no, [lists of indices of statements and question], index of answer word] 175 | #import pprint 176 | #pprint.pprint(word_to_id) 177 | print num_words 178 | 179 | # Pickle!!!! 180 | print("Pickling train...") 181 | f = file(train_file + '/dataset.train.pickle', 'wb') 182 | cPickle.dump((train_dataset, train_questions, word_to_id, num_words), f, protocol=cPickle.HIGHEST_PROTOCOL) 183 | f.close() 184 | 185 | print("Pickling test...") 186 | f = file(test_file + '/dataset.test.pickle', 'wb') 187 | cPickle.dump((test_dataset, test_questions, word_to_id, num_words), f, protocol=cPickle.HIGHEST_PROTOCOL) 188 | f.close() 189 | -------------------------------------------------------------------------------- /theano_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re, sys 3 | import theano 4 | import theano.tensor as T 5 | from keras.utils.theano_utils import shared_zeros 6 | 7 | dtype=theano.config.floatX 8 | 9 | def init_shared_normal(num_rows, num_cols, scale=1): 10 | '''Initialize a matrix shared variable with normally distributed 11 | elements.''' 12 | return theano.shared(np.random.normal( 13 | scale=scale, size=(num_rows, num_cols)).astype(dtype)) 14 | 15 | def init_shared_normal_tensor(num_slices, num_rows, num_cols, scale=1): 16 | '''Initialize a matrix shared variable with normally distributed 17 | elements.''' 18 | return theano.shared(np.random.normal( 19 | scale=scale, size=(num_slices, num_rows, num_cols)).astype(dtype)) 20 | 21 | def init_shared_zeros(*shape): 22 | '''Initialize a vector shared variable with zero elements.''' 23 | return theano.shared(np.zeros(shape, dtype=dtype)) 24 | 25 | def make_batches(size, batch_size): 26 | nb_batch = int(np.ceil(size/float(batch_size))) 27 | return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)] 28 | 29 | def maxnorm_constraint(p, m=40): 30 | norms = T.sqrt(T.sum(T.sqr(p))) 31 | desired = T.clip(norms, 0, m) 32 | p = p * (desired / (1e-7 + norms)) 33 | return p 34 | 35 | def get_param_updates(params, grads, lr, method=None, **kwargs): 36 | rho = 0.95 37 | epsilon = 1e-6 38 | 39 | accumulators = [shared_zeros(p.get_value().shape) for p in params] 40 | updates=[] 41 | 42 | if 'constraint' in kwargs: 43 | constraint = kwargs['constraint'] 44 | else: 45 | constraint = None 46 | 47 | if method == 'adadelta': 48 | print "Using ADADELTA" 49 | delta_accumulators = [shared_zeros(p.get_value().shape) for p in params] 50 | for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators): 51 | new_a = rho * a + (1 - rho) * g ** 2 # update accumulator 52 | 53 | # use the new accumulator and the *old* delta_accumulator 54 | update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon) 55 | new_p = p - lr * update 56 | 57 | # update delta_accumulator 58 | new_d_a = rho * d_a + (1 - rho) * update ** 2 59 | 60 | updates.append((p, new_p)) 61 | updates.append((a, new_a)) 62 | updates.append((d_a, new_d_a)) 63 | 64 | elif method == 'adagrad': 65 | print "Using ADAGRAD" 66 | for p, g, a in zip(params, grads, accumulators): 67 | new_a = a + g ** 2 # update accumulator 68 | 69 | new_p = p - lr * g / T.sqrt(new_a + epsilon) 70 | updates.append((p, new_p)) # apply constraints 71 | updates.append((a, new_a)) 72 | 73 | elif method == 'momentum': # Default 74 | print "Using MOMENTUM" 75 | momentum = kwargs['momentum'] 76 | for param, gparam in zip(params, grads): 77 | param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable) 78 | gparam_constrained = maxnorm_constraint(gparam) 79 | param_update_update = momentum*param_update + (1. - momentum)*gparam_constrained 80 | updates.append((param, param - param_update * lr)) 81 | updates.append((param_update, param_update_update)) 82 | 83 | else: # Default 84 | print "Using DEFAULT" 85 | for param, gparam in zip(params, grads): 86 | param_update = maxnorm_constraint(gparam) 87 | updates.append((param, param - param_update * lr)) 88 | 89 | # apply constraints on self.weights update 90 | # assumes that updates[0] corresponds to self.weights param 91 | if constraint != None: 92 | updates[0] = (updates[0][0], constraint(updates[0][1])) 93 | 94 | return updates 95 | 96 | 97 | def compute_bow(input_str, word_to_id, num_words): 98 | bow = np.zeros((num_words,)) 99 | for token in input_str.split(): 100 | bow[word_to_id[token]] += 1 101 | return bow 102 | 103 | def compute_seq(input_str, word_to_id, num_words): 104 | seq = [] 105 | for token in input_str.split(): 106 | seq.append(word_to_id[token]) 107 | return seq 108 | 109 | def transform_ques(question, word_to_id, num_words): 110 | question.append(compute_seq(question[2], word_to_id, num_words)) 111 | question[2] = compute_bow(question[2], word_to_id, num_words) 112 | return question 113 | 114 | def parse_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True): 115 | dataset = [] 116 | questions = [] 117 | with open(input_file) as f: 118 | statements = [] 119 | article_no = 0 120 | line_no = 0 121 | stmt_to_line = {} 122 | for line in f: 123 | line = line.strip() 124 | if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article 125 | dataset.append(statements) 126 | statements = [] 127 | line_no = 0 128 | stmt_to_line = {} 129 | article_no += 1 130 | if '\t' in line: 131 | question_parts = line.split('\t') 132 | tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split() 133 | if update_word_ids: 134 | for token in tokens[1:]: 135 | if token not in word_to_id: 136 | word_to_id[token] = word_id 137 | word_id += 1 138 | 139 | # To handle the case of "3 6" 140 | lines = None 141 | if ' ' in question_parts[2]: 142 | stmts = question_parts[2].split(' ') 143 | lines = '' 144 | for stmt in stmts: 145 | lines += str(stmt_to_line[stmt]) + ' ' 146 | lines = lines.strip() 147 | else: 148 | lines = str(stmt_to_line[question_parts[2]]) 149 | 150 | questions.append([article_no, line_no, ' '.join(tokens[1:]), word_to_id[question_parts[1]], lines]) 151 | else: 152 | tokens = re.sub(r'([\.\?])$', r' \1', line).split() 153 | stmt_to_line[tokens[0]] = line_no 154 | if update_word_ids: 155 | for token in tokens[1:]: 156 | if token not in word_to_id: 157 | word_to_id[token] = word_id 158 | word_id += 1 159 | statements.append(' '.join(tokens[1:])) 160 | line_no += 1 161 | if len(statements) > 0: 162 | dataset.append(statements) 163 | dataset_bow = map(lambda y: map(lambda x: compute_bow(x, word_to_id, word_id), y), dataset) 164 | dataset_seq = map(lambda y: map(lambda x: compute_seq(x, word_to_id, word_id), y), dataset) 165 | questions_bow = map(lambda x: transform_ques(x, word_to_id, word_id), questions) 166 | return dataset_seq, dataset_bow, questions_bow, word_to_id, word_id 167 | 168 | def pad_statement(stmt, null_word, max_words=20): 169 | if len(stmt) >= max_words: 170 | return stmt[-max_words:] 171 | else: 172 | return stmt + [null_word for i in range(max_words - len(stmt))] 173 | 174 | def pad_memories(stmts, null_word, max_stmts=20, max_words=20): 175 | if len(stmts) >= max_words: 176 | return stmts[-max_stmts:] 177 | else: 178 | 179 | return stmts + [[null_word for j in range(max_words)] for i in range(max_stmts - len(stmts))] 180 | 181 | def parse_dataset_weak(input_file, word_id=0, word_to_id={}, update_word_ids=True, max_stmts=20, max_words=20): 182 | dataset = [] 183 | questions = [] 184 | null_word = '' 185 | if null_word not in word_to_id: 186 | if update_word_ids == True: 187 | word_to_id[null_word] = word_id 188 | word_id += 1 189 | else: 190 | print "Null word not found!! AAAAA" 191 | sys.exit(1) 192 | null_word_id = word_to_id[null_word] 193 | 194 | with open(input_file) as f: 195 | statements = [] 196 | article_no = 0 197 | line_no = 0 198 | stmt_to_line = {} 199 | for line in f: 200 | line = line.strip() 201 | if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article 202 | dataset.append(statements) 203 | statements = [] 204 | line_no = 0 205 | stmt_to_line = {} 206 | article_no += 1 207 | if '\t' in line: 208 | question_parts = line.split('\t') 209 | tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split() 210 | if update_word_ids: 211 | for token in tokens[1:]: 212 | if token not in word_to_id: 213 | word_to_id[token] = word_id 214 | word_id += 1 215 | 216 | padded_stmts = pad_memories(statements[:line_no], null_word, max_stmts, max_words) 217 | padded_ques = pad_statement(tokens[1:], null_word, max_words) 218 | questions.append([article_no, line_no, padded_stmts, padded_ques, question_parts[1]]) 219 | else: 220 | tokens = re.sub(r'([\.\?])$', r' \1', line).split() 221 | stmt_to_line[tokens[0]] = line_no 222 | if update_word_ids: 223 | for token in tokens[1:]: 224 | if token not in word_to_id: 225 | word_to_id[token] = word_id 226 | word_id += 1 227 | statements.append(pad_statement(tokens[1:], null_word, max_words)) 228 | line_no += 1 229 | if len(statements) > 0: 230 | dataset.append(statements) 231 | questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions) 232 | return dataset, questions_seq, word_to_id, word_id, null_word_id 233 | 234 | def transform_ques_weak(question, word_to_id, num_words): 235 | indices = [] 236 | for stmt in question[2]: 237 | index_stmt = map(lambda x: word_to_id[x], stmt) 238 | indices.append(index_stmt) 239 | question[2] = indices 240 | question[3] = map(lambda x: word_to_id[x], question[3]) 241 | question[4] = word_to_id[question[4]] 242 | return question 243 | 244 | if __name__ == "__main__": 245 | train_file = sys.argv[1] 246 | test_file = train_file.replace('train', 'test') 247 | 248 | train_dataset, train_questions, word_to_id, num_words = parse_dataset_weak(train_file) 249 | test_dataset, test_questions, _, _ = parse_dataset_weak(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False) 250 | 251 | # each element of train_questions contains: [article_no, line_no, [lists of indices of statements and question], index of answer word] 252 | print train_questions[0] 253 | -------------------------------------------------------------------------------- /wmemnn.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import sys, random, pprint 5 | 6 | from theano_util import * 7 | from keras.activations import tanh, hard_sigmoid 8 | from keras.initializations import glorot_uniform, orthogonal 9 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix 10 | from keras.preprocessing import sequence 11 | 12 | from qa_dataset_parser import parse_qa_dataset 13 | 14 | import cPickle 15 | 16 | # theano.config.exception_verbosity = 'high' 17 | # theano.config.allow_gc = False 18 | #theano.config.profile = True 19 | 20 | def inspect_inputs(i, node, fn): 21 | print i, node, "inputs:", [input[0] for input in fn.inputs], 22 | 23 | def inspect_outputs(i, node, fn): 24 | print i, node, "outputs:", [output[0] for output in fn.outputs] 25 | 26 | class WMemNN: 27 | def __init__(self, n_words=20, n_embedding=100, lr=0.01, 28 | momentum=0.9, word_to_id=None, null_word_id=-1, 29 | max_stmts=20, max_words=20, load_from_file=None): 30 | if load_from_file: 31 | self.load_model(load_from_file) 32 | else: 33 | self.regularization = 0.001 34 | self.n_embedding = n_embedding 35 | self.lr = lr 36 | self.momentum = momentum 37 | self.n_words = n_words 38 | self.batch_size = 4 39 | self.max_stmts = max_stmts 40 | self.max_words = max_words 41 | 42 | self.word_to_id = word_to_id 43 | self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems()) 44 | self.null_word_id = null_word_id 45 | 46 | # Question embedding 47 | # self.B = init_shared_normal(self.n_words, self.n_embedding, 0.1) 48 | 49 | # Statement input, output embeddings 50 | self.weights = init_shared_normal_tensor(4, self.n_words, self.n_embedding, 0.1) 51 | 52 | # Linear mapping between layers 53 | self.H = init_shared_normal(self.n_embedding, self.n_embedding, 0.1) 54 | 55 | # Final outut weight matrix 56 | # self.W = init_shared_normal(self.n_embedding, self.n_words, 0.1) 57 | 58 | 59 | zero_vector = T.vector('zv', dtype=theano.config.floatX) 60 | 61 | # Statement 62 | x = T.imatrix('x') 63 | xbatch = T.tensor3('xb', dtype='int32') 64 | 65 | # Positional encoding matrix 66 | pe = T.tensor3('pe') 67 | 68 | # Question 69 | q = T.ivector('q') 70 | qbatch = T.imatrix('qb') 71 | 72 | # True word 73 | r = T.iscalar('r') 74 | rbatch = T.ivector('rb') 75 | 76 | memory_cost = self.memnn_cost(x, q, pe) 77 | # memory_loss = -T.log(memory_cost[r]) # cross entropy on softmax 78 | memory_loss = self.memnn_batch_cost(xbatch, qbatch, rbatch, pe) 79 | 80 | params = [ 81 | self.weights, 82 | # self.B, 83 | # self.W, 84 | self.H 85 | ] 86 | 87 | regularization_cost = reduce( 88 | lambda x,y: x + y, 89 | map(lambda x: self.regularization * T.sum(x ** 2), params) 90 | ) 91 | 92 | cost = memory_loss + regularization_cost 93 | 94 | grads = T.grad(cost, params) 95 | 96 | l_rate = T.scalar('l_rate') 97 | 98 | # Parameter updates 99 | updates = get_param_updates(params, grads, lr=l_rate, method='momentum', momentum=0.9, 100 | constraint=self._constrain_embedding(self.null_word_id, zero_vector)) 101 | 102 | self.train_function = theano.function( 103 | inputs = [ 104 | xbatch, qbatch, rbatch, pe, 105 | theano.Param(l_rate, default=self.lr), 106 | theano.Param(zero_vector, default=np.zeros((self.n_embedding,), theano.config.floatX)) 107 | ], 108 | outputs = cost, 109 | updates = updates, 110 | allow_input_downcast=True, 111 | # mode='FAST_COMPILE', 112 | #mode='DebugMode' 113 | #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs) 114 | on_unused_input='warn' 115 | ) 116 | 117 | self.predict_function = theano.function( 118 | inputs = [ 119 | x, q, pe 120 | ], 121 | outputs = memory_cost, 122 | allow_input_downcast=True, 123 | # mode='FAST_COMPILE', 124 | on_unused_input='warn' 125 | ) 126 | 127 | def _constrain_embedding(self, null_id, zero_vector): 128 | def wrapper(p): 129 | for i in range(4): 130 | p = T.set_subtensor(p[i,null_id], zero_vector) 131 | return p 132 | return wrapper 133 | 134 | def _compute_memories(self, statement, previous, weights, pe_matrix): 135 | pe_weights = pe_matrix * weights[statement] 136 | memories = T.sum(pe_weights, axis=0) 137 | return memories 138 | 139 | def _get_PE_matrix(self, num_words, embedding_size): 140 | pe_matrix = np.ones((num_words, 4, embedding_size), theano.config.floatX) 141 | # for j in range(num_words): 142 | # for k in range(embedding_size): 143 | # value = (1 - float(j+1)/num_words) - (float(k+1)/embedding_size) * (1 - 2*float(j+1)/num_words) 144 | # for i in range(4): 145 | # pe_matrix[j,i,k] = value 146 | return pe_matrix 147 | 148 | def save_model(self, filename): 149 | f = file(filename, 'wb') 150 | for obj in [self.regularization, self.n_embedding, self.lr, 151 | self.momentum, self.n_words, self.batch_size, 152 | self.word_to_id, self.id_to_word, self.null_word_id, 153 | self.max_stmts, self.max_words, self.weights, self.H]: 154 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 155 | f.close() 156 | 157 | def load_model(self, filename): 158 | f = file(filename, 'rb') 159 | self.regularization = cPickle.load(f) 160 | self.n_embedding = cPickle.load(f) 161 | self.lr = cPickle.load(f) 162 | self.momentum = cPickle.load(f) 163 | self.n_words = cPickle.load(f) 164 | self.batch_size = cPickle.load(f) 165 | self.word_to_id = cPickle.load(f) 166 | self.id_to_word = cPickle.load(f) 167 | self.null_word_id = cPickle.load(f) 168 | self.max_stmts = cPickle.load(f) 169 | self.max_words = cPickle.load(f) 170 | self.weights = cPickle.load(f) 171 | self.H = cPickle.load(f) 172 | f.close() 173 | 174 | 175 | def memnn_batch_cost(self, statements_batch, question_batch, r_batch, pe_matrix): 176 | l = statements_batch.shape[0] 177 | s, _ = theano.scan(fn=lambda i, c, xb, qb, rb, pe: c - T.log(self.memnn_cost(xb[i], qb[i], pe)[rb[i]]), 178 | outputs_info=T.as_tensor_variable(np.asarray(0, theano.config.floatX)), 179 | non_sequences=[statements_batch, question_batch, r_batch, pe_matrix], 180 | sequences=[theano.tensor.arange(l, dtype='int64')]) 181 | return s[-1] 182 | 183 | def memnn_cost(self, statements, question, pe_matrix): 184 | # statements: list of list of word indices 185 | # question: list of word indices 186 | 187 | computed_memories, updates = theano.scan( 188 | self._compute_memories, 189 | sequences = [statements], 190 | outputs_info = [ 191 | alloc_zeros_matrix(self.weights.shape[0], self.n_embedding) 192 | ], 193 | non_sequences = [ 194 | self.weights.dimshuffle(1, 0, 2), 195 | pe_matrix 196 | ], 197 | truncate_gradient = -1, 198 | ) 199 | 200 | memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) 201 | 202 | # Embed question 203 | u1 = T.sum(self.weights[0][question], axis=0) 204 | 205 | # Layer 1 206 | p = T.nnet.softmax(T.dot(u1, memories[0].T)) 207 | o1 = T.dot(p, memories[1]) 208 | 209 | # Layer 2 210 | u2 = o1 + T.dot(u1, self.H) 211 | p = T.nnet.softmax(T.dot(u2, memories[1].T)) 212 | o2 = T.dot(p, memories[2]) 213 | 214 | # Layer 3 215 | u3 = o2 + T.dot(u2, self.H) 216 | p = T.nnet.softmax(T.dot(u3, memories[2].T)) 217 | o3 = T.dot(p, memories[3]) 218 | 219 | # Final 220 | output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T)) 221 | 222 | return output[0] 223 | 224 | def train(self, dataset, questions, n_epochs=100, lr_schedule=None, start_epoch=0, max_words=20): 225 | l_rate = self.lr 226 | index_array = np.arange(len(questions)) 227 | 228 | # (max_words, ) 229 | pe_matrix = self._get_PE_matrix(max_words, self.n_embedding) 230 | 231 | for epoch in xrange(start_epoch, start_epoch + n_epochs): 232 | costs = [] 233 | 234 | if lr_schedule != None and epoch in lr_schedule: 235 | l_rate = lr_schedule[epoch] 236 | 237 | np.random.shuffle(index_array) 238 | seen = 0 239 | 240 | batches = make_batches(len(questions), self.batch_size) 241 | for batch_index, (batch_start, batch_end) in enumerate(batches): 242 | batch_ids = index_array[batch_start:batch_end] 243 | seen += len(batch_ids) 244 | questions_batch = [] 245 | for index in batch_ids: 246 | questions_batch.append(questions[index]) 247 | 248 | # (batch_size * max_stmts * max_words) 249 | statements_seq_batch = np.asarray(map(lambda x: x[2], questions_batch), theano.config.floatX) 250 | # (batch_size * max_words) 251 | question_seq_batch = np.asarray(map(lambda x: x[3], questions_batch), theano.config.floatX) 252 | # (batch_size) 253 | correct_word_batch = np.asarray(map(lambda x: x[4], questions_batch), theano.config.floatX) 254 | 255 | cost = self.train_function( 256 | statements_seq_batch, 257 | question_seq_batch, 258 | correct_word_batch, 259 | pe_matrix, 260 | l_rate 261 | ) 262 | 263 | # print "Epoch %d, sample %d: %f" % (epoch, i, cost) 264 | costs.append(cost) 265 | 266 | print "Epoch %d: %f" % (epoch, np.mean(costs)) 267 | 268 | def predict(self, dataset, questions, max_words=20, print_errors=False): 269 | correct_answers = 0 270 | wrong_answers = 0 271 | pe_matrix = self._get_PE_matrix(max_words, self.n_embedding) 272 | 273 | for i, question in enumerate(questions): 274 | statements_seq = np.asarray(question[2], theano.config.floatX) 275 | question_seq = np.asarray(question[3], theano.config.floatX) 276 | correct = question[4] 277 | 278 | probs = self.predict_function( 279 | statements_seq, question_seq, pe_matrix 280 | ) 281 | predicted = np.argmax(probs) 282 | 283 | if len(question) == 6: 284 | ## For mc_test 285 | options = question[5] 286 | options_probs = probs[options] 287 | best_idx = np.argmax(options_probs) 288 | predicted = options[best_idx] 289 | ## 290 | 291 | if predicted == correct: 292 | correct_answers += 1 293 | else: 294 | if print_errors and np.random.rand() < 0.02: 295 | print 'Correct: %s (%d %.3f), Guess: %s (%d %.3f)' % (self.id_to_word[correct], correct, probs[correct], self.id_to_word[predicted], predicted, probs[predicted]) 296 | wrong_answers += 1 297 | 298 | if len(questions) > 1000: 299 | print '(%d/%d) %d correct, %d wrong' % (i+1, len(questions), correct_answers, wrong_answers) 300 | 301 | print '%d correct, %d wrong' % (correct_answers, wrong_answers) 302 | 303 | if __name__ == "__main__": 304 | train_file = sys.argv[1] 305 | test_file = train_file.replace('train', 'test') 306 | 307 | if len(sys.argv) > 2: 308 | n_epochs = int(sys.argv[2]) 309 | else: 310 | n_epochs = 10 311 | 312 | if len(sys.argv) > 3: 313 | n_embedding = int(sys.argv[3]) 314 | else: 315 | n_embedding = 20 316 | 317 | mode = 'babi' # babi or wiki 318 | 319 | if '.pickle' in train_file: 320 | mode = 'wiki' 321 | 322 | max_stmts = 20 323 | max_words = 20 324 | 325 | if mode == 'babi': 326 | train_dataset, train_questions, word_to_id, num_words, null_word_id = parse_dataset_weak(train_file, max_stmts=max_stmts, max_words=max_words) 327 | test_dataset, test_questions, _, _, _ = parse_dataset_weak(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, max_stmts=max_stmts, max_words=max_words) 328 | elif mode == 'wiki': 329 | # Check for pickled dataset 330 | print("Loading pickled train dataset") 331 | f = file(train_file, 'rb') 332 | import cPickle 333 | obj = cPickle.load(f) 334 | train_dataset, train_questions, word_to_id, num_words, null_word_id = obj 335 | 336 | print("Loading pickled test dataset") 337 | f = file(test_file, 'rb') 338 | obj = cPickle.load(f) 339 | test_dataset, test_questions, _, _, _ = obj 340 | elif mode == 'debug': 341 | train_dataset = [] 342 | train_questions = [[0, 2, [[0, 1, 2, 3, 4, 5], [6, 7, 2, 3, 8, 5], [9, 10, 0, 11]], 4]] 343 | num_words = 12 344 | word_to_id = {} 345 | 346 | print "Dataset has %d words" % num_words 347 | # print train_questions[0] 348 | 349 | model_file = "mctest500_dim100_wmemnn.pickle" 350 | train_my_model = False 351 | save_my_model = True 352 | 353 | if train_my_model: 354 | wmemNN = WMemNN(n_words=num_words, n_embedding=100, lr=0.01, word_to_id=word_to_id, null_word_id=null_word_id, 355 | max_stmts=max_stmts, max_words=max_words) 356 | 357 | lr_schedule = dict([(0, 0.01), (25, 0.01/2), (50, 0.01/4), (75, 0.01/8)]) 358 | 359 | for i in xrange(n_epochs/5): 360 | wmemNN.train(train_dataset, train_questions, 5, lr_schedule, 5*i, max_words) 361 | wmemNN.predict(train_dataset, train_questions, max_words) 362 | wmemNN.predict(test_dataset, test_questions, max_words) 363 | 364 | if save_my_model: 365 | print "Saving model to", model_file 366 | wmemNN.save_model(model_file) 367 | else: 368 | wmemNN = WMemNN(load_from_file=model_file) 369 | wmemNN.predict(train_dataset, train_questions, max_words) 370 | wmemNN.predict(test_dataset, test_questions, max_words) 371 | 372 | 373 | -------------------------------------------------------------------------------- /wmemnnmc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import theano 3 | import theano.tensor as T 4 | import sys, random, pprint 5 | 6 | from theano_util import * 7 | from keras.activations import tanh, hard_sigmoid 8 | from keras.initializations import glorot_uniform, orthogonal 9 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix 10 | from keras.preprocessing import sequence 11 | 12 | import cPickle 13 | 14 | # theano.config.exception_verbosity = 'high' 15 | # theano.config.allow_gc = False 16 | #theano.config.profile = True 17 | 18 | class WMemNN: 19 | def __init__(self, n_words=20, n_embedding=100, lr=0.01, 20 | momentum=0.9, word_to_id=None, null_word_id=-1, 21 | load_from_file=None): 22 | if load_from_file: 23 | self.load_model(load_from_file) 24 | else: 25 | self.regularization = 0.01 26 | self.n_embedding = n_embedding 27 | self.lr = lr 28 | self.momentum = momentum 29 | self.n_words = n_words 30 | self.batch_size = 4 31 | 32 | self.word_to_id = word_to_id 33 | self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems()) 34 | self.null_word_id = null_word_id 35 | 36 | # Question embedding 37 | # self.B = init_shared_normal(self.n_words, self.n_embedding, 0.1) 38 | 39 | # Statement input, output embeddings 40 | self.weights = init_shared_normal_tensor(4, self.n_words, self.n_embedding, 0.1) 41 | 42 | # Linear mapping between layers 43 | self.H = init_shared_normal(self.n_embedding, self.n_embedding, 0.1) 44 | 45 | # Final outut weight matrix 46 | # self.W = init_shared_normal(self.n_embedding, self.n_words, 0.1) 47 | 48 | # Answer embedding matrix 49 | self.A = init_shared_normal(self.n_words, self.n_embedding, 0.1) 50 | 51 | # Final scoring matrix 52 | self.U = init_shared_normal(self.n_embedding, self.n_embedding, 0.1) 53 | 54 | zero_vector = T.vector('zv', dtype=theano.config.floatX) 55 | 56 | # Statement 57 | x = T.imatrix('x') 58 | xbatch = T.tensor3('xb', dtype='int32') 59 | 60 | # Positional encoding matrix 61 | pe = T.tensor3('pe') 62 | 63 | # Question 64 | q = T.ivector('q') 65 | qbatch = T.imatrix('qb') 66 | 67 | # True word 68 | r = T.iscalar('r') 69 | rbatch = T.ivector('rb') 70 | 71 | # Stacked answer vectors 72 | a = T.imatrix('a') 73 | abatch = T.tensor3('ab', dtype='int32') 74 | 75 | memory_cost = self.memnn_cost(x, q, a, pe) 76 | # memory_loss = -T.log(memory_cost[r]) # cross entropy on softmax 77 | memory_loss = self.memnn_batch_cost(xbatch, qbatch, rbatch, abatch, pe) 78 | 79 | params = [ 80 | self.weights, 81 | # self.B, 82 | # self.W, 83 | self.H, 84 | self.A, 85 | self.U, 86 | ] 87 | 88 | regularization_cost = reduce( 89 | lambda x,y: x + y, 90 | map(lambda x: self.regularization * T.sum(x ** 2), params) 91 | ) 92 | 93 | cost = memory_loss + regularization_cost 94 | 95 | grads = T.grad(cost, params) 96 | 97 | l_rate = T.scalar('l_rate') 98 | 99 | # Parameter updates 100 | updates = get_param_updates(params, grads, lr=l_rate, method='adagrad', momentum=0.9, 101 | constraint=self._constrain_embedding(self.null_word_id, zero_vector)) 102 | 103 | self.train_function = theano.function( 104 | inputs = [ 105 | xbatch, qbatch, rbatch, abatch, pe, 106 | theano.Param(l_rate, default=self.lr), 107 | theano.Param(zero_vector, default=np.zeros((self.n_embedding,), theano.config.floatX)) 108 | ], 109 | outputs = cost, 110 | updates = updates, 111 | allow_input_downcast=True, 112 | # mode='FAST_COMPILE', 113 | #mode='DebugMode' 114 | #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs) 115 | on_unused_input='warn' 116 | ) 117 | 118 | self.predict_function = theano.function( 119 | inputs = [ 120 | x, q, a, pe 121 | ], 122 | outputs = memory_cost, 123 | allow_input_downcast=True, 124 | # mode='FAST_COMPILE', 125 | on_unused_input='warn' 126 | ) 127 | 128 | def _constrain_embedding(self, null_id, zero_vector): 129 | def wrapper(p): 130 | for i in range(4): 131 | p = T.set_subtensor(p[i,null_id], zero_vector) 132 | return p 133 | return wrapper 134 | 135 | def _compute_memories(self, statement, previous, weights, pe_matrix): 136 | pe_weights = pe_matrix * weights[statement] 137 | memories = T.sum(pe_weights, axis=0) 138 | return memories 139 | 140 | def _get_PE_matrix(self, num_words, embedding_size): 141 | pe_matrix = np.ones((num_words, 4, embedding_size), theano.config.floatX) 142 | # for j in range(num_words): 143 | # for k in range(embedding_size): 144 | # value = (1 - float(j+1)/num_words) - (float(k+1)/embedding_size) * (1 - 2*float(j+1)/num_words) 145 | # for i in range(4): 146 | # pe_matrix[j,i,k] = value 147 | return pe_matrix 148 | 149 | def save_model(self, filename): 150 | f = file(filename, 'wb') 151 | for obj in [self.regularization, self.n_embedding, self.lr, 152 | self.momentum, self.n_words, self.batch_size, 153 | self.word_to_id, self.id_to_word, self.null_word_id, 154 | self.weights, self.H, self.A, self.U]: 155 | cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL) 156 | f.close() 157 | 158 | def load_model(self, filename): 159 | f = file(filename, 'rb') 160 | self.regularization = cPickle.load(f) 161 | self.n_embedding = cPickle.load(f) 162 | self.lr = cPickle.load(f) 163 | self.momentum = cPickle.load(f) 164 | self.n_words = cPickle.load(f) 165 | self.batch_size = cPickle.load(f) 166 | self.word_to_id = cPickle.load(f) 167 | self.id_to_word = cPickle.load(f) 168 | self.null_word_id = cPickle.load(f) 169 | self.weights = cPickle.load(f) 170 | self.H = cPickle.load(f) 171 | self.A = cPickle.load(f) 172 | self.U = cPickle.load(f) 173 | f.close() 174 | 175 | 176 | def memnn_batch_cost(self, statements_batch, question_batch, r_batch, ans_batch, pe_matrix): 177 | l = statements_batch.shape[0] 178 | s, _ = theano.scan(fn=lambda i, c, xb, qb, rb, ab, pe: c - T.log(self.memnn_cost(xb[i], qb[i], ab[i], pe)[rb[i]]), 179 | outputs_info=T.as_tensor_variable(np.asarray(0, theano.config.floatX)), 180 | non_sequences=[statements_batch, question_batch, r_batch, ans_batch, pe_matrix], 181 | sequences=[theano.tensor.arange(l, dtype='int64')]) 182 | return s[-1] 183 | 184 | def memnn_cost(self, statements, question, ans, pe_matrix): 185 | # statements: list of list of word indices 186 | # question: list of word indices 187 | 188 | computed_memories, updates = theano.scan( 189 | self._compute_memories, 190 | sequences = [statements], 191 | outputs_info = [ 192 | alloc_zeros_matrix(self.weights.shape[0], self.n_embedding) 193 | ], 194 | non_sequences = [ 195 | self.weights.dimshuffle(1, 0, 2), 196 | pe_matrix 197 | ], 198 | truncate_gradient = -1, 199 | ) 200 | 201 | memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2) 202 | 203 | # Embed question 204 | u1 = T.sum(self.weights[0][question], axis=0) 205 | 206 | # Layer 1 207 | p = T.nnet.softmax(T.dot(u1, memories[0].T)) 208 | o1 = T.dot(p, memories[1]) 209 | 210 | # Layer 2 211 | u2 = o1 + T.dot(u1, self.H) 212 | p = T.nnet.softmax(T.dot(u2, memories[1].T)) 213 | o2 = T.dot(p, memories[2]) 214 | 215 | # Layer 3 216 | u3 = o2 + T.dot(u2, self.H) 217 | p = T.nnet.softmax(T.dot(u3, memories[2].T)) 218 | o3 = T.dot(p, memories[3]) 219 | 220 | # Score answers 221 | u4 = o3 + T.dot(u3, self.H) 222 | 223 | # Embed answer 224 | a1 = T.sum(self.A[ans[0]], axis=0) 225 | a2 = T.sum(self.A[ans[1]], axis=0) 226 | a3 = T.sum(self.A[ans[2]], axis=0) 227 | a4 = T.sum(self.A[ans[3]], axis=0) 228 | a = T.stack(a1, a2, a3, a4) 229 | scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T)) 230 | #scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T)) 231 | output = T.nnet.softmax(scores) 232 | 233 | return output[0] 234 | 235 | def train(self, dataset, questions, n_epochs=100, lr_schedule=None, start_epoch=0, max_words=20): 236 | l_rate = self.lr 237 | index_array = np.arange(len(questions)) 238 | 239 | # (max_words, ) 240 | pe_matrix = self._get_PE_matrix(max_words, self.n_embedding) 241 | 242 | for epoch in xrange(start_epoch, start_epoch + n_epochs): 243 | costs = [] 244 | 245 | if lr_schedule != None and epoch in lr_schedule: 246 | l_rate = lr_schedule[epoch] 247 | 248 | np.random.shuffle(index_array) 249 | seen = 0 250 | 251 | batches = make_batches(len(questions), self.batch_size) 252 | for batch_index, (batch_start, batch_end) in enumerate(batches): 253 | batch_ids = index_array[batch_start:batch_end] 254 | seen += len(batch_ids) 255 | questions_batch = [] 256 | for index in batch_ids: 257 | questions_batch.append(questions[index]) 258 | 259 | #pprint.pprint(questions_batch) 260 | 261 | # (batch_size * max_stmts * max_words) 262 | statements_seq_batch = np.asarray(map(lambda x: x[2], questions_batch), theano.config.floatX) 263 | # (batch_size * max_words) 264 | question_seq_batch = np.asarray(map(lambda x: x[3], questions_batch), theano.config.floatX) 265 | # (batch_size) 266 | correct_word_batch = np.asarray(map(lambda x: x[4], questions_batch), theano.config.floatX) 267 | # (batch_size * 4 * max_words) 268 | ans_batch = np.asarray(map(lambda x: x[5], questions_batch), theano.config.floatX) 269 | 270 | cost = self.train_function( 271 | statements_seq_batch, 272 | question_seq_batch, 273 | correct_word_batch, 274 | ans_batch, 275 | pe_matrix, 276 | l_rate 277 | ) 278 | 279 | # print "Epoch %d, sample %d: %f" % (epoch, i, cost) 280 | costs.append(cost) 281 | 282 | print "Epoch %d: %f" % (epoch, np.mean(costs)) 283 | 284 | def predict(self, dataset, questions, max_words=20, print_errors=False): 285 | correct_answers = 0 286 | wrong_answers = 0 287 | pe_matrix = self._get_PE_matrix(max_words, self.n_embedding) 288 | 289 | for i, question in enumerate(questions): 290 | statements_seq = np.asarray(question[2], theano.config.floatX) 291 | question_seq = np.asarray(question[3], theano.config.floatX) 292 | answers = np.asarray(question[5], theano.config.floatX) 293 | correct = question[4] 294 | 295 | probs = self.predict_function( 296 | statements_seq, question_seq, answers, pe_matrix 297 | ) 298 | predicted = np.argmax(probs) 299 | 300 | if predicted == correct: 301 | correct_answers += 1 302 | else: 303 | if print_errors and np.random.rand() < 0.1: 304 | correct_words = map(lambda x: self.id_to_word[x], question[5][correct]) 305 | predicted_words = map(lambda x: self.id_to_word[x], question[5][predicted]) 306 | print 'Correct: %s (%d %.3f), Guess: %s (%d %.3f)' % (correct_words, correct, probs[correct], predicted_words, predicted, probs[predicted]) 307 | wrong_answers += 1 308 | 309 | #if len(questions) > 1000: 310 | # print '(%d/%d) %d correct, %d wrong' % (i+1, len(questions), correct_answers, wrong_answers) 311 | 312 | accuracy = 100.0 * float(correct_answers) / (correct_answers + wrong_answers) 313 | print '%d correct, %d wrong, %.2f%% acc' % (correct_answers, wrong_answers, accuracy) 314 | 315 | if __name__ == "__main__": 316 | train_file = sys.argv[1] 317 | test_file = train_file.replace('train', 'test') 318 | 319 | if len(sys.argv) > 2: 320 | n_epochs = int(sys.argv[2]) 321 | else: 322 | n_epochs = 10 323 | 324 | if len(sys.argv) > 3: 325 | n_embedding = int(sys.argv[3]) 326 | else: 327 | n_embedding = 20 328 | 329 | print("Loading pickled train dataset") 330 | f = file(train_file, 'rb') 331 | obj = cPickle.load(f) 332 | train_dataset, train_questions, word_to_id, num_words, null_word_id, train_max_stmts, train_max_words = obj 333 | 334 | print("Loading pickled test dataset") 335 | f = file(test_file, 'rb') 336 | obj = cPickle.load(f) 337 | test_dataset, test_questions, _, _, _, test_max_stmts, test_max_words = obj 338 | 339 | print "Dataset has %d words" % num_words 340 | 341 | model_file = train_file.replace("train", "model") 342 | train_my_model = True 343 | save_my_model = True 344 | 345 | if train_my_model: 346 | wmemNN = WMemNN(n_words=num_words, n_embedding=n_embedding, lr=0.01, word_to_id=word_to_id, null_word_id=null_word_id) 347 | 348 | lr_schedule = dict([(0, 0.01), (25, 0.01/2), (50, 0.01/4), (75, 0.01/8)]) 349 | 350 | for i in xrange(n_epochs/5): 351 | wmemNN.train(train_dataset, train_questions, 5, lr_schedule, 5*i, train_max_words) 352 | wmemNN.predict(train_dataset, train_questions, train_max_words) 353 | wmemNN.predict(test_dataset, test_questions, test_max_words) 354 | 355 | if save_my_model: 356 | print "Saving model to", model_file 357 | wmemNN.save_model(model_file) 358 | else: 359 | wmemNN = WMemNN(load_from_file=model_file) 360 | wmemNN.predict(train_dataset, train_questions, train_max_words) 361 | wmemNN.predict(test_dataset, test_questions, test_max_words) 362 | 363 | -------------------------------------------------------------------------------- /wordvec_pruning.py: -------------------------------------------------------------------------------- 1 | from gensim.models import Word2Vec 2 | import numpy 3 | 4 | def prune_statements(dataset, questions): 5 | total_old = 0 6 | total_new = 0 7 | 8 | wvs = Word2Vec(dataset, min_count=0) 9 | 10 | for i in range(len(questions)): 11 | question = questions[i] 12 | new_statements = [] 13 | old_statements = question[2][:-1] 14 | 15 | # Use word vectors and keep only the top 5 16 | 17 | sims = [] 18 | q = question[2][-1] 19 | for s in old_statements: 20 | sims.append(wvs.n_similarity(q,s)) 21 | 22 | sims2 = map(lambda x: x if type(x) is numpy.float64 else 0.0, sims) 23 | top = sorted(range(len(sims2)), key=sims2.__getitem__, reverse=True) 24 | new_statements = map(lambda x: old_statements[x], top[:5]) 25 | 26 | questions[i][2] = new_statements 27 | total_old += len(old_statements) 28 | total_new += len(new_statements) 29 | #print("Question: ", questions[i][2][-1], " before %d after %d" % (len(old_statements), len(new_statements))) 30 | 31 | print("Before %d After %d" % (total_old, total_new)) 32 | return questions 33 | --------------------------------------------------------------------------------