├── .gitignore
├── Poster.pdf
├── README
├── README.md
├── keras_lstm.py
├── keras_util.py
├── mctest_baseline.py
├── mctest_dataset_parser.py
├── mctest_dataset_parser_v2.py
├── mctest_lstm.py
├── memnn_numpy.py
├── memnn_theano.py
├── memnn_theano_v2.py
├── memnn_theano_v3.py
├── nltk_utils.py
├── pararth_final_report.pdf
├── pararth_milestone.pdf
├── pos_pruning.py
├── qa_dataset_parser.py
├── theano_util.py
├── wmemnn.py
├── wmemnnmc.py
└── wordvec_pruning.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/Poster.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/Poster.pdf


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/README


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Question Answering Using Memory Networks 
2 | CS224D Project
3 | 


--------------------------------------------------------------------------------
/keras_lstm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import sys, re
  5 | 
  6 | from keras.preprocessing import sequence
  7 | from keras.initializations import uniform
  8 | from keras.optimizers import SGD, RMSprop, Adagrad
  9 | from keras.utils import np_utils
 10 | from keras.models import Sequential
 11 | from keras.layers.core import Dense, Dropout, Activation
 12 | from keras.layers.embeddings import Embedding
 13 | from keras.layers.recurrent import LSTM, GRU
 14 | 
 15 | # mode can be 'baseline' or 'memnn'
 16 | def load_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True, mode='memnn'):
 17 |     #dataset = []
 18 |     dataset_ids = []
 19 |     #labels = []
 20 |     label_ids = []
 21 |     with open(input_file) as f:
 22 |         article = {}
 23 |         article_no = 0
 24 |         for line in f:
 25 |             line = line.strip()
 26 |             if len(line) > 0 and line[:2] == '1 ' and len(dataset_ids) > 0: # new article
 27 |                 article = {}
 28 |                 article_no += 1
 29 |             if '\t' in line: # question
 30 |                 question_parts = line.split('\t')
 31 |                 tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split()
 32 |                 if update_word_ids:
 33 |                     for token in tokens[1:]:
 34 |                         if token not in word_to_id:
 35 |                             word_to_id[token] = word_id
 36 |                             word_id += 1
 37 |                     if question_parts[1] not in word_to_id:
 38 |                             word_to_id[question_parts[1]] = word_id
 39 |                             word_id += 1
 40 | 
 41 |                 stmt_ids = map(int, question_parts[2].strip().split())
 42 |                 sequence = []
 43 |                 if mode == 'baseline':
 44 |                     for s in range(int(tokens[0])):
 45 |                         if s in article:
 46 |                             sequence += article[s]
 47 |                 else:
 48 |                     for s in stmt_ids:
 49 |                         sequence += article[s]
 50 | 
 51 |                 for token in tokens[1:]:
 52 |                     sequence.append(token)
 53 | 
 54 |                 if article_no == 0:
 55 |                     print("seq: %s | label: %s" % (' '.join(sequence).ljust(70), question_parts[1]))
 56 | 
 57 |                 dataset_ids.append(map(lambda t: word_to_id[t], sequence))
 58 |                 label_ids.append(word_to_id[question_parts[1]])
 59 | 
 60 |             else: # statement
 61 |                 tokens = re.sub(r'([\.\?])$', r' \1', line).split()
 62 |                 if update_word_ids:
 63 |                     for token in tokens[1:]:
 64 |                         if token not in word_to_id:
 65 |                             word_to_id[token] = word_id
 66 |                             word_id += 1
 67 | 
 68 |                 line_no = int(tokens[0])
 69 |                 article[line_no] = []
 70 |                 for token in tokens[1:]:
 71 |                     article[line_no].append(token)
 72 | 
 73 |     return dataset_ids, label_ids, word_to_id, word_id
 74 | 
 75 | if __name__ == "__main__":
 76 |     train_file = sys.argv[1]
 77 |     test_file = train_file.replace('train', 'test')
 78 | 
 79 |     mode = 'memnn'
 80 |     if len(sys.argv) > 2:
 81 |         mode = sys.argv[2] # should be 'baseline' or 'memnn'
 82 | 
 83 |     nb_epoch = 10
 84 |     if len(sys.argv) > 3:
 85 |         nb_epoch = int(sys.argv[3])
 86 | 
 87 |     print("Loading train data...")
 88 |     X_train, y_train, word_to_id, num_words = load_dataset(train_file, mode=mode)
 89 |     print("Loading test data...")
 90 |     X_test, y_test, _, _ = load_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, mode=mode)
 91 | 
 92 |     id_to_word = dict([(v, k) for k, v in word_to_id.iteritems()])
 93 | 
 94 |     y_train_cat = np_utils.to_categorical(y_train, nb_classes=num_words)
 95 |     y_test_cat = np_utils.to_categorical(y_test, nb_classes=num_words)
 96 | 
 97 |     print(len(X_train), 'train sequences')
 98 |     print(len(X_test), 'test sequences')
 99 | 
100 |     print("Pad sequences (samples x time)")
101 |     X_train = sequence.pad_sequences(X_train)
102 |     X_test = sequence.pad_sequences(X_test)
103 |     print('X_train shape:', X_train.shape)
104 |     print('X_test shape:', X_test.shape)
105 | 
106 |     print('Build model...')
107 |     batch_size = 1
108 |     in_embedding_size = 100
109 |     out_embedding_size = 100
110 | 
111 |     model = Sequential()
112 |     model.add(Embedding(num_words, in_embedding_size))
113 |     model.add(LSTM(in_embedding_size, out_embedding_size))
114 |     model.add(Dropout(0.5))
115 |     model.add(Dense(out_embedding_size, num_words))
116 |     model.add(Activation('softmax'))
117 | 
118 |     sgd_optimizer = SGD(lr=0.006, momentum=0.9, decay=0.99, nesterov=True)
119 |     adg_optimizer = Adagrad()
120 |     rms_optimizer = RMSprop()
121 |     model.compile(loss='categorical_crossentropy', optimizer=rms_optimizer, class_mode="categorical", theano_mode='FAST_COMPILE')
122 | 
123 |     print("Train...")
124 |     model.fit(X_train, y_train_cat, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True)
125 |     score = model.evaluate(X_test, y_test_cat, batch_size=batch_size)
126 |     print('Test score:', score)
127 | 
128 |     classes_proba = model.predict_proba(X_test, batch_size=batch_size)
129 |     for i in range(5):
130 |         probs = sorted(zip(range(len(classes_proba)), classes_proba[i].tolist()), key=lambda x: x[1], reverse=True)
131 |         print('Test sample %d (Correct label: %s)' % (i, id_to_word[y_test[i]]))
132 |         for j, p in probs[:5]:
133 |             print(id_to_word[j].ljust(20) + ': ' + str(p))
134 | 
135 |     classes = np_utils.probas_to_classes(classes_proba)
136 |     acc = np_utils.accuracy(classes, y_test)
137 |     print('Test accuracy:', acc)
138 | 
139 |     # print(classes.shape)
140 |     # print(classes[0])
141 |     # print(y_test[0])
142 | 
143 |     # classes_list = classes.tolist()
144 |     # print(map(lambda x: id_to_word[x], classes_list[:25]))
145 | 


--------------------------------------------------------------------------------
/keras_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | 
 4 | def parse_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True):
 5 |     dataset = []
 6 |     labels = []
 7 |     with open(input_file) as f:
 8 |         words = []
 9 |         for line in f:
10 |             line = line.strip()
11 |             if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article
12 |                 words = []
13 |             if '\t' in line:
14 |                 question_parts = line.split('\t')
15 |                 tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split()
16 |                 if update_word_ids:
17 |                     for token in tokens[1:]:
18 |                         if token not in word_to_id:
19 |                             word_to_id[token] = word_id
20 |                             word_id += 1
21 | 
22 |                 dataset.append(words)
23 |                 labels.append(word_to_id[question_parts[1]])
24 |             else:
25 |                 tokens = re.sub(r'([\.\?])$', r' \1', line).split()
26 |                 if update_word_ids:
27 |                     for token in tokens[1:]:
28 |                         if token not in word_to_id:
29 |                             word_to_id[token] = word_id
30 |                             word_id += 1
31 | 
32 |                 for token in tokens[1:]:
33 |                     words.append(word_to_id[token])
34 | 
35 |     return dataset, labels, word_to_id
36 | 


--------------------------------------------------------------------------------
/mctest_baseline.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys, random, pprint
  3 | import cPickle
  4 | import math, os
  5 | 
  6 | class MCTestBaseline:
  7 |     def __init__(self, n_words=20, word_to_id=None, null_word_id=-1):
  8 |         self.n_words = n_words
  9 |         self.word_to_id = word_to_id
 10 |         self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems())
 11 |         self.null_word_id = null_word_id
 12 | 
 13 |     def remove_nulls(self, stmt):
 14 |         return filter(lambda x: x != self.null_word_id, stmt)
 15 | 
 16 |     def compute_inverse_count(self, stmt_list):
 17 |         counts = {}
 18 |         for word in stmt_list:
 19 |             if word not in counts:
 20 |                 counts[word] = 0
 21 |             counts[word] += 1
 22 | 
 23 |         ic = {}
 24 |         for k, v in counts.iteritems():
 25 |             ic[k] = math.log10(1 + 1.0/float(v))
 26 |         return ic
 27 | 
 28 |     def compute_scores(self, statements, question, answers, stop_words):
 29 |         stmt_list = [word for stmt in statements for word in self.remove_nulls(stmt)]
 30 |         stmt_set = set(stmt_list)
 31 |         ques_set = set(self.remove_nulls(question))
 32 |         ans_set = map(lambda x: set(self.remove_nulls(x)), answers)
 33 |         ic = self.compute_inverse_count(stmt_list)
 34 |         scores = []
 35 |         for i in range(4):
 36 |             sw_score = -1
 37 |             S = ans_set[i] | ques_set
 38 |             S_list = list(S)
 39 |             for j in range(len(stmt_list)):
 40 |                 curr_score = 0
 41 |                 for w in range(len(S_list)):
 42 |                     if j+w < len(stmt_list) and stmt_list[j+w] in S:
 43 |                         if stmt_list[j+w] in stmt_set:
 44 |                             curr_score += ic[stmt_list[j+w]]
 45 |                 if sw_score == -1 or curr_score > sw_score:
 46 |                     sw_score = curr_score
 47 | 
 48 |             d_score = -1
 49 |             S_Q = (ques_set & stmt_set) - stop_words
 50 |             S_A = (ans_set[i] & stmt_set) - stop_words
 51 |             if len(S_Q) == 0 or len(S_A) == 0:
 52 |                 d_score = 1
 53 |             else:
 54 |                 min_dist = len(stmt_list)
 55 |                 last_q = -1
 56 |                 last_a = -1
 57 |                 for i in range(len(stmt_list)):
 58 |                     if stmt_list[i] in S_Q and stmt_list[i] in S_A:
 59 |                         min_dist = 0
 60 |                         break
 61 |                     if stmt_list[i] in S_Q:
 62 |                         last_q = i
 63 |                         if last_a >= 0 and i - last_a < min_dist:
 64 |                             min_dist = i - last_a
 65 |                     elif stmt_list[i] in S_A:
 66 |                         last_a = i
 67 |                         if last_q >= 0 and i - last_q < min_dist:
 68 |                             min_dist = i - last_q
 69 |                 d_score = float(min_dist + 1) / float(len(stmt_list) + 1)
 70 |             scores.append(sw_score - d_score)
 71 | 
 72 |         return scores
 73 | 
 74 | 
 75 |     def train(self):
 76 |         pass
 77 | 
 78 |     def predict(self, dataset, questions, stop_words=set(), max_words=20, print_errors=True):
 79 |         correct_answers = 0
 80 |         wrong_answers = 0
 81 | 
 82 |         for i, question in enumerate(questions):
 83 |             statements_seq = question[2]
 84 |             question_seq = question[3]
 85 |             answers = question[5]
 86 |             correct = question[4]
 87 | 
 88 |             # print statements_seq
 89 |             # print question_seq
 90 |             # print answers
 91 |             # print correct
 92 |             
 93 | 
 94 |             scores = self.compute_scores(statements_seq, question_seq, answers, stop_words)
 95 |             predicted = np.argmax(scores)
 96 | 
 97 |             if predicted == correct:
 98 |                 correct_answers += 1
 99 |             else:
100 |                 if print_errors and np.random.rand() < 0.1:
101 |                     correct_words = map(lambda x: self.id_to_word[x], self.remove_nulls(question[5][correct]))
102 |                     predicted_words = map(lambda x: self.id_to_word[x], self.remove_nulls(question[5][predicted]))
103 |                     print 'Correct: %s (%d %.3f), Guess: %s (%d %.3f)' % (correct_words, correct, scores[correct], predicted_words, predicted, scores[predicted])
104 |                 wrong_answers += 1
105 | 
106 |             #if len(questions) > 1000:
107 |             #    print '(%d/%d) %d correct, %d wrong' % (i+1, len(questions), correct_answers, wrong_answers)
108 | 
109 |         accuracy = 100.0 * float(correct_answers) / (correct_answers + wrong_answers)
110 |         print '%d correct, %d wrong, %.2f%% acc' % (correct_answers, wrong_answers, accuracy)
111 | 
112 | 
113 | if __name__ == "__main__":
114 |     train_file = sys.argv[1]
115 |     test_file = train_file.replace('train', 'test')
116 |     stop_file = os.path.join(os.path.dirname(train_file), 'stopwords.pickle')
117 | 
118 |     print("Loading pickled train dataset")
119 |     f = file(train_file, 'rb')
120 |     obj = cPickle.load(f)
121 |     train_dataset, train_questions, word_to_id, num_words, null_word_id, train_max_stmts, train_max_words = obj
122 |     f.close()
123 | 
124 |     print("Loading pickled test dataset")
125 |     f = file(test_file, 'rb')
126 |     obj = cPickle.load(f)
127 |     test_dataset, test_questions, _, _, _, test_max_stmts, test_max_words = obj
128 |     f.close()
129 | 
130 |     print("Loading pickled stop words")
131 |     f = file(stop_file, 'rb')
132 |     obj = cPickle.load(f)
133 |     stop_words = obj
134 |     f.close()
135 | 
136 |     print "Dataset has %d words" % num_words
137 | 
138 |     baseline = MCTestBaseline(n_words=num_words, word_to_id=word_to_id, null_word_id=null_word_id)
139 |     baseline.predict(train_dataset, train_questions, stop_words, train_max_words)
140 |     baseline.predict(test_dataset, test_questions, stop_words, test_max_words)
141 | 


--------------------------------------------------------------------------------
/mctest_dataset_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from theano_util import *
  4 | 
  5 | def only_words(line):
  6 |     ps = re.sub(r'[^a-zA-Z0-9]', r' ', line)
  7 |     ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations
  8 |     ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
  9 |     hs = re.sub(r'-', r' ', ns) # Replace hyphens with space
 10 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 11 |     rs = rs.lower().strip()
 12 |     return rs
 13 | 
 14 | def clean_sentence(line):
 15 |     ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters
 16 |     ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations
 17 |     ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 18 |     hs = re.sub(r'-', r' ', ns) # Replace hyphens with space
 19 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 20 |     rs = rs.lower().strip()
 21 |     return rs
 22 | 
 23 | def get_sentences(line):
 24 |     ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters
 25 |     s = re.sub(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', '\t', ps) # Split on sentences
 26 |     ws = re.sub(r'(\W)', r' \1 ', s) # Put spaces around punctuations
 27 |     ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 28 |     hs = re.sub(r'-', r' ', ns) # Replace hyphens with space
 29 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 30 |     rs = rs.lower().strip()
 31 |     return rs.split('\t')
 32 | 
 33 | def get_answer_index(a):
 34 |     answer_to_index = {
 35 |         'A': 0,
 36 |         'B': 1,
 37 |         'C': 2,
 38 |         'D': 3,
 39 |     }
 40 |     return answer_to_index[a]
 41 | 
 42 | def parse_mc_test_dataset(questions_file, answers_file, word_id=0, word_to_id={}, update_word_ids=True, max_stmts=20, max_words=20, pad=True):
 43 |     dataset = []
 44 |     questions = []
 45 | 
 46 |     null_word = '<NULL>'
 47 |     if null_word not in word_to_id:
 48 |         if update_word_ids == True:
 49 |             word_to_id[null_word] = word_id
 50 |             word_id += 1
 51 |         else:
 52 |             print "Null word not found!! AAAAA"
 53 |             sys.exit(1)
 54 |     null_word_id = word_to_id[null_word]
 55 | 
 56 |     article_files = set()
 57 |     print("Parsing questions %s %s" % (questions_file, answers_file))
 58 |     q_file = open(questions_file, 'r')
 59 |     a_file = open(answers_file, 'r')
 60 | 
 61 |     questions_data = q_file.readlines()
 62 |     answers_data = a_file.readlines()
 63 | 
 64 |     assert(len(questions_data) == len(answers_data))
 65 | 
 66 |     more_than_1_word_answers = 0
 67 |     answer_word_unknown = 0
 68 | 
 69 |     for i in xrange(len(questions_data)):
 70 |         question_line = questions_data[i]
 71 |         answer_line = answers_data[i]
 72 | 
 73 |         question_pieces = question_line.strip().split('\t')
 74 |         assert(len(question_pieces) == 23)
 75 | 
 76 |         answer_pieces = answer_line.strip().split('\t')
 77 |         assert(len(answer_pieces) == 4)
 78 | 
 79 |         text = question_pieces[2]
 80 |         text = text.replace('\\newline', ' ')
 81 |         sentences = get_sentences(text)
 82 | 
 83 |         statements = []
 84 |         for s in sentences:
 85 |             tokens = s.strip().split()
 86 | 
 87 |             if update_word_ids:
 88 |                 for token in tokens:
 89 |                     if token not in word_to_id:
 90 |                         word_to_id[token] = word_id
 91 |                         word_id += 1
 92 |             else:
 93 |                 tokens = filter(lambda x: x in word_to_id, tokens)
 94 | 
 95 |             if pad:
 96 |                 tokens = pad_statement(tokens, null_word, max_words)
 97 | 
 98 |             statements.append(tokens)
 99 |             dataset.append(tokens)
100 | 
101 |         if pad:
102 |             statements = pad_memories(statements, null_word, max_stmts, max_words)
103 | 
104 |         # 4 questions
105 |         for j in range(4):
106 |             q_index = (j * 5) + 3
107 |             q_words = question_pieces[q_index]
108 |             q_words = clean_sentence(q_words).split()
109 | 
110 |             options = [
111 |                 only_words(question_pieces[q_index + 1]),
112 |                 only_words(question_pieces[q_index + 2]),
113 |                 only_words(question_pieces[q_index + 3]),
114 |                 only_words(question_pieces[q_index + 4]),
115 |             ]
116 |             correct = get_answer_index(answer_pieces[j])
117 |             answer = options[correct]
118 | 
119 |             if update_word_ids:
120 |                 for token in (q_words + options):
121 |                     if token not in word_to_id:
122 |                         word_to_id[token] = word_id
123 |                         word_id += 1
124 |             else:
125 |                 q_words = filter(lambda x: x in word_to_id, q_words)
126 | 
127 |             if pad:
128 |                 q_words = pad_statement(q_words, null_word, max_words)
129 | 
130 |             # Ignore more than 1 word answers
131 |             if len(answer.split(' ')) > 1:
132 |                 more_than_1_word_answers += 1
133 |                 continue
134 |             elif len(filter(lambda x: x not in word_to_id, options)) > 0:
135 |                 answer_word_unknown += 1
136 |                 continue
137 | 
138 |             option_word_ids = map(lambda x: word_to_id[x], options)
139 | 
140 |             article_no = len(questions)
141 |             questions.append([article_no, -1, statements, q_words, answer, option_word_ids])
142 | 
143 |     print "There are %d questions" % len(questions)
144 |     print "There are %d statements" % len(dataset)
145 |     print "There are %d words" % len(word_to_id)
146 |     print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers
147 |     print "Ignored %d questions which had an unknown answer word" % answer_word_unknown
148 | 
149 |     print("Final processing...")
150 |     questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions)
151 |     return dataset, questions_seq, word_to_id, word_id, null_word_id
152 | 
153 | import cPickle
154 | 
155 | if __name__ == "__main__":
156 |     ADD_PADDING = True
157 | 
158 |     train_file = 'mc500.train.tsv'
159 |     train_answers = train_file.replace('tsv', 'ans')
160 | 
161 |     test_file = train_file.replace('train', 'test')
162 |     test_answers = test_file.replace('tsv', 'ans')
163 | 
164 |     data_dir = sys.argv[1]
165 | 
166 |     train_dataset, train_questions, word_to_id, num_words, null_word_id = parse_mc_test_dataset(data_dir + '/' + train_file, data_dir + '/' + train_answers, pad=ADD_PADDING)
167 |     test_dataset, test_questions, word_to_id, num_words, null_word_id = parse_mc_test_dataset(data_dir + '/' + test_file, data_dir + '/' + test_answers, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, pad=ADD_PADDING)
168 | 
169 |     # Add dev to test
170 |     test2_file = train_file.replace('train', 'dev')
171 |     test2_answers = test2_file.replace('tsv', 'ans')
172 |     test2_dataset, test2_questions, word_to_id, num_words, null_word_id = parse_mc_test_dataset(data_dir + '/' + test2_file, data_dir + '/' + test2_answers, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, pad=ADD_PADDING)
173 | 
174 |     test_dataset += test2_dataset
175 |     test_questions += test2_questions
176 | 
177 |     # Pickle!!!!
178 |     print("Pickling train...")
179 |     train_pickle = train_file.replace('tsv', 'pickle')
180 |     f = file(data_dir + '/' + train_pickle, 'wb')
181 |     cPickle.dump((train_dataset, train_questions, word_to_id, num_words, null_word_id), f, protocol=cPickle.HIGHEST_PROTOCOL)
182 |     f.close()
183 | 
184 |     print("Pickling test...")
185 |     test_pickle = test_file.replace('tsv', 'pickle')
186 |     f = file(data_dir + '/' + test_pickle, 'wb')
187 |     cPickle.dump((test_dataset, test_questions, word_to_id, num_words, null_word_id), f, protocol=cPickle.HIGHEST_PROTOCOL)
188 |     f.close()
189 | 


--------------------------------------------------------------------------------
/mctest_dataset_parser_v2.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys, os
  3 | import cPickle
  4 | 
  5 | from theano_util import (
  6 |     pad_memories,
  7 |     pad_statement,
  8 | )
  9 | 
 10 | from pos_pruning import prune_statements
 11 | 
 12 | def only_words(line):
 13 |     ps = re.sub(r'[^a-zA-Z0-9\']', r' ', line)
 14 |     ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations
 15 |     ws = re.sub(r" ' ", r"'", ws) # Remove spaces around '
 16 |     # ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 17 |     hs = re.sub(r'-', r' ', ws) # Replace hyphens with space
 18 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 19 |     rs = rs.lower().strip().split(' ')
 20 |     return rs
 21 | 
 22 | def clean_sentence(line):
 23 |     ps = re.sub(r'[^a-zA-Z0-9\.\?\!\']', ' ', line) # Split on punctuations and hex characters
 24 |     ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations
 25 |     ws = re.sub(r" ' ", r"'", ws) # Remove spaces around '
 26 |     # ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 27 |     hs = re.sub(r'-', r' ', ws) # Replace hyphens with space
 28 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 29 |     rs = rs.lower().strip()
 30 |     return rs
 31 | 
 32 | def get_sentences(line):
 33 |     ps = re.sub(r'[^a-zA-Z0-9\.\?\!\']', ' ', line) # Split on punctuations and hex characters
 34 |     s = re.sub(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', '\t', ps) # Split on sentences
 35 |     ws = re.sub(r'(\W)', r' \1 ', s) # Put spaces around punctuations
 36 |     ws = re.sub(r" ' ", r"'", ws) # Remove spaces around '
 37 |     # ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 38 |     hs = re.sub(r'-', r' ', ws) # Replace hyphens with space
 39 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 40 |     rs = rs.lower().strip()
 41 |     return rs.split('\t')
 42 | 
 43 | def get_answer_index(a):
 44 |     answer_to_index = {
 45 |         'A': 0,
 46 |         'B': 1,
 47 |         'C': 2,
 48 |         'D': 3,
 49 |     }
 50 |     return answer_to_index[a]
 51 | 
 52 | def transform_ques_weak(question, word_to_id, num_words):
 53 |     indices = []
 54 |     for stmt in question[2]:
 55 |         index_stmt = map(lambda x: word_to_id[x], stmt)
 56 |         indices.append(index_stmt)
 57 |     question[2] = indices
 58 |     question[3] = map(lambda x: word_to_id[x], question[3])
 59 |     question[5] = map(lambda l: map(lambda x: word_to_id[x], l), question[5])
 60 |     return question
 61 | 
 62 | def parse_mc_test_dataset(questions_file, answers_file, word_id=0, word_to_id={}, update_word_ids=True, pad=True, add_pruning=False):
 63 |     dataset = []
 64 |     questions = []
 65 | 
 66 |     null_word = '<NULL>'
 67 |     if null_word not in word_to_id:
 68 |         if update_word_ids == True:
 69 |             word_to_id[null_word] = word_id
 70 |             word_id += 1
 71 |         else:
 72 |             print "Null word not found!! AAAAA"
 73 |             sys.exit(1)
 74 |     null_word_id = word_to_id[null_word]
 75 | 
 76 |     article_files = set()
 77 |     print("Parsing questions %s %s" % (questions_file, answers_file))
 78 |     q_file = open(questions_file, 'r')
 79 |     a_file = open(answers_file, 'r')
 80 | 
 81 |     questions_data = q_file.readlines()
 82 |     answers_data = a_file.readlines()
 83 | 
 84 |     assert(len(questions_data) == len(answers_data))
 85 | 
 86 |     more_than_1_word_answers = 0
 87 |     answer_word_unknown = 0
 88 | 
 89 |     for i in xrange(len(questions_data)):
 90 |         question_line = questions_data[i]
 91 |         answer_line = answers_data[i]
 92 | 
 93 |         question_pieces = question_line.strip().split('\t')
 94 |         assert(len(question_pieces) == 23)
 95 | 
 96 |         answer_pieces = answer_line.strip().split('\t')
 97 |         assert(len(answer_pieces) == 4)
 98 | 
 99 |         text = question_pieces[2]
100 |         text = text.replace('\\newline', ' ')
101 |         sentences = get_sentences(text)
102 | 
103 |         statements = []
104 |         for s in sentences:
105 |             tokens = s.strip().split()
106 | 
107 |             if update_word_ids:
108 |                 for token in tokens:
109 |                     if token not in word_to_id:
110 |                         word_to_id[token] = word_id
111 |                         word_id += 1
112 |             else:
113 |                 tokens = filter(lambda x: x in word_to_id, tokens)
114 | 
115 |             statements.append(tokens)
116 |             dataset.append(tokens)
117 | 
118 |         # 4 questions
119 |         for j in range(4):
120 |             q_index = (j * 5) + 3
121 |             q_words = question_pieces[q_index]
122 |             q_words = clean_sentence(q_words).split()
123 | 
124 |             options = [
125 |                 only_words(question_pieces[q_index + 1]),
126 |                 only_words(question_pieces[q_index + 2]),
127 |                 only_words(question_pieces[q_index + 3]),
128 |                 only_words(question_pieces[q_index + 4]),
129 |             ]
130 |             correct = get_answer_index(answer_pieces[j])
131 |             answer = options[correct]
132 | 
133 |             # if len(answer) > 1:
134 |             #     more_than_1_word_answers += 1
135 |             #     continue
136 | 
137 |             if update_word_ids:
138 |                 for token in q_words:
139 |                     if token not in word_to_id:
140 |                         word_to_id[token] = word_id
141 |                         word_id += 1
142 |                 for o in options:
143 |                     for token in o:
144 |                         if token not in word_to_id:
145 |                             word_to_id[token] = word_id
146 |                             word_id += 1
147 |             else:
148 |                 q_words = filter(lambda x: x in word_to_id, q_words)
149 | 
150 |             if q_words[0] == 'multiple' or q_words[0] == 'one':
151 |                 del q_words[0]
152 | 
153 |             # Ignore questions with unknown words in the answer
154 |             options_word_ids = []
155 |             skip = False
156 |             for o in options:
157 |                 option_word_ids = []
158 |                 for w in o:
159 |                     if w not in word_to_id:
160 |                         if update_word_ids:
161 |                             word_to_id[w] = word_id
162 |                             word_id += 1
163 |                             option_word_ids.append(w)
164 |                         else:
165 |                             skip = True
166 |                             break
167 |                     else:
168 |                         option_word_ids.append(w)
169 |                 if skip:
170 |                     break
171 |                 else:
172 |                     #if len(option_word_ids) > 1:
173 |                     #    skip = True
174 |                     #    more_than_1_word_answers += 1
175 |                     #    break
176 |                     options_word_ids.append(option_word_ids)
177 | 
178 |             if skip:
179 |                 answer_word_unknown += 1
180 |                 continue
181 | 
182 |             article_no = len(questions)
183 |             questions.append([article_no, -1, statements, q_words, correct, options_word_ids])
184 | 
185 |     print "There are %d questions" % len(questions)
186 |     print "There are %d statements" % len(dataset)
187 |     print "There are %d words" % len(word_to_id)
188 |     print "Ignored %d questions which had more than 1 word answers" % more_than_1_word_answers
189 |     print "Ignored %d questions which had an unknown answer word" % answer_word_unknown
190 | 
191 |     if add_pruning:
192 |         print("Trying to prune extraneaous statements...")
193 |         questions = prune_statements(dataset, questions)
194 |         before_prune = len(questions)
195 |         questions = filter(lambda x: len(x[2]) > 1, questions)
196 |         after_prune = len(questions)
197 |         print("Pruning invalidated %d questions" % (before_prune - after_prune))
198 | 
199 |     max_stmts = None
200 |     max_words = None
201 |     if pad:
202 |         s_lens = []
203 |         q_lens = []
204 |         for i in xrange(len(questions)):
205 |             q = questions[i]
206 |             s_lens.append(len(q[2]))
207 |             for j in xrange(len(q[2])):
208 |                 q_lens.append(len(q[2][j]))
209 | 
210 |         max_stmts = max(s_lens)
211 |         max_words = max(q_lens)
212 |         print "Max statement length: ", max_words
213 |         print "Max number of statements: ", max_stmts
214 | 
215 |         for i in xrange(len(questions)):
216 |             q = questions[i]
217 |             # Statements
218 | 
219 |             for j in xrange(len(q[2])):
220 |                 q[2][j] = pad_statement(q[2][j], null_word, max_words)
221 | 
222 |             q[2] = pad_memories(q[2], null_word, max_stmts, max_words)
223 |             q[3] = pad_statement(q[3], null_word, max_words)
224 | 
225 |             for j in xrange(len(q[5])):
226 |                 q[5][j] = pad_statement(q[5][j], null_word, max_words)
227 | 
228 | 
229 |     print("Final processing...")
230 |     questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions)
231 |     return dataset, questions_seq, word_to_id, word_id, null_word_id, max_stmts, max_words
232 | 
233 | def parse_stop_words(stop_file, word_id=0, word_to_id={}, update_word_ids=False):
234 |     stop_words = set()
235 |     with open(stop_file) as f:
236 |         for line in f:
237 |             token = line.strip()
238 |             if not token in word_to_id:
239 |                 if update_word_ids:
240 |                     word_to_id[token] = word_id
241 |                     word_id += 1
242 |                 else:
243 |                     continue
244 |             stop_words.add(word_to_id[token])
245 |     return stop_words
246 | 
247 | if __name__ == "__main__":
248 |     ADD_PADDING = True
249 |     ADD_PRUNING = False
250 |     # Consider padding from the other side
251 | 
252 |     if len(sys.argv) > 2:
253 |         dataset = sys.argv[2]
254 |     else:
255 |         dataset = 'mc160'
256 | 
257 |     train_file = dataset + '.train.tsv'
258 |     print "Train file:", train_file
259 | 
260 |     train_answers = train_file.replace('tsv', 'ans')
261 | 
262 |     test_file = train_file.replace('train', 'test')
263 |     test_answers = test_file.replace('tsv', 'ans')
264 | 
265 |     data_dir = sys.argv[1]
266 | 
267 |     train_obj = parse_mc_test_dataset(os.path.join(data_dir, train_file), os.path.join(data_dir, train_answers), pad=ADD_PADDING, add_pruning=ADD_PRUNING)
268 |     num_words = train_obj[3]
269 |     word_to_id = train_obj[2]
270 |     test_obj = parse_mc_test_dataset(os.path.join(data_dir, test_file), os.path.join(data_dir, test_answers), word_id=num_words, word_to_id=word_to_id, update_word_ids=True, pad=ADD_PADDING, add_pruning=ADD_PRUNING)
271 |     num_words = test_obj[3]
272 |     word_to_id = test_obj[2]
273 | 
274 |     # Add dev to test
275 |     # test2_file = train_file.replace('train', 'dev')
276 |     # test2_answers = test2_file.replace('tsv', 'ans')
277 |     # test2_obj = parse_mc_test_dataset(os.path.join(data_dir, test2_file), os.path.join(data_dir, test2_answers), word_id=num_words, word_to_id=word_to_id, update_word_ids=True, pad=ADD_PADDING, add_pruning=ADD_PRUNING)
278 | 
279 |     #test_obj[0] += test2_obj[0]
280 |     #test_obj[1] += test2_obj[1]
281 | 
282 |     stop_file = 'stopwords.txt'
283 |     stop_obj = parse_stop_words(os.path.join(data_dir, stop_file), word_id=num_words, word_to_id=word_to_id)
284 | 
285 |     # Pickle!!!!
286 |     train_pickle = train_file.replace('tsv', 'pickle')
287 |     print("Pickling train... " + train_pickle)
288 |     f = file(os.path.join(data_dir, train_pickle), 'wb')
289 |     cPickle.dump(train_obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
290 |     f.close()
291 | 
292 |     test_pickle = test_file.replace('tsv', 'pickle')
293 |     print("Pickling test... " + test_pickle)
294 |     f = file(os.path.join(data_dir, test_pickle), 'wb')
295 |     cPickle.dump(test_obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
296 |     f.close()
297 | 
298 |     stop_pickle = stop_file.replace('txt', 'pickle')
299 |     print("Pickling stop words... " + stop_pickle)
300 |     f = file(os.path.join(data_dir, stop_pickle), 'wb')
301 |     cPickle.dump(stop_obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
302 |     f.close()
303 | 


--------------------------------------------------------------------------------
/mctest_lstm.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | from __future__ import print_function
  3 | import numpy as np
  4 | import sys, re
  5 | 
  6 | import cPickle
  7 | 
  8 | from keras.preprocessing import sequence
  9 | from keras.initializations import uniform
 10 | from keras.optimizers import SGD, RMSprop, Adagrad
 11 | from keras.utils import np_utils
 12 | from keras.models import Sequential
 13 | from keras.layers.core import Dense, Dropout, Activation
 14 | from keras.layers.embeddings import Embedding
 15 | from keras.layers.recurrent import LSTM, GRU
 16 | 
 17 | def flatten(a):
 18 |     ret = []
 19 |     for item in a:
 20 |         if type(item) == list:
 21 |             for k in item:
 22 |                 ret.append(k)
 23 |         else:
 24 |             ret.append(k)
 25 |     return ret
 26 | 
 27 | def get_dataset(questions):
 28 |     X = []
 29 |     y = []
 30 |     for question in questions:
 31 |         statements = []
 32 |         statements += flatten(question[2])
 33 |         statements += question[3]
 34 |         X.append(statements)
 35 |         y.append(question[4])
 36 |     return X,y
 37 | 
 38 | if __name__ == "__main__":
 39 |     train_file = sys.argv[1]
 40 |     test_file = train_file.replace('train', 'test')
 41 | 
 42 |     print("Loading pickled train dataset")
 43 |     f = file(train_file, 'rb')
 44 |     obj = cPickle.load(f)
 45 |     train_dataset, train_questions, word_to_id, num_words, null_word_id = obj
 46 | 
 47 |     print("Loading pickled test dataset")
 48 |     f = file(test_file, 'rb')
 49 |     obj = cPickle.load(f)
 50 |     test_dataset, test_questions, _, _, _ = obj
 51 | 
 52 |     nb_epoch = 10
 53 |     if len(sys.argv) > 2:
 54 |         nb_epoch = int(sys.argv[2])
 55 | 
 56 |     X_train, y_train = get_dataset(train_questions)
 57 |     X_test, y_test = get_dataset(test_questions)
 58 | 
 59 |     id_to_word = dict([(v, k) for k, v in word_to_id.iteritems()])
 60 | 
 61 |     y_train_cat = np_utils.to_categorical(y_train, nb_classes=num_words)
 62 |     y_test_cat = np_utils.to_categorical(y_test, nb_classes=num_words)
 63 | 
 64 |     print(len(X_train), 'train sequences')
 65 |     print(len(X_test), 'test sequences')
 66 | 
 67 |     print("Pad sequences (samples x time)")
 68 |     X_train = sequence.pad_sequences(X_train)
 69 |     X_test = sequence.pad_sequences(X_test)
 70 |     print('X_train shape:', X_train.shape)
 71 |     print('X_test shape:', X_test.shape)
 72 | 
 73 |     print('Build model...')
 74 |     batch_size = 1
 75 |     in_embedding_size = 100
 76 |     out_embedding_size = 100
 77 | 
 78 |     model = Sequential()
 79 |     model.add(Embedding(num_words, in_embedding_size))
 80 |     model.add(LSTM(in_embedding_size, out_embedding_size))
 81 |     model.add(Dropout(0.5))
 82 |     model.add(Dense(out_embedding_size, num_words))
 83 |     model.add(Activation('softmax'))
 84 | 
 85 |     sgd_optimizer = SGD(lr=0.006, momentum=0.9, decay=0.99, nesterov=True)
 86 |     adg_optimizer = Adagrad()
 87 |     rms_optimizer = RMSprop()
 88 |     model.compile(loss='categorical_crossentropy', optimizer=rms_optimizer, class_mode="categorical", theano_mode='FAST_COMPILE')
 89 | 
 90 |     print("Train...")
 91 |     model.fit(X_train, y_train_cat, batch_size=batch_size, nb_epoch=nb_epoch, validation_split=0.1, show_accuracy=True)
 92 |     score = model.evaluate(X_test, y_test_cat, batch_size=batch_size)
 93 |     print('Test score:', score)
 94 | 
 95 |     classes_proba = model.predict_proba(X_test, batch_size=batch_size)
 96 |     for i in range(5):
 97 |         probs = sorted(zip(range(len(classes_proba)), classes_proba[i].tolist()), key=lambda x: x[1], reverse=True)
 98 |         print('Test sample %d (Correct label: %s)' % (i, id_to_word[y_test[i]]))
 99 |         for j, p in probs[:5]:
100 |             print(id_to_word[j].ljust(20) + ': ' + str(p))
101 | 
102 |     classes = np_utils.probas_to_classes(classes_proba)
103 | 
104 |     correct, wrong = 0, 0
105 |     for (i,q) in enumerate(test_questions):
106 |         options = q[5]
107 |         options_probs = classes_proba[i][options]
108 |         best_idx = np.argmax(options_probs)
109 |         predicted = options[best_idx]
110 |         print('Test sample %d (Correct label: %s)' % (i, id_to_word[y_test[i]]))
111 |         for k in range(len(options)):
112 |             print(id_to_word[options[k]].ljust(20) + ': ' + str(options_probs[k]))
113 | 
114 |         if predicted == y_test[i]:
115 |             correct += 1
116 |         else:
117 |             wrong += 1
118 | 
119 |     print('%d correct, %d wrong' % (correct, wrong))
120 | 


--------------------------------------------------------------------------------
/memnn_numpy.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import sys, re, random
  3 | 
  4 | def init_shared_normal(num_rows, num_cols, scale=1):
  5 |     '''Initialize a matrix shared variable with normally distributed
  6 |     elements.'''
  7 |     return np.random.normal(scale=scale, size=(num_rows, num_cols))
  8 | 
  9 | def init_shared_zeros(*shape):
 10 |     '''Initialize a vector shared variable with zero elements.'''
 11 |     return np.zeros(shape, dtype=dtype)
 12 | 
 13 | class MemNN:
 14 |     def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, n_epochs=100):
 15 |         self.n_embedding = n_embedding
 16 |         self.lr = lr
 17 |         self.margin = margin
 18 |         self.n_epochs = n_epochs
 19 |         self.n_words = n_words
 20 |         self.n_D = 2 * self.n_words
 21 | 
 22 |         self.U_O = init_shared_normal(n_embedding, self.n_D, 0.01)
 23 | 
 24 |     def calc_score(self, phi_x, phi_y):
 25 |         return phi_x.T.dot(self.U_O.T).dot(self.U_O).dot(phi_y)
 26 | 
 27 |     def calc_grad_S_O_U_O(self, phi_x, phi_y):
 28 |         return self.U_O.dot(np.outer(phi_x, phi_y) + np.outer(phi_y, phi_x))
 29 | 
 30 |     def calc_cost_and_grad(self, phi_x, phi_f1, phi_f1bar):
 31 |         correct_score = self.calc_score(phi_x, phi_f1)
 32 |         false_score = self.calc_score(phi_x, phi_f1bar)
 33 |         cost = max(0, self.margin - correct_score + false_score)
 34 |         grad = {}
 35 |         grad['U_O'] = 0
 36 |         if cost > 0:
 37 |             grad['U_O'] = -1*self.calc_grad_S_O_U_O(phi_x, phi_f1) + self.calc_grad_S_O_U_O(phi_x, phi_f1bar)
 38 |         return cost, grad
 39 | 
 40 |     def train(self, dataset_bow, questions, num_words):
 41 |         for epoch in xrange(self.n_epochs):
 42 |             costs = []
 43 | 
 44 |             random.shuffle(questions)
 45 |             for i, question in enumerate(questions):
 46 |                 article_no = question[0]
 47 |                 line_no = question[1]
 48 |                 question_phi = question[2]
 49 |                 correct_stmt = question[4]
 50 |                 seq = [i for i in range(line_no)]
 51 |                 del seq[correct_stmt]
 52 |                 false_stmt = random.choice(seq)
 53 |                 #print article_no, line_no, correct_stmt, false_stmt
 54 |                 phi_x = np.zeros((self.n_D,))
 55 |                 phi_x[:num_words] = question_phi
 56 |                 phi_f1 = np.zeros((self.n_D,))
 57 |                 phi_f1[num_words:2*num_words] = dataset_bow[article_no][correct_stmt]
 58 |                 phi_f1bar = np.zeros((self.n_D,))
 59 |                 phi_f1bar[num_words:2*num_words] = dataset_bow[article_no][false_stmt]
 60 | 
 61 |                 # if article_no == 0 and line_no == 2:
 62 |                 #     corr_score = self.calc_score(phi_x, phi_f1)
 63 |                 #     fals_score = self.calc_score(phi_x, phi_f1bar)
 64 |                 #     print "[BEFORE] corr score: %f, false score: %f" % (corr_score, fals_score)
 65 | 
 66 |                 cost, grad = self.calc_cost_and_grad(phi_x, phi_f1, phi_f1bar)
 67 |                 costs.append(cost)
 68 |                 self.U_O -= self.lr * grad['U_O']
 69 | 
 70 |                 # if article_no == 0 and line_no == 2:
 71 |                 #     corr_score = self.calc_score(phi_x, phi_f1)
 72 |                 #     fals_score = self.calc_score(phi_x, phi_f1bar)
 73 |                 #     print "[ AFTER] corr score: %f, false score: %f" % (corr_score, fals_score)
 74 | 
 75 |             # if epoch % 100 == 0:
 76 |                 # print 'Epoch %i/%i' % (epoch + 1, self.n_epochs), np.mean(costs)
 77 |                 # sys.stdout.flush()
 78 | 
 79 |             # print np.mean(costs), np.mean(self.U_O), np.max(self.U_O), np.min(self.U_O)
 80 | 
 81 |     def predict(self, dataset, questions):
 82 |         correct_answers = 0
 83 |         wrong_answers = 0
 84 |         for i, question in enumerate(questions):
 85 |             article_no = question[0]
 86 |             line_no = question[1]
 87 |             question_phi = question[2]
 88 |             correct_stmt = question[4]
 89 |             
 90 |             phi_x = np.zeros((self.n_D,))
 91 |             phi_x[:num_words] = question_phi
 92 |             
 93 |             answer = -1
 94 |             max_score = -99999
 95 |             for l in range(line_no):
 96 |                 phi_f = np.zeros((self.n_D,))
 97 |                 phi_f[num_words:2*num_words] = dataset[article_no][l]
 98 |                 
 99 |                 #print phi_x, phi_f
100 |                 score = self.calc_score(phi_x, phi_f)
101 |                 if answer == -1 or score > max_score:
102 |                     max_score = score
103 |                     answer = l
104 | 
105 |             if article_no == 0:
106 |                 print "%d: corr stmt: %d, answer: %d" % (i, correct_stmt, answer)
107 | 
108 |             if answer == correct_stmt:
109 |                 correct_answers += 1
110 |             else:
111 |                 wrong_answers += 1
112 | 
113 |         print '%d correct, %d wrong' % (correct_answers, wrong_answers)
114 | 
115 | 
116 | def parse_dataset(input_file):
117 |     dataset = []
118 |     questions = []
119 |     word_to_id = {}
120 |     word_id = 0
121 |     with open(input_file) as f:
122 |         statements = []
123 |         article_no = 0
124 |         line_no = 0
125 |         stmt_to_line = {}
126 |         for line in f:
127 |             line = line.strip()
128 |             if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article
129 |                 dataset.append(statements)
130 |                 statements = []
131 |                 line_no = 0
132 |                 stmt_to_line = {}
133 |                 article_no += 1
134 |             if '\t' in line:
135 |                 question_parts = line.split('\t')
136 |                 tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0]).split()
137 |                 for token in tokens[1:]:
138 |                     if not token in word_to_id:
139 |                         word_to_id[token] = word_id
140 |                         word_id += 1
141 |                 questions.append([article_no, line_no, ' '.join(tokens[1:]), question_parts[1], stmt_to_line[question_parts[2]]])
142 |             else:
143 |                 tokens = re.sub(r'([\.\?])$', r' \1', line).split()
144 |                 stmt_to_line[tokens[0]] = line_no
145 |                 for token in tokens[1:]:
146 |                     if not token in word_to_id:
147 |                         word_to_id[token] = word_id
148 |                         word_id += 1
149 |                 statements.append(' '.join(tokens[1:]))
150 |                 line_no += 1
151 |         if len(statements) > 0:
152 |             dataset.append(statements)
153 |     return dataset, questions, word_to_id, word_id
154 | 
155 | def compute_phi(input_str, word_to_id, num_words):
156 |     phi = np.zeros((num_words,))
157 |     for token in input_str.split():
158 |         phi[word_to_id[token]] += 1
159 |     return phi
160 | 
161 | def transform_ques(question, word_to_id, num_words):
162 |     question[2] = compute_phi(question[2], word_to_id, num_words)
163 |     return question
164 | 
165 | if __name__ == "__main__":
166 |     training_dataset = sys.argv[1]
167 |     test_dataset = training_dataset.replace('train', 'test')
168 | 
169 |     dataset, questions, word_to_id, num_words = parse_dataset(training_dataset)
170 |     dataset_bow = map(lambda y: map(lambda x: compute_phi(x, word_to_id, num_words), y), dataset)
171 |     questions_bow = map(lambda x: transform_ques(x, word_to_id, num_words), questions)
172 |     # print dataset[0], dataset_bow[0], questions_bow[0]
173 |     #print len(dataset_bow)
174 |     memNN = MemNN(n_words=num_words, n_epochs=100, margin=1.0)
175 |     memNN.train(dataset_bow, questions_bow, num_words)
176 | 
177 |     test_dataset, test_questions, _, _ = parse_dataset(test_dataset)
178 |     test_dataset_bow = map(lambda y: map(lambda x: compute_phi(x, word_to_id, num_words), y), test_dataset)
179 |     test_questions_bow = map(lambda x: transform_ques(x, word_to_id, num_words), test_questions)    
180 |     memNN.predict(test_dataset_bow, test_questions_bow)
181 | 


--------------------------------------------------------------------------------
/memnn_theano.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | import sys, random
  5 | 
  6 | from theano_util import *
  7 | 
  8 | class MemNN:
  9 |     def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, n_epochs=100):
 10 |         self.n_embedding = n_embedding
 11 |         self.lr = lr
 12 |         self.margin = margin
 13 |         self.n_epochs = n_epochs
 14 |         self.n_words = n_words
 15 |         self.n_D = 2 * self.n_words
 16 |         self.n_embedding = n_embedding
 17 | 
 18 |         phi_x = T.vector('phi_x')
 19 |         phi_f1 = T.vector('phi_f1')
 20 |         phi_f1bar = T.vector('phi_f1bar')
 21 | 
 22 |         # Supporting memories
 23 |         phi_m0 = T.vector('phi_m0')
 24 | 
 25 |         # True word
 26 |         phi_r = T.vector('phi_r')
 27 | 
 28 |         # False words
 29 |         phi_rbar = T.vector('phi_rbar')
 30 | 
 31 |         self.U_O = init_shared_normal(self.n_embedding, self.n_D, 0.01)
 32 |         self.U_R = init_shared_normal(self.n_embedding, self.n_D, 0.01)
 33 | 
 34 |         cost = self.calc_cost(phi_x, phi_f1, phi_f1bar, phi_m0, phi_r, phi_rbar)
 35 |         params = [self.U_O, self.U_R]
 36 |         gradient = T.grad(cost, params)
 37 | 
 38 |         updates=[]
 39 |         for param, gparam in zip(params, gradient):
 40 |             updates.append((param, param - gparam * self.lr))
 41 | 
 42 |         self.train_function = theano.function(inputs = [phi_x, phi_f1, phi_f1bar, phi_m0, phi_r, phi_rbar],
 43 |                                          outputs = cost,
 44 |                                          updates = updates)
 45 | 
 46 |         phi_f = T.vector('phi_f')
 47 | 
 48 |         score_o = self.calc_score_o(phi_x, phi_f)
 49 |         self.predict_function_o = theano.function(inputs = [phi_x, phi_f], outputs = score_o)
 50 | 
 51 |         score_r = self.calc_score_r(phi_x, phi_f)
 52 |         self.predict_function_r = theano.function(inputs = [phi_x, phi_f], outputs = score_r)
 53 | 
 54 |     def calc_score(self, phi_x, phi_y, U):
 55 |         #return T.dot(T.dot(phi_x.T, self.U_O.T), T.dot(self.U_O, phi_y))
 56 |         return T.dot(U.dot(phi_x), U.dot(phi_y))
 57 | 
 58 |     def calc_score_o(self, phi_x, phi_y):
 59 |         return self.calc_score(phi_x, phi_y, self.U_O)
 60 | 
 61 |     def calc_score_r(self, phi_x, phi_y):
 62 |         return self.calc_score(phi_x, phi_y, self.U_R)
 63 | 
 64 |     def calc_cost(self, phi_x, phi_f1, phi_f1bar, phi_m0, phi_r, phi_rbar):
 65 |         correct_score1 = self.calc_score_o(phi_x, phi_f1)
 66 |         false_score1 = self.calc_score_o(phi_x, phi_f1bar)
 67 | 
 68 |         correct_score2 = self.calc_score_r(phi_x + phi_m0, phi_r)
 69 |         false_score2 = self.calc_score_r(phi_x + phi_m0, phi_rbar)
 70 | 
 71 |         cost = (
 72 |             T.maximum(0, self.margin - correct_score1 + false_score1) +
 73 |             T.maximum(0, self.margin - correct_score2 + false_score2)
 74 |         )
 75 |         return cost
 76 | 
 77 |     def train(self, dataset_bow, questions, num_words):
 78 |         for epoch in xrange(self.n_epochs):
 79 |             costs = []
 80 | 
 81 |             random.shuffle(questions)
 82 |             for i, question in enumerate(questions):
 83 |                 article_no = question[0]
 84 |                 line_no = question[1]
 85 |                 question_phi = question[2]
 86 |                 correct_stmt = question[4]
 87 |                 seq = [i for i in range(line_no)]
 88 |                 del seq[correct_stmt]
 89 |                 false_stmt = random.choice(seq)
 90 |                 #print article_no, line_no, correct_stmt, false_stmt
 91 |                 phi_x = np.zeros((self.n_D,))
 92 |                 phi_x[:num_words] = question_phi
 93 |                 phi_f1 = np.zeros((self.n_D,))
 94 |                 phi_f1[num_words:2*num_words] = dataset_bow[article_no][correct_stmt]
 95 |                 phi_f1bar = np.zeros((self.n_D,))
 96 |                 phi_f1bar[num_words:2*num_words] = dataset_bow[article_no][false_stmt]
 97 | 
 98 |                 if article_no == 0 and line_no == 2:
 99 |                     corr_score = self.predict_function(phi_x, phi_f1)
100 |                     fals_score = self.predict_function(phi_x, phi_f1bar)
101 |                     print "[BEFORE] corr score: %f, false score: %f" % (corr_score, fals_score)
102 | 
103 |                 cost = self.train_function(phi_x, phi_f1, phi_f1bar)
104 |                 costs.append(cost)
105 | 
106 |                 if article_no == 0 and line_no == 2:
107 |                     corr_score = self.predict_function(phi_x, phi_f1)
108 |                     fals_score = self.predict_function(phi_x, phi_f1bar)
109 |                     print "[ AFTER] corr score: %f, false score: %f" % (corr_score, fals_score)
110 | 
111 |             if epoch % 100 == 0:
112 |                 # print 'Epoch %i/%i' % (epoch + 1, self.n_epochs), np.mean(costs)
113 |                 sys.stdout.flush()
114 | 
115 |             # print np.mean(costs), np.mean(self.U_O.get_value()), np.max(self.U_O.get_value()), np.min(self.U_O.get_value())
116 | 
117 |     def predict(self, dataset, questions):
118 |         correct_answers = 0
119 |         wrong_answers = 0
120 |         for i, question in enumerate(questions):
121 |             article_no = question[0]
122 |             line_no = question[1]
123 |             question_phi = question[2]
124 |             correct_stmt = question[4]
125 | 
126 |             phi_x = np.zeros((self.n_D,))
127 |             phi_x[:num_words] = question_phi
128 | 
129 |             answer = -1
130 |             max_score = -99999
131 |             for i in range(line_no):
132 |                 phi_f = np.zeros((self.n_D,))
133 |                 phi_f[num_words:2*num_words] = dataset[article_no][i]
134 | 
135 |                 #print phi_x, phi_f
136 |                 score = self.predict_function(phi_x, phi_f)
137 |                 if answer == -1 or score > max_score:
138 |                     max_score = score
139 |                     answer = i
140 | 
141 |             if answer == correct_stmt:
142 |                 correct_answers += 1
143 |             else:
144 |                 wrong_answers += 1
145 | 
146 |         print '%d correct, %d wrong' % (correct_answers, wrong_answers)
147 | 
148 | if __name__ == "__main__":
149 |     training_dataset = sys.argv[1]
150 |     test_dataset = training_dataset.replace('train', 'test')
151 | 
152 |     dataset, questions, word_to_id, num_words = parse_dataset(training_dataset)
153 |     memNN = MemNN(n_words=num_words, n_embedding=100, lr=0.01, n_epochs=10, margin=1.0, word_to_id=word_to_id)
154 |     memNN.train(dataset, questions)
155 | 
156 |     test_dataset, test_questions, _, _ = parse_dataset(test_dataset, word_id=num_words, word_to_id=word_to_id, update_word_ids=False)
157 |     memNN.predict(test_dataset, test_questions)
158 | 


--------------------------------------------------------------------------------
/memnn_theano_v2.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | import sys, random, pprint
  5 | 
  6 | from theano_util import *
  7 | 
  8 | class MemNN:
  9 |     def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, n_epochs=100, momentum=0.9, word_to_id=None):
 10 |         self.n_embedding = n_embedding
 11 |         self.lr = lr
 12 |         self.momentum = momentum
 13 |         self.margin = margin
 14 |         self.n_epochs = n_epochs
 15 |         self.n_words = n_words
 16 |         self.n_D = 3 * self.n_words + 3
 17 | 
 18 |         self.word_to_id = word_to_id
 19 |         self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems())
 20 | 
 21 |         # Question
 22 |         phi_x = T.vector('phi_x')
 23 | 
 24 |         # True statements
 25 |         phi_f1_1 = T.vector('phi_f1_1')
 26 |         phi_f2_1 = T.vector('phi_f2_1')
 27 | 
 28 |         # False statements
 29 |         phi_f1_2 = T.vector('phi_f1_2')
 30 |         phi_f2_2 = T.vector('phi_f2_2')
 31 | 
 32 |         # Supporting memories
 33 |         phi_m0 = T.vector('phi_m0')
 34 |         phi_m1 = T.vector('phi_m1')
 35 | 
 36 |         # True word
 37 |         phi_r = T.vector('phi_r')
 38 | 
 39 |         # False words
 40 |         phi_rbars = T.matrix('phi_rbars')
 41 | 
 42 |         self.U_O = init_shared_normal(n_embedding, self.n_D, 0.01)
 43 |         self.U_R = init_shared_normal(n_embedding, self.n_D, 0.01)
 44 | 
 45 |         # Total S_R cost for all sampled words
 46 |         tot_sr_cost = T.scalar('sr_cost')
 47 | 
 48 |         cost = self.calc_cost(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0, phi_m1, phi_r, phi_rbars, tot_sr_cost)
 49 |         params = [self.U_O, self.U_R]
 50 |         gradient = T.grad(cost, params)
 51 | 
 52 |         l_rate = T.scalar('l_rate')
 53 | 
 54 |         updates=[]
 55 |         for param, gparam in zip(params, gradient):
 56 |             param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable)
 57 |             updates.append((param, param - param_update * l_rate))
 58 |             updates.append((param_update, self.momentum*param_update + (1. - self.momentum)*gparam))
 59 | 
 60 |         self.train_function = theano.function(
 61 |             inputs = [phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, \
 62 |                       phi_m0, phi_m1, phi_r, phi_rbars, \
 63 |                       theano.Param(l_rate, default=self.lr), \
 64 |                       theano.Param(tot_sr_cost, default=0.0)],
 65 |             outputs = cost,
 66 |             updates = updates)
 67 | 
 68 |         # Candidate statement for prediction
 69 |         phi_f = T.vector('phi_f')
 70 | 
 71 |         score_o = self.calc_score_o(phi_x, phi_f)
 72 |         self.predict_function_o = theano.function(inputs = [phi_x, phi_f], outputs = score_o)
 73 | 
 74 |         score_r = self.calc_score_r(phi_x, phi_f)
 75 |         self.predict_function_r = theano.function(inputs = [phi_x, phi_f], outputs = score_r)
 76 | 
 77 |     def calc_score_o(self, phi_x, phi_y_yp_t):
 78 |         return T.dot(self.U_O.dot(phi_x), self.U_O.dot(phi_y_yp_t))
 79 | 
 80 |     def calc_score_r(self, phi_x, phi_y):
 81 |         return T.dot(self.U_R.dot(phi_x), self.U_R.dot(phi_y))
 82 | 
 83 |     # phi_f1_1 = phi_f1 - phi_f1bar + phi_t1_1
 84 |     # phi_f1_2 = phi_f1bar - phi_f1 + phi_t1_2
 85 |     def calc_cost(self, phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0, phi_m1, phi_r, phi_rbars, tot_sr_cost):
 86 |         score1_1 = self.calc_score_o(phi_x, phi_f1_1)
 87 |         score1_2 = self.calc_score_o(phi_x, phi_f1_2)
 88 | 
 89 |         score2_1 = self.calc_score_o(phi_x + phi_m0, phi_f2_1)
 90 |         score2_2 = self.calc_score_o(phi_x + phi_m0, phi_f2_2)
 91 | 
 92 |         s_o_cost = (
 93 |             T.maximum(0, self.margin - score1_1) + T.maximum(0, self.margin + score1_2) +
 94 |             T.maximum(0, self.margin - score2_1) + T.maximum(0, self.margin + score2_2)
 95 |         )
 96 | 
 97 |         def compute_sr_cost(phi_rbar, correct_score):
 98 |             false_score = self.calc_score_r(phi_x + phi_m0 + phi_m1, phi_rbar)
 99 |             return T.maximum(0, self.margin - correct_score + false_score)
100 | 
101 |         correct_score3 = self.calc_score_r(phi_x + phi_m0 + phi_m1, phi_r)
102 |         sr_costs, sr_updates = theano.reduce(lambda phi_rbar, tot_sr_cost: tot_sr_cost + compute_sr_cost(phi_rbar, correct_score3),
103 |                                              sequences=phi_rbars, outputs_info=[{'initial': tot_sr_cost}])
104 | 
105 |         cost = s_o_cost + sr_costs
106 |         return cost
107 | 
108 |     def construct_phi(self, phi_type, bow=None, word_id=None, ids=None):
109 |         # type 0: question (phi_x)
110 |         # type 1: supporting memory (phi_m*)
111 |         # type 2: candidate memory (phi_y)
112 |         # type 3: word vector
113 |         # type 4: write-time features
114 |         assert(phi_type >= 0 and phi_type < 5)
115 |         phi = np.zeros((3*self.n_words + 3,))
116 |         if phi_type < 3:
117 |             assert(bow is not None)
118 |             phi[phi_type*self.n_words:(phi_type+1)*self.n_words] = bow
119 |         elif phi_type == 3:
120 |             assert(word_id != None and word_id < self.n_words)
121 |             phi[2*self.n_words + word_id] = 1
122 |         else:
123 |             assert(ids != None and len(ids) == 3)
124 |             if ids[0] > ids[1]: phi[3*self.n_words] = 1
125 |             if ids[0] > ids[2]: phi[3*self.n_words+1] = 1
126 |             if ids[1] > ids[2]: phi[3*self.n_words+2] = 1
127 |         return phi
128 | 
129 |     # returns (phi_y - phi_yp + phi_t)
130 |     def construct_wt_phi(self, index_x, index_y, index_yp, y, yp):
131 |         phi_y = self.construct_phi(2, bow=y)
132 |         phi_yp = self.construct_phi(2, bow=yp)
133 |         phi_t = self.construct_phi(4, ids=[index_x, index_y, index_yp])
134 |         return phi_y - phi_yp + phi_t
135 | 
136 |     def neg_sample(self, c, num):
137 |         assert(c < num)
138 |         assert(num > 1)
139 |         f = random.randint(0, num-2)
140 |         if f == c:
141 |             f = num-1
142 |         return f
143 | 
144 |     def find_m0(self, index_x, phi_x, statements, ignore=None):
145 |         max_score = float("-inf")
146 |         index_m0 = 0
147 |         m0 = statements[0]
148 |         for i in xrange(1,len(statements)):
149 |             if ignore and i == ignore:
150 |                 continue
151 | 
152 |             s = statements[i]
153 |             phi_s = self.construct_wt_phi(index_x, i, index_m0, s, m0)
154 | 
155 |             if self.predict_function_o(phi_x, phi_s) >= 0:
156 |                 index_m0 = i
157 |                 m0 = s
158 | 
159 |         return index_m0, m0
160 | 
161 |     def train(self, dataset_bow, questions, lr_schedule=None):
162 |         l_rate = self.lr
163 |         for epoch in xrange(self.n_epochs):
164 |             costs = []
165 | 
166 |             if lr_schedule != None and epoch in lr_schedule:
167 |                 l_rate = lr_schedule[epoch]
168 | 
169 |             random.shuffle(questions)
170 |             for i, question in enumerate(questions):
171 |                 article_no = question[0]
172 |                 article = dataset_bow[article_no]
173 |                 line_no = question[1]
174 |                 question_phi = question[2]
175 |                 correct_stmts = question[4].split(' ')
176 |                 correct_stmt1 = int(correct_stmts[0])
177 |                 correct_stmt2 = int(correct_stmts[1])
178 | 
179 |                 if line_no <= 1:
180 |                     continue
181 | 
182 |                 # The question
183 |                 phi_x = self.construct_phi(0, bow=question_phi)
184 | 
185 |                 # Find m0
186 |                 index_m0, m0 = self.find_m0(line_no, phi_x, article[:line_no])
187 |                 phi_m0 = self.construct_phi(1, bow=m0)
188 | 
189 |                 # Find m1
190 |                 index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, article[:line_no], ignore=index_m0)
191 |                 phi_m1 = self.construct_phi(1, bow=m1)
192 | 
193 |                 # False statement 1
194 |                 false_stmt1 = index_m0
195 |                 if false_stmt1 == correct_stmt1:
196 |                     false_stmt1 = self.neg_sample(correct_stmt1, line_no)
197 |                 phi_f1_1 = self.construct_wt_phi(line_no, correct_stmt1, false_stmt1, article[correct_stmt1], article[false_stmt1])
198 |                 phi_f1_2 = self.construct_wt_phi(line_no, false_stmt1, correct_stmt1, article[false_stmt1], article[correct_stmt1])
199 | 
200 |                 # False statement 2
201 |                 false_stmt2 = index_m1
202 |                 if false_stmt2 == correct_stmt2:
203 |                     false_stmt2 = self.neg_sample(correct_stmt2, line_no)
204 |                 phi_f2_1 = self.construct_wt_phi(line_no, correct_stmt2, false_stmt2, article[correct_stmt2], article[false_stmt2])
205 |                 phi_f2_2 = self.construct_wt_phi(line_no, false_stmt2, correct_stmt2, article[false_stmt2], article[correct_stmt2])
206 | 
207 |                 # Correct word
208 |                 correct_word = question[3]
209 |                 phi_r = self.construct_phi(3, word_id=correct_word)
210 | 
211 |                 # False word
212 |                 false_word_ids = [i for i in range(self.n_words)]
213 |                 del false_word_ids[correct_word]
214 |                 # Find the highest ranking word, if it isnt the correct word, add it to list
215 |                 # Possible that this word will be added twice, but that is okay
216 |                 false_word1, score = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False)
217 |                 if false_word1 != correct_word:
218 |                     false_word_ids.insert(0, false_word1)
219 |                 # Clip no. of samples to 20
220 |                 false_word_ids = false_word_ids[:min(20,len(false_word_ids))]
221 |                 phi_rbars = np.vstack(tuple(map(lambda word_id: self.construct_phi(3, word_id=word_id), false_word_ids)))
222 | 
223 |                 if article_no == 1 and line_no == 12:
224 |                     print '[SAMPLE] %s\t%s' % (self.id_to_word[correct_word], self.id_to_word[false_word1])
225 |                     w, score = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False)
226 |                     print "[BEFORE] %.3f\t%.3f\t%.3f\t%.3f\tm0:%d\tm1:%d\ta:%s\ts:%.3f\tc:%s" % (
227 |                         self.predict_function_o(phi_x, phi_f1_1),
228 |                         self.predict_function_o(phi_x, phi_f1_2),
229 |                         self.predict_function_o(phi_x + phi_m0, phi_f2_1),
230 |                         self.predict_function_o(phi_x + phi_m0, phi_f2_2),
231 |                         index_m0, index_m1,
232 |                         self.id_to_word[w], score, self.id_to_word[correct_word]
233 |                     )
234 | 
235 |                 cost = self.train_function(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, \
236 |                                            phi_m0, phi_m1, phi_r, phi_rbars, \
237 |                                            l_rate)
238 |                 costs.append(cost)
239 | 
240 |                 if article_no == 1 and line_no == 12:
241 |                     index_m0, m0 = self.find_m0(line_no, phi_x, article[:line_no])
242 |                     phi_m0 = self.construct_phi(1, bow=m0)
243 |                     index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, article[:line_no], ignore=index_m0)
244 |                     phi_m1 = self.construct_phi(1, bow=m1)
245 |                     w, score = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False)
246 |                     print "[ AFTER] %.3f\t%.3f\t%.3f\t%.3f\tm0:%d\tm1:%d\ta:%s\ts:%.3f\tc:%s" % (
247 |                         self.predict_function_o(phi_x, phi_f1_1),
248 |                         self.predict_function_o(phi_x, phi_f1_2),
249 |                         self.predict_function_o(phi_x + phi_m0, phi_f2_1),
250 |                         self.predict_function_o(phi_x + phi_m0, phi_f2_2),
251 |                         index_m0, index_m1,
252 |                         self.id_to_word[w], score, self.id_to_word[correct_word]
253 |                     )
254 | 
255 |             print "Epoch %d: %f" % (epoch, np.mean(costs))
256 | 
257 |     def find_word(self, phi_x, verbose=False):
258 |         max_score = float("-inf")
259 |         best_word = -1
260 |         for i in xrange(self.n_words):
261 |             phi_r = self.construct_phi(3, word_id=i)
262 |             score = self.predict_function_r(phi_x, phi_r)
263 |             if verbose:
264 |                 print '[  FIND] w:%s\ts:%.3f' % (
265 |                     self.id_to_word[i],
266 |                     score
267 |                 )
268 |             if score > max_score:
269 |                 max_score = score
270 |                 best_word = i
271 | 
272 |         assert(best_word >= 0)
273 |         return best_word, score
274 | 
275 |     def predict(self, dataset, questions):
276 |         correct_answers = 0
277 |         wrong_answers = 0
278 |         fake_correct_answers = 0
279 |         for i, question in enumerate(questions):
280 |             article_no = question[0]
281 |             line_no = question[1]
282 |             question_phi = question[2]
283 |             correct = question[3]
284 | 
285 |             phi_x = self.construct_phi(0, bow=question_phi)
286 | 
287 |             statements = dataset[article_no]
288 | 
289 |             phi_m0 = None
290 |             phi_m1 = None
291 |             if len(statements) == 0:
292 |                 print "Stupid question"
293 |                 continue
294 |             elif len(statements) == 1:
295 |                 print "Stupid question?"
296 |                 phi_m0 = self.construct_phi(1, statements[0])
297 |                 phi_m1 = self.construct_phi(1, statements[0])
298 |             else:
299 |                 index_m0, m0 = self.find_m0(line_no, phi_x, statements[:line_no])
300 |                 phi_m0 = self.construct_phi(1, m0)
301 |                 index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, statements[:line_no], ignore=index_m0)
302 |                 phi_m1 = self.construct_phi(1, m1)
303 | 
304 |                 c1 = int(question[4].split(' ')[0])
305 |                 c2 = int(question[4].split(' ')[1])
306 |                 if (index_m0 == c1 or index_m0 == c2) and (index_m1 == c1 or index_m1 == c2):
307 |                     fake_correct_answers += 1
308 | 
309 |             if article_no <= 2:
310 |                 predicted, _ = self.find_word(phi_x + phi_m0 + phi_m1, verbose=False)
311 |                 print "%d, %d, %d: predicted: %s, correct: %s" % (i, article_no, line_no, self.id_to_word[predicted], self.id_to_word[correct])
312 |             else:
313 |                 predicted, _ = self.find_word(phi_x + phi_m0 + phi_m1)
314 |             if predicted == correct:
315 |                 correct_answers += 1
316 |             else:
317 |                 wrong_answers += 1
318 | 
319 |         print '%d correct, %d wrong, %d fake_correct' % (correct_answers, wrong_answers, fake_correct_answers)
320 | 
321 | if __name__ == "__main__":
322 |     train_file = sys.argv[1]
323 |     test_file = train_file.replace('train', 'test')
324 | 
325 |     train_dataset, train_questions, word_to_id, num_words = parse_dataset(train_file)
326 |     test_dataset, test_questions, _, _ = parse_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False)
327 | 
328 |     if len(sys.argv) > 2:
329 |         n_epochs = int(sys.argv[2])
330 |     else:
331 |         n_epochs = 10
332 | 
333 |     memNN = MemNN(n_words=num_words, n_embedding=100, lr=0.01, n_epochs=n_epochs, margin=0.1, word_to_id=word_to_id)
334 |     # memNN.train(train_dataset, train_questions, lr_schedule=dict([(0, 0.01), (20, 0.005), (50, 0.001)]))
335 |     memNN.train(train_dataset, train_questions)
336 |     memNN.predict(test_dataset, test_questions)
337 | 


--------------------------------------------------------------------------------
/memnn_theano_v3.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | import sys, random, pprint
  5 | 
  6 | from theano_util import *
  7 | from keras.activations import tanh, hard_sigmoid
  8 | from keras.initializations import glorot_uniform, orthogonal
  9 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix
 10 | 
 11 | def inspect_inputs(i, node, fn):
 12 |     print i, node, "inputs:", [input[0] for input in fn.inputs],
 13 | 
 14 | def inspect_outputs(i, node, fn):
 15 |     print i, node, "outputs:", [output[0] for output in fn.outputs]
 16 | 
 17 | class MemNN:
 18 |     def __init__(self, n_words=1000, n_embedding=100, lr=0.01, margin=0.1, momentum=0.9, word_to_id=None):
 19 |         self.n_embedding = n_embedding
 20 |         self.n_lstm_embed = n_embedding
 21 |         self.word_embed = n_embedding
 22 |         self.lr = lr
 23 |         self.momentum = momentum
 24 |         self.margin = margin
 25 |         self.n_words = n_words
 26 |         self.n_D = 3 * self.n_words + 3
 27 | 
 28 |         self.word_to_id = word_to_id
 29 |         self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems())
 30 | 
 31 |         # Question
 32 |         x = T.vector('x')
 33 |         phi_x = T.vector('phi_x')
 34 | 
 35 |         # True statements
 36 |         phi_f1_1 = T.vector('phi_f1_1')
 37 |         phi_f2_1 = T.vector('phi_f2_1')
 38 | 
 39 |         # False statements
 40 |         phi_f1_2 = T.vector('phi_f1_2')
 41 |         phi_f2_2 = T.vector('phi_f2_2')
 42 | 
 43 |         # Supporting memories
 44 |         m0 = T.vector('m0')
 45 |         m1 = T.vector('m1')
 46 |         phi_m0 = T.vector('phi_m0')
 47 |         phi_m1 = T.vector('phi_m1')
 48 | 
 49 |         # True word
 50 |         r = T.vector('r')
 51 | 
 52 |         # Word sequence
 53 |         words = T.ivector('words')
 54 | 
 55 |         # Scoring function
 56 |         self.U_O = init_shared_normal(n_embedding, self.n_D, 0.01)
 57 | 
 58 |         # Word embedding
 59 |         self.L = glorot_uniform((self.n_words, self.word_embed))
 60 |         self.Lprime = glorot_uniform((self.n_words, self.n_lstm_embed))
 61 | 
 62 |         # LSTM
 63 |         self.W_i = glorot_uniform((self.word_embed, self.n_lstm_embed))
 64 |         self.U_i = orthogonal((self.n_lstm_embed, self.n_lstm_embed))
 65 |         self.b_i = shared_zeros((self.n_lstm_embed))
 66 | 
 67 |         self.W_f = glorot_uniform((self.word_embed, self.n_lstm_embed))
 68 |         self.U_f = orthogonal((self.n_lstm_embed, self.n_lstm_embed))
 69 |         self.b_f = shared_zeros((self.n_lstm_embed))
 70 | 
 71 |         self.W_c = glorot_uniform((self.word_embed, self.n_lstm_embed))
 72 |         self.U_c = orthogonal((self.n_lstm_embed, self.n_lstm_embed))
 73 |         self.b_c = shared_zeros((self.n_lstm_embed))
 74 | 
 75 |         self.W_o = glorot_uniform((self.word_embed, self.n_lstm_embed))
 76 |         self.U_o = orthogonal((self.n_lstm_embed, self.n_lstm_embed))
 77 |         self.b_o = shared_zeros((self.n_lstm_embed))
 78 | 
 79 |         mem_cost = self.calc_cost(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0)
 80 | 
 81 |         lstm_output = self.lstm_cost(words)
 82 |         self.predict_function_r = theano.function(inputs = [words], outputs = lstm_output, allow_input_downcast=True)
 83 | 
 84 |         lstm_cost = -T.sum(T.mul(r, T.log(lstm_output)))
 85 | 
 86 |         cost = mem_cost + lstm_cost
 87 | 
 88 |         params = [
 89 |             self.U_O,
 90 |             self.W_i, self.U_i, self.b_i,
 91 |             self.W_f, self.U_f, self.b_f,
 92 |             self.W_c, self.U_c, self.b_c,
 93 |             self.W_o, self.U_o, self.b_o,
 94 |             self.L, self.Lprime
 95 |         ]
 96 | 
 97 |         grads = T.grad(cost, params)
 98 | 
 99 |         # Parameter updates
100 |         updates = self.get_updates(params, grads, method='adagrad')
101 | 
102 |         l_rate = T.scalar('l_rate')
103 | 
104 |         # Theano functions
105 |         self.train_function = theano.function(
106 |             inputs = [phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2,
107 |                       phi_m0, r, words,
108 |                       theano.Param(l_rate, default=self.lr)],
109 |             outputs = cost,
110 |             updates = updates,
111 |             on_unused_input='warn',
112 |             allow_input_downcast=True,
113 |             )
114 |             #mode='FAST_COMPILE')
115 |             #mode='DebugMode')
116 |             #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs))
117 | 
118 |         # Candidate statement for prediction
119 |         phi_f = T.vector('phi_f')
120 | 
121 |         score_o = self.calc_score_o(phi_x, phi_f)
122 |         self.predict_function_o = theano.function(inputs = [phi_x, phi_f], outputs = score_o)
123 | 
124 |     def get_updates(self, params, grads, method=None, **kwargs):
125 |         self.rho = 0.95
126 |         self.epsilon = 1e-6
127 | 
128 |         accumulators = [shared_zeros(p.get_value().shape) for p in params]
129 |         updates=[]
130 | 
131 |         if method == 'adadelta':
132 |             print "Using ADADELTA"
133 |             delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
134 |             for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
135 |                 new_a = self.rho * a + (1 - self.rho) * g ** 2 # update accumulator
136 |                 updates.append((a, new_a))
137 | 
138 |                 # use the new accumulator and the *old* delta_accumulator
139 |                 update = g * T.sqrt(d_a + self.epsilon) / T.sqrt(new_a + self.epsilon)
140 | 
141 |                 new_p = p - self.lr * update
142 |                 updates.append((p, new_p)) # apply constraints
143 | 
144 |                 # update delta_accumulator
145 |                 new_d_a = self.rho * d_a + (1 - self.rho) * update ** 2
146 |                 updates.append((d_a, new_d_a))
147 | 
148 | 
149 |         elif method == 'adam':
150 |             # unimplemented
151 |             print "Using ADAM"
152 | 
153 |         elif method == 'adagrad':
154 |             print "Using ADAGRAD"
155 |             for p, g, a in zip(params, grads, accumulators):
156 |                 new_a = a + g ** 2 # update accumulator
157 |                 updates.append((a, new_a))
158 | 
159 |                 new_p = p - self.lr * g / T.sqrt(new_a + self.epsilon)
160 |                 updates.append((p, new_p)) # apply constraints
161 | 
162 |         else: # Default
163 |             print "Using MOMENTUM"
164 |             l_rate = kwargs['l_rate']
165 |             for param, gparam in zip(params, gradient):
166 |                 param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable)
167 |                 updates.append((param, param - param_update * l_rate))
168 |                 updates.append((param_update, self.momentum*param_update + (1. - self.momentum)*gparam))
169 | 
170 |         return updates
171 | 
172 |     def _step(self,
173 |         xi_t, xf_t, xc_t, xo_t,
174 |         h_tm1, c_tm1,
175 |         u_i, u_f, u_o, u_c):
176 | 
177 |         i_t = hard_sigmoid(xi_t + T.dot(h_tm1, u_i))
178 |         f_t = hard_sigmoid(xf_t + T.dot(h_tm1, u_f))
179 |         c_t = f_t * c_tm1 + i_t * tanh(xc_t + T.dot(h_tm1, u_c))
180 |         o_t = hard_sigmoid(xo_t + T.dot(h_tm1, u_o))
181 |         h_t = o_t * tanh(c_t)
182 |         return h_t, c_t
183 | 
184 |     # words: word index in n_words
185 |     def lstm_cost(self, words):
186 |         x = self.L[words]
187 | 
188 |         # Each element of x is (word_embed,) shape
189 |         xi = T.dot(x, self.W_i) + self.b_i
190 |         xf = T.dot(x, self.W_f) + self.b_f
191 |         xc = T.dot(x, self.W_c) + self.b_c
192 |         xo = T.dot(x, self.W_o) + self.b_o
193 | 
194 |         [outputs, memories], updates = theano.scan(
195 |             self._step,
196 |             sequences=[xi, xf, xc, xo],
197 |             outputs_info=[
198 |                 alloc_zeros_matrix(self.n_lstm_embed),
199 |                 alloc_zeros_matrix(self.n_lstm_embed),
200 |             ],
201 |             non_sequences=[
202 |                 self.U_i, self.U_f, self.U_o, self.U_c,
203 |             ],
204 |             truncate_gradient=-1
205 |         )
206 | 
207 |         r = T.dot(self.Lprime, outputs[-1])
208 | 
209 |         return T.nnet.softmax(r)
210 | 
211 |     def calc_score_o(self, phi_x, phi_y_yp_t):
212 |         return T.dot(self.U_O.dot(phi_x), self.U_O.dot(phi_y_yp_t))
213 | 
214 |     # phi_f1_1 = phi_f1 - phi_f1bar + phi_t1_1
215 |     # phi_f1_2 = phi_f1bar - phi_f1 + phi_t1_2
216 |     def calc_cost(self, phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2, phi_m0):
217 |         score1_1 = self.calc_score_o(phi_x, phi_f1_1)
218 |         score1_2 = self.calc_score_o(phi_x, phi_f1_2)
219 | 
220 |         score2_1 = self.calc_score_o(phi_x + phi_m0, phi_f2_1)
221 |         score2_2 = self.calc_score_o(phi_x + phi_m0, phi_f2_2)
222 | 
223 |         s_o_cost = (
224 |             T.maximum(0, self.margin - score1_1) + T.maximum(0, self.margin + score1_2) +
225 |             T.maximum(0, self.margin - score2_1) + T.maximum(0, self.margin + score2_2)
226 |         )
227 | 
228 |         return s_o_cost
229 | 
230 |     def construct_phi(self, phi_type, bow=None, word_id=None, ids=None):
231 |         # type 0: question (phi_x)
232 |         # type 1: supporting memory (phi_m*)
233 |         # type 2: candidate memory (phi_y)
234 |         # type 3: word vector
235 |         # type 4: write-time features
236 |         # type 5: 0s
237 |         assert(phi_type >= 0 and phi_type < 6)
238 |         phi = np.zeros((3*self.n_words + 3,))
239 |         if phi_type < 3:
240 |             assert(bow is not None)
241 |             phi[phi_type*self.n_words:(phi_type+1)*self.n_words] = bow
242 |         elif phi_type == 3:
243 |             assert(word_id != None and word_id < self.n_words)
244 |             phi[2*self.n_words + word_id] = 1
245 |         elif phi_type == 5:
246 |             pass
247 |         else:
248 |             assert(ids != None and len(ids) == 3)
249 |             if ids[0] > ids[1]: phi[3*self.n_words] = 1
250 |             if ids[0] > ids[2]: phi[3*self.n_words+1] = 1
251 |             if ids[1] > ids[2]: phi[3*self.n_words+2] = 1
252 |         return phi
253 | 
254 |     def make_one_hot(self, index):
255 |         v = np.zeros((self.n_words))
256 |         v[index] = 1.0
257 |         return v
258 | 
259 |     # returns (phi_y - phi_yp + phi_t)
260 |     def construct_wt_phi(self, index_x, index_y, index_yp, y, yp):
261 |         phi_y = self.construct_phi(2, bow=y)
262 |         phi_yp = self.construct_phi(2, bow=yp)
263 |         phi_t = self.construct_phi(4, ids=[index_x, index_y, index_yp])
264 |         return phi_y - phi_yp + phi_t
265 | 
266 |     def neg_sample(self, c, num):
267 |         assert(c < num)
268 |         assert(num > 1)
269 |         f = random.randint(0, num-2)
270 |         if f == c:
271 |             f = num-1
272 |         return f
273 | 
274 |     def find_m0(self, index_x, phi_x, statements, ignore=None):
275 |         max_score = float("-inf")
276 |         index_m0 = 0
277 |         m0 = statements[0]
278 |         for i in xrange(1,len(statements)):
279 |             if ignore and i == ignore:
280 |                 continue
281 | 
282 |             s = statements[i]
283 |             phi_s = self.construct_wt_phi(index_x, i, index_m0, s, m0)
284 | 
285 |             if self.predict_function_o(phi_x, phi_s) >= 0:
286 |                 index_m0 = i
287 |                 m0 = s
288 | 
289 |         return index_m0, m0
290 | 
291 |     def train(self, dataset_seq, dataset_bow, questions, n_epochs=100, lr_schedule=None):
292 |         l_rate = self.lr
293 |         for epoch in xrange(n_epochs):
294 |             costs = []
295 | 
296 |             if lr_schedule != None and epoch in lr_schedule:
297 |                 l_rate = lr_schedule[epoch]
298 | 
299 |             random.shuffle(questions)
300 |             for i, question in enumerate(questions):
301 |                 article_no = question[0]
302 |                 article = dataset_bow[article_no]
303 |                 line_no = question[1]
304 |                 question_phi = question[2]
305 |                 correct_stmts = question[4].split(' ')
306 |                 correct_stmt1 = int(correct_stmts[0])
307 |                 is_single_statement = len(correct_stmts) == 1
308 |                 correct_stmt2 = None
309 |                 if not is_single_statement:
310 |                     correct_stmt2 = int(correct_stmts[1])
311 |                 question_seq = question[-1]
312 | 
313 |                 if line_no <= 1:
314 |                     continue
315 | 
316 |                 # The question
317 |                 phi_x = self.construct_phi(0, bow=question_phi)
318 | 
319 |                 # Find m0
320 |                 index_m0, m0 = self.find_m0(line_no, phi_x, article[:line_no])
321 |                 phi_m0 = self.construct_phi(1, bow=m0)
322 | 
323 |                 # Find m1
324 |                 index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, article[:line_no], ignore=index_m0)
325 |                 phi_m1 = self.construct_phi(1, bow=m1)
326 | 
327 |                 # False statement 1
328 |                 false_stmt1 = index_m0
329 |                 if false_stmt1 == correct_stmt1:
330 |                     false_stmt1 = self.neg_sample(correct_stmt1, line_no)
331 |                 phi_f1_1 = self.construct_wt_phi(line_no, correct_stmt1, false_stmt1, article[correct_stmt1], article[false_stmt1])
332 |                 phi_f1_2 = self.construct_wt_phi(line_no, false_stmt1, correct_stmt1, article[false_stmt1], article[correct_stmt1])
333 | 
334 |                 # False statement 2
335 |                 phi_f2_1 = None
336 |                 phi_f2_2 = None
337 |                 if not is_single_statement:
338 |                     false_stmt2 = index_m1
339 |                     if false_stmt2 == correct_stmt2:
340 |                         false_stmt2 = self.neg_sample(correct_stmt2, line_no)
341 |                     phi_f2_1 = self.construct_wt_phi(line_no, correct_stmt2, false_stmt2, article[correct_stmt2], article[false_stmt2])
342 |                     phi_f2_2 = self.construct_wt_phi(line_no, false_stmt2, correct_stmt2, article[false_stmt2], article[correct_stmt2])
343 |                 else:
344 |                     phi_f2_1 = self.construct_phi(5)
345 |                     phi_f2_2 = self.construct_phi(5)
346 | 
347 |                 # Correct word
348 |                 correct_word = question[3]
349 |                 r = self.make_one_hot(correct_word)
350 | 
351 |                 words = np.asarray(dataset_seq[article_no][index_m0] + dataset_seq[article_no][index_m1] + question_seq)
352 | 
353 |                 cost = self.train_function(phi_x, phi_f1_1, phi_f1_2, phi_f2_1, phi_f2_2,
354 |                                            phi_m0, r, words)
355 |                 #print "%d: %f" % (i, cost)
356 |                 costs.append(cost)
357 | 
358 |             print "Epoch %d: %f" % (epoch, np.mean(costs))
359 | 
360 |     def find_word(self, words):
361 |         probs = self.predict_function_r(words)
362 |         return np.argmax(probs)
363 | 
364 |     def predict(self, dataset_seq, dataset_bow, questions):
365 |         correct_answers = 0
366 |         wrong_answers = 0
367 |         fake_correct_answers = 0
368 |         for i, question in enumerate(questions):
369 |             article_no = question[0]
370 |             line_no = question[1]
371 |             question_phi = question[2]
372 |             correct = question[3]
373 |             question_seq = question[-1]
374 | 
375 |             x = question_phi
376 |             phi_x = self.construct_phi(0, bow=question_phi)
377 | 
378 |             statements = dataset_bow[article_no]
379 | 
380 |             phi_m0 = None
381 |             phi_m1 = None
382 |             if len(statements) == 0:
383 |                 print "Stupid question"
384 |                 continue
385 |             elif len(statements) == 1:
386 |                 print "Stupid question?"
387 |                 phi_m0 = self.construct_phi(1, statements[0])
388 |                 phi_m1 = self.construct_phi(1, statements[0])
389 |             else:
390 |                 index_m0, m0 = self.find_m0(line_no, phi_x, statements[:line_no])
391 |                 phi_m0 = self.construct_phi(1, m0)
392 |                 index_m1, m1 = self.find_m0(index_m0, phi_x + phi_m0, statements[:line_no], ignore=index_m0)
393 | 
394 |                 correct_stmts = question[4].split(' ')
395 |                 is_single_statement = len(correct_stmts) == 1
396 |                 c1 = int(correct_stmts[0])
397 |                 c2 = int(question[4].split(' ')[1]) if not is_single_statement else None
398 |                 if (index_m0 == c1 or index_m0 == c2) and (index_m1 == c1 or index_m1 == c2):
399 |                     fake_correct_answers += 1
400 | 
401 |             predicted = self.find_word(
402 |                 np.asarray(dataset_seq[article_no][index_m0] + dataset_seq[article_no][index_m1] + question_seq)
403 |             )
404 |             # print 'Correct: %s (%d), Guess: %s (%d)' % (self.id_to_word[correct], correct, self.id_to_word[predicted], predicted)
405 |             if predicted == correct:
406 |                 correct_answers += 1
407 |             else:
408 |                 wrong_answers += 1
409 | 
410 |         print '%d correct, %d wrong, %d fake_correct' % (correct_answers, wrong_answers, fake_correct_answers)
411 | 
412 |     def train_weak(self, dataset, questions, n_epochs=100, lr_schedule=None):
413 |         l_rate = self.lr
414 |         for epoch in xrange(n_epochs):
415 |             costs = []
416 | 
417 |             if lr_schedule != None and epoch in lr_schedule:
418 |                 l_rate = lr_schedule[epoch]
419 | 
420 |             random.shuffle(questions)
421 |             for i, question in enumerate(questions):
422 |                 article_no = question[0]
423 |                 article = dataset[article_no]
424 |                 line_no = question[1]
425 |                 statements_seq = question[2][:-1]
426 |                 question_seq = question[2][-1]
427 | 
428 |                 if line_no <= 1:
429 |                     continue
430 | 
431 |                 # Correct word
432 |                 correct_word = question[3]
433 | 
434 |                 cost = self.train_function(statements_seq, question_seq, correct_word)
435 | 
436 |                 #print "%d: %f" % (i, cost)
437 |                 costs.append(cost)
438 | 
439 |             print "Epoch %d: %f" % (epoch, np.mean(costs))
440 | 
441 |     def predict_weak(self, dataset, questions):
442 |         correct_answers = 0
443 |         wrong_answers = 0
444 |         for i, question in enumerate(questions):
445 |             article_no = question[0]
446 |             article = dataset[article_no]
447 |             line_no = question[1]
448 |             statements_seq = question[2][:-1]
449 |             question_seq = question[2][-1]
450 |             correct = question[3]
451 | 
452 |             predicted = self.predict_function(
453 |                 np.asarray(statements_seq), np.asarray(question_seq)
454 |             )
455 |             # print 'Correct: %s (%d), Guess: %s (%d)' % (self.id_to_word[correct], correct, self.id_to_word[predicted], predicted)
456 |             if predicted == correct:
457 |                 correct_answers += 1
458 |             else:
459 |                 wrong_answers += 1
460 | 
461 |         print '%d correct, %d wrong' % (correct_answers, wrong_answers)
462 | 
463 | if __name__ == "__main__":
464 |     train_file = sys.argv[1]
465 |     test_file = train_file.replace('train', 'test')
466 | 
467 |     train_dataset_seq, train_dataset_bow, train_questions, word_to_id, num_words = parse_dataset(train_file)
468 |     test_dataset_seq, test_dataset_bow, test_questions, _, _ = parse_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False)
469 | 
470 |     if len(sys.argv) > 2:
471 |         n_epochs = int(sys.argv[2])
472 |     else:
473 |         n_epochs = 10
474 | 
475 |     memNN = MemNN(n_words=num_words, n_embedding=100, lr=0.01, margin=0.1, word_to_id=word_to_id)
476 |     #memNN.train(train_dataset_seq, train_dataset_bow, train_questions, n_epochs=n_epochs, lr_schedule=dict([(0, 0.02), (20, 0.01), (50, 0.005), (80, 0.002)]))
477 |     #memNN.train(train_dataset_seq, train_dataset_bow, train_questions, lr_schedule=dict([(0, 0.01), (15, 0.009), (30, 0.007), (50, 0.005), (60, 0.003), (85, 0.001)]))
478 |     #memNN.train(train_dataset_seq, train_dataset_bow, train_questions)
479 |     #memNN.predict(train_dataset, train_questions)
480 |     #memNN.predict(test_dataset_seq, test_dataset_bow, test_questions)
481 | 
482 |     for i in xrange(20):
483 |         memNN.train(train_dataset_seq, train_dataset_bow, train_questions, n_epochs=5)
484 |         memNN.predict(test_dataset_seq, test_dataset_bow, test_questions)
485 | 


--------------------------------------------------------------------------------
/nltk_utils.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | from nltk.stem.wordnet import WordNetLemmatizer
 3 | from nltk.tokenize import word_tokenize
 4 | from nltk.corpus import wordnet as wn
 5 | 
 6 | def is_noun(tag):
 7 |     return tag in ['NN', 'NNS', 'NNP', 'NNPS']
 8 | 
 9 | def is_verb(tag):
10 |     return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
11 | 
12 | def is_adverb(tag):
13 |     return tag in ['RB', 'RBR', 'RBS']
14 | 
15 | def is_adjective(tag):
16 |     return tag in ['JJ', 'JJR', 'JJS']
17 | 
18 | def penn_to_wn(tag):
19 |     if is_adjective(tag):
20 |         return wn.ADJ
21 |     elif is_noun(tag):
22 |         return wn.NOUN
23 |     elif is_adverb(tag):
24 |         return wn.ADV
25 |     elif is_verb(tag):
26 |         return wn.VERB
27 |     return wn.NOUN
28 | 
29 | def memoize1(f):
30 |     memo = {}
31 |     def helper(x):
32 |         if x not in memo:
33 |             memo[x] = f(x)
34 |         return memo[x]
35 |     return helper
36 | 
37 | def memoize2(f):
38 |     memo = {}
39 |     def helper(x,y):
40 |         if (x,y) not in memo:
41 |             memo[(x,y)] = f(x, y)
42 |         return memo[(x,y)]
43 |     return helper
44 | 
45 | def stem_word(word):
46 |     return nltk.stem.snowball.EnglishStemmer().stem(word)
47 | 
48 | stem_word = memoize1(stem_word)
49 | 
50 | def get_lemma(word, tag):
51 |     return WordNetLemmatizer().lemmatize(word, tag)
52 | 
53 | get_lemma = memoize2(get_lemma)
54 | 
55 | def canonicalize_tokens(tokens):
56 |     canonical_tokens = []
57 |     tags = nltk.pos_tag(tokens)
58 |     for tag in tags:
59 |         wn_tag = penn_to_wn(tag[1])
60 |         t = get_lemma(tag[0], wn_tag)
61 |         t = stem_word(t)
62 |         canonical_tokens.append(t)
63 |     return canonical_tokens
64 | 


--------------------------------------------------------------------------------
/pararth_final_report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/pararth_final_report.pdf


--------------------------------------------------------------------------------
/pararth_milestone.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pararthshah/qa-memnn/29f087c1d8f9720c02caa715ff6ff0793fde98b8/pararth_milestone.pdf


--------------------------------------------------------------------------------
/pos_pruning.py:
--------------------------------------------------------------------------------
 1 | from nltk_utils import *
 2 | 
 3 | import nltk
 4 | from nltk.corpus import wordnet as wn
 5 | 
 6 | def memoizefirst(f):
 7 |     memo = {}
 8 |     def helper(x, y):
 9 |         if x not in memo:
10 |             memo[x] = f(x, y)
11 |         return memo[x]
12 |     return helper
13 | 
14 | def get_noun_set(article, tokens):
15 |     tags = nltk.pos_tag(tokens)
16 |     nouns = set(
17 |         map(
18 |             lambda x: x[0],
19 |             filter(
20 |                 lambda x: x[1] == wn.NOUN,
21 |                 map(lambda x: (x[0], penn_to_wn(x[1])), tags),
22 |             )
23 |         )
24 |     )
25 |     return nouns
26 | 
27 | get_noun_set = memoizefirst(get_noun_set)
28 | 
29 | def prune_statements(dataset, questions, debug=True):
30 |     total_old = 0
31 |     total_new = 0
32 | 
33 |     for i in range(len(questions)):
34 |         question = questions[i]
35 |         new_statements = []
36 |         old_statements = question[2]
37 | 
38 |         # Keep only statements which have at least 1 common noun
39 |         q = question[3]
40 |         q_nouns = get_noun_set('|'.join(q), q)
41 | 
42 |         for s in old_statements:
43 |             s_nouns = get_noun_set('|'.join(s), s)
44 |             if len(s_nouns.intersection(q_nouns)) > 0:
45 |                 new_statements.append(s)
46 | 
47 |         questions[i][2] = new_statements
48 |         total_old += len(old_statements)
49 |         total_new += len(new_statements)
50 | 
51 |         if debug and i < 3:
52 |             print "Question: ", q, "Statements:\n", old_statements, "\n", new_statements, "\nbefore %d after %d" % (len(old_statements), len(new_statements))
53 | 
54 |     #print("Before %d After %d" % (total_old, total_new))
55 |     return questions
56 | 


--------------------------------------------------------------------------------
/qa_dataset_parser.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from theano_util import *
  4 | #from wordvec_pruning import prune_statements
  5 | from pos_pruning import prune_statements
  6 | 
  7 | from nltk_utils import *
  8 | 
  9 | def only_words(line):
 10 |     ps = re.sub(r'[^a-zA-Z0-9]', r' ', line)
 11 |     ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations
 12 |     ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 13 |     hs = re.sub(r'-', r' ', ns) # Replace hyphens with space
 14 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 15 |     return rs
 16 | 
 17 | def clean_sentence(line):
 18 |     ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters
 19 |     ws = re.sub(r'(\W)', r' \1 ', ps) # Put spaces around punctuations
 20 |     ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 21 |     hs = re.sub(r'-', r' ', ns) # Replace hyphens with space
 22 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 23 |     return rs
 24 | 
 25 | def get_sentences(line):
 26 |     ps = re.sub(r'[^a-zA-Z0-9\.\?\!]', ' ', line) # Split on punctuations and hex characters
 27 |     s = re.sub(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', '\t', ps) # Split on sentences
 28 |     ws = re.sub(r'(\W)', r' \1 ', s) # Put spaces around punctuations
 29 |     ns = re.sub(r'(\d+)', r' <number> ', ws) # Put spaces around numbers
 30 |     hs = re.sub(r'-', r' ', ns) # Replace hyphens with space
 31 |     rs = re.sub(r' +', r' ', hs) # Reduce multiple spaces into 1
 32 | 
 33 |     return rs.split('\t')
 34 | 
 35 | def parse_qa_dataset(input_dir, word_id=0, word_to_id={}, update_word_ids=True):
 36 |     dataset = []
 37 |     questions = []
 38 | 
 39 |     article_files = set()
 40 |     print("Parsing questions...")
 41 |     with open(input_dir + '/question_answer_pairs.txt') as f:
 42 |         for line in f:
 43 |             # Skip first line
 44 |             if 'ArticleFile' in line:
 45 |                 continue
 46 | 
 47 |             line = line.strip()
 48 | 
 49 |             # Skip empty lines
 50 |             if len(line) == 0:
 51 |                 continue
 52 | 
 53 |             parts = line.split('\t')
 54 |             if len(parts) != 6:
 55 |                 print("Malformed line: " + line)
 56 |                 continue
 57 | 
 58 |             question = parts[1]
 59 |             answer = parts[2]
 60 |             answer = canonicalize_tokens([only_words(answer).strip().lower()])
 61 |             assert(len(answer) == 1)
 62 |             answer = answer[0]
 63 | 
 64 |             article_name = parts[5]
 65 | 
 66 |             # There are other fields in the dataset, use them later if you want
 67 | 
 68 |             # This dataset has repeated questions. What to do?
 69 | 
 70 |             # Don't answer questions with more than 1 word answers
 71 |             if len(answer) == 0 or len(answer.split(' ')) > 1:
 72 |                 # Skip for now
 73 |                 continue
 74 | 
 75 |             if not update_word_ids and answer not in word_to_id:
 76 |                 continue
 77 | 
 78 |             question_parts = question.split('\t')
 79 |             tokens = clean_sentence(question_parts[0]).strip().split()
 80 |             tokens = filter(lambda x: len(x.strip()) > 0, tokens)
 81 |             tokens = map(lambda x: x.lower(), tokens)
 82 |             tokens = canonicalize_tokens(tokens)
 83 | 
 84 |             if not update_word_ids:
 85 |                 tokens = filter(lambda x: x in word_to_id, tokens)
 86 | 
 87 |             question_tokens = tokens
 88 |             if update_word_ids:
 89 |                 for token in (tokens + [answer]):
 90 |                     if token not in word_to_id:
 91 |                         word_to_id[token] = word_id
 92 |                         word_id += 1
 93 | 
 94 |             article_no = len(questions)
 95 | 
 96 |             article_file = input_dir + '/' + article_name + '.txt.clean'
 97 |             article_files.add(article_file)
 98 |             dataset.append(question_tokens)
 99 |             questions.append([article_no, article_file, None, question_tokens, answer])
100 | 
101 |     article_data = {}
102 |     print("Parsing articles...")
103 |     for article_file in article_files:
104 |         # Get all statements in the dataset for this question
105 | 
106 |         print("Parsing: " + article_file)
107 |         s_file = open(article_file)
108 |         statements = []
109 |         for statement in s_file:
110 |             if len(statement.strip()) == 0:
111 |                 continue
112 | 
113 |             sentences = get_sentences(statement.strip())
114 | 
115 |             for sentence in sentences:
116 |                 tokens = sentence.strip().split()
117 |                 tokens = filter(lambda x: len(x.strip()) > 0, tokens)
118 |                 tokens = map(lambda x: x.lower(), tokens)
119 |                 tokens = canonicalize_tokens(tokens)
120 | 
121 |                 if not update_word_ids:
122 |                     tokens = filter(lambda x: x in word_to_id, tokens)
123 | 
124 |                 article = tokens
125 |                 statements.append(article)
126 |                 dataset.append(article)
127 |                 if update_word_ids:
128 |                     for token in tokens:
129 |                         if token not in word_to_id:
130 |                             word_to_id[token] = word_id
131 |                             word_id += 1
132 | 
133 |         article_data[article_file] = statements
134 | 
135 |     print("Mapping articles to statements...")
136 |     print("There are %d questions before deduplication" % len(questions))
137 |     question_set = set()
138 |     for i in xrange(len(questions)):
139 |         question = questions[i]
140 |         question_tuple = tuple(question[3])
141 |         if question_tuple in question_set:
142 |             question[0] = None
143 |             continue
144 | 
145 |         question_set.add(question_tuple)
146 |         question[2] = article_data[question[1]]
147 | 
148 |     questions = filter(lambda x: x[0] is not None, questions)
149 |     print("There are %d questions after deduplication" % len(questions))
150 | 
151 |     print("Trying to prune extraneaous statements...")
152 |     questions = prune_statements(dataset, questions)
153 |     before_prune = len(questions)
154 |     questions = filter(lambda x: len(x[2]) > 1, questions)
155 |     after_prune = len(questions)
156 |     print("Pruning invalidated %d questions", (before_prune - after_prune))
157 | 
158 |     print("Final processing...")
159 |     questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions)
160 |     return dataset, questions_seq, word_to_id, word_id
161 | 
162 | import cPickle
163 | import random
164 | 
165 | if __name__ == "__main__":
166 |     train_file = sys.argv[1]
167 |     test_file = sys.argv[2]
168 | 
169 |     train_dataset, train_questions, word_to_id, num_words = parse_qa_dataset(train_file)
170 |     test_dataset, test_questions, word_to_id, num_words = parse_qa_dataset(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False)
171 | 
172 |     #test_dataset, test_questions, _, _ = parse_dataset_weak(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False)
173 | 
174 |     # each element of train_questions contains: [article_no, line_no, [lists of indices of statements and question], index of answer word]
175 |     #import pprint
176 |     #pprint.pprint(word_to_id)
177 |     print num_words
178 | 
179 |     # Pickle!!!!
180 |     print("Pickling train...")
181 |     f = file(train_file + '/dataset.train.pickle', 'wb')
182 |     cPickle.dump((train_dataset, train_questions, word_to_id, num_words), f, protocol=cPickle.HIGHEST_PROTOCOL)
183 |     f.close()
184 | 
185 |     print("Pickling test...")
186 |     f = file(test_file + '/dataset.test.pickle', 'wb')
187 |     cPickle.dump((test_dataset, test_questions, word_to_id, num_words), f, protocol=cPickle.HIGHEST_PROTOCOL)
188 |     f.close()
189 | 


--------------------------------------------------------------------------------
/theano_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import re, sys
  3 | import theano
  4 | import theano.tensor as T
  5 | from keras.utils.theano_utils import shared_zeros
  6 | 
  7 | dtype=theano.config.floatX
  8 | 
  9 | def init_shared_normal(num_rows, num_cols, scale=1):
 10 |     '''Initialize a matrix shared variable with normally distributed
 11 |     elements.'''
 12 |     return theano.shared(np.random.normal(
 13 |         scale=scale, size=(num_rows, num_cols)).astype(dtype))
 14 | 
 15 | def init_shared_normal_tensor(num_slices, num_rows, num_cols, scale=1):
 16 |     '''Initialize a matrix shared variable with normally distributed
 17 |     elements.'''
 18 |     return theano.shared(np.random.normal(
 19 |         scale=scale, size=(num_slices, num_rows, num_cols)).astype(dtype))
 20 | 
 21 | def init_shared_zeros(*shape):
 22 |     '''Initialize a vector shared variable with zero elements.'''
 23 |     return theano.shared(np.zeros(shape, dtype=dtype))
 24 | 
 25 | def make_batches(size, batch_size):
 26 |     nb_batch = int(np.ceil(size/float(batch_size)))
 27 |     return [(i*batch_size, min(size, (i+1)*batch_size)) for i in range(0, nb_batch)]
 28 | 
 29 | def maxnorm_constraint(p, m=40):
 30 |     norms = T.sqrt(T.sum(T.sqr(p)))
 31 |     desired = T.clip(norms, 0, m)
 32 |     p = p * (desired / (1e-7 + norms))
 33 |     return p
 34 | 
 35 | def get_param_updates(params, grads, lr, method=None, **kwargs):
 36 |     rho = 0.95
 37 |     epsilon = 1e-6
 38 | 
 39 |     accumulators = [shared_zeros(p.get_value().shape) for p in params]
 40 |     updates=[]
 41 | 
 42 |     if 'constraint' in kwargs:
 43 |         constraint = kwargs['constraint']
 44 |     else:
 45 |         constraint = None
 46 | 
 47 |     if method == 'adadelta':
 48 |         print "Using ADADELTA"
 49 |         delta_accumulators = [shared_zeros(p.get_value().shape) for p in params]
 50 |         for p, g, a, d_a in zip(params, grads, accumulators, delta_accumulators):
 51 |             new_a = rho * a + (1 - rho) * g ** 2 # update accumulator
 52 | 
 53 |             # use the new accumulator and the *old* delta_accumulator
 54 |             update = g * T.sqrt(d_a + epsilon) / T.sqrt(new_a + epsilon)
 55 |             new_p = p - lr * update
 56 | 
 57 |             # update delta_accumulator
 58 |             new_d_a = rho * d_a + (1 - rho) * update ** 2
 59 | 
 60 |             updates.append((p, new_p))
 61 |             updates.append((a, new_a))
 62 |             updates.append((d_a, new_d_a))
 63 | 
 64 |     elif method == 'adagrad':
 65 |         print "Using ADAGRAD"
 66 |         for p, g, a in zip(params, grads, accumulators):
 67 |             new_a = a + g ** 2 # update accumulator
 68 | 
 69 |             new_p = p - lr * g / T.sqrt(new_a + epsilon)
 70 |             updates.append((p, new_p)) # apply constraints
 71 |             updates.append((a, new_a))
 72 | 
 73 |     elif method == 'momentum': # Default
 74 |         print "Using MOMENTUM"
 75 |         momentum = kwargs['momentum']
 76 |         for param, gparam in zip(params, grads):
 77 |             param_update = theano.shared(param.get_value()*0., broadcastable=param.broadcastable)
 78 |             gparam_constrained = maxnorm_constraint(gparam)
 79 |             param_update_update = momentum*param_update + (1. - momentum)*gparam_constrained
 80 |             updates.append((param, param - param_update * lr))
 81 |             updates.append((param_update, param_update_update))
 82 | 
 83 |     else: # Default
 84 |         print "Using DEFAULT"
 85 |         for param, gparam in zip(params, grads):
 86 |             param_update = maxnorm_constraint(gparam)
 87 |             updates.append((param, param - param_update * lr))
 88 | 
 89 |     # apply constraints on self.weights update
 90 |     # assumes that updates[0] corresponds to self.weights param
 91 |     if constraint != None:
 92 |         updates[0] = (updates[0][0], constraint(updates[0][1]))
 93 | 
 94 |     return updates
 95 | 
 96 | 
 97 | def compute_bow(input_str, word_to_id, num_words):
 98 |     bow = np.zeros((num_words,))
 99 |     for token in input_str.split():
100 |         bow[word_to_id[token]] += 1
101 |     return bow
102 | 
103 | def compute_seq(input_str, word_to_id, num_words):
104 |     seq = []
105 |     for token in input_str.split():
106 |         seq.append(word_to_id[token])
107 |     return seq
108 | 
109 | def transform_ques(question, word_to_id, num_words):
110 |     question.append(compute_seq(question[2], word_to_id, num_words))
111 |     question[2] = compute_bow(question[2], word_to_id, num_words)
112 |     return question
113 | 
114 | def parse_dataset(input_file, word_id=0, word_to_id={}, update_word_ids=True):
115 |     dataset = []
116 |     questions = []
117 |     with open(input_file) as f:
118 |         statements = []
119 |         article_no = 0
120 |         line_no = 0
121 |         stmt_to_line = {}
122 |         for line in f:
123 |             line = line.strip()
124 |             if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article
125 |                 dataset.append(statements)
126 |                 statements = []
127 |                 line_no = 0
128 |                 stmt_to_line = {}
129 |                 article_no += 1
130 |             if '\t' in line:
131 |                 question_parts = line.split('\t')
132 |                 tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split()
133 |                 if update_word_ids:
134 |                     for token in tokens[1:]:
135 |                         if token not in word_to_id:
136 |                             word_to_id[token] = word_id
137 |                             word_id += 1
138 | 
139 |                 # To handle the case of "3 6"
140 |                 lines = None
141 |                 if ' ' in question_parts[2]:
142 |                     stmts = question_parts[2].split(' ')
143 |                     lines = ''
144 |                     for stmt in stmts:
145 |                         lines += str(stmt_to_line[stmt]) + ' '
146 |                     lines = lines.strip()
147 |                 else:
148 |                     lines = str(stmt_to_line[question_parts[2]])
149 | 
150 |                 questions.append([article_no, line_no, ' '.join(tokens[1:]), word_to_id[question_parts[1]], lines])
151 |             else:
152 |                 tokens = re.sub(r'([\.\?])$', r' \1', line).split()
153 |                 stmt_to_line[tokens[0]] = line_no
154 |                 if update_word_ids:
155 |                     for token in tokens[1:]:
156 |                         if token not in word_to_id:
157 |                             word_to_id[token] = word_id
158 |                             word_id += 1
159 |                 statements.append(' '.join(tokens[1:]))
160 |                 line_no += 1
161 |         if len(statements) > 0:
162 |             dataset.append(statements)
163 |     dataset_bow = map(lambda y: map(lambda x: compute_bow(x, word_to_id, word_id), y), dataset)
164 |     dataset_seq = map(lambda y: map(lambda x: compute_seq(x, word_to_id, word_id), y), dataset)
165 |     questions_bow = map(lambda x: transform_ques(x, word_to_id, word_id), questions)
166 |     return dataset_seq, dataset_bow, questions_bow, word_to_id, word_id
167 | 
168 | def pad_statement(stmt, null_word, max_words=20):
169 |     if len(stmt) >= max_words:
170 |         return stmt[-max_words:]
171 |     else:
172 |         return stmt + [null_word for i in range(max_words - len(stmt))]
173 | 
174 | def pad_memories(stmts, null_word, max_stmts=20, max_words=20):
175 |     if len(stmts) >= max_words:
176 |         return stmts[-max_stmts:]
177 |     else:
178 | 
179 |         return stmts + [[null_word for j in range(max_words)] for i in range(max_stmts - len(stmts))]
180 | 
181 | def parse_dataset_weak(input_file, word_id=0, word_to_id={}, update_word_ids=True, max_stmts=20, max_words=20):
182 |     dataset = []
183 |     questions = []
184 |     null_word = '<NULL>'
185 |     if null_word not in word_to_id:
186 |         if update_word_ids == True:
187 |             word_to_id[null_word] = word_id
188 |             word_id += 1
189 |         else:
190 |             print "Null word not found!! AAAAA"
191 |             sys.exit(1)
192 |     null_word_id = word_to_id[null_word]
193 | 
194 |     with open(input_file) as f:
195 |         statements = []
196 |         article_no = 0
197 |         line_no = 0
198 |         stmt_to_line = {}
199 |         for line in f:
200 |             line = line.strip()
201 |             if len(line) > 0 and line[:2] == '1 ' and len(statements) > 0: # new article
202 |                 dataset.append(statements)
203 |                 statements = []
204 |                 line_no = 0
205 |                 stmt_to_line = {}
206 |                 article_no += 1
207 |             if '\t' in line:
208 |                 question_parts = line.split('\t')
209 |                 tokens = re.sub(r'([\.\?])$', r' \1', question_parts[0].strip()).split()
210 |                 if update_word_ids:
211 |                     for token in tokens[1:]:
212 |                         if token not in word_to_id:
213 |                             word_to_id[token] = word_id
214 |                             word_id += 1
215 | 
216 |                 padded_stmts = pad_memories(statements[:line_no], null_word, max_stmts, max_words)
217 |                 padded_ques = pad_statement(tokens[1:], null_word, max_words)
218 |                 questions.append([article_no, line_no, padded_stmts, padded_ques, question_parts[1]])
219 |             else:
220 |                 tokens = re.sub(r'([\.\?])$', r' \1', line).split()
221 |                 stmt_to_line[tokens[0]] = line_no
222 |                 if update_word_ids:
223 |                     for token in tokens[1:]:
224 |                         if token not in word_to_id:
225 |                             word_to_id[token] = word_id
226 |                             word_id += 1
227 |                 statements.append(pad_statement(tokens[1:], null_word, max_words))
228 |                 line_no += 1
229 |         if len(statements) > 0:
230 |             dataset.append(statements)
231 |     questions_seq = map(lambda x: transform_ques_weak(x, word_to_id, word_id), questions)
232 |     return dataset, questions_seq, word_to_id, word_id, null_word_id
233 | 
234 | def transform_ques_weak(question, word_to_id, num_words):
235 |     indices = []
236 |     for stmt in question[2]:
237 |         index_stmt = map(lambda x: word_to_id[x], stmt)
238 |         indices.append(index_stmt)
239 |     question[2] = indices
240 |     question[3] = map(lambda x: word_to_id[x], question[3])
241 |     question[4] = word_to_id[question[4]]
242 |     return question
243 | 
244 | if __name__ == "__main__":
245 |     train_file = sys.argv[1]
246 |     test_file = train_file.replace('train', 'test')
247 | 
248 |     train_dataset, train_questions, word_to_id, num_words = parse_dataset_weak(train_file)
249 |     test_dataset, test_questions, _, _ = parse_dataset_weak(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False)
250 | 
251 |     # each element of train_questions contains: [article_no, line_no, [lists of indices of statements and question], index of answer word]
252 |     print train_questions[0]
253 | 


--------------------------------------------------------------------------------
/wmemnn.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | import sys, random, pprint
  5 | 
  6 | from theano_util import *
  7 | from keras.activations import tanh, hard_sigmoid
  8 | from keras.initializations import glorot_uniform, orthogonal
  9 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix
 10 | from keras.preprocessing import sequence
 11 | 
 12 | from qa_dataset_parser import parse_qa_dataset
 13 | 
 14 | import cPickle
 15 | 
 16 | # theano.config.exception_verbosity = 'high'
 17 | # theano.config.allow_gc = False
 18 | #theano.config.profile = True
 19 | 
 20 | def inspect_inputs(i, node, fn):
 21 |     print i, node, "inputs:", [input[0] for input in fn.inputs],
 22 | 
 23 | def inspect_outputs(i, node, fn):
 24 |     print i, node, "outputs:", [output[0] for output in fn.outputs]
 25 | 
 26 | class WMemNN:
 27 |     def __init__(self, n_words=20, n_embedding=100, lr=0.01,
 28 |                  momentum=0.9, word_to_id=None, null_word_id=-1,
 29 |                  max_stmts=20, max_words=20, load_from_file=None):
 30 |         if load_from_file:
 31 |             self.load_model(load_from_file)
 32 |         else:
 33 |             self.regularization = 0.001
 34 |             self.n_embedding = n_embedding
 35 |             self.lr = lr
 36 |             self.momentum = momentum
 37 |             self.n_words = n_words
 38 |             self.batch_size = 4
 39 |             self.max_stmts = max_stmts
 40 |             self.max_words = max_words
 41 | 
 42 |             self.word_to_id = word_to_id
 43 |             self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems())
 44 |             self.null_word_id = null_word_id
 45 | 
 46 |             # Question embedding
 47 |             # self.B = init_shared_normal(self.n_words, self.n_embedding, 0.1)
 48 | 
 49 |             # Statement input, output embeddings
 50 |             self.weights = init_shared_normal_tensor(4, self.n_words, self.n_embedding, 0.1)
 51 | 
 52 |             # Linear mapping between layers
 53 |             self.H = init_shared_normal(self.n_embedding, self.n_embedding, 0.1)
 54 | 
 55 |             # Final outut weight matrix
 56 |             # self.W = init_shared_normal(self.n_embedding, self.n_words, 0.1)
 57 | 
 58 | 
 59 |         zero_vector = T.vector('zv', dtype=theano.config.floatX)
 60 | 
 61 |         # Statement
 62 |         x = T.imatrix('x')
 63 |         xbatch = T.tensor3('xb', dtype='int32')
 64 | 
 65 |         # Positional encoding matrix
 66 |         pe = T.tensor3('pe')
 67 | 
 68 |         # Question
 69 |         q = T.ivector('q')
 70 |         qbatch = T.imatrix('qb')
 71 | 
 72 |         # True word
 73 |         r = T.iscalar('r')
 74 |         rbatch = T.ivector('rb')
 75 | 
 76 |         memory_cost = self.memnn_cost(x, q, pe)
 77 |         # memory_loss = -T.log(memory_cost[r]) # cross entropy on softmax
 78 |         memory_loss = self.memnn_batch_cost(xbatch, qbatch, rbatch, pe)
 79 | 
 80 |         params = [
 81 |             self.weights,
 82 |             # self.B,
 83 |             # self.W,
 84 |             self.H
 85 |         ]
 86 | 
 87 |         regularization_cost = reduce(
 88 |             lambda x,y: x + y,
 89 |             map(lambda x: self.regularization * T.sum(x ** 2), params)
 90 |         )
 91 | 
 92 |         cost = memory_loss + regularization_cost
 93 | 
 94 |         grads = T.grad(cost, params)
 95 | 
 96 |         l_rate = T.scalar('l_rate')
 97 | 
 98 |         # Parameter updates
 99 |         updates = get_param_updates(params, grads, lr=l_rate, method='momentum', momentum=0.9,
100 |             constraint=self._constrain_embedding(self.null_word_id, zero_vector))
101 | 
102 |         self.train_function = theano.function(
103 |             inputs = [
104 |                 xbatch, qbatch, rbatch, pe,
105 |                 theano.Param(l_rate, default=self.lr),
106 |                 theano.Param(zero_vector, default=np.zeros((self.n_embedding,), theano.config.floatX))
107 |             ],
108 |             outputs = cost,
109 |             updates = updates,
110 |             allow_input_downcast=True,
111 |             # mode='FAST_COMPILE',
112 |             #mode='DebugMode'
113 |             #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs)
114 |             on_unused_input='warn'
115 |         )
116 | 
117 |         self.predict_function = theano.function(
118 |             inputs = [
119 |                 x, q, pe
120 |             ],
121 |             outputs = memory_cost,
122 |             allow_input_downcast=True,
123 |             # mode='FAST_COMPILE',
124 |             on_unused_input='warn'
125 |         )
126 | 
127 |     def _constrain_embedding(self, null_id, zero_vector):
128 |         def wrapper(p):
129 |             for i in range(4):
130 |                 p = T.set_subtensor(p[i,null_id], zero_vector)
131 |             return p
132 |         return wrapper
133 | 
134 |     def _compute_memories(self, statement, previous, weights, pe_matrix):
135 |         pe_weights = pe_matrix * weights[statement]
136 |         memories = T.sum(pe_weights, axis=0)
137 |         return memories
138 | 
139 |     def _get_PE_matrix(self, num_words, embedding_size):
140 |         pe_matrix = np.ones((num_words, 4, embedding_size), theano.config.floatX)
141 |         # for j in range(num_words):
142 |         #     for k in range(embedding_size):
143 |         #         value = (1 - float(j+1)/num_words) - (float(k+1)/embedding_size) * (1 - 2*float(j+1)/num_words)
144 |         #         for i in range(4):
145 |         #             pe_matrix[j,i,k] = value
146 |         return pe_matrix
147 | 
148 |     def save_model(self, filename):
149 |         f = file(filename, 'wb')
150 |         for obj in [self.regularization, self.n_embedding, self.lr,
151 |                     self.momentum, self.n_words, self.batch_size,
152 |                     self.word_to_id, self.id_to_word, self.null_word_id,
153 |                     self.max_stmts, self.max_words, self.weights, self.H]:
154 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
155 |         f.close()
156 | 
157 |     def load_model(self, filename):
158 |         f = file(filename, 'rb')
159 |         self.regularization = cPickle.load(f)
160 |         self.n_embedding = cPickle.load(f)
161 |         self.lr = cPickle.load(f)
162 |         self.momentum = cPickle.load(f)
163 |         self.n_words = cPickle.load(f)
164 |         self.batch_size = cPickle.load(f)
165 |         self.word_to_id = cPickle.load(f)
166 |         self.id_to_word = cPickle.load(f)
167 |         self.null_word_id = cPickle.load(f)
168 |         self.max_stmts = cPickle.load(f)
169 |         self.max_words = cPickle.load(f)
170 |         self.weights = cPickle.load(f)
171 |         self.H = cPickle.load(f)
172 |         f.close()
173 | 
174 | 
175 |     def memnn_batch_cost(self, statements_batch, question_batch, r_batch, pe_matrix):
176 |         l = statements_batch.shape[0]
177 |         s, _ = theano.scan(fn=lambda i, c, xb, qb, rb, pe: c - T.log(self.memnn_cost(xb[i], qb[i], pe)[rb[i]]),
178 |                            outputs_info=T.as_tensor_variable(np.asarray(0, theano.config.floatX)),
179 |                            non_sequences=[statements_batch, question_batch, r_batch, pe_matrix],
180 |                            sequences=[theano.tensor.arange(l, dtype='int64')])
181 |         return s[-1]
182 | 
183 |     def memnn_cost(self, statements, question, pe_matrix):
184 |         # statements: list of list of word indices
185 |         # question: list of word indices
186 | 
187 |         computed_memories, updates = theano.scan(
188 |             self._compute_memories,
189 |             sequences = [statements],
190 |             outputs_info = [
191 |                 alloc_zeros_matrix(self.weights.shape[0], self.n_embedding)
192 |             ],
193 |             non_sequences = [
194 |                 self.weights.dimshuffle(1, 0, 2),
195 |                 pe_matrix
196 |             ],
197 |             truncate_gradient = -1,
198 |         )
199 | 
200 |         memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2)
201 | 
202 |         # Embed question
203 |         u1 = T.sum(self.weights[0][question], axis=0)
204 | 
205 |         # Layer 1
206 |         p = T.nnet.softmax(T.dot(u1, memories[0].T))
207 |         o1 = T.dot(p, memories[1])
208 | 
209 |         # Layer 2
210 |         u2 = o1 + T.dot(u1, self.H)
211 |         p = T.nnet.softmax(T.dot(u2, memories[1].T))
212 |         o2 = T.dot(p, memories[2])
213 | 
214 |         # Layer 3
215 |         u3 = o2 + T.dot(u2, self.H)
216 |         p = T.nnet.softmax(T.dot(u3, memories[2].T))
217 |         o3 = T.dot(p, memories[3])
218 | 
219 |         # Final
220 |         output = T.nnet.softmax(T.dot(o3 + u3, self.weights[3].T))
221 | 
222 |         return output[0]
223 | 
224 |     def train(self, dataset, questions, n_epochs=100, lr_schedule=None, start_epoch=0, max_words=20):
225 |         l_rate = self.lr
226 |         index_array = np.arange(len(questions))
227 | 
228 |         # (max_words, )
229 |         pe_matrix = self._get_PE_matrix(max_words, self.n_embedding)
230 | 
231 |         for epoch in xrange(start_epoch, start_epoch + n_epochs):
232 |             costs = []
233 | 
234 |             if lr_schedule != None and epoch in lr_schedule:
235 |                 l_rate = lr_schedule[epoch]
236 | 
237 |             np.random.shuffle(index_array)
238 |             seen = 0
239 | 
240 |             batches = make_batches(len(questions), self.batch_size)
241 |             for batch_index, (batch_start, batch_end) in enumerate(batches):
242 |                 batch_ids = index_array[batch_start:batch_end]
243 |                 seen += len(batch_ids)
244 |                 questions_batch = []
245 |                 for index in batch_ids:
246 |                     questions_batch.append(questions[index])
247 | 
248 |                 # (batch_size * max_stmts * max_words)
249 |                 statements_seq_batch = np.asarray(map(lambda x: x[2], questions_batch), theano.config.floatX)
250 |                 # (batch_size * max_words)
251 |                 question_seq_batch = np.asarray(map(lambda x: x[3], questions_batch), theano.config.floatX)
252 |                 # (batch_size)
253 |                 correct_word_batch = np.asarray(map(lambda x: x[4], questions_batch), theano.config.floatX)
254 | 
255 |                 cost = self.train_function(
256 |                     statements_seq_batch,
257 |                     question_seq_batch,
258 |                     correct_word_batch,
259 |                     pe_matrix,
260 |                     l_rate
261 |                 )
262 | 
263 |                 # print "Epoch %d, sample %d: %f" % (epoch, i, cost)
264 |                 costs.append(cost)
265 | 
266 |             print "Epoch %d: %f" % (epoch, np.mean(costs))
267 | 
268 |     def predict(self, dataset, questions, max_words=20, print_errors=False):
269 |         correct_answers = 0
270 |         wrong_answers = 0
271 |         pe_matrix = self._get_PE_matrix(max_words, self.n_embedding)
272 | 
273 |         for i, question in enumerate(questions):
274 |             statements_seq = np.asarray(question[2], theano.config.floatX)
275 |             question_seq = np.asarray(question[3], theano.config.floatX)
276 |             correct = question[4]
277 | 
278 |             probs = self.predict_function(
279 |                 statements_seq, question_seq, pe_matrix
280 |             )
281 |             predicted = np.argmax(probs)
282 | 
283 |             if len(question) == 6:
284 |                 ## For mc_test
285 |                 options = question[5]
286 |                 options_probs = probs[options]
287 |                 best_idx = np.argmax(options_probs)
288 |                 predicted = options[best_idx]
289 |                 ##
290 | 
291 |             if predicted == correct:
292 |                 correct_answers += 1
293 |             else:
294 |                 if print_errors and np.random.rand() < 0.02:
295 |                     print 'Correct: %s (%d %.3f), Guess: %s (%d %.3f)' % (self.id_to_word[correct], correct, probs[correct], self.id_to_word[predicted], predicted, probs[predicted])
296 |                 wrong_answers += 1
297 | 
298 |             if len(questions) > 1000:
299 |                 print '(%d/%d) %d correct, %d wrong' % (i+1, len(questions), correct_answers, wrong_answers)
300 | 
301 |         print '%d correct, %d wrong' % (correct_answers, wrong_answers)
302 | 
303 | if __name__ == "__main__":
304 |     train_file = sys.argv[1]
305 |     test_file = train_file.replace('train', 'test')
306 | 
307 |     if len(sys.argv) > 2:
308 |         n_epochs = int(sys.argv[2])
309 |     else:
310 |         n_epochs = 10
311 | 
312 |     if len(sys.argv) > 3:
313 |         n_embedding = int(sys.argv[3])
314 |     else:
315 |         n_embedding = 20
316 | 
317 |     mode = 'babi' # babi or wiki
318 | 
319 |     if '.pickle' in train_file:
320 |         mode = 'wiki'
321 | 
322 |     max_stmts = 20
323 |     max_words = 20
324 | 
325 |     if mode == 'babi':
326 |         train_dataset, train_questions, word_to_id, num_words, null_word_id = parse_dataset_weak(train_file, max_stmts=max_stmts, max_words=max_words)
327 |         test_dataset, test_questions, _, _, _ = parse_dataset_weak(test_file, word_id=num_words, word_to_id=word_to_id, update_word_ids=False, max_stmts=max_stmts, max_words=max_words)
328 |     elif mode == 'wiki':
329 |         # Check for pickled dataset
330 |         print("Loading pickled train dataset")
331 |         f = file(train_file, 'rb')
332 |         import cPickle
333 |         obj = cPickle.load(f)
334 |         train_dataset, train_questions, word_to_id, num_words, null_word_id = obj
335 | 
336 |         print("Loading pickled test dataset")
337 |         f = file(test_file, 'rb')
338 |         obj = cPickle.load(f)
339 |         test_dataset, test_questions, _, _, _ = obj
340 |     elif mode == 'debug':
341 |         train_dataset = []
342 |         train_questions = [[0, 2, [[0, 1, 2, 3, 4, 5], [6, 7, 2, 3, 8, 5], [9, 10, 0, 11]], 4]]
343 |         num_words = 12
344 |         word_to_id = {}
345 | 
346 |     print "Dataset has %d words" % num_words
347 |     # print train_questions[0]
348 | 
349 |     model_file = "mctest500_dim100_wmemnn.pickle"
350 |     train_my_model = False
351 |     save_my_model = True
352 | 
353 |     if train_my_model:
354 |         wmemNN = WMemNN(n_words=num_words, n_embedding=100, lr=0.01, word_to_id=word_to_id, null_word_id=null_word_id,
355 |                         max_stmts=max_stmts, max_words=max_words)
356 | 
357 |         lr_schedule = dict([(0, 0.01), (25, 0.01/2), (50, 0.01/4), (75, 0.01/8)])
358 | 
359 |         for i in xrange(n_epochs/5):
360 |             wmemNN.train(train_dataset, train_questions, 5, lr_schedule, 5*i, max_words)
361 |             wmemNN.predict(train_dataset, train_questions, max_words)
362 |             wmemNN.predict(test_dataset, test_questions, max_words)
363 | 
364 |         if save_my_model:
365 |             print "Saving model to", model_file
366 |             wmemNN.save_model(model_file)
367 |     else:
368 |         wmemNN = WMemNN(load_from_file=model_file)
369 |         wmemNN.predict(train_dataset, train_questions, max_words)
370 |         wmemNN.predict(test_dataset, test_questions, max_words)
371 | 
372 | 
373 | 


--------------------------------------------------------------------------------
/wmemnnmc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import theano
  3 | import theano.tensor as T
  4 | import sys, random, pprint
  5 | 
  6 | from theano_util import *
  7 | from keras.activations import tanh, hard_sigmoid
  8 | from keras.initializations import glorot_uniform, orthogonal
  9 | from keras.utils.theano_utils import shared_zeros, alloc_zeros_matrix
 10 | from keras.preprocessing import sequence
 11 | 
 12 | import cPickle
 13 | 
 14 | # theano.config.exception_verbosity = 'high'
 15 | # theano.config.allow_gc = False
 16 | #theano.config.profile = True
 17 | 
 18 | class WMemNN:
 19 |     def __init__(self, n_words=20, n_embedding=100, lr=0.01,
 20 |                  momentum=0.9, word_to_id=None, null_word_id=-1,
 21 |                  load_from_file=None):
 22 |         if load_from_file:
 23 |             self.load_model(load_from_file)
 24 |         else:
 25 |             self.regularization = 0.01
 26 |             self.n_embedding = n_embedding
 27 |             self.lr = lr
 28 |             self.momentum = momentum
 29 |             self.n_words = n_words
 30 |             self.batch_size = 4
 31 | 
 32 |             self.word_to_id = word_to_id
 33 |             self.id_to_word = dict((v, k) for k, v in word_to_id.iteritems())
 34 |             self.null_word_id = null_word_id
 35 | 
 36 |             # Question embedding
 37 |             # self.B = init_shared_normal(self.n_words, self.n_embedding, 0.1)
 38 | 
 39 |             # Statement input, output embeddings
 40 |             self.weights = init_shared_normal_tensor(4, self.n_words, self.n_embedding, 0.1)
 41 | 
 42 |             # Linear mapping between layers
 43 |             self.H = init_shared_normal(self.n_embedding, self.n_embedding, 0.1)
 44 | 
 45 |             # Final outut weight matrix
 46 |             # self.W = init_shared_normal(self.n_embedding, self.n_words, 0.1)
 47 | 
 48 |             # Answer embedding matrix
 49 |             self.A = init_shared_normal(self.n_words, self.n_embedding, 0.1)
 50 | 
 51 |             # Final scoring matrix
 52 |             self.U = init_shared_normal(self.n_embedding, self.n_embedding, 0.1)
 53 | 
 54 |         zero_vector = T.vector('zv', dtype=theano.config.floatX)
 55 | 
 56 |         # Statement
 57 |         x = T.imatrix('x')
 58 |         xbatch = T.tensor3('xb', dtype='int32')
 59 | 
 60 |         # Positional encoding matrix
 61 |         pe = T.tensor3('pe')
 62 | 
 63 |         # Question
 64 |         q = T.ivector('q')
 65 |         qbatch = T.imatrix('qb')
 66 | 
 67 |         # True word
 68 |         r = T.iscalar('r')
 69 |         rbatch = T.ivector('rb')
 70 | 
 71 |         # Stacked answer vectors
 72 |         a = T.imatrix('a')
 73 |         abatch = T.tensor3('ab', dtype='int32')
 74 | 
 75 |         memory_cost = self.memnn_cost(x, q, a, pe)
 76 |         # memory_loss = -T.log(memory_cost[r]) # cross entropy on softmax
 77 |         memory_loss = self.memnn_batch_cost(xbatch, qbatch, rbatch, abatch, pe)
 78 | 
 79 |         params = [
 80 |             self.weights,
 81 |             # self.B,
 82 |             # self.W,
 83 |             self.H,
 84 |             self.A,
 85 |             self.U,
 86 |         ]
 87 | 
 88 |         regularization_cost = reduce(
 89 |             lambda x,y: x + y,
 90 |             map(lambda x: self.regularization * T.sum(x ** 2), params)
 91 |         )
 92 | 
 93 |         cost = memory_loss + regularization_cost
 94 | 
 95 |         grads = T.grad(cost, params)
 96 | 
 97 |         l_rate = T.scalar('l_rate')
 98 | 
 99 |         # Parameter updates
100 |         updates = get_param_updates(params, grads, lr=l_rate, method='adagrad', momentum=0.9,
101 |             constraint=self._constrain_embedding(self.null_word_id, zero_vector))
102 | 
103 |         self.train_function = theano.function(
104 |             inputs = [
105 |                 xbatch, qbatch, rbatch, abatch, pe,
106 |                 theano.Param(l_rate, default=self.lr),
107 |                 theano.Param(zero_vector, default=np.zeros((self.n_embedding,), theano.config.floatX))
108 |             ],
109 |             outputs = cost,
110 |             updates = updates,
111 |             allow_input_downcast=True,
112 |             # mode='FAST_COMPILE',
113 |             #mode='DebugMode'
114 |             #mode=theano.compile.MonitorMode(pre_func=inspect_inputs,post_func=inspect_outputs)
115 |             on_unused_input='warn'
116 |         )
117 | 
118 |         self.predict_function = theano.function(
119 |             inputs = [
120 |                 x, q, a, pe
121 |             ],
122 |             outputs = memory_cost,
123 |             allow_input_downcast=True,
124 |             # mode='FAST_COMPILE',
125 |             on_unused_input='warn'
126 |         )
127 | 
128 |     def _constrain_embedding(self, null_id, zero_vector):
129 |         def wrapper(p):
130 |             for i in range(4):
131 |                 p = T.set_subtensor(p[i,null_id], zero_vector)
132 |             return p
133 |         return wrapper
134 | 
135 |     def _compute_memories(self, statement, previous, weights, pe_matrix):
136 |         pe_weights = pe_matrix * weights[statement]
137 |         memories = T.sum(pe_weights, axis=0)
138 |         return memories
139 | 
140 |     def _get_PE_matrix(self, num_words, embedding_size):
141 |         pe_matrix = np.ones((num_words, 4, embedding_size), theano.config.floatX)
142 |         # for j in range(num_words):
143 |         #     for k in range(embedding_size):
144 |         #         value = (1 - float(j+1)/num_words) - (float(k+1)/embedding_size) * (1 - 2*float(j+1)/num_words)
145 |         #         for i in range(4):
146 |         #             pe_matrix[j,i,k] = value
147 |         return pe_matrix
148 | 
149 |     def save_model(self, filename):
150 |         f = file(filename, 'wb')
151 |         for obj in [self.regularization, self.n_embedding, self.lr,
152 |                     self.momentum, self.n_words, self.batch_size,
153 |                     self.word_to_id, self.id_to_word, self.null_word_id,
154 |                     self.weights, self.H, self.A, self.U]:
155 |             cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
156 |         f.close()
157 | 
158 |     def load_model(self, filename):
159 |         f = file(filename, 'rb')
160 |         self.regularization = cPickle.load(f)
161 |         self.n_embedding = cPickle.load(f)
162 |         self.lr = cPickle.load(f)
163 |         self.momentum = cPickle.load(f)
164 |         self.n_words = cPickle.load(f)
165 |         self.batch_size = cPickle.load(f)
166 |         self.word_to_id = cPickle.load(f)
167 |         self.id_to_word = cPickle.load(f)
168 |         self.null_word_id = cPickle.load(f)
169 |         self.weights = cPickle.load(f)
170 |         self.H = cPickle.load(f)
171 |         self.A = cPickle.load(f)
172 |         self.U = cPickle.load(f)
173 |         f.close()
174 | 
175 | 
176 |     def memnn_batch_cost(self, statements_batch, question_batch, r_batch, ans_batch, pe_matrix):
177 |         l = statements_batch.shape[0]
178 |         s, _ = theano.scan(fn=lambda i, c, xb, qb, rb, ab, pe: c - T.log(self.memnn_cost(xb[i], qb[i], ab[i], pe)[rb[i]]),
179 |                            outputs_info=T.as_tensor_variable(np.asarray(0, theano.config.floatX)),
180 |                            non_sequences=[statements_batch, question_batch, r_batch, ans_batch, pe_matrix],
181 |                            sequences=[theano.tensor.arange(l, dtype='int64')])
182 |         return s[-1]
183 | 
184 |     def memnn_cost(self, statements, question, ans, pe_matrix):
185 |         # statements: list of list of word indices
186 |         # question: list of word indices
187 | 
188 |         computed_memories, updates = theano.scan(
189 |             self._compute_memories,
190 |             sequences = [statements],
191 |             outputs_info = [
192 |                 alloc_zeros_matrix(self.weights.shape[0], self.n_embedding)
193 |             ],
194 |             non_sequences = [
195 |                 self.weights.dimshuffle(1, 0, 2),
196 |                 pe_matrix
197 |             ],
198 |             truncate_gradient = -1,
199 |         )
200 | 
201 |         memories = T.stacklists(computed_memories).dimshuffle(1, 0, 2)
202 | 
203 |         # Embed question
204 |         u1 = T.sum(self.weights[0][question], axis=0)
205 | 
206 |         # Layer 1
207 |         p = T.nnet.softmax(T.dot(u1, memories[0].T))
208 |         o1 = T.dot(p, memories[1])
209 | 
210 |         # Layer 2
211 |         u2 = o1 + T.dot(u1, self.H)
212 |         p = T.nnet.softmax(T.dot(u2, memories[1].T))
213 |         o2 = T.dot(p, memories[2])
214 | 
215 |         # Layer 3
216 |         u3 = o2 + T.dot(u2, self.H)
217 |         p = T.nnet.softmax(T.dot(u3, memories[2].T))
218 |         o3 = T.dot(p, memories[3])
219 | 
220 |         # Score answers
221 |         u4 = o3 + T.dot(u3, self.H)
222 | 
223 |         # Embed answer
224 |         a1 = T.sum(self.A[ans[0]], axis=0)
225 |         a2 = T.sum(self.A[ans[1]], axis=0)
226 |         a3 = T.sum(self.A[ans[2]], axis=0)
227 |         a4 = T.sum(self.A[ans[3]], axis=0)
228 |         a = T.stack(a1, a2, a3, a4)
229 |         scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T))
230 |         #scores = T.dot(T.dot(u4, self.U.T), T.dot(self.U, a.T))
231 |         output = T.nnet.softmax(scores)
232 | 
233 |         return output[0]
234 | 
235 |     def train(self, dataset, questions, n_epochs=100, lr_schedule=None, start_epoch=0, max_words=20):
236 |         l_rate = self.lr
237 |         index_array = np.arange(len(questions))
238 | 
239 |         # (max_words, )
240 |         pe_matrix = self._get_PE_matrix(max_words, self.n_embedding)
241 | 
242 |         for epoch in xrange(start_epoch, start_epoch + n_epochs):
243 |             costs = []
244 | 
245 |             if lr_schedule != None and epoch in lr_schedule:
246 |                 l_rate = lr_schedule[epoch]
247 | 
248 |             np.random.shuffle(index_array)
249 |             seen = 0
250 | 
251 |             batches = make_batches(len(questions), self.batch_size)
252 |             for batch_index, (batch_start, batch_end) in enumerate(batches):
253 |                 batch_ids = index_array[batch_start:batch_end]
254 |                 seen += len(batch_ids)
255 |                 questions_batch = []
256 |                 for index in batch_ids:
257 |                     questions_batch.append(questions[index])
258 | 
259 |                 #pprint.pprint(questions_batch)
260 | 
261 |                 # (batch_size * max_stmts * max_words)
262 |                 statements_seq_batch = np.asarray(map(lambda x: x[2], questions_batch), theano.config.floatX)
263 |                 # (batch_size * max_words)
264 |                 question_seq_batch = np.asarray(map(lambda x: x[3], questions_batch), theano.config.floatX)
265 |                 # (batch_size)
266 |                 correct_word_batch = np.asarray(map(lambda x: x[4], questions_batch), theano.config.floatX)
267 |                 # (batch_size * 4 * max_words)
268 |                 ans_batch = np.asarray(map(lambda x: x[5], questions_batch), theano.config.floatX)
269 | 
270 |                 cost = self.train_function(
271 |                     statements_seq_batch,
272 |                     question_seq_batch,
273 |                     correct_word_batch,
274 |                     ans_batch,
275 |                     pe_matrix,
276 |                     l_rate
277 |                 )
278 | 
279 |                 # print "Epoch %d, sample %d: %f" % (epoch, i, cost)
280 |                 costs.append(cost)
281 | 
282 |             print "Epoch %d: %f" % (epoch, np.mean(costs))
283 | 
284 |     def predict(self, dataset, questions, max_words=20, print_errors=False):
285 |         correct_answers = 0
286 |         wrong_answers = 0
287 |         pe_matrix = self._get_PE_matrix(max_words, self.n_embedding)
288 | 
289 |         for i, question in enumerate(questions):
290 |             statements_seq = np.asarray(question[2], theano.config.floatX)
291 |             question_seq = np.asarray(question[3], theano.config.floatX)
292 |             answers = np.asarray(question[5], theano.config.floatX)
293 |             correct = question[4]
294 | 
295 |             probs = self.predict_function(
296 |                 statements_seq, question_seq, answers, pe_matrix
297 |             )
298 |             predicted = np.argmax(probs)
299 | 
300 |             if predicted == correct:
301 |                 correct_answers += 1
302 |             else:
303 |                 if print_errors and np.random.rand() < 0.1:
304 |                     correct_words = map(lambda x: self.id_to_word[x], question[5][correct])
305 |                     predicted_words = map(lambda x: self.id_to_word[x], question[5][predicted])
306 |                     print 'Correct: %s (%d %.3f), Guess: %s (%d %.3f)' % (correct_words, correct, probs[correct], predicted_words, predicted, probs[predicted])
307 |                 wrong_answers += 1
308 | 
309 |             #if len(questions) > 1000:
310 |             #    print '(%d/%d) %d correct, %d wrong' % (i+1, len(questions), correct_answers, wrong_answers)
311 | 
312 |         accuracy = 100.0 * float(correct_answers) / (correct_answers + wrong_answers)
313 |         print '%d correct, %d wrong, %.2f%% acc' % (correct_answers, wrong_answers, accuracy)
314 | 
315 | if __name__ == "__main__":
316 |     train_file = sys.argv[1]
317 |     test_file = train_file.replace('train', 'test')
318 | 
319 |     if len(sys.argv) > 2:
320 |         n_epochs = int(sys.argv[2])
321 |     else:
322 |         n_epochs = 10
323 | 
324 |     if len(sys.argv) > 3:
325 |         n_embedding = int(sys.argv[3])
326 |     else:
327 |         n_embedding = 20
328 | 
329 |     print("Loading pickled train dataset")
330 |     f = file(train_file, 'rb')
331 |     obj = cPickle.load(f)
332 |     train_dataset, train_questions, word_to_id, num_words, null_word_id, train_max_stmts, train_max_words = obj
333 | 
334 |     print("Loading pickled test dataset")
335 |     f = file(test_file, 'rb')
336 |     obj = cPickle.load(f)
337 |     test_dataset, test_questions, _, _, _, test_max_stmts, test_max_words = obj
338 | 
339 |     print "Dataset has %d words" % num_words
340 | 
341 |     model_file = train_file.replace("train", "model")
342 |     train_my_model = True
343 |     save_my_model = True
344 | 
345 |     if train_my_model:
346 |         wmemNN = WMemNN(n_words=num_words, n_embedding=n_embedding, lr=0.01, word_to_id=word_to_id, null_word_id=null_word_id)
347 | 
348 |         lr_schedule = dict([(0, 0.01), (25, 0.01/2), (50, 0.01/4), (75, 0.01/8)])
349 | 
350 |         for i in xrange(n_epochs/5):
351 |             wmemNN.train(train_dataset, train_questions, 5, lr_schedule, 5*i, train_max_words)
352 |             wmemNN.predict(train_dataset, train_questions, train_max_words)
353 |             wmemNN.predict(test_dataset, test_questions, test_max_words)
354 | 
355 |         if save_my_model:
356 |             print "Saving model to", model_file
357 |             wmemNN.save_model(model_file)
358 |     else:
359 |         wmemNN = WMemNN(load_from_file=model_file)
360 |         wmemNN.predict(train_dataset, train_questions, train_max_words)
361 |         wmemNN.predict(test_dataset, test_questions, test_max_words)
362 | 
363 | 


--------------------------------------------------------------------------------
/wordvec_pruning.py:
--------------------------------------------------------------------------------
 1 | from gensim.models import Word2Vec
 2 | import numpy
 3 | 
 4 | def prune_statements(dataset, questions):
 5 |     total_old = 0
 6 |     total_new = 0
 7 | 
 8 |     wvs = Word2Vec(dataset, min_count=0)
 9 | 
10 |     for i in range(len(questions)):
11 |         question = questions[i]
12 |         new_statements = []
13 |         old_statements = question[2][:-1]
14 | 
15 |         # Use word vectors and keep only the top 5
16 | 
17 |         sims = []
18 |         q = question[2][-1]
19 |         for s in old_statements:
20 |             sims.append(wvs.n_similarity(q,s))
21 | 
22 |         sims2 = map(lambda x: x if type(x) is numpy.float64 else 0.0, sims)
23 |         top = sorted(range(len(sims2)), key=sims2.__getitem__, reverse=True)
24 |         new_statements = map(lambda x: old_statements[x], top[:5])
25 | 
26 |         questions[i][2] = new_statements
27 |         total_old += len(old_statements)
28 |         total_new += len(new_statements)
29 |         #print("Question: ", questions[i][2][-1], " before %d after %d" % (len(old_statements), len(new_statements)))
30 | 
31 |     print("Before %d After %d" % (total_old, total_new))
32 |     return questions
33 | 


--------------------------------------------------------------------------------