├── .gitattributes ├── .gitignore ├── README.md ├── dataload.py ├── debug_train.py ├── params.txt ├── params_cnn.txt ├── sentence_cnn.py ├── theano_cnn.py ├── train_model.py └── train_parallel_cnn.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | 19 | # External tool builders 20 | .externalToolBuilders/ 21 | 22 | # Locally stored "Eclipse launch configurations" 23 | *.launch 24 | 25 | # CDT-specific 26 | .cproject 27 | 28 | # PDT-specific 29 | .buildpath 30 | 31 | 32 | ################# 33 | ## Visual Studio 34 | ################# 35 | 36 | ## Ignore Visual Studio temporary files, build results, and 37 | ## files generated by popular Visual Studio add-ons. 38 | 39 | # User-specific files 40 | *.suo 41 | *.user 42 | *.sln.docstates 43 | 44 | # Build results 45 | 46 | [Dd]ebug/ 47 | [Rr]elease/ 48 | x64/ 49 | build/ 50 | [Bb]in/ 51 | [Oo]bj/ 52 | 53 | # MSTest test Results 54 | [Tt]est[Rr]esult*/ 55 | [Bb]uild[Ll]og.* 56 | 57 | *_i.c 58 | *_p.c 59 | *.ilk 60 | *.meta 61 | *.obj 62 | *.pch 63 | *.pdb 64 | *.pgc 65 | *.pgd 66 | *.rsp 67 | *.sbr 68 | *.tlb 69 | *.tli 70 | *.tlh 71 | *.tmp 72 | *.tmp_proj 73 | *.log 74 | *.vspscc 75 | *.vssscc 76 | .builds 77 | *.pidb 78 | *.log 79 | *.scc 80 | 81 | # Visual C++ cache files 82 | ipch/ 83 | *.aps 84 | *.ncb 85 | *.opensdf 86 | *.sdf 87 | *.cachefile 88 | 89 | # Visual Studio profiler 90 | *.psess 91 | *.vsp 92 | *.vspx 93 | 94 | # Guidance Automation Toolkit 95 | *.gpState 96 | 97 | # ReSharper is a .NET coding add-in 98 | _ReSharper*/ 99 | *.[Rr]e[Ss]harper 100 | 101 | # TeamCity is a build add-in 102 | _TeamCity* 103 | 104 | # DotCover is a Code Coverage Tool 105 | *.dotCover 106 | 107 | # NCrunch 108 | *.ncrunch* 109 | .*crunch*.local.xml 110 | 111 | # Installshield output folder 112 | [Ee]xpress/ 113 | 114 | # DocProject is a documentation generator add-in 115 | DocProject/buildhelp/ 116 | DocProject/Help/*.HxT 117 | DocProject/Help/*.HxC 118 | DocProject/Help/*.hhc 119 | DocProject/Help/*.hhk 120 | DocProject/Help/*.hhp 121 | DocProject/Help/Html2 122 | DocProject/Help/html 123 | 124 | # Click-Once directory 125 | publish/ 126 | 127 | # Publish Web Output 128 | *.Publish.xml 129 | *.pubxml 130 | *.publishproj 131 | 132 | # NuGet Packages Directory 133 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 134 | #packages/ 135 | 136 | # Windows Azure Build Output 137 | csx 138 | *.build.csdef 139 | 140 | # Windows Store app package directory 141 | AppPackages/ 142 | 143 | # Others 144 | sql/ 145 | *.Cache 146 | ClientBin/ 147 | [Ss]tyle[Cc]op.* 148 | ~$* 149 | *~ 150 | *.dbmdl 151 | *.[Pp]ublish.xml 152 | *.pfx 153 | *.publishsettings 154 | 155 | # RIA/Silverlight projects 156 | Generated_Code/ 157 | 158 | # Backup & report files from converting an old project file to a newer 159 | # Visual Studio version. Backup files are not needed, because we have git ;-) 160 | _UpgradeReport_Files/ 161 | Backup*/ 162 | UpgradeLog*.XML 163 | UpgradeLog*.htm 164 | 165 | # SQL Server files 166 | App_Data/*.mdf 167 | App_Data/*.ldf 168 | 169 | ############# 170 | ## Windows detritus 171 | ############# 172 | 173 | # Windows image file caches 174 | Thumbs.db 175 | ehthumbs.db 176 | 177 | # Folder config file 178 | Desktop.ini 179 | 180 | # Recycle Bin used on file shares 181 | $RECYCLE.BIN/ 182 | 183 | # Mac crap 184 | .DS_Store 185 | 186 | 187 | ############# 188 | ## Python 189 | ############# 190 | 191 | *.py[cod] 192 | 193 | # Packages 194 | *.egg 195 | *.egg-info 196 | dist/ 197 | build/ 198 | eggs/ 199 | parts/ 200 | var/ 201 | sdist/ 202 | develop-eggs/ 203 | .installed.cfg 204 | 205 | # Installer logs 206 | pip-log.txt 207 | 208 | # Unit test / coverage reports 209 | .coverage 210 | .tox 211 | 212 | #Translations 213 | *.mo 214 | 215 | #Mr Developer 216 | .mr.developer.cfg 217 | 218 | .idea/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The data needed to run this project is lost. 2 | -------------------------------------------------------------------------------- /dataload.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | 3 | __author__ = 'dhl' 4 | 5 | import numpy 6 | import math 7 | import copy 8 | 9 | def load_word_vectors(file_path): 10 | print 'loading word vectors...' 11 | f = open(file_path, 'rb') 12 | num_words = numpy.fromfile(f, '>i4', 1) 13 | vec_len = numpy.fromfile(f, '>i4', 1) 14 | print num_words, vec_len 15 | words = [] 16 | # word_vecs = [] 17 | # word_vecs.append(0, [0. for i in xrange(word_vec_len)]) 18 | word_vecs = numpy.zeros((num_words + 1, vec_len)) 19 | # word_vecs[0][0] = 1 20 | for i in xrange(num_words): 21 | length = ord(f.read(1)) 22 | byte_buf = f.read(length) 23 | if i == 0: 24 | words.append(byte_buf.decode('utf-8')) 25 | words.append(byte_buf.decode('utf-8')) 26 | word_vecs[i + 1] = numpy.fromfile(f, '>f4', vec_len) 27 | # vec = numpy.fromfile(f, '>f4', vec_len) 28 | # word_vecs.append(vec) 29 | 30 | f.close() 31 | print 'done.' 32 | 33 | return words, word_vecs 34 | 35 | 36 | def load_index_vec_of_entities_fixed_len(file_path): 37 | print 'loading representations of entities (word indices, fixed len) ...' 38 | f = open(file_path, 'rb') 39 | num_entities = numpy.fromfile(f, '>i4', 1) 40 | print num_entities 41 | vec_len = numpy.fromfile(f, '>i4', 1)[0] 42 | print vec_len 43 | 44 | wid_idx_dict = dict() 45 | wid_idx_dict[0] = 0 46 | entity_vecs = numpy.zeros((num_entities + 1, vec_len), dtype='int32') 47 | for i in xrange(num_entities): 48 | wid = numpy.fromfile(f, '>i4', 1) 49 | wid_idx_dict[wid[0]] = i + 1 50 | 51 | # print num_indices 52 | entity_vecs[i + 1] = numpy.fromfile(f, '>i4', vec_len) 53 | 54 | # print i 55 | if (i + 1) % 1000000 == 0: 56 | print i + 1 57 | 58 | f.close() 59 | 60 | print 'done.' 61 | return wid_idx_dict, entity_vecs, vec_len 62 | 63 | 64 | def load_entities_indices(file_path, max_num_words=50, pad_len=1): 65 | print 'loading representations of entities (word indices) ...' 66 | 67 | f = open(file_path, 'rb') 68 | num_entities = numpy.fromfile(f, '>i4', 1) 69 | print num_entities 70 | 71 | wid_idx_dict = dict() 72 | wid_idx_dict[0] = 0 73 | entity_vecs = numpy.zeros((num_entities + 1, max_num_words + 2 * pad_len), dtype='int32') 74 | for i in xrange(num_entities): 75 | wid = numpy.fromfile(f, '>i4', 1) 76 | wid_idx_dict[wid[0]] = i + 1 77 | 78 | num_indices = numpy.fromfile(f, '>i4', 1) 79 | # print num_indices 80 | indices = numpy.fromfile(f, '>i4', num_indices) 81 | for j in xrange(num_indices): 82 | if j < max_num_words: 83 | entity_vecs[i + 1][pad_len + j] = indices[j] + 1 84 | else: 85 | break 86 | 87 | # print i 88 | if (i + 1) % 1000000 == 0: 89 | print i + 1 90 | 91 | f.close() 92 | 93 | print 'done.' 94 | return wid_idx_dict, entity_vecs 95 | 96 | 97 | def load_entities(file_path, div_by_len=False, unknown_vec=None): 98 | print 'loading entity representations ...' 99 | f = open(file_path, 'rb') 100 | 101 | num_entities = numpy.fromfile(f, '>i4', 1) 102 | vec_len = numpy.fromfile(f, '>i4', 1) 103 | 104 | print num_entities, vec_len 105 | 106 | wid_idx_dict = dict() 107 | wid_idx_dict[0] = 0 108 | entity_vecs = numpy.zeros((num_entities + 1, vec_len)) 109 | if unknown_vec is None: 110 | # entity_vecs[0] = numpy.random.uniform(low=0, high=1, size=(vec_len,)) 111 | entity_vecs[0][0] = 1 112 | else: 113 | entity_vecs[0] = unknown_vec 114 | cnt = 1 115 | while True: 116 | wid = numpy.fromfile(f, '>i4', 1) 117 | 118 | if not wid: 119 | break 120 | 121 | entity_vecs[cnt] = numpy.fromfile(f, '>f4', vec_len) 122 | if div_by_len: 123 | l2_norm = 0 124 | for i in xrange(vec_len): 125 | l2_norm += entity_vecs[cnt][i] * entity_vecs[cnt][i] 126 | l2_norm = math.sqrt(l2_norm) 127 | for i in xrange(vec_len): 128 | entity_vecs[cnt][i] /= l2_norm 129 | 130 | wid_idx_dict[wid[0]] = cnt 131 | 132 | # print entity_vecs[cnt] 133 | cnt += 1 134 | # if cnt == 10: 135 | # break 136 | if cnt % 1000000 == 0: 137 | print cnt 138 | 139 | f.close() 140 | print 'done.' 141 | 142 | return wid_idx_dict, entity_vecs 143 | 144 | 145 | def skip_next_training_paragraph(f): 146 | sentence_len = numpy.fromfile(f, '>i4', 1) 147 | if not sentence_len: 148 | return False 149 | 150 | word_indices = numpy.fromfile(f, '>i4', sentence_len) 151 | 152 | num_mentions = numpy.fromfile(f, '>i4', 1) 153 | for i in xrange(num_mentions): 154 | mention_span = numpy.fromfile(f, '>i4', 2) 155 | num_mention_candidates = numpy.fromfile(f, '>i4', 1) 156 | 157 | candidates = numpy.fromfile(f, '>i4', num_mention_candidates) 158 | 159 | 160 | def load_next_training_paragraph(f): 161 | sentence_len = numpy.fromfile(f, '>i4', 1) 162 | if not sentence_len: 163 | return False 164 | 165 | word_indices = numpy.fromfile(f, '>i4', sentence_len) 166 | 167 | num_mentions = numpy.fromfile(f, '>i4', 1) 168 | mention_spans = [] 169 | candidates_mentions = [] 170 | for i in xrange(num_mentions): 171 | mention_span = numpy.fromfile(f, '>i4', 2) 172 | if mention_span is None: 173 | print i, num_mentions, 'weird' 174 | # [mention_beg, mention_end] = numpy.fromfile(f, '>i4', 2) 175 | # mention_spans.append((mention_beg, mention_end)) 176 | if not len(mention_span) == 2: 177 | print i, num_mentions, 'weird' 178 | print mention_span 179 | [mention_beg, mention_end] = mention_span 180 | mention_spans.append([mention_beg, mention_end]) 181 | num_mention_candidates = numpy.fromfile(f, '>i4', 1) 182 | 183 | if num_mention_candidates > 37: 184 | print 'num_mention_candidates', num_mention_candidates 185 | return False 186 | # else: 187 | # print 'num_mention_candidates', num_mention_candidates 188 | 189 | candidates = numpy.fromfile(f, '>i4', num_mention_candidates) 190 | candidates_mentions.append(candidates) 191 | 192 | return word_indices, mention_spans, candidates_mentions 193 | 194 | 195 | def get_mention_centered_context(word_indices, mention_span, sentence_len, pad_len): 196 | result_indices = [] 197 | for i in xrange(pad_len): 198 | result_indices.append(0) 199 | 200 | len_mention_span = mention_span[1] - mention_span[0] + 1 201 | len_side = (sentence_len - len_mention_span) / 2 202 | pos_left = mention_span[0] - len_side 203 | pos_right = mention_span[1] + len_side 204 | 205 | if pos_left < 0: 206 | pos_right -= pos_left 207 | if pos_right >= len(word_indices): 208 | pos_right = len(word_indices) - 1 209 | pos_left = 0 210 | elif pos_right >= len(word_indices): 211 | pos_left -= pos_right - len(word_indices) + 1 212 | if pos_left < 0: 213 | pos_left = 0 214 | pos_right = len(word_indices) - 1 215 | 216 | for pos in xrange(pos_left, pos_right + 1): 217 | cur_word_index = word_indices[pos] 218 | if cur_word_index > -1: 219 | result_indices.append(cur_word_index + 1) 220 | 221 | # print sentence_len, pad_len 222 | 223 | while len(result_indices) < sentence_len + 2 * pad_len: 224 | result_indices.append(0) 225 | 226 | return result_indices 227 | 228 | 229 | def get_samples_in_paragraph(word_indices, mention_spans, candidates_mentions, 230 | wid_idx_dict, dst_contexts, dst_entity_idxs, 231 | sentence_len, sentence_pad_len, num_candidates=2): 232 | cnt0 = 0 233 | cnt1 = 0 234 | if len(mention_spans) == len(candidates_mentions): 235 | entity_indices = numpy.zeros(num_candidates, dtype='int32') 236 | for mention_span, candidate_mentions in zip(mention_spans, candidates_mentions): 237 | if len(candidate_mentions) == 1: 238 | cnt0 += 1 239 | continue 240 | 241 | pos = 0 242 | last_index = 0 243 | while pos < len(candidate_mentions) and pos < num_candidates: 244 | idx = wid_idx_dict.get(candidate_mentions[pos], 0) 245 | entity_indices[pos] = idx 246 | last_index = idx 247 | pos += 1 248 | 249 | while pos < num_candidates: 250 | entity_indices[pos] = last_index 251 | pos += 1 252 | 253 | mention_context = get_mention_centered_context(word_indices, mention_span, sentence_len, sentence_pad_len) 254 | dst_contexts.append(mention_context) 255 | dst_entity_idxs.append(copy.copy(entity_indices)) 256 | else: 257 | print 'number of mention spans does not match number of candidates of mentions' 258 | 259 | return cnt0, cnt1 260 | 261 | 262 | def load_training_samples(f, num_paragraphs, wid_idx_dict, sentence_len, sentence_pad_len): 263 | print 'loading data', num_paragraphs, 'paragraphs' 264 | contexts = [] 265 | entity_idxs = [] 266 | for i in xrange(num_paragraphs): 267 | result_tuple = load_next_training_paragraph(f) 268 | if result_tuple: 269 | word_indices, mention_spans, candidates_mentions = result_tuple 270 | get_samples_in_paragraph(word_indices, mention_spans, candidates_mentions, 271 | wid_idx_dict, contexts, entity_idxs, 272 | sentence_len, sentence_pad_len) 273 | else: 274 | return contexts, entity_idxs 275 | 276 | print 'done.' 277 | return contexts, entity_idxs 278 | 279 | 280 | def skip_training_sample(f, num_paragraphs): 281 | for i in xrange(num_paragraphs): 282 | skip_next_training_paragraph(f) 283 | 284 | 285 | def load_samples_full(file_name, wid_idx_dict, sentence_len, sentence_pad_len, skip_width=20, num_candidates=2): 286 | print 'loading', file_name, '...' 287 | contexts = [] 288 | entity_idxs = [] 289 | f = open(file_name, 'rb') 290 | cnt = 0 291 | result_tuple = load_next_training_paragraph(f) 292 | while result_tuple: 293 | if skip_width == 0 or cnt % skip_width == 0: 294 | word_indices, mention_spans, candidates_mentions = result_tuple 295 | get_samples_in_paragraph(word_indices, mention_spans, candidates_mentions, 296 | wid_idx_dict, contexts, entity_idxs, 297 | sentence_len, sentence_pad_len, num_candidates) 298 | # print entity_idxs 299 | cnt += 1 300 | 301 | if cnt % 500000 == 0: 302 | print cnt 303 | result_tuple = load_next_training_paragraph(f) 304 | 305 | f.close() 306 | print 'done.' 307 | return contexts, entity_idxs 308 | 309 | 310 | def main(): 311 | print 'data_load' 312 | # words, word_vecs = load_word_vectors('/media/dhl/Data/el/word2vec/wiki_vectors.jbin') 313 | # for i in xrange(300): 314 | # print words[i] 315 | 316 | # f = open('/media/dhl/Data/el/vec_rep/wiki_training_word_vec_indices.td', 'rb') 317 | # word_indices, mention_spans, candidates_mentions = load_next_training_paragraph(f) 318 | # f.close() 319 | 320 | # for i in range(len(word_indices)): 321 | # if word_indices[i] > -1: 322 | # print i, words[word_indices[i]] 323 | 324 | # print word_indices 325 | # print mention_spans 326 | # mention_context = get_mention_centered_context(word_indices, mention_spans[0]) 327 | # print mention_context 328 | 329 | # wid_idx_dict, entity_vecs = load_entities('/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_unit_vec.bin', False) 330 | 331 | # wid_idx_dict, entity_vecs = load_entities_indices('/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_indices.bin') 332 | # print wid_idx_dict[12] 333 | # print entity_vecs[wid_idx_dict[12]] 334 | # for idx in entity_vecs[wid_idx_dict[12]]: 335 | # print words[idx] 336 | 337 | 338 | if __name__ == '__main__': 339 | main() 340 | -------------------------------------------------------------------------------- /debug_train.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | __author__ = 'dhl' 3 | 4 | import sys 5 | import cPickle 6 | 7 | import numpy as np 8 | 9 | import theano 10 | import theano.tensor as T 11 | 12 | import data_load 13 | from sentence_cnn import SentenceCNN 14 | from theano_cnn import HiddenLayer, relu, sgd_updates_adadelta 15 | 16 | 17 | def to_theano_shared(vals): 18 | return theano.shared(value=np.asarray(vals, 19 | dtype=theano.config.floatX), 20 | borrow=True) 21 | 22 | 23 | def get_entity_context_similarities(unit_mc, cnn_output_for_entities, batch_size, num_candidates): 24 | entity_reps = cnn_output_for_entities.reshape((batch_size, num_candidates, 25 | cnn_output_for_entities.shape[1])) 26 | unit_entity_reps = entity_reps / T.sqrt(T.maximum( 27 | T.sum(T.sqr(entity_reps), 2), 1e-5)).dimshuffle(0, 1, 'x') 28 | return (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2) 29 | 30 | 31 | # TODO remove these global variables 32 | def_filter_hs = [1, 2] 33 | sentence_len = 50 34 | sentence_pad_len = def_filter_hs[-1] - 1 35 | training_part_size = 50000 36 | 37 | num_train_candidates = 2 38 | 39 | 40 | max_num_entity_words = 50 41 | entity_pad_len = 1 42 | entity_rep_len = max_num_entity_words + 2 * entity_pad_len 43 | entity_hs = [1] 44 | num_entity_rep_feature_maps = 300 45 | 46 | def train_cnn_for_el(train_data_file_name, 47 | val_data_file_name, 48 | num_val_candidates, 49 | test_data_file_name, 50 | num_test_candidates, 51 | full_sentence_len, word_vec_len, 52 | all_words, # first row of all_words should be a non-existing word 53 | wid_idx_dict, 54 | entity_vecs, 55 | gold_as_first_candidate=True, 56 | skip_width_loading=40, # skip width while loading samples 57 | n_epochs=25, 58 | batch_size=50, 59 | filter_hs=def_filter_hs, 60 | num_feature_maps=100, 61 | lr_decay=0.9, 62 | sqr_norm_lim=9, 63 | hidden_out_len=50,): 64 | rng = np.random.RandomState(3435) 65 | 66 | print 'making entity_vecs...', len(entity_vecs) 67 | # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX), 68 | # name='entity_vecs', borrow=True) 69 | shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype="int32"), 70 | name='entity_vecs', borrow=True) 71 | # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32), 72 | # name='entity_vecs', borrow=True) 73 | print 'making shared_words...', len(all_words) 74 | shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX), 75 | name='shared_words', borrow=True) 76 | print 'done' 77 | 78 | # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading) 79 | # num_test_batches = test_indices.shape[0] / batch_size 80 | # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name, 81 | # wid_idx_dict, skip_width_loading) 82 | val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len, 83 | sentence_pad_len, 84 | skip_width=skip_width_loading, 85 | num_candidates=num_val_candidates) 86 | num_val_batches = len(val_contexts) / batch_size 87 | print num_val_batches, 'validation batches' 88 | print len(val_indices[0]), 'candidates per mention' 89 | val_contexts = T.cast(to_theano_shared(val_contexts), 'int32') 90 | val_indices = T.cast(to_theano_shared(val_indices), 'int32') 91 | 92 | test_contexts, test_indices = data_load.load_samples_full(test_data_file_name, wid_idx_dict, sentence_len, 93 | sentence_pad_len, 94 | skip_width=skip_width_loading, 95 | num_candidates=num_test_candidates) 96 | num_test_batches = len(test_contexts) / batch_size 97 | print num_test_batches, 'test batches' 98 | print len(test_indices[0]), 'candidates per mention' 99 | test_contexts = T.cast(to_theano_shared(test_contexts), 'int32') 100 | test_indices = T.cast(to_theano_shared(test_indices), 'int32') 101 | 102 | if gold_as_first_candidate: 103 | gold_labels = theano.shared(value=np.zeros(batch_size, 104 | dtype='int32'), 105 | borrow=True) 106 | else: 107 | gold_labels = theano.shared(value=np.ones(batch_size, 108 | dtype='int32'), 109 | borrow=True) 110 | 111 | x = T.imatrix('x') 112 | entities = T.imatrix('entities') 113 | 114 | sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps, 115 | batch_size, 116 | hidden_out_len, rng) 117 | mc = sentence_cnn0.output # mention contexts 118 | unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 1e-5)).dimshuffle(0, 'x') 119 | 120 | batch_entity_vecs = shared_entity_vecs[entities] 121 | entity_vecs_reshaped = batch_entity_vecs.reshape((batch_entity_vecs.shape[0] * batch_entity_vecs.shape[1], 122 | batch_entity_vecs.shape[2])) 123 | 124 | sentence_cnn1_train = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs, 125 | num_entity_rep_feature_maps, 126 | batch_size * num_train_candidates, hidden_out_len, rng) 127 | entity_reps_train = sentence_cnn1_train.output 128 | similarities_train = get_entity_context_similarities(unit_mc, entity_reps_train, batch_size, num_train_candidates) 129 | loss = T.maximum(0, 1 - similarities_train[:, 0] + similarities_train[:, 1]).sum() 130 | 131 | # entity_reps_train = entity_reps_train.reshape((batch_size, num_train_candidates, entity_reps_train.shape[1])) 132 | # matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu) 133 | # entity_reps = matcher1.output 134 | 135 | # unit_entity_reps_train = entity_reps_train / T.sqrt(T.maximum( 136 | # T.sum(T.sqr(entity_reps_train), 2), 0.0001)).dimshuffle(0, 1, 'x') 137 | # 138 | # similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2) 139 | 140 | sentence_cnn1_val = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs, 141 | num_entity_rep_feature_maps, 142 | batch_size * num_val_candidates, 143 | hidden_out_len, rng, 144 | hidden_W=sentence_cnn1_train.hiddenW, 145 | hidden_b=sentence_cnn1_train.hiddenb, 146 | conv_Ws=sentence_cnn1_train.convWs, 147 | conv_bs=sentence_cnn1_train.convbs) 148 | entity_reps_val = sentence_cnn1_val.output 149 | similarities_val = get_entity_context_similarities(unit_mc, entity_reps_val, batch_size, num_val_candidates) 150 | correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities_val, axis=1))) 151 | 152 | # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2) # / mc_norm 153 | 154 | # params = sentence_cnn0.params + matcher1.params 155 | params = sentence_cnn0.params + sentence_cnn1_train.params 156 | grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim) 157 | 158 | index = T.lscalar() 159 | 160 | val_model = theano.function( 161 | [index], 162 | correct_rate, 163 | givens={x: val_contexts[index * batch_size: (index + 1) * batch_size], 164 | entities: val_indices[index * batch_size: (index + 1) * batch_size]} 165 | ) 166 | 167 | test_model = theano.function( 168 | [index], 169 | correct_rate, 170 | givens={x: test_contexts[index * batch_size: (index + 1) * batch_size], 171 | entities: test_indices[index * batch_size: (index + 1) * batch_size]} 172 | ) 173 | 174 | train_contexts = theano.shared( 175 | value=np.zeros((3, 2)), 176 | borrow=True) 177 | int_train_contexts = T.cast(train_contexts, 'int32') 178 | train_indices = theano.shared( 179 | value=np.zeros((3, 2)), 180 | borrow=True) 181 | int_train_indices = T.cast(train_indices, 'int32') 182 | train_model = theano.function( 183 | [index], 184 | loss, 185 | updates=grad_updates, 186 | givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], 187 | entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 188 | ) 189 | 190 | # fdebug = theano.function( 191 | # [index], 192 | # similarities_train, 193 | # givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], 194 | # entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 195 | # ) 196 | fdebug0 = theano.function( 197 | [index], 198 | entity_reps_train.sum(axis=1), 199 | givens={entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 200 | ) 201 | fdebug1 = theano.function( 202 | [index], 203 | similarities_train, 204 | givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], 205 | entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 206 | ) 207 | fdebug2 = theano.function( 208 | [index], 209 | unit_mc.sum(axis=1), 210 | givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size]} 211 | ) 212 | # print fdebug(0) 213 | 214 | # val_perfs = [val_model(i) for i in xrange(num_val_batches)] 215 | # print('init val perf %f' % np.mean(val_perfs)) 216 | 217 | epoch = 0 218 | max_val_perf = 0 219 | test_perf = 0 220 | print 'training ...' 221 | # while epoch < n_epochs: 222 | epoch += 1 223 | 224 | train_part_cnt = 0 225 | 226 | # f_train = open(train_data_file_name, 'rb') 227 | # for i in xrange(143): 228 | # data_load.skip_training_sample(f_train, 50000) 229 | # if i % 40 == 0: 230 | # print i 231 | # print 'skipped' 232 | # 233 | # f_train = open(train_data_file_name, 'rb') 234 | # cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, 235 | # training_part_size, 236 | # wid_idx_dict, 237 | # sentence_len, 238 | # sentence_pad_len) 239 | # f_train.close() 240 | 241 | f_debug = open('debug_data.bin', 'rb') 242 | cur_train_contexts, cur_train_indices = cPickle.load(f_debug) 243 | f_debug.close() 244 | 245 | # print cur_train_contexts[9 * batch_size: (9 + 1) * batch_size] 246 | # print cur_train_indices[8 * batch_size: (8 + 1) * batch_size] 247 | 248 | train_contexts.set_value(cur_train_contexts, borrow=True) 249 | train_indices.set_value(cur_train_indices, borrow=True) 250 | 251 | # entity_index_vecs = fdebug0(8) 252 | # for entity_index_vec in entity_index_vecs: 253 | # print entity_index_vec 254 | 255 | train_part_cnt += 1 256 | num_train_batches = len(cur_train_contexts) / batch_size 257 | # print 'num_train_batches', num_train_batches 258 | mean_loss = 0 259 | for minibatch_index in xrange(num_train_batches): 260 | # if minibatch_index == 8: 261 | # continue 262 | # if 6 < minibatch_index < 10: 263 | # print minibatch_index 264 | # print sentence_cnn1_train.hiddenb.get_value() 265 | # print fdebug0(minibatch_index) 266 | cur_loss = train_model(minibatch_index) 267 | # if 6 < minibatch_index < 10: 268 | # print minibatch_index 269 | # print sentence_cnn1_train.hiddenb.get_value() 270 | # print fdebug0(minibatch_index) 271 | print minibatch_index, cur_loss 272 | mean_loss += cur_loss 273 | # if 11 > minibatch_index > 8: 274 | # print minibatch_index, cur_loss 275 | # print fdebug(minibatch_index) 276 | # print minibatch_index, cur_loss 277 | print 'loss:', mean_loss / num_train_batches 278 | # print fdebug(0) 279 | 280 | val_perfs = [val_model(i) for i in xrange(num_val_batches)] 281 | val_perf = np.mean(val_perfs) 282 | print('epoch %i, training part %i, val perf %f(%f), test perf %f' 283 | % (epoch, train_part_cnt, val_perf, max_val_perf, test_perf)) 284 | 285 | if val_perf > max_val_perf: 286 | max_val_perf = val_perf 287 | test_perfs = [test_model(i) for i in xrange(num_test_batches)] 288 | test_perf = np.mean(test_perfs) 289 | print('\tepoch %i, training part %i, test_perf %f' 290 | % (epoch, train_part_cnt, test_perf)) 291 | 292 | 293 | def dump_debug_data(): 294 | train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td' 295 | entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \ 296 | 'wid_entity_rep_wiki50_indices_with_keywords_fixed_len_10kw.bin' 297 | # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \ 298 | # 'wid_entity_rep_wiki50_indices.bin' 299 | 300 | # wid_idx_dict, entity_vecs = data_load.load_entities_indices( 301 | # entity_rep_file_name, max_num_entity_words, entity_pad_len) 302 | global entity_rep_len 303 | wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len( 304 | entity_rep_file_name) 305 | f_train = open(train_data_file_name, 'rb') 306 | for i in xrange(143): 307 | data_load.skip_training_sample(f_train, 50000) 308 | if i % 40 == 0: 309 | print i 310 | print 'skipped' 311 | 312 | cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, 313 | training_part_size, 314 | wid_idx_dict, 315 | sentence_len, 316 | sentence_pad_len) 317 | f_debug = open('debug_data_vlen.bin', 'wb') 318 | cPickle.dump([cur_train_contexts, cur_train_indices], f_debug) 319 | f_debug.close() 320 | 321 | 322 | def main(): 323 | local_flg = True 324 | if len(sys.argv) > 1: 325 | if sys.argv[1] == '0': 326 | local_flg = False 327 | 328 | if local_flg: 329 | word_vec_file_name = '/media/dhl/Data/el/word2vec/wiki_vectors.jbin' 330 | entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \ 331 | 'wid_entity_rep_wiki50_indices_with_keywords_fixed_len.bin' 332 | # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \ 333 | # 'wid_entity_rep_wiki50_indices.bin' 334 | train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td' 335 | val_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_training.bin' 336 | test_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_eval.bin' 337 | else: 338 | word_vec_file_name = '/home/dhl/data/word_vec/wiki_vectors.jbin' 339 | entity_rep_file_name = '/home/dhl/data/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len_0kw.bin' 340 | train_data_file_name = '/home/dhl/data/vec_rep/wiki_train_word_vec_indices_wiki50.td' 341 | val_data_file_name = '/home/dhl/data/vec_rep/tac_2014_training.bin' 342 | test_data_file_name = '/home/dhl/data/vec_rep/tac_2014_eval.bin' 343 | 344 | _, word_vecs = data_load.load_word_vectors(word_vec_file_name) 345 | word_vec_len = len(word_vecs[0]) 346 | 347 | # wid_idx_dict, entity_vecs = data_load.load_entities( 348 | # '/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin', 349 | # False) 350 | 351 | # wid_idx_dict, entity_vecs = data_load.load_entities_indices( 352 | # entity_rep_file_name, max_num_entity_words, entity_pad_len) 353 | 354 | global entity_rep_len 355 | wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len( 356 | entity_rep_file_name) 357 | 358 | num_val_candidates = 30 359 | num_test_candidates = 30 360 | skipwidth_loading = 0 361 | img_h = sentence_len + 2 * sentence_pad_len 362 | train_cnn_for_el(train_data_file_name, 363 | val_data_file_name, 364 | num_val_candidates, 365 | test_data_file_name, 366 | num_test_candidates, 367 | img_h, word_vec_len, 368 | word_vecs, 369 | wid_idx_dict, 370 | entity_vecs, 371 | gold_as_first_candidate=False, 372 | skip_width_loading=skipwidth_loading, 373 | n_epochs=1) 374 | 375 | 376 | if __name__ == '__main__': 377 | # dump_debug_data() 378 | main() 379 | -------------------------------------------------------------------------------- /params.txt: -------------------------------------------------------------------------------- 1 | entity_side_cnn 0 2 | training_part_size 25000 3 | context_sentence_len 50 4 | word_vec_file /media/dhl/Data/el/word2vec/wiki_vectors.jbin 5 | entity_rep_indices_file /media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len_0kw.bin 6 | entity_rep_vec_file /media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin 7 | train_data_file /media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td 8 | val_data_file /media/dhl/Data/el/vec_rep/tac_2014_train_gold_second.bin 9 | test_data_file /media/dhl/Data/el/vec_rep/tac_2014_eval_gold_second.bin 10 | -------------------------------------------------------------------------------- /params_cnn.txt: -------------------------------------------------------------------------------- 1 | entity_side_cnn 1 2 | training_part_size 25000 3 | context_sentence_len 50 4 | word_vec_file /home/dhl/data/word2vec/wiki_vectors.jbin 5 | entity_rep_indices_file /home/dhl/data/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len.bin 6 | entity_rep_vec_file /home/dhl/data/vec_rep/wid_entity_rep_wiki50_cat.bin 7 | train_data_file /home/dhl/data/vec_rep/wiki_train_word_vec_indices_wiki50.td 8 | val_data_file /home/dhl/data/vec_rep/tac_2014_training.bin 9 | test_data_file /home/dhl/data/vec_rep/tac_2014_eval.bin 10 | -------------------------------------------------------------------------------- /sentence_cnn.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dhl' 2 | 3 | import theano.tensor as T 4 | 5 | from theano_cnn import LeNetConvPoolLayer, HiddenLayer, relu 6 | 7 | 8 | class SentenceCNN: 9 | def __init__(self, input_sentences, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps, 10 | batch_size, 11 | output_len, rng, conv_non_linear=relu, 12 | hidden_W=None, hidden_b=None, conv_Ws=None, conv_bs=None): 13 | # self.input_x = input_x 14 | self.input = input_sentences 15 | self.non_linear = conv_non_linear 16 | 17 | # batch_size = input_sentences.shape[0] 18 | # full_sentence_len = input_sentences.shape[1] 19 | # word_vec_len = shared_words.shape[1] 20 | 21 | filter_shapes = [] 22 | pool_sizes = [] 23 | filter_w = word_vec_len 24 | for filter_h in filter_hs: 25 | filter_shapes.append((num_feature_maps, 1, filter_h, filter_w)) 26 | pool_sizes.append((full_sentence_len - filter_h + 1, word_vec_len - filter_w + 1)) 27 | 28 | layer0_input = shared_words[input_sentences.flatten()].reshape((input_sentences.shape[0], 1, 29 | input_sentences.shape[1], 30 | shared_words.shape[1])) 31 | conv_layers = [] 32 | layer1_inputs = [] 33 | for i in xrange(len(filter_hs)): 34 | filter_shape = filter_shapes[i] 35 | pool_size = pool_sizes[i] 36 | conv_W = None 37 | conv_b = None 38 | if conv_Ws is not None: 39 | conv_W = conv_Ws[i] 40 | if conv_bs is not None: 41 | conv_b = conv_bs[i] 42 | conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, 43 | image_shape=(batch_size, 1, full_sentence_len, word_vec_len), 44 | filter_shape=filter_shape, poolsize=pool_size, 45 | non_linear=conv_non_linear.func_name, 46 | W=conv_W, b=conv_b) 47 | layer1_input = conv_layer.output.flatten(2) 48 | conv_layers.append(conv_layer) 49 | layer1_inputs.append(layer1_input) 50 | 51 | layer1_input = T.concatenate(layer1_inputs, 1) 52 | matcher0 = HiddenLayer(rng, layer1_input, num_feature_maps * len(filter_hs), 53 | output_len, relu, W=hidden_W, b=hidden_b) 54 | 55 | self.hiddenW = matcher0.W 56 | self.hiddenb = matcher0.b 57 | self.convWs = list() 58 | self.convbs = list() 59 | for conv_layer in conv_layers: 60 | self.convWs.append(conv_layer.W) 61 | self.convbs.append(conv_layer.b) 62 | 63 | self.output = matcher0.output # mention contexts 64 | self.params = matcher0.params 65 | for conv_layer in conv_layers: 66 | self.params += conv_layer.params 67 | 68 | # unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x') 69 | -------------------------------------------------------------------------------- /theano_cnn.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dhl' 2 | 3 | import numpy 4 | import theano 5 | import theano.tensor as T 6 | from theano.tensor.signal import downsample 7 | from theano.tensor.nnet import conv 8 | from collections import OrderedDict 9 | 10 | def relu(x): 11 | return T.maximum(0.0, x) 12 | 13 | 14 | def sigmoid(x): 15 | return T.nnet.sigmoid(x) 16 | 17 | 18 | def as_floatx(variable): 19 | if isinstance(variable, float): 20 | return numpy.cast[theano.config.floatX](variable) 21 | 22 | if isinstance(variable, numpy.ndarray): 23 | return numpy.cast[theano.config.floatX](variable) 24 | return theano.tensor.cast(variable, theano.config.floatX) 25 | 26 | 27 | def sgd_updates_adadelta(params, cost, rho=0.95, epsilon=1e-6, norm_lim=9): 28 | """ 29 | adadelta update rule, mostly from 30 | https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta) 31 | """ 32 | updates = OrderedDict({}) 33 | exp_sqr_grads = OrderedDict({}) 34 | exp_sqr_ups = OrderedDict({}) 35 | gparams = [] 36 | for param in params: 37 | empty = numpy.zeros_like(param.get_value()) 38 | exp_sqr_grads[param] = theano.shared(value=as_floatx(empty), name="exp_grad_%s" % param.name) 39 | gp = T.grad(cost, param) 40 | exp_sqr_ups[param] = theano.shared(value=as_floatx(empty), name="exp_grad_%s" % param.name) 41 | gparams.append(gp) 42 | for param, gp in zip(params, gparams): 43 | exp_sg = exp_sqr_grads[param] 44 | exp_su = exp_sqr_ups[param] 45 | up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp) 46 | updates[exp_sg] = up_exp_sg 47 | step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp 48 | updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step) 49 | stepped_param = param + step 50 | if param.get_value(borrow=True).ndim == 2: 51 | col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0)) 52 | desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim)) 53 | scale = desired_norms / (1e-7 + col_norms) 54 | updates[param] = stepped_param * scale 55 | else: 56 | updates[param] = stepped_param 57 | return updates 58 | 59 | 60 | class LogisticRegression(object): 61 | """Multi-class Logistic Regression Class 62 | 63 | The logistic regression is fully described by a weight matrix :math:`W` 64 | and bias vector :math:`b`. Classification is done by projecting data 65 | points onto a set of hyperplanes, the distance to which is used to 66 | determine a class membership probability. 67 | """ 68 | 69 | def __init__(self, input, n_in, n_out, W=None, b=None): 70 | """ Initialize the parameters of the logistic regression 71 | 72 | :type input: theano.tensor.TensorType 73 | :param input: symbolic variable that describes the input of the 74 | architecture (one minibatch) 75 | 76 | :type n_in: int 77 | :param n_in: number of input units, the dimension of the space in 78 | which the datapoints lie 79 | 80 | :type n_out: int 81 | :param n_out: number of output units, the dimension of the space in 82 | which the labels lie 83 | """ 84 | 85 | # initialize with 0 the weights W as a matrix of shape (n_in, n_out) 86 | if W is None: 87 | self.W = theano.shared( 88 | value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), 89 | name='W') 90 | else: 91 | self.W = W 92 | 93 | # initialize the baises b as a vector of n_out 0s 94 | if b is None: 95 | self.b = theano.shared( 96 | value=numpy.zeros((n_out,), dtype=theano.config.floatX), 97 | name='b') 98 | else: 99 | self.b = b 100 | 101 | # compute vector of class-membership probabilities in symbolic form 102 | self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b) 103 | 104 | # compute prediction as class whose probability is maximal in 105 | # symbolic form 106 | self.y_pred = T.argmax(self.p_y_given_x, axis=1) 107 | 108 | # parameters of the model 109 | self.params = [self.W, self.b] 110 | 111 | def negative_log_likelihood(self, y): 112 | """Return the mean of the negative log-likelihood of the prediction 113 | of this model under a given target distribution. 114 | :type y: theano.tensor.TensorType 115 | :param y: corresponds to a vector that gives for each example the 116 | correct label 117 | """ 118 | return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y]) 119 | 120 | def errors(self, y): 121 | """Return a float representing the number of errors in the minibatch ; 122 | zero one loss over the size of the minibatch 123 | 124 | :type y: theano.tensor.TensorType 125 | :param y: corresponds to a vector that gives for each example the 126 | correct label 127 | """ 128 | 129 | # check if y has same dimension of y_pred 130 | if y.ndim != self.y_pred.ndim: 131 | raise TypeError('y should have the same shape as self.y_pred', 132 | ('y', y.type, 'y_pred', self.y_pred.type)) 133 | # check if y is of the correct datatype 134 | if y.dtype.startswith('int'): 135 | # the T.neq operator returns a vector of 0s and 1s, where 1 136 | # represents a mistake in prediction 137 | return T.mean(T.neq(self.y_pred, y)) 138 | else: 139 | raise NotImplementedError() 140 | 141 | 142 | class HiddenLayer(object): 143 | def __init__(self, rng, input, n_in, n_out, activation, W=None, b=None, 144 | use_bias=True): 145 | self.input = input 146 | self.activation = activation 147 | 148 | if W is None: 149 | if activation.func_name == 'relu': 150 | W_values = numpy.asarray(0.01 * rng.standard_normal(size=(n_in, n_out)), 151 | dtype=theano.config.floatX) 152 | else: 153 | W_bound = numpy.sqrt(6. / (n_in + n_out)) 154 | W_values = numpy.asarray( 155 | rng.uniform(low=-W_bound, high=W_bound, size=(n_in, n_out)), 156 | dtype=theano.config.floatX 157 | ) 158 | W = theano.shared(value=W_values, name='W') 159 | # if broadcast: 160 | # W = W.dimshuffle('x', 0, 1) 161 | if b is None: 162 | b_values = numpy.zeros((n_out,), dtype=theano.config.floatX) 163 | b = theano.shared(value=b_values, name='b') 164 | # if broadcast: 165 | # b = b.dimshuffle('x', 0) 166 | 167 | self.W = W 168 | self.b = b 169 | 170 | lin_output = T.dot(input, self.W) 171 | if use_bias: 172 | lin_output += self.b 173 | 174 | self.output = (lin_output if activation is None else activation(lin_output)) 175 | 176 | self.params = [self.W, self.b] if use_bias else [self.W] 177 | 178 | 179 | # CNN pooling 180 | class LeNetConvPoolLayer(object): 181 | def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2), non_linear='tanh', 182 | W=None, b=None): 183 | assert image_shape[1] == filter_shape[1] 184 | self.input = input 185 | self.filter_shape = filter_shape 186 | self.image_shape = image_shape 187 | self.poolsize = poolsize 188 | self.non_linear = non_linear 189 | 190 | fan_in = numpy.prod(filter_shape[1:]) 191 | fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize)) 192 | if W is None: 193 | if self.non_linear == 'none' or self.non_linear == 'relu': 194 | self.W = theano.shared(numpy.asarray(rng.uniform(low=-0.01, high=0.01, 195 | size=filter_shape), 196 | dtype=theano.config.floatX), 197 | borrow=True, 198 | name='W_conv') 199 | else: 200 | W_bound = numpy.sqrt(6. / (fan_in + fan_out)) 201 | self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape), 202 | dtype=theano.config.floatX), 203 | borrow=True, 204 | name="W_conv") 205 | else: 206 | self.W = W 207 | 208 | if b is None: 209 | b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX) 210 | self.b = theano.shared(value=b_values, borrow=True, name="b_conv") 211 | else: 212 | self.b = b 213 | 214 | # ftmp = theano.function([], self.W.shape) 215 | # print ftmp() 216 | conv_out = conv.conv2d(input=input, 217 | filters=self.W, 218 | filter_shape=self.filter_shape, 219 | image_shape=self.image_shape) 220 | 221 | if self.non_linear == 'tanh': 222 | conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) 223 | self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True) 224 | elif self.non_linear == 'relu': 225 | conv_out_tanh = relu(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) 226 | self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True) 227 | else: 228 | pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True) 229 | self.output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x') 230 | 231 | self.params = [self.W, self.b] 232 | 233 | def predict(self, new_data, batch_size): 234 | """ 235 | predict for new data 236 | """ 237 | img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3]) 238 | conv_out = conv.conv2d(input=new_data, filters=self.W, 239 | filter_shape=self.filter_shape, image_shape=img_shape) 240 | if self.non_linear == 'tanh': 241 | conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) 242 | output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True) 243 | elif self.non_linear == 'relu': 244 | conv_out_relu = relu(conv_out + self.b.dimshuffle('x', 0, 'x', 'x')) 245 | output = downsample.max_pool_2d(input=conv_out_relu, ds=self.poolsize, ignore_border=True) 246 | else: 247 | pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True) 248 | output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x') 249 | return output 250 | -------------------------------------------------------------------------------- /train_model.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dhl' 2 | 3 | 4 | import theano 5 | import theano.tensor as T 6 | import numpy as np 7 | 8 | from theano_cnn import LeNetConvPoolLayer, HiddenLayer, sgd_updates_adadelta 9 | 10 | import data_load 11 | 12 | def_filter_hs = [2, 3] 13 | sentence_len = 50 14 | sentence_pad_len = def_filter_hs[-1] - 1 15 | training_part_size = 25000 16 | 17 | 18 | def relu(x): 19 | return T.maximum(0.0, x) 20 | 21 | 22 | def to_theano_shared(vals): 23 | return theano.shared(value=np.asarray(vals, 24 | dtype=theano.config.floatX), 25 | borrow=True) 26 | 27 | 28 | def train_cnn_for_el(train_data_file_name, 29 | val_data_file_name, 30 | num_val_candidates, 31 | test_data_file_name, 32 | num_test_candidates, 33 | img_h, img_w, 34 | all_words, # first row of all_words should be a non-existing word 35 | wid_idx_dict, 36 | entity_vecs, 37 | gold_as_first_candidate=False, 38 | skip_width_loading=40, # skip width while loading samples 39 | n_epochs=25, 40 | batch_size=50, 41 | filter_hs=def_filter_hs, 42 | num_feature_maps=100, 43 | conv_non_linear="relu", 44 | lr_decay=0.9, 45 | sqr_norm_lim=9, 46 | hidden_out_len=50,): 47 | rng = np.random.RandomState(3435) 48 | 49 | x = T.imatrix('x') 50 | # es = T.imatrix('es') 51 | # es_test = T.imatrix('es_test') 52 | entities = T.imatrix('entities') 53 | 54 | print 'making entity_vecs...', len(entity_vecs) 55 | shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX), 56 | name='entity_vecs', borrow=True) 57 | # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32), 58 | # name='entity_vecs', borrow=True) 59 | print 'making shared_words...', len(all_words) 60 | shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX), 61 | name='shared_words', borrow=True) 62 | print 'done' 63 | 64 | # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading) 65 | # num_test_batches = test_indices.shape[0] / batch_size 66 | # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name, 67 | # wid_idx_dict, skip_width_loading) 68 | val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len, 69 | sentence_pad_len, 70 | skip_width=skip_width_loading, 71 | num_candidates=num_val_candidates) 72 | num_val_batches = len(val_contexts) / batch_size 73 | print num_val_batches, 'validation batches' 74 | print len(val_indices[0]), 'candidates per mention' 75 | 76 | if gold_as_first_candidate: 77 | gold_labels = theano.shared(value=np.zeros(batch_size, 78 | dtype='int32'), 79 | borrow=True) 80 | else: 81 | gold_labels = theano.shared(value=np.ones(batch_size, 82 | dtype='int32'), 83 | borrow=True) 84 | 85 | val_contexts = T.cast(to_theano_shared(val_contexts), 'int32') 86 | val_indices = T.cast(to_theano_shared(val_indices), 'int32') 87 | 88 | filter_shapes = [] 89 | pool_sizes = [] 90 | filter_w = img_w 91 | for filter_h in filter_hs: 92 | filter_shapes.append((num_feature_maps, 1, filter_h, filter_w)) 93 | pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1)) 94 | 95 | layer0_input = shared_words[x.flatten()].reshape((x.shape[0], 1, x.shape[1], shared_words.shape[1])) 96 | conv_layers = [] 97 | layer1_inputs = [] 98 | for i in xrange(len(filter_hs)): 99 | filter_shape = filter_shapes[i] 100 | pool_size = pool_sizes[i] 101 | conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w), 102 | filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear) 103 | layer1_input = conv_layer.output.flatten(2) 104 | conv_layers.append(conv_layer) 105 | layer1_inputs.append(layer1_input) 106 | 107 | layer1_input = T.concatenate(layer1_inputs, 1) 108 | matcher0 = HiddenLayer(rng, layer1_input, num_feature_maps * len(filter_hs), 109 | hidden_out_len, relu) 110 | mc = matcher0.output # mention contexts 111 | 112 | unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x') 113 | 114 | batch_entity_vecs = shared_entity_vecs[entities] 115 | matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu) 116 | entity_reps = matcher1.output 117 | # entity_reps = batch_entity_vecs 118 | 119 | unit_entity_reps = entity_reps / T.sqrt(T.maximum(T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x') 120 | 121 | similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2) 122 | correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities, axis=1))) 123 | 124 | loss = T.maximum(0, 1 - similarities[:, 0] + similarities[:, 1]).sum() 125 | 126 | # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2) # / mc_norm 127 | 128 | params = matcher0.params + matcher1.params 129 | # params = matcher0.params 130 | for conv_layer in conv_layers: 131 | params += conv_layer.params 132 | grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim) 133 | 134 | index = T.lscalar() 135 | 136 | # test_model = theano.function( 137 | # [index], 138 | # error_rate, 139 | # givens={x: test_contexts[index * batch_size: (index + 1) * batch_size], 140 | # es: test_indices[index * batch_size: (index + 1) * batch_size]} 141 | # ) 142 | 143 | val_model = theano.function( 144 | [index], 145 | correct_rate, 146 | givens={x: val_contexts[index * batch_size: (index + 1) * batch_size], 147 | entities: val_indices[index * batch_size: (index + 1) * batch_size]} 148 | ) 149 | 150 | train_contexts = theano.shared( 151 | value=np.zeros((3, 2)), 152 | borrow=True) 153 | int_train_contexts = T.cast(train_contexts, 'int32') 154 | train_indices = theano.shared( 155 | value=np.zeros((3, 2)), 156 | borrow=True) 157 | int_train_indices = T.cast(train_indices, 'int32') 158 | train_model = theano.function( 159 | [index], 160 | loss, 161 | updates=grad_updates, 162 | givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], 163 | entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 164 | ) 165 | 166 | fdebug = theano.function( 167 | [index], 168 | similarities, 169 | givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], 170 | entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 171 | ) 172 | # print fdebug(0) 173 | 174 | val_perfs = [val_model(i) for i in xrange(num_val_batches)] 175 | print('init val perf %f' % np.mean(val_perfs)) 176 | 177 | print 'training ...' 178 | f_train = open(train_data_file_name, 'rb') 179 | epoch = 0 180 | while epoch < n_epochs: 181 | epoch += 1 182 | 183 | train_part_cnt = 0 184 | # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part( 185 | # f_train, wid_idx_dict, 50000) 186 | cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, 187 | training_part_size, 188 | wid_idx_dict, 189 | sentence_len, 190 | sentence_pad_len) 191 | while not len(cur_train_contexts) == 0: 192 | train_contexts.set_value(cur_train_contexts, borrow=True) 193 | train_indices.set_value(cur_train_indices, borrow=True) 194 | # print fdebug(0) 195 | 196 | train_part_cnt += 1 197 | num_train_batches = len(cur_train_contexts) / batch_size 198 | # print 'num_train_batches', num_train_batches 199 | mean_loss = 0 200 | for minibatch_index in xrange(num_train_batches): 201 | cur_loss = train_model(minibatch_index) 202 | mean_loss += cur_loss 203 | # if (minibatch_index + 1) % (num_train_batches / 3) == 0: # show some progress 204 | # print minibatch_index, num_train_batches 205 | print 'loss:', mean_loss / num_train_batches 206 | # print fdebug(0) 207 | 208 | val_perfs = [val_model(i) for i in xrange(num_val_batches)] 209 | val_perf = np.mean(val_perfs) 210 | print('epoch %i, training part %i, val perf %f' 211 | % (epoch, train_part_cnt, val_perf)) 212 | cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, 213 | training_part_size, 214 | wid_idx_dict, 215 | sentence_len, 216 | sentence_pad_len) 217 | # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part( 218 | # f_train, wid_idx_dict, 50000) 219 | 220 | f_train.close() 221 | 222 | 223 | def main(): 224 | _, word_vecs = data_load.load_word_vectors('/media/dhl/Data/el/word2vec/wiki_vectors.jbin') 225 | word_vec_len = len(word_vecs[0]) 226 | 227 | wid_idx_dict, entity_vecs = data_load.load_entities( 228 | '/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin', 229 | False) 230 | # wid_idx_dict, entity_vecs = data_load.load_entities('/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50.bin', 231 | # True) 232 | 233 | # all_word_vecs = 234 | num_val_candidates = 30 235 | num_test_candidates = 30 236 | skipwidth_loading = 0 237 | img_h = sentence_len + 2 * sentence_pad_len 238 | train_cnn_for_el('/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td', 239 | '/media/dhl/Data/el/vec_rep/tac_2014_training.bin', 240 | # '/media/dhl/Data/el/vec_rep/wiki_val_word_vec_indices_wiki50.td', 241 | num_val_candidates, 242 | '/media/dhl/Data/el/vec_rep/wiki_test_word_vec_indices_wiki50.td', 243 | num_test_candidates, 244 | img_h, word_vec_len, 245 | word_vecs, 246 | wid_idx_dict, 247 | entity_vecs, 248 | skip_width_loading=skipwidth_loading, 249 | n_epochs=1) 250 | 251 | 252 | if __name__ == '__main__': 253 | main() 254 | -------------------------------------------------------------------------------- /train_parallel_cnn.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | __author__ = 'dhl' 3 | 4 | import sys 5 | 6 | import numpy as np 7 | 8 | import theano 9 | import theano.tensor as T 10 | 11 | import data_load 12 | from sentence_cnn import SentenceCNN 13 | from theano_cnn import HiddenLayer, relu, sgd_updates_adadelta 14 | 15 | 16 | # TODO remove these global variables 17 | def_filter_hs = [2, 3] 18 | sentence_pad_len = def_filter_hs[-1] - 1 19 | 20 | 21 | max_num_entity_words = 50 22 | entity_pad_len = 1 23 | entity_rep_len = max_num_entity_words + 2 * entity_pad_len 24 | entity_hs = [1] 25 | num_entity_rep_feature_maps = 300 26 | 27 | 28 | def to_theano_shared(vals): 29 | return theano.shared(value=np.asarray(vals, 30 | dtype=theano.config.floatX), 31 | borrow=True) 32 | 33 | 34 | def get_entity_context_similarities(unit_mc, cnn_output_for_entities, batch_size, num_candidates): 35 | entity_reps = cnn_output_for_entities.reshape((batch_size, num_candidates, 36 | cnn_output_for_entities.shape[1])) 37 | unit_entity_reps = entity_reps / T.sqrt(T.maximum( 38 | T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x') 39 | return (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2) 40 | 41 | 42 | def get_training_variables_with_entity_side_cnn(batch_entity_vecs, shared_words, word_vec_len, batch_size, 43 | hidden_out_len, unit_mc, num_train_candidates, 44 | num_val_candidates, gold_labels, rng): 45 | entity_vecs_reshaped = batch_entity_vecs.reshape((batch_entity_vecs.shape[0] * batch_entity_vecs.shape[1], 46 | batch_entity_vecs.shape[2])) 47 | 48 | sentence_cnn1_train = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs, 49 | num_entity_rep_feature_maps, 50 | batch_size * num_train_candidates, hidden_out_len, rng) 51 | entity_reps_train = sentence_cnn1_train.output 52 | similarities_train = get_entity_context_similarities(unit_mc, entity_reps_train, batch_size, num_train_candidates) 53 | loss = T.maximum(0, 1 - similarities_train[:, 0] + similarities_train[:, 1]).sum() 54 | 55 | sentence_cnn1_val = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs, 56 | num_entity_rep_feature_maps, 57 | batch_size * num_val_candidates, 58 | hidden_out_len, rng, 59 | hidden_W=sentence_cnn1_train.hiddenW, 60 | hidden_b=sentence_cnn1_train.hiddenb, 61 | conv_Ws=sentence_cnn1_train.convWs, 62 | conv_bs=sentence_cnn1_train.convbs) 63 | entity_reps_val = sentence_cnn1_val.output 64 | 65 | similarities_val = get_entity_context_similarities(unit_mc, entity_reps_val, batch_size, num_val_candidates) 66 | correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities_val, axis=1))) 67 | 68 | params = sentence_cnn1_train.params 69 | 70 | return loss, correct_rate, params 71 | 72 | 73 | def get_training_variables_no_entity_side_cnn(batch_entity_vecs, entity_vec_len, hidden_out_len, unit_mc, 74 | gold_labels, rng): 75 | matcher1 = HiddenLayer(rng, batch_entity_vecs, entity_vec_len, hidden_out_len, relu) 76 | entity_reps = matcher1.output 77 | unit_entity_reps = entity_reps / T.sqrt(T.maximum( 78 | T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x') 79 | 80 | similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2) 81 | loss = T.maximum(0, 1 - similarities[:, 0] + similarities[:, 1]).sum() 82 | correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities, axis=1))) 83 | params = matcher1.params 84 | return loss, correct_rate, params 85 | 86 | 87 | def train_cnn_for_el(train_data_file_name, 88 | val_data_file_name, 89 | num_val_candidates, 90 | test_data_file_name, 91 | num_test_candidates, 92 | sentence_len, word_vec_len, 93 | all_words, # first row of all_words should be a non-existing word 94 | wid_idx_dict, 95 | entity_vecs, 96 | entity_side_cnn=False, 97 | gold_as_first_candidate=False, 98 | skip_width_loading=40, # skip width while loading samples 99 | n_epochs=25, 100 | batch_size=50, 101 | filter_hs=def_filter_hs, 102 | num_feature_maps=100, 103 | lr_decay=0.9, 104 | sqr_norm_lim=9, 105 | hidden_out_len=50, 106 | training_part_size=50000, 107 | num_train_candidates=2): 108 | full_sentence_len = sentence_len + 2 * sentence_pad_len 109 | rng = np.random.RandomState(3435) 110 | 111 | print 'making entity_vecs...', len(entity_vecs) 112 | if entity_side_cnn: 113 | shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype="int32"), 114 | name='entity_vecs', borrow=True) 115 | else: 116 | shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX), 117 | name='entity_vecs', borrow=True) 118 | 119 | print 'making shared_words...', len(all_words) 120 | shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX), 121 | name='shared_words', borrow=True) 122 | print 'done' 123 | 124 | val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len, 125 | sentence_pad_len, 126 | skip_width=skip_width_loading, 127 | num_candidates=num_val_candidates) 128 | num_val_batches = len(val_contexts) / batch_size 129 | print num_val_batches, 'validation batches' 130 | print len(val_indices[0]), 'candidates per mention' 131 | val_contexts = T.cast(to_theano_shared(val_contexts), 'int32') 132 | val_indices = T.cast(to_theano_shared(val_indices), 'int32') 133 | 134 | test_contexts, test_indices = data_load.load_samples_full(test_data_file_name, wid_idx_dict, sentence_len, 135 | sentence_pad_len, 136 | skip_width=skip_width_loading, 137 | num_candidates=num_test_candidates) 138 | num_test_batches = len(test_contexts) / batch_size 139 | print num_test_batches, 'test batches' 140 | print len(test_indices[0]), 'candidates per mention' 141 | test_contexts = T.cast(to_theano_shared(test_contexts), 'int32') 142 | test_indices = T.cast(to_theano_shared(test_indices), 'int32') 143 | 144 | if gold_as_first_candidate: 145 | gold_labels = theano.shared(value=np.zeros(batch_size, 146 | dtype='int32'), 147 | borrow=True) 148 | else: 149 | gold_labels = theano.shared(value=np.ones(batch_size, 150 | dtype='int32'), 151 | borrow=True) 152 | 153 | x = T.imatrix('x') 154 | entities = T.imatrix('entities') 155 | 156 | sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps, 157 | batch_size, 158 | hidden_out_len, rng) 159 | mc = sentence_cnn0.output # mention contexts 160 | unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x') 161 | 162 | batch_entity_vecs = shared_entity_vecs[entities] 163 | 164 | if entity_side_cnn: 165 | loss, correct_rate, entity_side_params = get_training_variables_with_entity_side_cnn(batch_entity_vecs, 166 | shared_words, 167 | word_vec_len, batch_size, 168 | hidden_out_len, unit_mc, 169 | num_train_candidates, 170 | num_val_candidates, 171 | gold_labels, rng) 172 | else: 173 | loss, correct_rate, entity_side_params = get_training_variables_no_entity_side_cnn(batch_entity_vecs, 174 | len(entity_vecs[0]), 175 | hidden_out_len, 176 | unit_mc, 177 | gold_labels, rng) 178 | # params = matcher0.params + entity_side_params 179 | # for conv_layer in conv_layers: 180 | # params += conv_layer.params 181 | 182 | # params = sentence_cnn0.params + matcher1.params 183 | # params = sentence_cnn0.params + sentence_cnn1_train.params 184 | 185 | params = sentence_cnn0.params + entity_side_params 186 | grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim) 187 | 188 | index = T.lscalar() 189 | 190 | val_model = theano.function( 191 | [index], 192 | correct_rate, 193 | givens={x: val_contexts[index * batch_size: (index + 1) * batch_size], 194 | entities: val_indices[index * batch_size: (index + 1) * batch_size]} 195 | ) 196 | 197 | test_model = theano.function( 198 | [index], 199 | correct_rate, 200 | givens={x: test_contexts[index * batch_size: (index + 1) * batch_size], 201 | entities: test_indices[index * batch_size: (index + 1) * batch_size]} 202 | ) 203 | 204 | train_contexts = theano.shared( 205 | value=np.zeros((3, 2)), 206 | borrow=True) 207 | int_train_contexts = T.cast(train_contexts, 'int32') 208 | train_indices = theano.shared( 209 | value=np.zeros((3, 2)), 210 | borrow=True) 211 | int_train_indices = T.cast(train_indices, 'int32') 212 | train_model = theano.function( 213 | [index], 214 | loss, 215 | updates=grad_updates, 216 | givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size], 217 | entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 218 | ) 219 | 220 | fdebug = theano.function( 221 | [index], 222 | batch_entity_vecs, 223 | givens={entities: int_train_indices[index * batch_size: (index + 1) * batch_size]} 224 | ) 225 | # print fdebug(0) 226 | 227 | val_perfs = [val_model(i) for i in xrange(num_val_batches)] 228 | print('init val perf %f' % np.mean(val_perfs)) 229 | 230 | epoch = 0 231 | max_val_perf = 0 232 | test_perf = 0 233 | print 'training ...' 234 | while epoch < n_epochs: 235 | f_train = open(train_data_file_name, 'rb') 236 | epoch += 1 237 | 238 | train_part_cnt = 0 239 | 240 | cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, 241 | training_part_size, 242 | wid_idx_dict, 243 | sentence_len, 244 | sentence_pad_len) 245 | 246 | while not len(cur_train_contexts) == 0 and train_part_cnt < 100: 247 | train_contexts.set_value(cur_train_contexts, borrow=True) 248 | train_indices.set_value(cur_train_indices, borrow=True) 249 | 250 | train_part_cnt += 1 251 | num_train_batches = len(cur_train_contexts) / batch_size 252 | # print 'num_train_batches', num_train_batches 253 | mean_loss = 0 254 | for minibatch_index in xrange(num_train_batches): 255 | cur_loss = train_model(minibatch_index) 256 | mean_loss += cur_loss 257 | # print minibatch_index, cur_loss 258 | print 'loss:', mean_loss / num_train_batches 259 | # print fdebug(0) 260 | 261 | val_perfs = [val_model(i) for i in xrange(num_val_batches)] 262 | val_perf = np.mean(val_perfs) 263 | print('epoch %i, training part %i, val perf %f(%f), test perf %f' 264 | % (epoch, train_part_cnt, val_perf, max_val_perf, test_perf)) 265 | 266 | if val_perf > max_val_perf: 267 | max_val_perf = val_perf 268 | test_perfs = [test_model(i) for i in xrange(num_test_batches)] 269 | test_perf = np.mean(test_perfs) 270 | print('\tepoch %i, training part %i, test_perf %f' 271 | % (epoch, train_part_cnt, test_perf)) 272 | 273 | cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train, 274 | training_part_size, 275 | wid_idx_dict, 276 | sentence_len, 277 | sentence_pad_len) 278 | f_train.close() 279 | 280 | 281 | def load_params(param_file_name): 282 | f = open(param_file_name, 'rb') 283 | params = dict() 284 | line = f.readline() 285 | while line: 286 | vals = line.decode('utf8').strip() 287 | if not vals == '': 288 | vals = vals.split(' ') 289 | params[vals[0]] = vals[1] 290 | line = f.readline() 291 | f.close() 292 | return params 293 | 294 | 295 | def main(): 296 | if len(sys.argv) < 2: 297 | print 'need params file' 298 | 299 | params = load_params(sys.argv[1]) 300 | 301 | entity_side_cnn = params['entity_side_cnn'] == '1' 302 | word_vec_file_name = params['word_vec_file'] 303 | 304 | if entity_side_cnn: 305 | entity_rep_file_name = params['entity_rep_indices_file'] 306 | else: 307 | entity_rep_file_name = params['entity_rep_vec_file'] 308 | 309 | train_data_file_name = params['train_data_file'] 310 | val_data_file_name = params['val_data_file'] 311 | test_data_file_name = params['test_data_file'] 312 | 313 | training_part_size = int(params['training_part_size']) 314 | sentence_len = int(params['context_sentence_len']) 315 | 316 | _, word_vecs = data_load.load_word_vectors(word_vec_file_name) 317 | word_vec_len = len(word_vecs[0]) 318 | 319 | if entity_side_cnn: 320 | print 'entity use cnn' 321 | global entity_rep_len 322 | wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len( 323 | entity_rep_file_name) 324 | else: 325 | wid_idx_dict, entity_vecs = data_load.load_entities( 326 | entity_rep_file_name, 327 | False) 328 | 329 | # wid_idx_dict, entity_vecs = data_load.load_entities_indices( 330 | # entity_rep_file_name, max_num_entity_words, entity_pad_len) 331 | 332 | num_val_candidates = 30 333 | num_test_candidates = 30 334 | skipwidth_loading = 0 335 | train_cnn_for_el(train_data_file_name, 336 | val_data_file_name, 337 | num_val_candidates, 338 | test_data_file_name, 339 | num_test_candidates, 340 | sentence_len, word_vec_len, 341 | word_vecs, 342 | wid_idx_dict, 343 | entity_vecs, 344 | entity_side_cnn=entity_side_cnn, 345 | gold_as_first_candidate=False, 346 | skip_width_loading=skipwidth_loading, 347 | n_epochs=1, 348 | training_part_size=training_part_size) 349 | 350 | 351 | if __name__ == '__main__': 352 | main() 353 | --------------------------------------------------------------------------------