├── .gitattributes
├── .gitignore
├── README.md
├── dataload.py
├── debug_train.py
├── params.txt
├── params_cnn.txt
├── sentence_cnn.py
├── theano_cnn.py
├── train_model.py
└── train_parallel_cnn.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #################
  2 | ## Eclipse
  3 | #################
  4 | 
  5 | *.pydevproject
  6 | .project
  7 | .metadata
  8 | bin/
  9 | tmp/
 10 | *.tmp
 11 | *.bak
 12 | *.swp
 13 | *~.nib
 14 | local.properties
 15 | .classpath
 16 | .settings/
 17 | .loadpath
 18 | 
 19 | # External tool builders
 20 | .externalToolBuilders/
 21 | 
 22 | # Locally stored "Eclipse launch configurations"
 23 | *.launch
 24 | 
 25 | # CDT-specific
 26 | .cproject
 27 | 
 28 | # PDT-specific
 29 | .buildpath
 30 | 
 31 | 
 32 | #################
 33 | ## Visual Studio
 34 | #################
 35 | 
 36 | ## Ignore Visual Studio temporary files, build results, and
 37 | ## files generated by popular Visual Studio add-ons.
 38 | 
 39 | # User-specific files
 40 | *.suo
 41 | *.user
 42 | *.sln.docstates
 43 | 
 44 | # Build results
 45 | 
 46 | [Dd]ebug/
 47 | [Rr]elease/
 48 | x64/
 49 | build/
 50 | [Bb]in/
 51 | [Oo]bj/
 52 | 
 53 | # MSTest test Results
 54 | [Tt]est[Rr]esult*/
 55 | [Bb]uild[Ll]og.*
 56 | 
 57 | *_i.c
 58 | *_p.c
 59 | *.ilk
 60 | *.meta
 61 | *.obj
 62 | *.pch
 63 | *.pdb
 64 | *.pgc
 65 | *.pgd
 66 | *.rsp
 67 | *.sbr
 68 | *.tlb
 69 | *.tli
 70 | *.tlh
 71 | *.tmp
 72 | *.tmp_proj
 73 | *.log
 74 | *.vspscc
 75 | *.vssscc
 76 | .builds
 77 | *.pidb
 78 | *.log
 79 | *.scc
 80 | 
 81 | # Visual C++ cache files
 82 | ipch/
 83 | *.aps
 84 | *.ncb
 85 | *.opensdf
 86 | *.sdf
 87 | *.cachefile
 88 | 
 89 | # Visual Studio profiler
 90 | *.psess
 91 | *.vsp
 92 | *.vspx
 93 | 
 94 | # Guidance Automation Toolkit
 95 | *.gpState
 96 | 
 97 | # ReSharper is a .NET coding add-in
 98 | _ReSharper*/
 99 | *.[Rr]e[Ss]harper
100 | 
101 | # TeamCity is a build add-in
102 | _TeamCity*
103 | 
104 | # DotCover is a Code Coverage Tool
105 | *.dotCover
106 | 
107 | # NCrunch
108 | *.ncrunch*
109 | .*crunch*.local.xml
110 | 
111 | # Installshield output folder
112 | [Ee]xpress/
113 | 
114 | # DocProject is a documentation generator add-in
115 | DocProject/buildhelp/
116 | DocProject/Help/*.HxT
117 | DocProject/Help/*.HxC
118 | DocProject/Help/*.hhc
119 | DocProject/Help/*.hhk
120 | DocProject/Help/*.hhp
121 | DocProject/Help/Html2
122 | DocProject/Help/html
123 | 
124 | # Click-Once directory
125 | publish/
126 | 
127 | # Publish Web Output
128 | *.Publish.xml
129 | *.pubxml
130 | *.publishproj
131 | 
132 | # NuGet Packages Directory
133 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
134 | #packages/
135 | 
136 | # Windows Azure Build Output
137 | csx
138 | *.build.csdef
139 | 
140 | # Windows Store app package directory
141 | AppPackages/
142 | 
143 | # Others
144 | sql/
145 | *.Cache
146 | ClientBin/
147 | [Ss]tyle[Cc]op.*
148 | ~$*
149 | *~
150 | *.dbmdl
151 | *.[Pp]ublish.xml
152 | *.pfx
153 | *.publishsettings
154 | 
155 | # RIA/Silverlight projects
156 | Generated_Code/
157 | 
158 | # Backup & report files from converting an old project file to a newer
159 | # Visual Studio version. Backup files are not needed, because we have git ;-)
160 | _UpgradeReport_Files/
161 | Backup*/
162 | UpgradeLog*.XML
163 | UpgradeLog*.htm
164 | 
165 | # SQL Server files
166 | App_Data/*.mdf
167 | App_Data/*.ldf
168 | 
169 | #############
170 | ## Windows detritus
171 | #############
172 | 
173 | # Windows image file caches
174 | Thumbs.db
175 | ehthumbs.db
176 | 
177 | # Folder config file
178 | Desktop.ini
179 | 
180 | # Recycle Bin used on file shares
181 | $RECYCLE.BIN/
182 | 
183 | # Mac crap
184 | .DS_Store
185 | 
186 | 
187 | #############
188 | ## Python
189 | #############
190 | 
191 | *.py[cod]
192 | 
193 | # Packages
194 | *.egg
195 | *.egg-info
196 | dist/
197 | build/
198 | eggs/
199 | parts/
200 | var/
201 | sdist/
202 | develop-eggs/
203 | .installed.cfg
204 | 
205 | # Installer logs
206 | pip-log.txt
207 | 
208 | # Unit test / coverage reports
209 | .coverage
210 | .tox
211 | 
212 | #Translations
213 | *.mo
214 | 
215 | #Mr Developer
216 | .mr.developer.cfg
217 | 
218 | .idea/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | The data needed to run this project is lost.
2 | 


--------------------------------------------------------------------------------
/dataload.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | 
  3 | __author__ = 'dhl'
  4 | 
  5 | import numpy
  6 | import math
  7 | import copy
  8 | 
  9 | def load_word_vectors(file_path):
 10 |     print 'loading word vectors...'
 11 |     f = open(file_path, 'rb')
 12 |     num_words = numpy.fromfile(f, '>i4', 1)
 13 |     vec_len = numpy.fromfile(f, '>i4', 1)
 14 |     print num_words, vec_len
 15 |     words = []
 16 |     # word_vecs = []
 17 |     # word_vecs.append(0, [0. for i in xrange(word_vec_len)])
 18 |     word_vecs = numpy.zeros((num_words + 1, vec_len))
 19 |     # word_vecs[0][0] = 1
 20 |     for i in xrange(num_words):
 21 |         length = ord(f.read(1))
 22 |         byte_buf = f.read(length)
 23 |         if i == 0:
 24 |             words.append(byte_buf.decode('utf-8'))
 25 |         words.append(byte_buf.decode('utf-8'))
 26 |         word_vecs[i + 1] = numpy.fromfile(f, '>f4', vec_len)
 27 |         # vec = numpy.fromfile(f, '>f4', vec_len)
 28 |         # word_vecs.append(vec)
 29 | 
 30 |     f.close()
 31 |     print 'done.'
 32 | 
 33 |     return words, word_vecs
 34 | 
 35 | 
 36 | def load_index_vec_of_entities_fixed_len(file_path):
 37 |     print 'loading representations of entities (word indices, fixed len) ...'
 38 |     f = open(file_path, 'rb')
 39 |     num_entities = numpy.fromfile(f, '>i4', 1)
 40 |     print num_entities
 41 |     vec_len = numpy.fromfile(f, '>i4', 1)[0]
 42 |     print vec_len
 43 | 
 44 |     wid_idx_dict = dict()
 45 |     wid_idx_dict[0] = 0
 46 |     entity_vecs = numpy.zeros((num_entities + 1, vec_len), dtype='int32')
 47 |     for i in xrange(num_entities):
 48 |         wid = numpy.fromfile(f, '>i4', 1)
 49 |         wid_idx_dict[wid[0]] = i + 1
 50 | 
 51 |         # print num_indices
 52 |         entity_vecs[i + 1] = numpy.fromfile(f, '>i4', vec_len)
 53 | 
 54 |         # print i
 55 |         if (i + 1) % 1000000 == 0:
 56 |             print i + 1
 57 | 
 58 |     f.close()
 59 | 
 60 |     print 'done.'
 61 |     return wid_idx_dict, entity_vecs, vec_len
 62 | 
 63 | 
 64 | def load_entities_indices(file_path, max_num_words=50, pad_len=1):
 65 |     print 'loading representations of entities (word indices) ...'
 66 | 
 67 |     f = open(file_path, 'rb')
 68 |     num_entities = numpy.fromfile(f, '>i4', 1)
 69 |     print num_entities
 70 | 
 71 |     wid_idx_dict = dict()
 72 |     wid_idx_dict[0] = 0
 73 |     entity_vecs = numpy.zeros((num_entities + 1, max_num_words + 2 * pad_len), dtype='int32')
 74 |     for i in xrange(num_entities):
 75 |         wid = numpy.fromfile(f, '>i4', 1)
 76 |         wid_idx_dict[wid[0]] = i + 1
 77 | 
 78 |         num_indices = numpy.fromfile(f, '>i4', 1)
 79 |         # print num_indices
 80 |         indices = numpy.fromfile(f, '>i4', num_indices)
 81 |         for j in xrange(num_indices):
 82 |             if j < max_num_words:
 83 |                 entity_vecs[i + 1][pad_len + j] = indices[j] + 1
 84 |             else:
 85 |                 break
 86 | 
 87 |         # print i
 88 |         if (i + 1) % 1000000 == 0:
 89 |             print i + 1
 90 | 
 91 |     f.close()
 92 | 
 93 |     print 'done.'
 94 |     return wid_idx_dict, entity_vecs
 95 | 
 96 | 
 97 | def load_entities(file_path, div_by_len=False, unknown_vec=None):
 98 |     print 'loading entity representations ...'
 99 |     f = open(file_path, 'rb')
100 | 
101 |     num_entities = numpy.fromfile(f, '>i4', 1)
102 |     vec_len = numpy.fromfile(f, '>i4', 1)
103 | 
104 |     print num_entities, vec_len
105 | 
106 |     wid_idx_dict = dict()
107 |     wid_idx_dict[0] = 0
108 |     entity_vecs = numpy.zeros((num_entities + 1, vec_len))
109 |     if unknown_vec is None:
110 |         # entity_vecs[0] = numpy.random.uniform(low=0, high=1, size=(vec_len,))
111 |         entity_vecs[0][0] = 1
112 |     else:
113 |         entity_vecs[0] = unknown_vec
114 |     cnt = 1
115 |     while True:
116 |         wid = numpy.fromfile(f, '>i4', 1)
117 | 
118 |         if not wid:
119 |             break
120 | 
121 |         entity_vecs[cnt] = numpy.fromfile(f, '>f4', vec_len)
122 |         if div_by_len:
123 |             l2_norm = 0
124 |             for i in xrange(vec_len):
125 |                 l2_norm += entity_vecs[cnt][i] * entity_vecs[cnt][i]
126 |             l2_norm = math.sqrt(l2_norm)
127 |             for i in xrange(vec_len):
128 |                 entity_vecs[cnt][i] /= l2_norm
129 | 
130 |         wid_idx_dict[wid[0]] = cnt
131 | 
132 |         # print entity_vecs[cnt]
133 |         cnt += 1
134 |         # if cnt == 10:
135 |         #     break
136 |         if cnt % 1000000 == 0:
137 |             print cnt
138 | 
139 |     f.close()
140 |     print 'done.'
141 | 
142 |     return wid_idx_dict, entity_vecs
143 | 
144 | 
145 | def skip_next_training_paragraph(f):
146 |     sentence_len = numpy.fromfile(f, '>i4', 1)
147 |     if not sentence_len:
148 |         return False
149 | 
150 |     word_indices = numpy.fromfile(f, '>i4', sentence_len)
151 | 
152 |     num_mentions = numpy.fromfile(f, '>i4', 1)
153 |     for i in xrange(num_mentions):
154 |         mention_span = numpy.fromfile(f, '>i4', 2)
155 |         num_mention_candidates = numpy.fromfile(f, '>i4', 1)
156 | 
157 |         candidates = numpy.fromfile(f, '>i4', num_mention_candidates)
158 | 
159 | 
160 | def load_next_training_paragraph(f):
161 |     sentence_len = numpy.fromfile(f, '>i4', 1)
162 |     if not sentence_len:
163 |         return False
164 | 
165 |     word_indices = numpy.fromfile(f, '>i4', sentence_len)
166 | 
167 |     num_mentions = numpy.fromfile(f, '>i4', 1)
168 |     mention_spans = []
169 |     candidates_mentions = []
170 |     for i in xrange(num_mentions):
171 |         mention_span = numpy.fromfile(f, '>i4', 2)
172 |         if mention_span is None:
173 |             print i, num_mentions, 'weird'
174 |         # [mention_beg, mention_end] = numpy.fromfile(f, '>i4', 2)
175 |         # mention_spans.append((mention_beg, mention_end))
176 |         if not len(mention_span) == 2:
177 |             print i, num_mentions, 'weird'
178 |             print mention_span
179 |         [mention_beg, mention_end] = mention_span
180 |         mention_spans.append([mention_beg, mention_end])
181 |         num_mention_candidates = numpy.fromfile(f, '>i4', 1)
182 | 
183 |         if num_mention_candidates > 37:
184 |             print 'num_mention_candidates', num_mention_candidates
185 |             return False
186 |         # else:
187 |         #     print 'num_mention_candidates', num_mention_candidates
188 | 
189 |         candidates = numpy.fromfile(f, '>i4', num_mention_candidates)
190 |         candidates_mentions.append(candidates)
191 | 
192 |     return word_indices, mention_spans, candidates_mentions
193 | 
194 | 
195 | def get_mention_centered_context(word_indices, mention_span, sentence_len, pad_len):
196 |     result_indices = []
197 |     for i in xrange(pad_len):
198 |         result_indices.append(0)
199 | 
200 |     len_mention_span = mention_span[1] - mention_span[0] + 1
201 |     len_side = (sentence_len - len_mention_span) / 2
202 |     pos_left = mention_span[0] - len_side
203 |     pos_right = mention_span[1] + len_side
204 | 
205 |     if pos_left < 0:
206 |         pos_right -= pos_left
207 |         if pos_right >= len(word_indices):
208 |             pos_right = len(word_indices) - 1
209 |         pos_left = 0
210 |     elif pos_right >= len(word_indices):
211 |         pos_left -= pos_right - len(word_indices) + 1
212 |         if pos_left < 0:
213 |             pos_left = 0
214 |         pos_right = len(word_indices) - 1
215 | 
216 |     for pos in xrange(pos_left, pos_right + 1):
217 |         cur_word_index = word_indices[pos]
218 |         if cur_word_index > -1:
219 |             result_indices.append(cur_word_index + 1)
220 | 
221 |     # print sentence_len, pad_len
222 | 
223 |     while len(result_indices) < sentence_len + 2 * pad_len:
224 |         result_indices.append(0)
225 | 
226 |     return result_indices
227 | 
228 | 
229 | def get_samples_in_paragraph(word_indices, mention_spans, candidates_mentions,
230 |                              wid_idx_dict, dst_contexts, dst_entity_idxs,
231 |                              sentence_len, sentence_pad_len, num_candidates=2):
232 |     cnt0 = 0
233 |     cnt1 = 0
234 |     if len(mention_spans) == len(candidates_mentions):
235 |         entity_indices = numpy.zeros(num_candidates, dtype='int32')
236 |         for mention_span, candidate_mentions in zip(mention_spans, candidates_mentions):
237 |             if len(candidate_mentions) == 1:
238 |                 cnt0 += 1
239 |                 continue
240 | 
241 |             pos = 0
242 |             last_index = 0
243 |             while pos < len(candidate_mentions) and pos < num_candidates:
244 |                 idx = wid_idx_dict.get(candidate_mentions[pos], 0)
245 |                 entity_indices[pos] = idx
246 |                 last_index = idx
247 |                 pos += 1
248 | 
249 |             while pos < num_candidates:
250 |                 entity_indices[pos] = last_index
251 |                 pos += 1
252 | 
253 |             mention_context = get_mention_centered_context(word_indices, mention_span, sentence_len, sentence_pad_len)
254 |             dst_contexts.append(mention_context)
255 |             dst_entity_idxs.append(copy.copy(entity_indices))
256 |     else:
257 |         print 'number of mention spans does not match number of candidates of mentions'
258 | 
259 |     return cnt0, cnt1
260 | 
261 | 
262 | def load_training_samples(f, num_paragraphs, wid_idx_dict, sentence_len, sentence_pad_len):
263 |     print 'loading data', num_paragraphs, 'paragraphs'
264 |     contexts = []
265 |     entity_idxs = []
266 |     for i in xrange(num_paragraphs):
267 |         result_tuple = load_next_training_paragraph(f)
268 |         if result_tuple:
269 |             word_indices, mention_spans, candidates_mentions = result_tuple
270 |             get_samples_in_paragraph(word_indices, mention_spans, candidates_mentions,
271 |                                      wid_idx_dict, contexts, entity_idxs,
272 |                                      sentence_len, sentence_pad_len)
273 |         else:
274 |             return contexts, entity_idxs
275 | 
276 |     print 'done.'
277 |     return contexts, entity_idxs
278 | 
279 | 
280 | def skip_training_sample(f, num_paragraphs):
281 |     for i in xrange(num_paragraphs):
282 |         skip_next_training_paragraph(f)
283 | 
284 | 
285 | def load_samples_full(file_name, wid_idx_dict, sentence_len, sentence_pad_len, skip_width=20, num_candidates=2):
286 |     print 'loading', file_name, '...'
287 |     contexts = []
288 |     entity_idxs = []
289 |     f = open(file_name, 'rb')
290 |     cnt = 0
291 |     result_tuple = load_next_training_paragraph(f)
292 |     while result_tuple:
293 |         if skip_width == 0 or cnt % skip_width == 0:
294 |             word_indices, mention_spans, candidates_mentions = result_tuple
295 |             get_samples_in_paragraph(word_indices, mention_spans, candidates_mentions,
296 |                                      wid_idx_dict, contexts, entity_idxs,
297 |                                      sentence_len, sentence_pad_len, num_candidates)
298 |             # print entity_idxs
299 |         cnt += 1
300 | 
301 |         if cnt % 500000 == 0:
302 |             print cnt
303 |         result_tuple = load_next_training_paragraph(f)
304 | 
305 |     f.close()
306 |     print 'done.'
307 |     return contexts, entity_idxs
308 | 
309 | 
310 | def main():
311 |     print 'data_load'
312 |     # words, word_vecs = load_word_vectors('/media/dhl/Data/el/word2vec/wiki_vectors.jbin')
313 |     # for i in xrange(300):
314 |     #     print words[i]
315 | 
316 |     # f = open('/media/dhl/Data/el/vec_rep/wiki_training_word_vec_indices.td', 'rb')
317 |     # word_indices, mention_spans, candidates_mentions = load_next_training_paragraph(f)
318 |     # f.close()
319 | 
320 |     # for i in range(len(word_indices)):
321 |     #     if word_indices[i] > -1:
322 |     #         print i, words[word_indices[i]]
323 | 
324 |     # print word_indices
325 |     # print mention_spans
326 |     # mention_context = get_mention_centered_context(word_indices, mention_spans[0])
327 |     # print mention_context
328 | 
329 |     # wid_idx_dict, entity_vecs = load_entities('/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_unit_vec.bin', False)
330 | 
331 |     # wid_idx_dict, entity_vecs = load_entities_indices('/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_indices.bin')
332 |     # print wid_idx_dict[12]
333 |     # print entity_vecs[wid_idx_dict[12]]
334 |     # for idx in entity_vecs[wid_idx_dict[12]]:
335 |     #     print words[idx]
336 | 
337 | 
338 | if __name__ == '__main__':
339 |     main()
340 | 


--------------------------------------------------------------------------------
/debug_train.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | __author__ = 'dhl'
  3 | 
  4 | import sys
  5 | import cPickle
  6 | 
  7 | import numpy as np
  8 | 
  9 | import theano
 10 | import theano.tensor as T
 11 | 
 12 | import data_load
 13 | from sentence_cnn import SentenceCNN
 14 | from theano_cnn import HiddenLayer, relu, sgd_updates_adadelta
 15 | 
 16 | 
 17 | def to_theano_shared(vals):
 18 |     return theano.shared(value=np.asarray(vals,
 19 |                                           dtype=theano.config.floatX),
 20 |                          borrow=True)
 21 | 
 22 | 
 23 | def get_entity_context_similarities(unit_mc, cnn_output_for_entities, batch_size, num_candidates):
 24 |     entity_reps = cnn_output_for_entities.reshape((batch_size, num_candidates,
 25 |                                                    cnn_output_for_entities.shape[1]))
 26 |     unit_entity_reps = entity_reps / T.sqrt(T.maximum(
 27 |         T.sum(T.sqr(entity_reps), 2), 1e-5)).dimshuffle(0, 1, 'x')
 28 |     return (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)
 29 | 
 30 | 
 31 | # TODO remove these global variables
 32 | def_filter_hs = [1, 2]
 33 | sentence_len = 50
 34 | sentence_pad_len = def_filter_hs[-1] - 1
 35 | training_part_size = 50000
 36 | 
 37 | num_train_candidates = 2
 38 | 
 39 | 
 40 | max_num_entity_words = 50
 41 | entity_pad_len = 1
 42 | entity_rep_len = max_num_entity_words + 2 * entity_pad_len
 43 | entity_hs = [1]
 44 | num_entity_rep_feature_maps = 300
 45 | 
 46 | def train_cnn_for_el(train_data_file_name,
 47 |                      val_data_file_name,
 48 |                      num_val_candidates,
 49 |                      test_data_file_name,
 50 |                      num_test_candidates,
 51 |                      full_sentence_len, word_vec_len,
 52 |                      all_words,  # first row of all_words should be a non-existing word
 53 |                      wid_idx_dict,
 54 |                      entity_vecs,
 55 |                      gold_as_first_candidate=True,
 56 |                      skip_width_loading=40,  # skip width while loading samples
 57 |                      n_epochs=25,
 58 |                      batch_size=50,
 59 |                      filter_hs=def_filter_hs,
 60 |                      num_feature_maps=100,
 61 |                      lr_decay=0.9,
 62 |                      sqr_norm_lim=9,
 63 |                      hidden_out_len=50,):
 64 |     rng = np.random.RandomState(3435)
 65 | 
 66 |     print 'making entity_vecs...', len(entity_vecs)
 67 |     # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX),
 68 |     #                                    name='entity_vecs', borrow=True)
 69 |     shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype="int32"),
 70 |                                        name='entity_vecs', borrow=True)
 71 |     # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32),
 72 |     #                                    name='entity_vecs', borrow=True)
 73 |     print 'making shared_words...', len(all_words)
 74 |     shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX),
 75 |                                  name='shared_words', borrow=True)
 76 |     print 'done'
 77 | 
 78 |     # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading)
 79 |     # num_test_batches = test_indices.shape[0] / batch_size
 80 |     # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name,
 81 |     #                                                                 wid_idx_dict, skip_width_loading)
 82 |     val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len,
 83 |                                                             sentence_pad_len,
 84 |                                                             skip_width=skip_width_loading,
 85 |                                                             num_candidates=num_val_candidates)
 86 |     num_val_batches = len(val_contexts) / batch_size
 87 |     print num_val_batches, 'validation batches'
 88 |     print len(val_indices[0]), 'candidates per mention'
 89 |     val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
 90 |     val_indices = T.cast(to_theano_shared(val_indices), 'int32')
 91 | 
 92 |     test_contexts, test_indices = data_load.load_samples_full(test_data_file_name, wid_idx_dict, sentence_len,
 93 |                                                               sentence_pad_len,
 94 |                                                               skip_width=skip_width_loading,
 95 |                                                               num_candidates=num_test_candidates)
 96 |     num_test_batches = len(test_contexts) / batch_size
 97 |     print num_test_batches, 'test batches'
 98 |     print len(test_indices[0]), 'candidates per mention'
 99 |     test_contexts = T.cast(to_theano_shared(test_contexts), 'int32')
100 |     test_indices = T.cast(to_theano_shared(test_indices), 'int32')
101 | 
102 |     if gold_as_first_candidate:
103 |         gold_labels = theano.shared(value=np.zeros(batch_size,
104 |                                                    dtype='int32'),
105 |                                     borrow=True)
106 |     else:
107 |         gold_labels = theano.shared(value=np.ones(batch_size,
108 |                                                   dtype='int32'),
109 |                                     borrow=True)
110 | 
111 |     x = T.imatrix('x')
112 |     entities = T.imatrix('entities')
113 | 
114 |     sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps,
115 |                                 batch_size,
116 |                                 hidden_out_len, rng)
117 |     mc = sentence_cnn0.output  # mention contexts
118 |     unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 1e-5)).dimshuffle(0, 'x')
119 | 
120 |     batch_entity_vecs = shared_entity_vecs[entities]
121 |     entity_vecs_reshaped = batch_entity_vecs.reshape((batch_entity_vecs.shape[0] * batch_entity_vecs.shape[1],
122 |                                                       batch_entity_vecs.shape[2]))
123 | 
124 |     sentence_cnn1_train = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs,
125 |                                       num_entity_rep_feature_maps,
126 |                                       batch_size * num_train_candidates, hidden_out_len, rng)
127 |     entity_reps_train = sentence_cnn1_train.output
128 |     similarities_train = get_entity_context_similarities(unit_mc, entity_reps_train, batch_size, num_train_candidates)
129 |     loss = T.maximum(0, 1 - similarities_train[:, 0] + similarities_train[:, 1]).sum()
130 | 
131 |     # entity_reps_train = entity_reps_train.reshape((batch_size, num_train_candidates, entity_reps_train.shape[1]))
132 |     # matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu)
133 |     # entity_reps = matcher1.output
134 | 
135 |     # unit_entity_reps_train = entity_reps_train / T.sqrt(T.maximum(
136 |     #     T.sum(T.sqr(entity_reps_train), 2), 0.0001)).dimshuffle(0, 1, 'x')
137 |     #
138 |     # similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)
139 | 
140 |     sentence_cnn1_val = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs,
141 |                                     num_entity_rep_feature_maps,
142 |                                     batch_size * num_val_candidates,
143 |                                     hidden_out_len, rng,
144 |                                     hidden_W=sentence_cnn1_train.hiddenW,
145 |                                     hidden_b=sentence_cnn1_train.hiddenb,
146 |                                     conv_Ws=sentence_cnn1_train.convWs,
147 |                                     conv_bs=sentence_cnn1_train.convbs)
148 |     entity_reps_val = sentence_cnn1_val.output
149 |     similarities_val = get_entity_context_similarities(unit_mc, entity_reps_val, batch_size, num_val_candidates)
150 |     correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities_val, axis=1)))
151 | 
152 |     # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2)  # / mc_norm
153 | 
154 |     # params = sentence_cnn0.params + matcher1.params
155 |     params = sentence_cnn0.params + sentence_cnn1_train.params
156 |     grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim)
157 | 
158 |     index = T.lscalar()
159 | 
160 |     val_model = theano.function(
161 |         [index],
162 |         correct_rate,
163 |         givens={x: val_contexts[index * batch_size: (index + 1) * batch_size],
164 |                 entities: val_indices[index * batch_size: (index + 1) * batch_size]}
165 |     )
166 | 
167 |     test_model = theano.function(
168 |         [index],
169 |         correct_rate,
170 |         givens={x: test_contexts[index * batch_size: (index + 1) * batch_size],
171 |                 entities: test_indices[index * batch_size: (index + 1) * batch_size]}
172 |     )
173 | 
174 |     train_contexts = theano.shared(
175 |         value=np.zeros((3, 2)),
176 |         borrow=True)
177 |     int_train_contexts = T.cast(train_contexts, 'int32')
178 |     train_indices = theano.shared(
179 |         value=np.zeros((3, 2)),
180 |         borrow=True)
181 |     int_train_indices = T.cast(train_indices, 'int32')
182 |     train_model = theano.function(
183 |         [index],
184 |         loss,
185 |         updates=grad_updates,
186 |         givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
187 |                 entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
188 |     )
189 | 
190 |     # fdebug = theano.function(
191 |     #     [index],
192 |     #     similarities_train,
193 |     #     givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
194 |     #             entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
195 |     # )
196 |     fdebug0 = theano.function(
197 |         [index],
198 |         entity_reps_train.sum(axis=1),
199 |         givens={entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
200 |     )
201 |     fdebug1 = theano.function(
202 |         [index],
203 |         similarities_train,
204 |         givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
205 |                 entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
206 |     )
207 |     fdebug2 = theano.function(
208 |         [index],
209 |         unit_mc.sum(axis=1),
210 |         givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size]}
211 |     )
212 |     # print fdebug(0)
213 | 
214 |     # val_perfs = [val_model(i) for i in xrange(num_val_batches)]
215 |     # print('init val perf %f' % np.mean(val_perfs))
216 | 
217 |     epoch = 0
218 |     max_val_perf = 0
219 |     test_perf = 0
220 |     print 'training ...'
221 |     # while epoch < n_epochs:
222 |     epoch += 1
223 | 
224 |     train_part_cnt = 0
225 | 
226 |     # f_train = open(train_data_file_name, 'rb')
227 |     # for i in xrange(143):
228 |     #     data_load.skip_training_sample(f_train, 50000)
229 |     #     if i % 40 == 0:
230 |     #         print i
231 |     # print 'skipped'
232 |     #
233 |     # f_train = open(train_data_file_name, 'rb')
234 |     # cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
235 |     #                                                                         training_part_size,
236 |     #                                                                         wid_idx_dict,
237 |     #                                                                         sentence_len,
238 |     #                                                                         sentence_pad_len)
239 |     # f_train.close()
240 | 
241 |     f_debug = open('debug_data.bin', 'rb')
242 |     cur_train_contexts, cur_train_indices = cPickle.load(f_debug)
243 |     f_debug.close()
244 | 
245 |     # print cur_train_contexts[9 * batch_size: (9 + 1) * batch_size]
246 |     # print cur_train_indices[8 * batch_size: (8 + 1) * batch_size]
247 | 
248 |     train_contexts.set_value(cur_train_contexts, borrow=True)
249 |     train_indices.set_value(cur_train_indices, borrow=True)
250 | 
251 |     # entity_index_vecs = fdebug0(8)
252 |     # for entity_index_vec in entity_index_vecs:
253 |     #     print entity_index_vec
254 | 
255 |     train_part_cnt += 1
256 |     num_train_batches = len(cur_train_contexts) / batch_size
257 |     # print 'num_train_batches', num_train_batches
258 |     mean_loss = 0
259 |     for minibatch_index in xrange(num_train_batches):
260 |         # if minibatch_index == 8:
261 |         #     continue
262 |         # if 6 < minibatch_index < 10:
263 |             # print minibatch_index
264 |             # print sentence_cnn1_train.hiddenb.get_value()
265 |             # print fdebug0(minibatch_index)
266 |         cur_loss = train_model(minibatch_index)
267 |         # if 6 < minibatch_index < 10:
268 |             # print minibatch_index
269 |             # print sentence_cnn1_train.hiddenb.get_value()
270 |             # print fdebug0(minibatch_index)
271 |         print minibatch_index, cur_loss
272 |         mean_loss += cur_loss
273 |         # if 11 > minibatch_index > 8:
274 |         #     print minibatch_index, cur_loss
275 |         # print fdebug(minibatch_index)
276 |         # print minibatch_index, cur_loss
277 |     print 'loss:', mean_loss / num_train_batches
278 |     # print fdebug(0)
279 | 
280 |     val_perfs = [val_model(i) for i in xrange(num_val_batches)]
281 |     val_perf = np.mean(val_perfs)
282 |     print('epoch %i, training part %i, val perf %f(%f), test perf %f'
283 |           % (epoch, train_part_cnt, val_perf, max_val_perf, test_perf))
284 | 
285 |     if val_perf > max_val_perf:
286 |         max_val_perf = val_perf
287 |         test_perfs = [test_model(i) for i in xrange(num_test_batches)]
288 |         test_perf = np.mean(test_perfs)
289 |         print('\tepoch %i, training part %i, test_perf %f'
290 |               % (epoch, train_part_cnt, test_perf))
291 | 
292 | 
293 | def dump_debug_data():
294 |     train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td'
295 |     entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
296 |                            'wid_entity_rep_wiki50_indices_with_keywords_fixed_len_10kw.bin'
297 |     # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
298 |     #                        'wid_entity_rep_wiki50_indices.bin'
299 | 
300 |     # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
301 |     #     entity_rep_file_name, max_num_entity_words, entity_pad_len)
302 |     global entity_rep_len
303 |     wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
304 |         entity_rep_file_name)
305 |     f_train = open(train_data_file_name, 'rb')
306 |     for i in xrange(143):
307 |         data_load.skip_training_sample(f_train, 50000)
308 |         if i % 40 == 0:
309 |             print i
310 |     print 'skipped'
311 | 
312 |     cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
313 |                                                                             training_part_size,
314 |                                                                             wid_idx_dict,
315 |                                                                             sentence_len,
316 |                                                                             sentence_pad_len)
317 |     f_debug = open('debug_data_vlen.bin', 'wb')
318 |     cPickle.dump([cur_train_contexts, cur_train_indices], f_debug)
319 |     f_debug.close()
320 | 
321 | 
322 | def main():
323 |     local_flg = True
324 |     if len(sys.argv) > 1:
325 |         if sys.argv[1] == '0':
326 |             local_flg = False
327 | 
328 |     if local_flg:
329 |         word_vec_file_name = '/media/dhl/Data/el/word2vec/wiki_vectors.jbin'
330 |         entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
331 |                                'wid_entity_rep_wiki50_indices_with_keywords_fixed_len.bin'
332 |         # entity_rep_file_name = '/media/dhl/Data/el/vec_rep/' + \
333 |         #                        'wid_entity_rep_wiki50_indices.bin'
334 |         train_data_file_name = '/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td'
335 |         val_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_training.bin'
336 |         test_data_file_name = '/media/dhl/Data/el/vec_rep/tac_2014_eval.bin'
337 |     else:
338 |         word_vec_file_name = '/home/dhl/data/word_vec/wiki_vectors.jbin'
339 |         entity_rep_file_name = '/home/dhl/data/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len_0kw.bin'
340 |         train_data_file_name = '/home/dhl/data/vec_rep/wiki_train_word_vec_indices_wiki50.td'
341 |         val_data_file_name = '/home/dhl/data/vec_rep/tac_2014_training.bin'
342 |         test_data_file_name = '/home/dhl/data/vec_rep/tac_2014_eval.bin'
343 | 
344 |     _, word_vecs = data_load.load_word_vectors(word_vec_file_name)
345 |     word_vec_len = len(word_vecs[0])
346 | 
347 |     # wid_idx_dict, entity_vecs = data_load.load_entities(
348 |     #     '/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin',
349 |     #     False)
350 | 
351 |     # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
352 |     #     entity_rep_file_name, max_num_entity_words, entity_pad_len)
353 | 
354 |     global entity_rep_len
355 |     wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
356 |         entity_rep_file_name)
357 | 
358 |     num_val_candidates = 30
359 |     num_test_candidates = 30
360 |     skipwidth_loading = 0
361 |     img_h = sentence_len + 2 * sentence_pad_len
362 |     train_cnn_for_el(train_data_file_name,
363 |                      val_data_file_name,
364 |                      num_val_candidates,
365 |                      test_data_file_name,
366 |                      num_test_candidates,
367 |                      img_h, word_vec_len,
368 |                      word_vecs,
369 |                      wid_idx_dict,
370 |                      entity_vecs,
371 |                      gold_as_first_candidate=False,
372 |                      skip_width_loading=skipwidth_loading,
373 |                      n_epochs=1)
374 | 
375 | 
376 | if __name__ == '__main__':
377 |     # dump_debug_data()
378 |     main()
379 | 


--------------------------------------------------------------------------------
/params.txt:
--------------------------------------------------------------------------------
 1 | entity_side_cnn 0
 2 | training_part_size 25000
 3 | context_sentence_len 50
 4 | word_vec_file /media/dhl/Data/el/word2vec/wiki_vectors.jbin
 5 | entity_rep_indices_file /media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len_0kw.bin
 6 | entity_rep_vec_file /media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin
 7 | train_data_file /media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td
 8 | val_data_file /media/dhl/Data/el/vec_rep/tac_2014_train_gold_second.bin
 9 | test_data_file /media/dhl/Data/el/vec_rep/tac_2014_eval_gold_second.bin
10 | 


--------------------------------------------------------------------------------
/params_cnn.txt:
--------------------------------------------------------------------------------
 1 | entity_side_cnn 1
 2 | training_part_size 25000
 3 | context_sentence_len 50
 4 | word_vec_file /home/dhl/data/word2vec/wiki_vectors.jbin
 5 | entity_rep_indices_file /home/dhl/data/vec_rep/wid_entity_rep_wiki50_indices_with_keywords_fixed_len.bin
 6 | entity_rep_vec_file /home/dhl/data/vec_rep/wid_entity_rep_wiki50_cat.bin
 7 | train_data_file /home/dhl/data/vec_rep/wiki_train_word_vec_indices_wiki50.td
 8 | val_data_file /home/dhl/data/vec_rep/tac_2014_training.bin
 9 | test_data_file /home/dhl/data/vec_rep/tac_2014_eval.bin
10 | 


--------------------------------------------------------------------------------
/sentence_cnn.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'dhl'
 2 | 
 3 | import theano.tensor as T
 4 | 
 5 | from theano_cnn import LeNetConvPoolLayer, HiddenLayer, relu
 6 | 
 7 | 
 8 | class SentenceCNN:
 9 |     def __init__(self, input_sentences, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps,
10 |                  batch_size,
11 |                  output_len, rng, conv_non_linear=relu,
12 |                  hidden_W=None, hidden_b=None, conv_Ws=None, conv_bs=None):
13 |         # self.input_x = input_x
14 |         self.input = input_sentences
15 |         self.non_linear = conv_non_linear
16 | 
17 |         # batch_size = input_sentences.shape[0]
18 |         # full_sentence_len = input_sentences.shape[1]
19 |         # word_vec_len = shared_words.shape[1]
20 | 
21 |         filter_shapes = []
22 |         pool_sizes = []
23 |         filter_w = word_vec_len
24 |         for filter_h in filter_hs:
25 |             filter_shapes.append((num_feature_maps, 1, filter_h, filter_w))
26 |             pool_sizes.append((full_sentence_len - filter_h + 1, word_vec_len - filter_w + 1))
27 | 
28 |         layer0_input = shared_words[input_sentences.flatten()].reshape((input_sentences.shape[0], 1,
29 |                                                                         input_sentences.shape[1],
30 |                                                                         shared_words.shape[1]))
31 |         conv_layers = []
32 |         layer1_inputs = []
33 |         for i in xrange(len(filter_hs)):
34 |             filter_shape = filter_shapes[i]
35 |             pool_size = pool_sizes[i]
36 |             conv_W = None
37 |             conv_b = None
38 |             if conv_Ws is not None:
39 |                 conv_W = conv_Ws[i]
40 |             if conv_bs is not None:
41 |                 conv_b = conv_bs[i]
42 |             conv_layer = LeNetConvPoolLayer(rng, input=layer0_input,
43 |                                             image_shape=(batch_size, 1, full_sentence_len, word_vec_len),
44 |                                             filter_shape=filter_shape, poolsize=pool_size,
45 |                                             non_linear=conv_non_linear.func_name,
46 |                                             W=conv_W, b=conv_b)
47 |             layer1_input = conv_layer.output.flatten(2)
48 |             conv_layers.append(conv_layer)
49 |             layer1_inputs.append(layer1_input)
50 | 
51 |         layer1_input = T.concatenate(layer1_inputs, 1)
52 |         matcher0 = HiddenLayer(rng, layer1_input, num_feature_maps * len(filter_hs),
53 |                                output_len, relu, W=hidden_W, b=hidden_b)
54 | 
55 |         self.hiddenW = matcher0.W
56 |         self.hiddenb = matcher0.b
57 |         self.convWs = list()
58 |         self.convbs = list()
59 |         for conv_layer in conv_layers:
60 |             self.convWs.append(conv_layer.W)
61 |             self.convbs.append(conv_layer.b)
62 | 
63 |         self.output = matcher0.output  # mention contexts
64 |         self.params = matcher0.params
65 |         for conv_layer in conv_layers:
66 |             self.params += conv_layer.params
67 | 
68 |         # unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x')
69 | 


--------------------------------------------------------------------------------
/theano_cnn.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'dhl'
  2 | 
  3 | import numpy
  4 | import theano
  5 | import theano.tensor as T
  6 | from theano.tensor.signal import downsample
  7 | from theano.tensor.nnet import conv
  8 | from collections import OrderedDict
  9 | 
 10 | def relu(x):
 11 |     return T.maximum(0.0, x)
 12 | 
 13 | 
 14 | def sigmoid(x):
 15 |     return T.nnet.sigmoid(x)
 16 | 
 17 | 
 18 | def as_floatx(variable):
 19 |     if isinstance(variable, float):
 20 |         return numpy.cast[theano.config.floatX](variable)
 21 | 
 22 |     if isinstance(variable, numpy.ndarray):
 23 |         return numpy.cast[theano.config.floatX](variable)
 24 |     return theano.tensor.cast(variable, theano.config.floatX)
 25 | 
 26 | 
 27 | def sgd_updates_adadelta(params, cost, rho=0.95, epsilon=1e-6, norm_lim=9):
 28 |     """
 29 |     adadelta update rule, mostly from
 30 |     https://groups.google.com/forum/#!topic/pylearn-dev/3QbKtCumAW4 (for Adadelta)
 31 |     """
 32 |     updates = OrderedDict({})
 33 |     exp_sqr_grads = OrderedDict({})
 34 |     exp_sqr_ups = OrderedDict({})
 35 |     gparams = []
 36 |     for param in params:
 37 |         empty = numpy.zeros_like(param.get_value())
 38 |         exp_sqr_grads[param] = theano.shared(value=as_floatx(empty), name="exp_grad_%s" % param.name)
 39 |         gp = T.grad(cost, param)
 40 |         exp_sqr_ups[param] = theano.shared(value=as_floatx(empty), name="exp_grad_%s" % param.name)
 41 |         gparams.append(gp)
 42 |     for param, gp in zip(params, gparams):
 43 |         exp_sg = exp_sqr_grads[param]
 44 |         exp_su = exp_sqr_ups[param]
 45 |         up_exp_sg = rho * exp_sg + (1 - rho) * T.sqr(gp)
 46 |         updates[exp_sg] = up_exp_sg
 47 |         step = -(T.sqrt(exp_su + epsilon) / T.sqrt(up_exp_sg + epsilon)) * gp
 48 |         updates[exp_su] = rho * exp_su + (1 - rho) * T.sqr(step)
 49 |         stepped_param = param + step
 50 |         if param.get_value(borrow=True).ndim == 2:
 51 |             col_norms = T.sqrt(T.sum(T.sqr(stepped_param), axis=0))
 52 |             desired_norms = T.clip(col_norms, 0, T.sqrt(norm_lim))
 53 |             scale = desired_norms / (1e-7 + col_norms)
 54 |             updates[param] = stepped_param * scale
 55 |         else:
 56 |             updates[param] = stepped_param
 57 |     return updates
 58 | 
 59 | 
 60 | class LogisticRegression(object):
 61 |     """Multi-class Logistic Regression Class
 62 | 
 63 |     The logistic regression is fully described by a weight matrix :math:`W`
 64 |     and bias vector :math:`b`. Classification is done by projecting data
 65 |     points onto a set of hyperplanes, the distance to which is used to
 66 |     determine a class membership probability.
 67 |     """
 68 | 
 69 |     def __init__(self, input, n_in, n_out, W=None, b=None):
 70 |         """ Initialize the parameters of the logistic regression
 71 | 
 72 |     :type input: theano.tensor.TensorType
 73 |     :param input: symbolic variable that describes the input of the
 74 |     architecture (one minibatch)
 75 | 
 76 |     :type n_in: int
 77 |     :param n_in: number of input units, the dimension of the space in
 78 |     which the datapoints lie
 79 | 
 80 |     :type n_out: int
 81 |     :param n_out: number of output units, the dimension of the space in
 82 |     which the labels lie
 83 |     """
 84 | 
 85 |         # initialize with 0 the weights W as a matrix of shape (n_in, n_out)
 86 |         if W is None:
 87 |             self.W = theano.shared(
 88 |                 value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX),
 89 |                 name='W')
 90 |         else:
 91 |             self.W = W
 92 | 
 93 |         # initialize the baises b as a vector of n_out 0s
 94 |         if b is None:
 95 |             self.b = theano.shared(
 96 |                 value=numpy.zeros((n_out,), dtype=theano.config.floatX),
 97 |                 name='b')
 98 |         else:
 99 |             self.b = b
100 | 
101 |         # compute vector of class-membership probabilities in symbolic form
102 |         self.p_y_given_x = T.nnet.softmax(T.dot(input, self.W) + self.b)
103 | 
104 |         # compute prediction as class whose probability is maximal in
105 |         # symbolic form
106 |         self.y_pred = T.argmax(self.p_y_given_x, axis=1)
107 | 
108 |         # parameters of the model
109 |         self.params = [self.W, self.b]
110 | 
111 |     def negative_log_likelihood(self, y):
112 |         """Return the mean of the negative log-likelihood of the prediction
113 |         of this model under a given target distribution.
114 |     :type y: theano.tensor.TensorType
115 |     :param y: corresponds to a vector that gives for each example the
116 |     correct label
117 |     """
118 |         return -T.mean(T.log(self.p_y_given_x)[T.arange(y.shape[0]), y])
119 | 
120 |     def errors(self, y):
121 |         """Return a float representing the number of errors in the minibatch ;
122 |     zero one loss over the size of the minibatch
123 | 
124 |     :type y: theano.tensor.TensorType
125 |     :param y: corresponds to a vector that gives for each example the
126 |     correct label
127 |     """
128 | 
129 |         # check if y has same dimension of y_pred
130 |         if y.ndim != self.y_pred.ndim:
131 |             raise TypeError('y should have the same shape as self.y_pred',
132 |                 ('y', y.type, 'y_pred', self.y_pred.type))
133 |         # check if y is of the correct datatype
134 |         if y.dtype.startswith('int'):
135 |             # the T.neq operator returns a vector of 0s and 1s, where 1
136 |             # represents a mistake in prediction
137 |             return T.mean(T.neq(self.y_pred, y))
138 |         else:
139 |             raise NotImplementedError()
140 | 
141 | 
142 | class HiddenLayer(object):
143 |     def __init__(self, rng, input, n_in, n_out, activation, W=None, b=None,
144 |                  use_bias=True):
145 |         self.input = input
146 |         self.activation = activation
147 | 
148 |         if W is None:
149 |             if activation.func_name == 'relu':
150 |                 W_values = numpy.asarray(0.01 * rng.standard_normal(size=(n_in, n_out)),
151 |                                          dtype=theano.config.floatX)
152 |             else:
153 |                 W_bound = numpy.sqrt(6. / (n_in + n_out))
154 |                 W_values = numpy.asarray(
155 |                     rng.uniform(low=-W_bound, high=W_bound, size=(n_in, n_out)),
156 |                     dtype=theano.config.floatX
157 |                 )
158 |             W = theano.shared(value=W_values, name='W')
159 |             # if broadcast:
160 |             #     W = W.dimshuffle('x', 0, 1)
161 |         if b is None:
162 |             b_values = numpy.zeros((n_out,), dtype=theano.config.floatX)
163 |             b = theano.shared(value=b_values, name='b')
164 |             # if broadcast:
165 |             #     b = b.dimshuffle('x', 0)
166 | 
167 |         self.W = W
168 |         self.b = b
169 | 
170 |         lin_output = T.dot(input, self.W)
171 |         if use_bias:
172 |             lin_output += self.b
173 | 
174 |         self.output = (lin_output if activation is None else activation(lin_output))
175 | 
176 |         self.params = [self.W, self.b] if use_bias else [self.W]
177 | 
178 | 
179 | # CNN pooling
180 | class LeNetConvPoolLayer(object):
181 |     def __init__(self, rng, input, filter_shape, image_shape, poolsize=(2, 2), non_linear='tanh',
182 |                  W=None, b=None):
183 |         assert image_shape[1] == filter_shape[1]
184 |         self.input = input
185 |         self.filter_shape = filter_shape
186 |         self.image_shape = image_shape
187 |         self.poolsize = poolsize
188 |         self.non_linear = non_linear
189 | 
190 |         fan_in = numpy.prod(filter_shape[1:])
191 |         fan_out = (filter_shape[0] * numpy.prod(filter_shape[2:]) / numpy.prod(poolsize))
192 |         if W is None:
193 |             if self.non_linear == 'none' or self.non_linear == 'relu':
194 |                 self.W = theano.shared(numpy.asarray(rng.uniform(low=-0.01, high=0.01,
195 |                                                                  size=filter_shape),
196 |                                                      dtype=theano.config.floatX),
197 |                                        borrow=True,
198 |                                        name='W_conv')
199 |             else:
200 |                 W_bound = numpy.sqrt(6. / (fan_in + fan_out))
201 |                 self.W = theano.shared(numpy.asarray(rng.uniform(low=-W_bound, high=W_bound, size=filter_shape),
202 |                                                      dtype=theano.config.floatX),
203 |                                        borrow=True,
204 |                                        name="W_conv")
205 |         else:
206 |             self.W = W
207 | 
208 |         if b is None:
209 |             b_values = numpy.zeros((filter_shape[0],), dtype=theano.config.floatX)
210 |             self.b = theano.shared(value=b_values, borrow=True, name="b_conv")
211 |         else:
212 |             self.b = b
213 | 
214 |         # ftmp = theano.function([], self.W.shape)
215 |         # print ftmp()
216 |         conv_out = conv.conv2d(input=input,
217 |                                filters=self.W,
218 |                                filter_shape=self.filter_shape,
219 |                                image_shape=self.image_shape)
220 | 
221 |         if self.non_linear == 'tanh':
222 |             conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
223 |             self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
224 |         elif self.non_linear == 'relu':
225 |             conv_out_tanh = relu(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
226 |             self.output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
227 |         else:
228 |             pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True)
229 |             self.output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')
230 | 
231 |         self.params = [self.W, self.b]
232 | 
233 |     def predict(self, new_data, batch_size):
234 |         """
235 |         predict for new data
236 |         """
237 |         img_shape = (batch_size, 1, self.image_shape[2], self.image_shape[3])
238 |         conv_out = conv.conv2d(input=new_data, filters=self.W,
239 |                                filter_shape=self.filter_shape, image_shape=img_shape)
240 |         if self.non_linear == 'tanh':
241 |             conv_out_tanh = T.tanh(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
242 |             output = downsample.max_pool_2d(input=conv_out_tanh, ds=self.poolsize, ignore_border=True)
243 |         elif self.non_linear == 'relu':
244 |             conv_out_relu = relu(conv_out + self.b.dimshuffle('x', 0, 'x', 'x'))
245 |             output = downsample.max_pool_2d(input=conv_out_relu, ds=self.poolsize, ignore_border=True)
246 |         else:
247 |             pooled_out = downsample.max_pool_2d(input=conv_out, ds=self.poolsize, ignore_border=True)
248 |             output = pooled_out + self.b.dimshuffle('x', 0, 'x', 'x')
249 |         return output
250 | 


--------------------------------------------------------------------------------
/train_model.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'dhl'
  2 | 
  3 | 
  4 | import theano
  5 | import theano.tensor as T
  6 | import numpy as np
  7 | 
  8 | from theano_cnn import LeNetConvPoolLayer, HiddenLayer, sgd_updates_adadelta
  9 | 
 10 | import data_load
 11 | 
 12 | def_filter_hs = [2, 3]
 13 | sentence_len = 50
 14 | sentence_pad_len = def_filter_hs[-1] - 1
 15 | training_part_size = 25000
 16 | 
 17 | 
 18 | def relu(x):
 19 |     return T.maximum(0.0, x)
 20 | 
 21 | 
 22 | def to_theano_shared(vals):
 23 |     return theano.shared(value=np.asarray(vals,
 24 |                                           dtype=theano.config.floatX),
 25 |                          borrow=True)
 26 | 
 27 | 
 28 | def train_cnn_for_el(train_data_file_name,
 29 |                      val_data_file_name,
 30 |                      num_val_candidates,
 31 |                      test_data_file_name,
 32 |                      num_test_candidates,
 33 |                      img_h, img_w,
 34 |                      all_words,  # first row of all_words should be a non-existing word
 35 |                      wid_idx_dict,
 36 |                      entity_vecs,
 37 |                      gold_as_first_candidate=False,
 38 |                      skip_width_loading=40,  # skip width while loading samples
 39 |                      n_epochs=25,
 40 |                      batch_size=50,
 41 |                      filter_hs=def_filter_hs,
 42 |                      num_feature_maps=100,
 43 |                      conv_non_linear="relu",
 44 |                      lr_decay=0.9,
 45 |                      sqr_norm_lim=9,
 46 |                      hidden_out_len=50,):
 47 |     rng = np.random.RandomState(3435)
 48 | 
 49 |     x = T.imatrix('x')
 50 |     # es = T.imatrix('es')
 51 |     # es_test = T.imatrix('es_test')
 52 |     entities = T.imatrix('entities')
 53 | 
 54 |     print 'making entity_vecs...', len(entity_vecs)
 55 |     shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX),
 56 |                                        name='entity_vecs', borrow=True)
 57 |     # shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=np.float32),
 58 |     #                                    name='entity_vecs', borrow=True)
 59 |     print 'making shared_words...', len(all_words)
 60 |     shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX),
 61 |                                  name='shared_words', borrow=True)
 62 |     print 'done'
 63 | 
 64 |     # test_contexts, test_indices = get_data_set_full(test_data_file_name, wid_idx_dict, skip_width_loading)
 65 |     # num_test_batches = test_indices.shape[0] / batch_size
 66 |     # num_val_contexts, val_contexts, val_indices = get_data_set_full(val_data_file_name,
 67 |     #                                                                 wid_idx_dict, skip_width_loading)
 68 |     val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len,
 69 |                                                             sentence_pad_len,
 70 |                                                             skip_width=skip_width_loading,
 71 |                                                             num_candidates=num_val_candidates)
 72 |     num_val_batches = len(val_contexts) / batch_size
 73 |     print num_val_batches, 'validation batches'
 74 |     print len(val_indices[0]), 'candidates per mention'
 75 | 
 76 |     if gold_as_first_candidate:
 77 |         gold_labels = theano.shared(value=np.zeros(batch_size,
 78 |                                                    dtype='int32'),
 79 |                                     borrow=True)
 80 |     else:
 81 |         gold_labels = theano.shared(value=np.ones(batch_size,
 82 |                                                   dtype='int32'),
 83 |                                     borrow=True)
 84 | 
 85 |     val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
 86 |     val_indices = T.cast(to_theano_shared(val_indices), 'int32')
 87 | 
 88 |     filter_shapes = []
 89 |     pool_sizes = []
 90 |     filter_w = img_w
 91 |     for filter_h in filter_hs:
 92 |         filter_shapes.append((num_feature_maps, 1, filter_h, filter_w))
 93 |         pool_sizes.append((img_h - filter_h + 1, img_w - filter_w + 1))
 94 | 
 95 |     layer0_input = shared_words[x.flatten()].reshape((x.shape[0], 1, x.shape[1], shared_words.shape[1]))
 96 |     conv_layers = []
 97 |     layer1_inputs = []
 98 |     for i in xrange(len(filter_hs)):
 99 |         filter_shape = filter_shapes[i]
100 |         pool_size = pool_sizes[i]
101 |         conv_layer = LeNetConvPoolLayer(rng, input=layer0_input, image_shape=(batch_size, 1, img_h, img_w),
102 |                                         filter_shape=filter_shape, poolsize=pool_size, non_linear=conv_non_linear)
103 |         layer1_input = conv_layer.output.flatten(2)
104 |         conv_layers.append(conv_layer)
105 |         layer1_inputs.append(layer1_input)
106 | 
107 |     layer1_input = T.concatenate(layer1_inputs, 1)
108 |     matcher0 = HiddenLayer(rng, layer1_input, num_feature_maps * len(filter_hs),
109 |                            hidden_out_len, relu)
110 |     mc = matcher0.output  # mention contexts
111 | 
112 |     unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x')
113 | 
114 |     batch_entity_vecs = shared_entity_vecs[entities]
115 |     matcher1 = HiddenLayer(rng, batch_entity_vecs, len(entity_vecs[0]), hidden_out_len, relu)
116 |     entity_reps = matcher1.output
117 |     # entity_reps = batch_entity_vecs
118 | 
119 |     unit_entity_reps = entity_reps / T.sqrt(T.maximum(T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x')
120 | 
121 |     similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)
122 |     correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities, axis=1)))
123 | 
124 |     loss = T.maximum(0, 1 - similarities[:, 0] + similarities[:, 1]).sum()
125 | 
126 |     # similarities = (mc.dimshuffle(0, 'x', 1) * batch_entity_vecs).sum(axis=2)  # / mc_norm
127 | 
128 |     params = matcher0.params + matcher1.params
129 |     # params = matcher0.params
130 |     for conv_layer in conv_layers:
131 |         params += conv_layer.params
132 |     grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim)
133 | 
134 |     index = T.lscalar()
135 | 
136 |     # test_model = theano.function(
137 |     #     [index],
138 |     #     error_rate,
139 |     #     givens={x: test_contexts[index * batch_size: (index + 1) * batch_size],
140 |     #             es: test_indices[index * batch_size: (index + 1) * batch_size]}
141 |     # )
142 | 
143 |     val_model = theano.function(
144 |         [index],
145 |         correct_rate,
146 |         givens={x: val_contexts[index * batch_size: (index + 1) * batch_size],
147 |                 entities: val_indices[index * batch_size: (index + 1) * batch_size]}
148 |     )
149 | 
150 |     train_contexts = theano.shared(
151 |         value=np.zeros((3, 2)),
152 |         borrow=True)
153 |     int_train_contexts = T.cast(train_contexts, 'int32')
154 |     train_indices = theano.shared(
155 |         value=np.zeros((3, 2)),
156 |         borrow=True)
157 |     int_train_indices = T.cast(train_indices, 'int32')
158 |     train_model = theano.function(
159 |         [index],
160 |         loss,
161 |         updates=grad_updates,
162 |         givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
163 |                 entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
164 |     )
165 | 
166 |     fdebug = theano.function(
167 |         [index],
168 |         similarities,
169 |         givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
170 |                 entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
171 |     )
172 |     # print fdebug(0)
173 | 
174 |     val_perfs = [val_model(i) for i in xrange(num_val_batches)]
175 |     print('init val perf %f' % np.mean(val_perfs))
176 | 
177 |     print 'training ...'
178 |     f_train = open(train_data_file_name, 'rb')
179 |     epoch = 0
180 |     while epoch < n_epochs:
181 |         epoch += 1
182 | 
183 |         train_part_cnt = 0
184 |         # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part(
185 |         #     f_train, wid_idx_dict, 50000)
186 |         cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
187 |                                                                                 training_part_size,
188 |                                                                                 wid_idx_dict,
189 |                                                                                 sentence_len,
190 |                                                                                 sentence_pad_len)
191 |         while not len(cur_train_contexts) == 0:
192 |             train_contexts.set_value(cur_train_contexts, borrow=True)
193 |             train_indices.set_value(cur_train_indices, borrow=True)
194 |             # print fdebug(0)
195 | 
196 |             train_part_cnt += 1
197 |             num_train_batches = len(cur_train_contexts) / batch_size
198 |             # print 'num_train_batches', num_train_batches
199 |             mean_loss = 0
200 |             for minibatch_index in xrange(num_train_batches):
201 |                 cur_loss = train_model(minibatch_index)
202 |                 mean_loss += cur_loss
203 |                 # if (minibatch_index + 1) % (num_train_batches / 3) == 0:  # show some progress
204 |                 #     print minibatch_index, num_train_batches
205 |             print 'loss:', mean_loss / num_train_batches
206 |             # print fdebug(0)
207 | 
208 |             val_perfs = [val_model(i) for i in xrange(num_val_batches)]
209 |             val_perf = np.mean(val_perfs)
210 |             print('epoch %i, training part %i, val perf %f'
211 |                   % (epoch, train_part_cnt, val_perf))
212 |             cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
213 |                                                                                     training_part_size,
214 |                                                                                     wid_idx_dict,
215 |                                                                                     sentence_len,
216 |                                                                                     sentence_pad_len)
217 |             # num_train_contexts, cur_train_contexts, cur_train_indices = get_data_set_part(
218 |             #     f_train, wid_idx_dict, 50000)
219 | 
220 |     f_train.close()
221 | 
222 | 
223 | def main():
224 |     _, word_vecs = data_load.load_word_vectors('/media/dhl/Data/el/word2vec/wiki_vectors.jbin')
225 |     word_vec_len = len(word_vecs[0])
226 | 
227 |     wid_idx_dict, entity_vecs = data_load.load_entities(
228 |         '/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50_cat.bin',
229 |         False)
230 |     # wid_idx_dict, entity_vecs = data_load.load_entities('/media/dhl/Data/el/vec_rep/wid_entity_rep_wiki50.bin',
231 |     #                                                     True)
232 | 
233 |     # all_word_vecs =
234 |     num_val_candidates = 30
235 |     num_test_candidates = 30
236 |     skipwidth_loading = 0
237 |     img_h = sentence_len + 2 * sentence_pad_len
238 |     train_cnn_for_el('/media/dhl/Data/el/vec_rep/wiki_train_word_vec_indices_wiki50.td',
239 |                      '/media/dhl/Data/el/vec_rep/tac_2014_training.bin',
240 |                      # '/media/dhl/Data/el/vec_rep/wiki_val_word_vec_indices_wiki50.td',
241 |                      num_val_candidates,
242 |                      '/media/dhl/Data/el/vec_rep/wiki_test_word_vec_indices_wiki50.td',
243 |                      num_test_candidates,
244 |                      img_h, word_vec_len,
245 |                      word_vecs,
246 |                      wid_idx_dict,
247 |                      entity_vecs,
248 |                      skip_width_loading=skipwidth_loading,
249 |                      n_epochs=1)
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     main()
254 | 


--------------------------------------------------------------------------------
/train_parallel_cnn.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | __author__ = 'dhl'
  3 | 
  4 | import sys
  5 | 
  6 | import numpy as np
  7 | 
  8 | import theano
  9 | import theano.tensor as T
 10 | 
 11 | import data_load
 12 | from sentence_cnn import SentenceCNN
 13 | from theano_cnn import HiddenLayer, relu, sgd_updates_adadelta
 14 | 
 15 | 
 16 | # TODO remove these global variables
 17 | def_filter_hs = [2, 3]
 18 | sentence_pad_len = def_filter_hs[-1] - 1
 19 | 
 20 | 
 21 | max_num_entity_words = 50
 22 | entity_pad_len = 1
 23 | entity_rep_len = max_num_entity_words + 2 * entity_pad_len
 24 | entity_hs = [1]
 25 | num_entity_rep_feature_maps = 300
 26 | 
 27 | 
 28 | def to_theano_shared(vals):
 29 |     return theano.shared(value=np.asarray(vals,
 30 |                                           dtype=theano.config.floatX),
 31 |                          borrow=True)
 32 | 
 33 | 
 34 | def get_entity_context_similarities(unit_mc, cnn_output_for_entities, batch_size, num_candidates):
 35 |     entity_reps = cnn_output_for_entities.reshape((batch_size, num_candidates,
 36 |                                                    cnn_output_for_entities.shape[1]))
 37 |     unit_entity_reps = entity_reps / T.sqrt(T.maximum(
 38 |         T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x')
 39 |     return (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)
 40 | 
 41 | 
 42 | def get_training_variables_with_entity_side_cnn(batch_entity_vecs, shared_words, word_vec_len, batch_size,
 43 |                                                 hidden_out_len, unit_mc, num_train_candidates,
 44 |                                                 num_val_candidates, gold_labels, rng):
 45 |     entity_vecs_reshaped = batch_entity_vecs.reshape((batch_entity_vecs.shape[0] * batch_entity_vecs.shape[1],
 46 |                                                       batch_entity_vecs.shape[2]))
 47 | 
 48 |     sentence_cnn1_train = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs,
 49 |                                       num_entity_rep_feature_maps,
 50 |                                       batch_size * num_train_candidates, hidden_out_len, rng)
 51 |     entity_reps_train = sentence_cnn1_train.output
 52 |     similarities_train = get_entity_context_similarities(unit_mc, entity_reps_train, batch_size, num_train_candidates)
 53 |     loss = T.maximum(0, 1 - similarities_train[:, 0] + similarities_train[:, 1]).sum()
 54 | 
 55 |     sentence_cnn1_val = SentenceCNN(entity_vecs_reshaped, shared_words, entity_rep_len, word_vec_len, entity_hs,
 56 |                                     num_entity_rep_feature_maps,
 57 |                                     batch_size * num_val_candidates,
 58 |                                     hidden_out_len, rng,
 59 |                                     hidden_W=sentence_cnn1_train.hiddenW,
 60 |                                     hidden_b=sentence_cnn1_train.hiddenb,
 61 |                                     conv_Ws=sentence_cnn1_train.convWs,
 62 |                                     conv_bs=sentence_cnn1_train.convbs)
 63 |     entity_reps_val = sentence_cnn1_val.output
 64 | 
 65 |     similarities_val = get_entity_context_similarities(unit_mc, entity_reps_val, batch_size, num_val_candidates)
 66 |     correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities_val, axis=1)))
 67 | 
 68 |     params = sentence_cnn1_train.params
 69 | 
 70 |     return loss, correct_rate, params
 71 | 
 72 | 
 73 | def get_training_variables_no_entity_side_cnn(batch_entity_vecs, entity_vec_len, hidden_out_len, unit_mc,
 74 |                                               gold_labels, rng):
 75 |     matcher1 = HiddenLayer(rng, batch_entity_vecs, entity_vec_len, hidden_out_len, relu)
 76 |     entity_reps = matcher1.output
 77 |     unit_entity_reps = entity_reps / T.sqrt(T.maximum(
 78 |         T.sum(T.sqr(entity_reps), 2), 0.0001)).dimshuffle(0, 1, 'x')
 79 | 
 80 |     similarities = (unit_mc.dimshuffle(0, 'x', 1) * unit_entity_reps).sum(axis=2)
 81 |     loss = T.maximum(0, 1 - similarities[:, 0] + similarities[:, 1]).sum()
 82 |     correct_rate = T.mean(T.eq(gold_labels, T.argmax(similarities, axis=1)))
 83 |     params = matcher1.params
 84 |     return loss, correct_rate, params
 85 | 
 86 | 
 87 | def train_cnn_for_el(train_data_file_name,
 88 |                      val_data_file_name,
 89 |                      num_val_candidates,
 90 |                      test_data_file_name,
 91 |                      num_test_candidates,
 92 |                      sentence_len, word_vec_len,
 93 |                      all_words,  # first row of all_words should be a non-existing word
 94 |                      wid_idx_dict,
 95 |                      entity_vecs,
 96 |                      entity_side_cnn=False,
 97 |                      gold_as_first_candidate=False,
 98 |                      skip_width_loading=40,  # skip width while loading samples
 99 |                      n_epochs=25,
100 |                      batch_size=50,
101 |                      filter_hs=def_filter_hs,
102 |                      num_feature_maps=100,
103 |                      lr_decay=0.9,
104 |                      sqr_norm_lim=9,
105 |                      hidden_out_len=50,
106 |                      training_part_size=50000,
107 |                      num_train_candidates=2):
108 |     full_sentence_len = sentence_len + 2 * sentence_pad_len
109 |     rng = np.random.RandomState(3435)
110 | 
111 |     print 'making entity_vecs...', len(entity_vecs)
112 |     if entity_side_cnn:
113 |         shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype="int32"),
114 |                                            name='entity_vecs', borrow=True)
115 |     else:
116 |         shared_entity_vecs = theano.shared(value=np.asarray(entity_vecs, dtype=theano.config.floatX),
117 |                                            name='entity_vecs', borrow=True)
118 | 
119 |     print 'making shared_words...', len(all_words)
120 |     shared_words = theano.shared(value=np.asarray(all_words, dtype=theano.config.floatX),
121 |                                  name='shared_words', borrow=True)
122 |     print 'done'
123 | 
124 |     val_contexts, val_indices = data_load.load_samples_full(val_data_file_name, wid_idx_dict, sentence_len,
125 |                                                             sentence_pad_len,
126 |                                                             skip_width=skip_width_loading,
127 |                                                             num_candidates=num_val_candidates)
128 |     num_val_batches = len(val_contexts) / batch_size
129 |     print num_val_batches, 'validation batches'
130 |     print len(val_indices[0]), 'candidates per mention'
131 |     val_contexts = T.cast(to_theano_shared(val_contexts), 'int32')
132 |     val_indices = T.cast(to_theano_shared(val_indices), 'int32')
133 | 
134 |     test_contexts, test_indices = data_load.load_samples_full(test_data_file_name, wid_idx_dict, sentence_len,
135 |                                                               sentence_pad_len,
136 |                                                               skip_width=skip_width_loading,
137 |                                                               num_candidates=num_test_candidates)
138 |     num_test_batches = len(test_contexts) / batch_size
139 |     print num_test_batches, 'test batches'
140 |     print len(test_indices[0]), 'candidates per mention'
141 |     test_contexts = T.cast(to_theano_shared(test_contexts), 'int32')
142 |     test_indices = T.cast(to_theano_shared(test_indices), 'int32')
143 | 
144 |     if gold_as_first_candidate:
145 |         gold_labels = theano.shared(value=np.zeros(batch_size,
146 |                                                    dtype='int32'),
147 |                                     borrow=True)
148 |     else:
149 |         gold_labels = theano.shared(value=np.ones(batch_size,
150 |                                                   dtype='int32'),
151 |                                     borrow=True)
152 | 
153 |     x = T.imatrix('x')
154 |     entities = T.imatrix('entities')
155 | 
156 |     sentence_cnn0 = SentenceCNN(x, shared_words, full_sentence_len, word_vec_len, filter_hs, num_feature_maps,
157 |                                 batch_size,
158 |                                 hidden_out_len, rng)
159 |     mc = sentence_cnn0.output  # mention contexts
160 |     unit_mc = mc / T.sqrt(T.maximum(T.sum(T.sqr(mc), 1), 0.0001)).dimshuffle(0, 'x')
161 | 
162 |     batch_entity_vecs = shared_entity_vecs[entities]
163 | 
164 |     if entity_side_cnn:
165 |         loss, correct_rate, entity_side_params = get_training_variables_with_entity_side_cnn(batch_entity_vecs,
166 |                                                                                              shared_words,
167 |                                                                                              word_vec_len, batch_size,
168 |                                                                                              hidden_out_len, unit_mc,
169 |                                                                                              num_train_candidates,
170 |                                                                                              num_val_candidates,
171 |                                                                                              gold_labels, rng)
172 |     else:
173 |         loss, correct_rate, entity_side_params = get_training_variables_no_entity_side_cnn(batch_entity_vecs,
174 |                                                                                            len(entity_vecs[0]),
175 |                                                                                            hidden_out_len,
176 |                                                                                            unit_mc,
177 |                                                                                            gold_labels, rng)
178 |     # params = matcher0.params + entity_side_params
179 |     # for conv_layer in conv_layers:
180 |     #     params += conv_layer.params
181 | 
182 |     # params = sentence_cnn0.params + matcher1.params
183 |     # params = sentence_cnn0.params + sentence_cnn1_train.params
184 | 
185 |     params = sentence_cnn0.params + entity_side_params
186 |     grad_updates = sgd_updates_adadelta(params, loss, lr_decay, 1e-6, sqr_norm_lim)
187 | 
188 |     index = T.lscalar()
189 | 
190 |     val_model = theano.function(
191 |         [index],
192 |         correct_rate,
193 |         givens={x: val_contexts[index * batch_size: (index + 1) * batch_size],
194 |                 entities: val_indices[index * batch_size: (index + 1) * batch_size]}
195 |     )
196 | 
197 |     test_model = theano.function(
198 |         [index],
199 |         correct_rate,
200 |         givens={x: test_contexts[index * batch_size: (index + 1) * batch_size],
201 |                 entities: test_indices[index * batch_size: (index + 1) * batch_size]}
202 |     )
203 | 
204 |     train_contexts = theano.shared(
205 |         value=np.zeros((3, 2)),
206 |         borrow=True)
207 |     int_train_contexts = T.cast(train_contexts, 'int32')
208 |     train_indices = theano.shared(
209 |         value=np.zeros((3, 2)),
210 |         borrow=True)
211 |     int_train_indices = T.cast(train_indices, 'int32')
212 |     train_model = theano.function(
213 |         [index],
214 |         loss,
215 |         updates=grad_updates,
216 |         givens={x: int_train_contexts[index * batch_size: (index + 1) * batch_size],
217 |                 entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
218 |     )
219 | 
220 |     fdebug = theano.function(
221 |         [index],
222 |         batch_entity_vecs,
223 |         givens={entities: int_train_indices[index * batch_size: (index + 1) * batch_size]}
224 |     )
225 |     # print fdebug(0)
226 | 
227 |     val_perfs = [val_model(i) for i in xrange(num_val_batches)]
228 |     print('init val perf %f' % np.mean(val_perfs))
229 | 
230 |     epoch = 0
231 |     max_val_perf = 0
232 |     test_perf = 0
233 |     print 'training ...'
234 |     while epoch < n_epochs:
235 |         f_train = open(train_data_file_name, 'rb')
236 |         epoch += 1
237 | 
238 |         train_part_cnt = 0
239 | 
240 |         cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
241 |                                                                                 training_part_size,
242 |                                                                                 wid_idx_dict,
243 |                                                                                 sentence_len,
244 |                                                                                 sentence_pad_len)
245 | 
246 |         while not len(cur_train_contexts) == 0 and train_part_cnt < 100:
247 |             train_contexts.set_value(cur_train_contexts, borrow=True)
248 |             train_indices.set_value(cur_train_indices, borrow=True)
249 | 
250 |             train_part_cnt += 1
251 |             num_train_batches = len(cur_train_contexts) / batch_size
252 |             # print 'num_train_batches', num_train_batches
253 |             mean_loss = 0
254 |             for minibatch_index in xrange(num_train_batches):
255 |                 cur_loss = train_model(minibatch_index)
256 |                 mean_loss += cur_loss
257 |                 # print minibatch_index, cur_loss
258 |             print 'loss:', mean_loss / num_train_batches
259 |             # print fdebug(0)
260 | 
261 |             val_perfs = [val_model(i) for i in xrange(num_val_batches)]
262 |             val_perf = np.mean(val_perfs)
263 |             print('epoch %i, training part %i, val perf %f(%f), test perf %f'
264 |                   % (epoch, train_part_cnt, val_perf, max_val_perf, test_perf))
265 | 
266 |             if val_perf > max_val_perf:
267 |                 max_val_perf = val_perf
268 |                 test_perfs = [test_model(i) for i in xrange(num_test_batches)]
269 |                 test_perf = np.mean(test_perfs)
270 |                 print('\tepoch %i, training part %i, test_perf %f'
271 |                       % (epoch, train_part_cnt, test_perf))
272 | 
273 |             cur_train_contexts, cur_train_indices = data_load.load_training_samples(f_train,
274 |                                                                                     training_part_size,
275 |                                                                                     wid_idx_dict,
276 |                                                                                     sentence_len,
277 |                                                                                     sentence_pad_len)
278 |         f_train.close()
279 | 
280 | 
281 | def load_params(param_file_name):
282 |     f = open(param_file_name, 'rb')
283 |     params = dict()
284 |     line = f.readline()
285 |     while line:
286 |         vals = line.decode('utf8').strip()
287 |         if not vals == '':
288 |             vals = vals.split(' ')
289 |             params[vals[0]] = vals[1]
290 |         line = f.readline()
291 |     f.close()
292 |     return params
293 | 
294 | 
295 | def main():
296 |     if len(sys.argv) < 2:
297 |         print 'need params file'
298 | 
299 |     params = load_params(sys.argv[1])
300 | 
301 |     entity_side_cnn = params['entity_side_cnn'] == '1'
302 |     word_vec_file_name = params['word_vec_file']
303 | 
304 |     if entity_side_cnn:
305 |         entity_rep_file_name = params['entity_rep_indices_file']
306 |     else:
307 |         entity_rep_file_name = params['entity_rep_vec_file']
308 | 
309 |     train_data_file_name = params['train_data_file']
310 |     val_data_file_name = params['val_data_file']
311 |     test_data_file_name = params['test_data_file']
312 | 
313 |     training_part_size = int(params['training_part_size'])
314 |     sentence_len = int(params['context_sentence_len'])
315 | 
316 |     _, word_vecs = data_load.load_word_vectors(word_vec_file_name)
317 |     word_vec_len = len(word_vecs[0])
318 | 
319 |     if entity_side_cnn:
320 |         print 'entity use cnn'
321 |         global entity_rep_len
322 |         wid_idx_dict, entity_vecs, entity_rep_len = data_load.load_index_vec_of_entities_fixed_len(
323 |             entity_rep_file_name)
324 |     else:
325 |         wid_idx_dict, entity_vecs = data_load.load_entities(
326 |             entity_rep_file_name,
327 |             False)
328 | 
329 |     # wid_idx_dict, entity_vecs = data_load.load_entities_indices(
330 |     #     entity_rep_file_name, max_num_entity_words, entity_pad_len)
331 | 
332 |     num_val_candidates = 30
333 |     num_test_candidates = 30
334 |     skipwidth_loading = 0
335 |     train_cnn_for_el(train_data_file_name,
336 |                      val_data_file_name,
337 |                      num_val_candidates,
338 |                      test_data_file_name,
339 |                      num_test_candidates,
340 |                      sentence_len, word_vec_len,
341 |                      word_vecs,
342 |                      wid_idx_dict,
343 |                      entity_vecs,
344 |                      entity_side_cnn=entity_side_cnn,
345 |                      gold_as_first_candidate=False,
346 |                      skip_width_loading=skipwidth_loading,
347 |                      n_epochs=1,
348 |                      training_part_size=training_part_size)
349 | 
350 | 
351 | if __name__ == '__main__':
352 |     main()
353 | 


--------------------------------------------------------------------------------