├── CITATION.cff ├── MLBiNet.py ├── README.md ├── ace_model_evaluation.py ├── data-ACE ├── example_new.dev ├── example_new.test └── example_new.train ├── dict ├── dict_gen.py ├── event_types.txt ├── ner_1.txt ├── ner_2.txt └── vocab.txt ├── embedding └── embeddings.txt ├── requirements.txt ├── run_experiments_multi.py ├── train_MLBiNet.py └── utils_init.py /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: "1.0.0" 2 | message: "If you use this code, please cite it using these metadata." 3 | title: "doced" 4 | repository-code: "https://github.com/zjunlp/DocED" 5 | authors: 6 | - family-names: Lou 7 | given-names: Dongfang 8 | - family-names: Liao 9 | given-names: Zhilin 10 | - family-names: Deng 11 | given-names: Shumin 12 | - family-names: Zhang 13 | given-names: Ningyu 14 | - family-names: Chen 15 | given-names: Huajun 16 | preferred-citation: 17 | type: article 18 | title: "MLBiNet: A Cross-Sentence Collective Event Detection Network" 19 | authors: 20 | - family-names: Lou 21 | given-names: Dongfang 22 | - family-names: Liao 23 | given-names: Zhilin 24 | - family-names: Deng 25 | given-names: Shumin 26 | - family-names: Zhang 27 | given-names: Ningyu 28 | - family-names: Chen 29 | given-names: Huajun 30 | journal: "arXiv preprint arXiv:2105.09458" 31 | year: 2021 32 | -------------------------------------------------------------------------------- /MLBiNet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import tensorflow as tf 5 | from tensorflow.contrib.layers.python.layers import initializers 6 | from tensorflow.contrib import rnn 7 | 8 | 9 | class MLBiNet: 10 | def __init__(self, 11 | encode_h, # hidden size of sentence encoding 12 | decode_h, # hidden size of sentence decoding 13 | tag_dim, # hidden size of event tag 14 | event_info_h, # hidden size of event info integration model 15 | word_emb_mat, # word embedding matrix 16 | batch_size, # batch size 17 | max_doc_len, # max length of doc 18 | max_seq_len, # max length of sequence 19 | id_O, # location of other event / negative event 20 | num_tag_layers, # number of tagging layers 21 | weight_decay, # weight decay of each tagging layer 22 | reverse_seq, # reverse the sequence or not when aggregating information of next sentence 23 | class_size, # class size 24 | tagging_mechanism="bidirectional_decoder", # forward_decoder, backward_decoder, bidirectional_decoder 25 | ner_size_1=None, # size of level-1 ner vocab 26 | ner_dim_1=None, # dimension of level-1 ner embedding 27 | ner_size_2=None, # size of level-2 ner vocab 28 | ner_dim_2=None, # dimension of level-2 ner embedding 29 | self_att_not=1, # concat word embedding or not 30 | context_info=1, # 0: single sentence information, 1: information of two neighbor sentences 31 | event_vector_trans=1 # nonlinear transformation for the event vector 32 | ): 33 | self.encode_h = encode_h 34 | self.decode_h = decode_h 35 | self.tag_dim = tag_dim 36 | self.event_info_h = event_info_h 37 | self.word_emb_mat = word_emb_mat 38 | self.batch_size = batch_size 39 | self.max_doc_len = max_doc_len 40 | self.max_seq_len = max_seq_len 41 | self.id_O = id_O 42 | self.num_tag_layers = num_tag_layers 43 | self.weight_decay = weight_decay 44 | self.reverse_seq = reverse_seq 45 | self.class_size = class_size 46 | self.tagging_mechanism = tagging_mechanism 47 | 48 | self.ner_size_1 = ner_size_1 49 | self.ner_dim_1 = ner_dim_1 50 | self.ner_size_2 = ner_size_2 51 | self.ner_dim_2 = ner_dim_2 52 | self.self_att_not = self_att_not 53 | 54 | self.context_info = context_info 55 | self.event_vector_trans = event_vector_trans 56 | 57 | # global initializer 58 | self.initializer = initializers.xavier_initializer() 59 | 60 | # initialize the word embedding matrix 61 | self.word_emb_mat = tf.cast(self.word_emb_mat, dtype=tf.float32) 62 | self.word_embedding_init() 63 | 64 | # placeholders 65 | self.input_docs = tf.placeholder(dtype=tf.int32, shape=[self.batch_size, 66 | self.max_doc_len, self.max_seq_len], name='input_docs') 67 | self.ner_docs_1 = tf.placeholder(dtype=tf.int32, shape=[self.batch_size, 68 | self.max_doc_len, self.max_seq_len], name='ner_docs_1') 69 | self.ner_docs_2 = tf.placeholder(dtype=tf.int32, shape=[self.batch_size, 70 | self.max_doc_len, self.max_seq_len], name='ner_docs_2') 71 | self.input_label_docs = tf.placeholder(dtype=tf.int32, 72 | shape=[self.batch_size, self.max_doc_len, self.max_seq_len], 73 | name='input_label_docs') 74 | self.valid_batch = tf.placeholder(dtype=tf.int32, shape=(), name='valid_batch') 75 | self.valid_sent_len = tf.placeholder(dtype=tf.int32, shape=[self.batch_size], name='valid_sent_len') 76 | self.valid_words_len = tf.placeholder(dtype=tf.int32, shape=[self.batch_size, self.max_doc_len], 77 | name='valid_words_len') 78 | self.dropout_rate = tf.placeholder(dtype=tf.float32, shape=(), name='dropout_rate') 79 | self.positive_weights = tf.placeholder(dtype=tf.float32, shape=(), name='positive_weights') 80 | 81 | # embedding layer 82 | self.word_embedding_lookup = self.embedding_layer() 83 | 84 | # [unk] event and semantic information aggregation embedding 85 | self.unk_event_semantic = tf.Variable(tf.truncated_normal(shape=[1, self.event_info_h], stddev=0.1), 86 | trainable=True, name="unk_event_semantic") 87 | # self.unk_event_semantic = tf.zeros(shape=[1,self.event_info_h]) 88 | 89 | # sentence encoding layer 90 | emb_size_curr = self.word_embedding_lookup.get_shape().as_list()[-1] 91 | self.lstm_inputs = tf.nn.dropout(self.word_embedding_lookup, keep_prob=1 - self.dropout_rate) 92 | 93 | print("embedding dimension before encoding layer:\t", emb_size_curr) 94 | 95 | words_enc, _, _ = self.sent_encode_layer( 96 | tf.reshape(self.lstm_inputs, [self.batch_size * self.max_doc_len, 97 | self.max_seq_len, emb_size_curr]), 98 | tf.reshape(self.valid_words_len, shape=[-1]), name='sent_enc_model') 99 | 100 | print("embedding dimension after encoding layer:\t", words_enc.get_shape().as_list()[-1]) 101 | 102 | # self-attention 103 | words_enc = tf.reshape(words_enc, [self.batch_size, self.max_doc_len, self.max_seq_len, -1]) 104 | if self.self_att_not: 105 | words_enc = self.sent_self_att(words_enc, self.valid_words_len) 106 | 107 | print("embedding dimension after self-attention:\t", words_enc.get_shape().as_list()[-1]) 108 | 109 | # concat with looking up embedding 110 | words_enc = tf.concat([words_enc, self.word_embedding_lookup], axis=-1) 111 | words_enc = tf.nn.dropout(words_enc, keep_prob=1 - self.dropout_rate) 112 | 113 | print("embedding dimension before decoding:\t", words_enc.get_shape().as_list()[-1]) 114 | 115 | # mask all padding vectors 116 | dim_curr = words_enc.get_shape().as_list()[-1] 117 | mask_padding_ind = tf.sequence_mask(self.valid_words_len, maxlen=self.max_seq_len, dtype=tf.float32) 118 | self.mask_padding_ind = tf.tile(tf.expand_dims(mask_padding_ind, axis=3), multiples=[1, 1, 1, dim_curr]) 119 | 120 | self.words_enc = words_enc * self.mask_padding_ind 121 | 122 | # tagging via multi-tagging network 123 | if self.tagging_mechanism == "forward_decoder": 124 | tag_vect, tag_vect_layerwise = self.forward_cross_sent_ED(words_enc=self.words_enc, tag_dim=self.tag_dim, 125 | num_tag_layers=self.num_tag_layers, 126 | weight_decay=self.weight_decay) 127 | elif self.tagging_mechanism == "backward_decoder": 128 | tag_vect, tag_vect_layerwise = self.backward_cross_sent_ED(words_enc=self.words_enc, tag_dim=self.tag_dim, 129 | num_tag_layers=self.num_tag_layers, 130 | weight_decay=self.weight_decay) 131 | elif self.tagging_mechanism == "bidirectional_decoder": 132 | tag_vect_fw, tag_vect_bw, tag_vect_lw_fw, tag_vect_lw_bw = self.biderectional_cross_sent_ED( 133 | words_enc=self.words_enc, tag_dim=self.tag_dim, num_tag_layers=self.num_tag_layers, 134 | weight_decay=self.weight_decay) 135 | tag_vect = tf.concat([tag_vect_fw, tag_vect_bw], axis=-1) 136 | tag_vect_layerwise = tf.concat([tag_vect_lw_fw, tag_vect_lw_bw], axis=-1) 137 | elif self.tagging_mechanism == "agg_average": 138 | tag_vect_fw, tag_vect_bw, tag_vect_lw_fw, tag_vect_lw_bw = self.agg_choice_cross_sent_ED( 139 | words_enc=self.words_enc, 140 | tag_dim=self.tag_dim, 141 | num_tag_layers=self.num_tag_layers, 142 | weight_decay=self.weight_decay, 143 | agg_choice="average") 144 | tag_vect = tf.concat([tag_vect_fw, tag_vect_bw], axis=-1) 145 | tag_vect_layerwise = tf.concat([tag_vect_lw_fw, tag_vect_lw_bw], axis=-1) 146 | elif self.tagging_mechanism == "agg_concat": 147 | tag_vect_fw, tag_vect_bw, tag_vect_lw_fw, tag_vect_lw_bw = self.agg_choice_cross_sent_ED( 148 | words_enc=self.words_enc, 149 | tag_dim=self.tag_dim, 150 | num_tag_layers=self.num_tag_layers, 151 | weight_decay=self.weight_decay, 152 | agg_choice="concat") 153 | tag_vect = tf.concat([tag_vect_fw, tag_vect_bw], axis=-1) 154 | tag_vect_layerwise = tf.concat([tag_vect_lw_fw, tag_vect_lw_bw], axis=-1) 155 | else: 156 | print("tagging_mechanism assigned is not supported!") 157 | 158 | # self loss function 159 | self.loss, self.label_true, self.label_pred, self.valid_len_list = self.loss_layer(tag_vect) 160 | 161 | def word_embedding_init(self): 162 | """ 163 | initialize the word embedding matrix 164 | """ 165 | if self.word_emb_mat is None: 166 | print("The embedding matrix must be initialized!") 167 | else: 168 | self.word_emb_mat = tf.Variable(self.word_emb_mat, trainable=True, name='word_emb_mat') 169 | 170 | def embedding_layer(self): 171 | """ 172 | embedding layer with respect to the word embedding matrix 173 | """ 174 | embedding_tmp = tf.nn.embedding_lookup(self.word_emb_mat, self.input_docs) 175 | # looking up the level-1 ner embedding 176 | if self.ner_size_1 is not None: 177 | ner_mat_1 = tf.get_variable(name="ner_mat_1", shape=[self.ner_size_1, self.ner_dim_1], 178 | dtype=tf.float32, initializer=self.initializer) 179 | emb_ner1_tmp = tf.nn.embedding_lookup(ner_mat_1, self.ner_docs_1) 180 | embedding_tmp = tf.concat([embedding_tmp, emb_ner1_tmp], axis=-1) 181 | # looking up the level-2 ner embedding 182 | if self.ner_size_2 is not None: 183 | ner_mat_2 = tf.get_variable(name="ner_mat_2", shape=[self.ner_size_2, self.ner_dim_2], 184 | dtype=tf.float32, initializer=self.initializer) 185 | emb_ner2_tmp = tf.nn.embedding_lookup(ner_mat_2, self.ner_docs_1) 186 | embedding_tmp = tf.concat([embedding_tmp, emb_ner2_tmp], axis=-1) 187 | return embedding_tmp 188 | 189 | def sent_encode_layer(self, embedding_input, valid_len, name): 190 | """ 191 | sentence encoding layer to get representation of each words 192 | """ 193 | with tf.variable_scope(name, reuse=tf.AUTO_REUSE): 194 | lstm_cell = {} 195 | for direction in ["forward", "backward"]: 196 | with tf.variable_scope(direction): 197 | lstm_cell[direction] = rnn.CoupledInputForgetGateLSTMCell( 198 | self.encode_h, 199 | use_peepholes=True, 200 | initializer=self.initializer, 201 | state_is_tuple=True 202 | ) 203 | (outputs, 204 | (encoder_fw_final_state, 205 | encoder_bw_final_state)) = tf.nn.bidirectional_dynamic_rnn( 206 | lstm_cell["forward"], 207 | lstm_cell["backward"], 208 | inputs=embedding_input, 209 | dtype=tf.float32, 210 | sequence_length=valid_len 211 | ) 212 | words_out = tf.concat(outputs, axis=-1) 213 | final_state = tf.concat((encoder_fw_final_state.h, encoder_bw_final_state.h), -1) 214 | final_state_add = (encoder_fw_final_state.h + encoder_bw_final_state.h) / 2 215 | return words_out, final_state, final_state_add 216 | 217 | def sent_self_att(self, words_enc, valid_words_len): 218 | """ 219 | sentence-level self-attention 220 | :param words_enc: batch_size * max_doc_size * max_seq_len * dim 221 | :param valid_words_len: batch_size * max_doc_size 222 | """ 223 | enc_dim_tmp = words_enc.get_shape().as_list()[-1] 224 | words_enc_new0 = tf.reshape(words_enc, [self.batch_size * self.max_doc_len, self.max_seq_len, enc_dim_tmp]) 225 | valid_words_len_new = tf.reshape(valid_words_len, shape=[-1]) 226 | 227 | def self_att(variable_scope="attention", weight_name="att_W"): 228 | """ 229 | sentence level self attention with different window size 230 | """ 231 | with tf.variable_scope(variable_scope, reuse=tf.AUTO_REUSE): 232 | W = tf.get_variable(weight_name, 233 | shape=[enc_dim_tmp, enc_dim_tmp], 234 | dtype=tf.float32, 235 | initializer=self.initializer, 236 | ) 237 | # x'Wx 238 | words_enc_new = tf.reshape(words_enc, 239 | [self.batch_size * self.max_doc_len * self.max_seq_len, enc_dim_tmp]) 240 | words_enc_new = tf.matmul(words_enc_new, W) 241 | words_enc_new = tf.reshape(words_enc_new, 242 | [self.batch_size * self.max_doc_len, self.max_seq_len, enc_dim_tmp]) 243 | # tanh(x'Wx) 244 | logit_self_att = tf.matmul(words_enc_new, tf.transpose(words_enc_new0, perm=[0, 2, 1])) 245 | logit_self_att = tf.tanh(logit_self_att) 246 | probs = tf.nn.softmax(logit_self_att) 247 | 248 | # mask invalid words 249 | mask_words = tf.sequence_mask(valid_words_len_new, maxlen=self.max_seq_len, 250 | dtype=tf.float32) # 160 * 100 251 | mask_words = tf.tile(tf.expand_dims(mask_words, axis=1), 252 | multiples=[1, self.max_seq_len, 1]) # 160 * 100 * 100 253 | probs = probs * mask_words 254 | probs = tf.matmul(tf.matrix_diag(1 / (tf.reduce_sum(probs, axis=-1) + 1e-8)), 255 | probs) # re-standardize the probability 256 | # attention output 257 | att_output = tf.matmul(probs, words_enc_new0) 258 | att_output = tf.reshape(att_output, 259 | shape=[self.batch_size, self.max_doc_len, self.max_seq_len, enc_dim_tmp]) 260 | return att_output 261 | 262 | att_output = self_att(variable_scope="attention", weight_name="att_W") 263 | return att_output 264 | 265 | def info_agg_layer(self, pred_tag_vect, reverse_seq=False): 266 | """ 267 | sentence-level event and semantic information aggregation layer 268 | """ 269 | dim_curr = pred_tag_vect.get_shape().as_list()[-1] 270 | 271 | # mask invalid words 272 | mask_padding_ind = tf.sequence_mask(self.valid_words_len, maxlen=self.max_seq_len, dtype=tf.float32) 273 | mask_padding_ind = tf.tile(tf.expand_dims(mask_padding_ind, axis=3), multiples=[1, 1, 1, dim_curr]) 274 | pred_tag_vect = pred_tag_vect * mask_padding_ind 275 | 276 | # reverse the sequence 277 | if reverse_seq: 278 | pred_tag_vect = pred_tag_vect[:, :, ::-1, :] 279 | var_name = "reversed_sent_info_agg_layer" 280 | else: 281 | var_name = "sent_info_agg_layer" 282 | 283 | info_agg_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.event_info_h, forget_bias=0.0, state_is_tuple=True, 284 | name=var_name, reuse=tf.AUTO_REUSE) 285 | info_agg_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(info_agg_lstm_cell, output_keep_prob=1 - self.dropout_rate) 286 | # todo, change to bidirectional_dynamic_rnn 287 | # _, _, sent_event_sematic_info = self.sent_encode_layer( 288 | # embedding_input=tf.reshape(pred_tag_vect, shape=[self.batch_size * self.max_doc_len, self.max_seq_len, -1]), 289 | # valid_len=tf.reshape(self.valid_words_len, [-1]), 290 | # name=var_name 291 | # ) 292 | _, (_, sent_event_sematic_info) = tf.nn.dynamic_rnn(cell=info_agg_lstm_cell, 293 | inputs=tf.reshape(pred_tag_vect, 294 | shape=[self.batch_size * self.max_doc_len, 295 | self.max_seq_len, -1]), 296 | sequence_length=tf.reshape(self.valid_words_len, [-1]), 297 | dtype=tf.float32 298 | ) 299 | sent_event_sematic_info = tf.reshape(sent_event_sematic_info, 300 | shape=[self.batch_size, self.max_doc_len, -1]) 301 | return sent_event_sematic_info 302 | 303 | def info_agg_layer_bi(self, pred_tag_vect, reverse_seq=False): 304 | """ 305 | sentence-level event and semantic information aggregation layer 306 | """ 307 | dim_curr = pred_tag_vect.get_shape().as_list()[-1] 308 | 309 | # mask invalid words 310 | mask_padding_ind = tf.sequence_mask(self.valid_words_len, maxlen=self.max_seq_len, dtype=tf.float32) 311 | mask_padding_ind = tf.tile(tf.expand_dims(mask_padding_ind, axis=3), multiples=[1, 1, 1, dim_curr]) 312 | pred_tag_vect = pred_tag_vect * mask_padding_ind 313 | 314 | # reverse the sequence 315 | if reverse_seq: 316 | pred_tag_vect = pred_tag_vect[:, :, ::-1, :] 317 | var_name = "reversed_sent_info_agg_layer" 318 | else: 319 | var_name = "sent_info_agg_layer" 320 | 321 | # info_agg_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.event_info_h, forget_bias=0.0, state_is_tuple=True, 322 | # name=var_name, reuse=tf.AUTO_REUSE) 323 | # info_agg_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(info_agg_lstm_cell, output_keep_prob=1 - self.dropout_rate) 324 | # todo, change to bidirectional_dynamic_rnn 325 | _, _, sent_event_sematic_info = self.sent_encode_layer( 326 | embedding_input=tf.reshape(pred_tag_vect, shape=[self.batch_size * self.max_doc_len, self.max_seq_len, -1]), 327 | valid_len=tf.reshape(self.valid_words_len, [-1]), 328 | name=var_name 329 | ) 330 | # _, (_, sent_event_sematic_info) = tf.nn.dynamic_rnn(cell=info_agg_lstm_cell, 331 | # inputs=tf.reshape(pred_tag_vect, 332 | # shape=[self.batch_size * self.max_doc_len, 333 | # self.max_seq_len, -1]), 334 | # sequence_length=tf.reshape(self.valid_words_len, [-1]), 335 | # dtype=tf.float32 336 | # ) 337 | sent_event_sematic_info = tf.reshape(sent_event_sematic_info, 338 | shape=[self.batch_size, self.max_doc_len, -1]) 339 | return sent_event_sematic_info 340 | 341 | def project(self, h_state, lstm_dim): 342 | """ 343 | project the output of decoder model to a tag vector 344 | """ 345 | enc_dim = h_state.get_shape().as_list()[-1] 346 | with tf.variable_scope("tag_project_layer", reuse=tf.AUTO_REUSE): 347 | W = tf.get_variable("W", 348 | shape=[enc_dim, lstm_dim], 349 | dtype=tf.float32, 350 | initializer=self.initializer, 351 | ) 352 | b = tf.get_variable("b", 353 | shape=[lstm_dim], 354 | dtype=tf.float32, 355 | initializer=tf.zeros_initializer() 356 | ) 357 | y_pre = tf.add(tf.matmul(h_state, W), b) 358 | tag_pre = tf.cast(tf.argmax(tf.nn.softmax(y_pre), axis=-1), tf.float32) 359 | return y_pre, tag_pre 360 | 361 | def forward_cross_sent_ED(self, words_enc, tag_dim, num_tag_layers, weight_decay): 362 | """ 363 | forward-wise cross-sentence event tag event detection, modeling the forward-wise event correlation 364 | :param words_enc: words encoding 365 | :param num_tag_layers: number of tagging layers 366 | :param weight_decay: weight decay of tagging vectors of different layers 367 | """ 368 | # decoding layer 369 | # all layers share the same decoder layer 370 | # for the first decoder layer, we set c_{i-1} and c_{i+1} with unk_event_semantic 371 | lstm_outputs = tf.reshape(words_enc, shape=[self.batch_size * self.max_doc_len, self.max_seq_len, -1]) 372 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.decode_h, forget_bias=0.0, state_is_tuple=True, 373 | name="forward_rnn_decoder", reuse=tf.AUTO_REUSE) 374 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=1 - self.dropout_rate) 375 | 376 | # mutli-tagging block 377 | tag_final = tf.zeros(shape=[self.batch_size, self.max_doc_len, self.max_seq_len, tag_dim], dtype=tf.float32) 378 | tag_final_list = [] 379 | 380 | init_state = lstm_cell.zero_state(self.batch_size * self.max_doc_len, dtype=tf.float32) 381 | # event and semantic information of the previous sentence and next sentence sentence 382 | info_event_sem_pre_sent = tf.tile(self.unk_event_semantic, 383 | multiples=[self.batch_size * self.max_doc_len, 1]) 384 | info_event_sem_next_sent = tf.tile(self.unk_event_semantic, 385 | multiples=[self.batch_size * self.max_doc_len, 1]) 386 | 387 | # event and semantic information of the beginning sentence 388 | info_event_sem_init_sent = tf.tile(self.unk_event_semantic, multiples=[self.batch_size, 1]) 389 | info_event_sem_init_sent = tf.expand_dims(info_event_sem_init_sent, axis=1) 390 | info_event_sem_mat0 = tf.tile(tf.expand_dims(self.unk_event_semantic, axis=0), 391 | multiples=[self.batch_size, self.max_doc_len, 1]) 392 | with tf.variable_scope("forward_rnn_decoding_layer", reuse=tf.AUTO_REUSE): 393 | for layer_id in range(num_tag_layers): 394 | # initialize for each layer 395 | c_state, h_state = init_state 396 | tag_pre = tf.zeros([self.batch_size * self.max_doc_len, tag_dim]) 397 | tag_outputs = [] 398 | for time_step in range(self.max_seq_len): 399 | if time_step > 0: 400 | tf.get_variable_scope().reuse_variables() 401 | if self.num_tag_layers > 1: 402 | two_info = tf.concat([info_event_sem_pre_sent, info_event_sem_next_sent], axis=-1) 403 | input_all = tf.concat([lstm_outputs[:, time_step, :], two_info, tag_pre], axis=-1) 404 | else: 405 | input_all = tf.concat([lstm_outputs[:, time_step, :], tag_pre], axis=-1) 406 | (cell_output, (c_state, h_state)) = lstm_cell(input_all, (c_state, h_state)) 407 | tag_pre, tag_result = self.project(cell_output, tag_dim) 408 | if self.event_vector_trans: 409 | tag_pre = tf.tanh(tag_pre) 410 | tag_outputs.append(tag_pre) 411 | tag_outputs = tf.reshape(tf.transpose(tag_outputs, [1, 0, 2]), [self.batch_size, self.max_doc_len, 412 | self.max_seq_len, tag_dim]) 413 | if self.num_tag_layers > 1: 414 | # info aggregation of current sentence, [batch_size, max_doc_len,event_info_h] 415 | info_event_sem_current_sent = self.info_agg_layer(tag_outputs, reverse_seq=False) 416 | 417 | # corresponds to the information of previous sentence 418 | info_event_sem_pre_sent = tf.concat([info_event_sem_init_sent, 419 | info_event_sem_current_sent[:, :-1, :]], axis=1) 420 | info_event_sem_pre_sent = tf.reshape(info_event_sem_pre_sent, 421 | shape=[self.batch_size * self.max_doc_len, -1]) 422 | 423 | # find valid sentence firstly, and replace with emebedding of unk 424 | info_event_sem_current_sent_bw = self.info_agg_layer(tag_outputs, reverse_seq=self.reverse_seq) 425 | 426 | valid_sent_ind = tf.sequence_mask(self.valid_sent_len, maxlen=self.max_doc_len, dtype=tf.float32) 427 | valid_sent_ind = tf.tile(tf.expand_dims(valid_sent_ind, axis=2), multiples=[1, 1, self.event_info_h]) 428 | info_event_sem_current_sent_bw = info_event_sem_current_sent_bw * valid_sent_ind + \ 429 | info_event_sem_mat0 * (1 - valid_sent_ind) 430 | 431 | # corresponds to the information of previous sentence 432 | info_event_sem_next_sent = tf.concat([info_event_sem_current_sent_bw[:, 1:, :], info_event_sem_init_sent], 433 | axis=1) 434 | info_event_sem_next_sent = tf.reshape(info_event_sem_next_sent, 435 | shape=[self.batch_size * self.max_doc_len, -1]) 436 | 437 | tag_final += weight_decay ** layer_id * tag_outputs 438 | tag_final_list.append(tag_outputs) 439 | return tag_final, tag_final_list 440 | 441 | 442 | def backward_cross_sent_ED(self, words_enc, tag_dim, num_tag_layers, weight_decay): 443 | """ 444 | backward-wise cross-sentence event tag event detection, modeling the backward-wise event correlation 445 | """ 446 | # reshape the inputs and reverse it to cater to backward event extraction 447 | lstm_outputs = tf.reshape(words_enc, shape=[self.batch_size * self.max_doc_len, self.max_seq_len, -1]) 448 | lstm_outputs = lstm_outputs[:, ::-1, :] 449 | 450 | # decoding layer 451 | # all layers share the same decoder layer 452 | # for the first decoder layer, we set c_{i-1} and c_{i+1} with unk_event_semantic 453 | lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.decode_h, forget_bias=0.0, state_is_tuple=True, 454 | name="backward_rnn_decoder", reuse=tf.AUTO_REUSE) 455 | lstm_cell = tf.nn.rnn_cell.DropoutWrapper(lstm_cell, output_keep_prob=1 - self.dropout_rate) 456 | 457 | # mutli-tagging block 458 | tag_final = tf.zeros(shape=[self.batch_size, self.max_doc_len, self.max_seq_len, tag_dim], dtype=tf.float32) 459 | tag_final_list = [] 460 | 461 | init_state = lstm_cell.zero_state(self.batch_size * self.max_doc_len, dtype=tf.float32) 462 | # event and semantic information of the previous sentence and next sentence sentence 463 | info_event_sem_pre_sent = tf.tile(self.unk_event_semantic, 464 | multiples=[self.batch_size * self.max_doc_len, 1]) 465 | info_event_sem_next_sent = tf.tile(self.unk_event_semantic, 466 | multiples=[self.batch_size * self.max_doc_len, 1]) 467 | 468 | # event and semantic information of the final sentence 469 | info_event_sem_init_sent = tf.tile(self.unk_event_semantic, multiples=[self.batch_size, 1]) 470 | info_event_sem_init_sent = tf.expand_dims(info_event_sem_init_sent, axis=1) 471 | info_event_sem_mat0 = tf.tile(tf.expand_dims(self.unk_event_semantic, axis=0), 472 | multiples=[self.batch_size, self.max_doc_len, 1]) 473 | 474 | with tf.variable_scope("backward_rnn_decoding_layer", reuse=tf.AUTO_REUSE): 475 | for layer_id in range(num_tag_layers): 476 | # initialize for each layer 477 | c_state, h_state = init_state 478 | tag_next = tf.zeros([self.batch_size * self.max_doc_len, tag_dim]) 479 | tag_outputs = [] 480 | for time_step in range(self.max_seq_len): 481 | if time_step > 0: 482 | tf.get_variable_scope().reuse_variables() 483 | if self.num_tag_layers > 1: 484 | two_info = tf.concat([info_event_sem_pre_sent, info_event_sem_next_sent], axis=-1) 485 | input_all = tf.concat([lstm_outputs[:, time_step, :], two_info, tag_next], axis=-1) 486 | else: 487 | input_all = tf.concat([lstm_outputs[:, time_step, :], tag_next], axis=-1) 488 | (cell_output, (c_state, h_state)) = lstm_cell(input_all, (c_state, h_state)) 489 | tag_next, tag_result = self.project(cell_output, tag_dim) 490 | if self.event_vector_trans: 491 | tag_next = tf.tanh(tag_next) 492 | tag_outputs.append(tag_next) 493 | tag_outputs = tf.reshape(tf.transpose(tag_outputs, [1, 0, 2]), [self.batch_size, self.max_doc_len, 494 | self.max_seq_len, tag_dim]) 495 | # recover the tag_outputs in order 496 | tag_outputs = tag_outputs[:, :, ::-1, :] 497 | 498 | if self.num_tag_layers > 1: 499 | # info aggregation of current sentence, [batch_size, max_doc_len,event_info_h] 500 | info_event_sem_current_sent = self.info_agg_layer(tag_outputs, reverse_seq=self.reverse_seq) 501 | 502 | # find valid sentence firstly, and replace with emebedding of unk 503 | valid_sent_ind = tf.sequence_mask(self.valid_sent_len, maxlen=self.max_doc_len, dtype=tf.float32) 504 | valid_sent_ind = tf.tile(tf.expand_dims(valid_sent_ind, axis=2), multiples=[1, 1, self.event_info_h]) 505 | info_event_sem_current_sent = info_event_sem_current_sent * valid_sent_ind + \ 506 | info_event_sem_mat0 * (1 - valid_sent_ind) 507 | 508 | # corresponds to the information of previous sentence 509 | info_event_sem_next_sent = tf.concat([info_event_sem_current_sent[:, 1:, :], info_event_sem_init_sent], 510 | axis=1) 511 | info_event_sem_next_sent = tf.reshape(info_event_sem_next_sent, 512 | shape=[self.batch_size * self.max_doc_len, -1]) 513 | 514 | # information of previous sentence, [batch_size, max_doc_len,event_info_h] 515 | info_event_sem_current_sent = self.info_agg_layer(tag_outputs, reverse_seq=False) 516 | info_event_sem_pre_sent = tf.concat([info_event_sem_init_sent, info_event_sem_current_sent[:, :-1, :]], 517 | axis=1) 518 | info_event_sem_pre_sent = tf.reshape(info_event_sem_pre_sent, 519 | shape=[self.batch_size * self.max_doc_len, -1]) 520 | 521 | tag_final += weight_decay ** layer_id * tag_outputs 522 | tag_final_list.append(tag_outputs) 523 | return tag_final, tag_final_list 524 | 525 | 526 | def biderectional_cross_sent_ED(self, words_enc, tag_dim, num_tag_layers, weight_decay): 527 | """ 528 | birectional cross-sentence event tag event detection, modeling birectional event correlation 529 | """ 530 | # decoding layer 531 | # all layers share the same decoder layer 532 | # for the first decoder layer, we set c_{i-1} and c_{i+1} with unk_event_semantic 533 | lstm_outputs = tf.reshape(words_enc, shape=[self.batch_size * self.max_doc_len, self.max_seq_len, -1]) 534 | backward_lstm_outputs = lstm_outputs[:, ::-1, :] 535 | 536 | fw_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.decode_h, forget_bias=0.0, state_is_tuple=True, 537 | name="forward_rnn_decoder", reuse=tf.AUTO_REUSE) 538 | fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(fw_lstm_cell, output_keep_prob=1 - self.dropout_rate) 539 | 540 | bw_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.decode_h, forget_bias=0.0, state_is_tuple=True, 541 | name="backward_rnn_decoder", reuse=tf.AUTO_REUSE) 542 | bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(bw_lstm_cell, output_keep_prob=1 - self.dropout_rate) 543 | 544 | # mutli-tagging block 545 | tag_final_fw = tf.zeros(shape=[self.batch_size, self.max_doc_len, self.max_seq_len, tag_dim], dtype=tf.float32) 546 | tag_final_bw = tf.zeros(shape=[self.batch_size, self.max_doc_len, self.max_seq_len, tag_dim], dtype=tf.float32) 547 | tag_final_list_fw = [] 548 | tag_final_list_bw = [] 549 | 550 | fw_init_state = fw_lstm_cell.zero_state(self.batch_size * self.max_doc_len, dtype=tf.float32) 551 | bw_init_state = bw_lstm_cell.zero_state(self.batch_size * self.max_doc_len, dtype=tf.float32) 552 | # event and semantic information of the previous sentence and next sentence sentence 553 | info_event_sem_pre_sent = tf.tile(self.unk_event_semantic, 554 | multiples=[self.batch_size * self.max_doc_len, 1]) 555 | info_event_sem_next_sent = tf.tile(self.unk_event_semantic, 556 | multiples=[self.batch_size * self.max_doc_len, 1]) 557 | 558 | # event and semantic information of the beginning sentence 559 | info_event_sem_init_sent = tf.tile(self.unk_event_semantic, multiples=[self.batch_size, 1]) 560 | info_event_sem_init_sent = tf.expand_dims(info_event_sem_init_sent, axis=1) 561 | info_event_sem_mat0 = tf.tile(tf.expand_dims(self.unk_event_semantic, axis=0), 562 | multiples=[self.batch_size, self.max_doc_len, 1]) 563 | 564 | with tf.variable_scope("bidirectional_rnn_decoding_layer", reuse=tf.AUTO_REUSE): 565 | for layer_id in range(num_tag_layers): 566 | # initialize for each layer 567 | fw_c_state, fw_h_state = fw_init_state 568 | bw_c_state, bw_h_state = bw_init_state 569 | tag_fw = tf.zeros([self.batch_size * self.max_doc_len, tag_dim]) 570 | tag_bw = tf.zeros([self.batch_size * self.max_doc_len, tag_dim]) 571 | fw_tag_outputs = [] 572 | bw_tag_outputs = [] 573 | for time_step in range(self.max_seq_len): 574 | if time_step > 0: 575 | tf.get_variable_scope().reuse_variables() 576 | # concat two event information 577 | if self.num_tag_layers > 1: 578 | if not self.context_info: 579 | fw_input_all = tf.concat([lstm_outputs[:, time_step, :], info_event_sem_pre_sent, tag_fw], 580 | axis=-1) 581 | bw_input_all = tf.concat([backward_lstm_outputs[:, time_step, :], 582 | info_event_sem_next_sent, tag_bw], axis=-1) 583 | else: 584 | two_info = tf.concat([info_event_sem_pre_sent, info_event_sem_next_sent], axis=-1) 585 | fw_input_all = tf.concat([lstm_outputs[:, time_step, :], two_info, tag_fw], axis=-1) 586 | bw_input_all = tf.concat([backward_lstm_outputs[:, time_step, :], two_info, tag_bw], axis=-1) 587 | else: 588 | fw_input_all = tf.concat([lstm_outputs[:, time_step, :], tag_fw], 589 | axis=-1) 590 | bw_input_all = tf.concat([backward_lstm_outputs[:, time_step, :], tag_bw], axis=-1) 591 | # forward decoder 592 | (fw_cell_output, (fw_c_state, fw_h_state)) = fw_lstm_cell(fw_input_all, (fw_c_state, fw_h_state)) 593 | tag_fw, _ = self.project(fw_cell_output, tag_dim) 594 | if self.event_vector_trans: 595 | tag_fw = tf.tanh(tag_fw) 596 | fw_tag_outputs.append(tag_fw) 597 | 598 | # backward decoder 599 | (bw_cell_output, (bw_c_state, bw_h_state)) = bw_lstm_cell(bw_input_all, (bw_c_state, bw_h_state)) 600 | tag_bw, _ = self.project(bw_cell_output, tag_dim) 601 | if self.event_vector_trans: 602 | tag_bw = tf.tanh(tag_bw) 603 | bw_tag_outputs.append(tag_bw) 604 | 605 | fw_tag_outputs = tf.reshape(tf.transpose(fw_tag_outputs, [1, 0, 2]), [self.batch_size, self.max_doc_len, 606 | self.max_seq_len, tag_dim]) 607 | bw_tag_outputs = tf.reshape(tf.transpose(bw_tag_outputs, [1, 0, 2]), [self.batch_size, self.max_doc_len, 608 | self.max_seq_len, tag_dim]) 609 | # recover the bw_tag_outputs in order 610 | bw_tag_outputs = bw_tag_outputs[:, :, ::-1, :] 611 | 612 | tag_final_fw += weight_decay ** layer_id * fw_tag_outputs 613 | tag_final_list_fw.append(fw_tag_outputs) 614 | tag_final_bw += weight_decay ** layer_id * bw_tag_outputs 615 | tag_final_list_bw.append(bw_tag_outputs) 616 | if self.num_tag_layers > 1: 617 | # -----------update event and semantic information for the previous and next setence---------- 618 | # info aggregation of current sentence, [batch_size, max_doc_len,event_info_h] 619 | info_event_sem_current_sent_fw = self.info_agg_layer(tf.concat([fw_tag_outputs, bw_tag_outputs], 620 | axis=-1), reverse_seq=False) 621 | # corresponds to the information of previous sentence 622 | info_event_sem_pre_sent = tf.concat([info_event_sem_init_sent, info_event_sem_current_sent_fw[:, :-1, :]], 623 | axis=1) 624 | info_event_sem_pre_sent = tf.reshape(info_event_sem_pre_sent, 625 | shape=[self.batch_size * self.max_doc_len, -1]) 626 | 627 | # find valid sentence firstly, and replace with emebedding of unk 628 | # if self.reverse_seq: 629 | info_event_sem_current_sent_bw = self.info_agg_layer(tf.concat([fw_tag_outputs, bw_tag_outputs], 630 | axis=-1), reverse_seq=self.reverse_seq) 631 | # else: 632 | # info_event_sem_current_sent_bw = info_event_sem_current_sent_fw 633 | 634 | valid_sent_ind = tf.sequence_mask(self.valid_sent_len, maxlen=self.max_doc_len, dtype=tf.float32) 635 | valid_sent_ind = tf.tile(tf.expand_dims(valid_sent_ind, axis=2), multiples=[1, 1, self.event_info_h]) 636 | info_event_sem_current_sent_bw = info_event_sem_current_sent_bw * valid_sent_ind + \ 637 | info_event_sem_mat0 * (1 - valid_sent_ind) 638 | 639 | # corresponds to the information of previous sentence 640 | info_event_sem_next_sent = tf.concat([info_event_sem_current_sent_bw[:, 1:, :], info_event_sem_init_sent], 641 | axis=1) 642 | info_event_sem_next_sent = tf.reshape(info_event_sem_next_sent, 643 | shape=[self.batch_size * self.max_doc_len, -1]) 644 | return tag_final_fw, tag_final_bw, tag_final_list_fw, tag_final_list_bw 645 | 646 | 647 | def agg_choice_cross_sent_ED(self, words_enc, tag_dim, num_tag_layers, weight_decay, agg_choice="lstm"): 648 | """ 649 | different choice of aggregation function 650 | agg_choice: average, lstm, or concat (concat state) 651 | """ 652 | # decoding layer 653 | # all layers share the same decoder layer 654 | # for the first decoder layer, we set c_{i-1} and c_{i+1} with unk_event_semantic 655 | lstm_outputs = tf.reshape(words_enc, shape=[self.batch_size * self.max_doc_len, self.max_seq_len, -1]) 656 | backward_lstm_outputs = lstm_outputs[:, ::-1, :] 657 | 658 | fw_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.decode_h, forget_bias=0.0, state_is_tuple=True, 659 | name="forward_rnn_decoder", reuse=tf.AUTO_REUSE) 660 | fw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(fw_lstm_cell, output_keep_prob=1 - self.dropout_rate) 661 | 662 | bw_lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.decode_h, forget_bias=0.0, state_is_tuple=True, 663 | name="backward_rnn_decoder", reuse=tf.AUTO_REUSE) 664 | bw_lstm_cell = tf.nn.rnn_cell.DropoutWrapper(bw_lstm_cell, output_keep_prob=1 - self.dropout_rate) 665 | 666 | # mutli-tagging block 667 | tag_final_fw = tf.zeros(shape=[self.batch_size, self.max_doc_len, self.max_seq_len, tag_dim], dtype=tf.float32) 668 | tag_final_bw = tf.zeros(shape=[self.batch_size, self.max_doc_len, self.max_seq_len, tag_dim], dtype=tf.float32) 669 | tag_final_list_fw = [] 670 | tag_final_list_bw = [] 671 | 672 | fw_init_state = fw_lstm_cell.zero_state(self.batch_size * self.max_doc_len, dtype=tf.float32) 673 | bw_init_state = bw_lstm_cell.zero_state(self.batch_size * self.max_doc_len, dtype=tf.float32) 674 | # event and semantic information of the previous sentence and next sentence sentence 675 | if agg_choice == "lstm": 676 | info_event_sem_pre_sent = tf.tile(self.unk_event_semantic, 677 | multiples=[self.batch_size * self.max_doc_len, 1]) 678 | info_event_sem_next_sent = tf.tile(self.unk_event_semantic, 679 | multiples=[self.batch_size * self.max_doc_len, 1]) 680 | else: 681 | info_event_sem_pre_sent = tf.zeros(shape=[self.batch_size * self.max_doc_len, 1 * tag_dim]) 682 | info_event_sem_next_sent = tf.zeros(shape=[self.batch_size * self.max_doc_len, 1 * tag_dim]) 683 | 684 | # event and semantic information of the beginning sentence 685 | info_event_sem_init_sent = tf.tile(self.unk_event_semantic, multiples=[self.batch_size, 1]) 686 | info_event_sem_init_sent = tf.expand_dims(info_event_sem_init_sent, axis=1) 687 | info_event_sem_mat0 = tf.tile(tf.expand_dims(self.unk_event_semantic, axis=0), 688 | multiples=[self.batch_size, self.max_doc_len, 1]) 689 | 690 | with tf.variable_scope("bidirectional_rnn_decoding_layer", reuse=tf.AUTO_REUSE): 691 | for layer_id in range(num_tag_layers): 692 | # initialize for each layer 693 | fw_c_state, fw_h_state = fw_init_state 694 | bw_c_state, bw_h_state = bw_init_state 695 | tag_fw = tf.zeros([self.batch_size * self.max_doc_len, tag_dim]) 696 | tag_bw = tf.zeros([self.batch_size * self.max_doc_len, tag_dim]) 697 | fw_tag_outputs = [] 698 | bw_tag_outputs = [] 699 | for time_step in range(self.max_seq_len): 700 | if time_step > 0: 701 | tf.get_variable_scope().reuse_variables() 702 | # concat two event information 703 | if self.num_tag_layers > 1: 704 | if not self.context_info: 705 | fw_input_all = tf.concat([lstm_outputs[:, time_step, :], info_event_sem_pre_sent, tag_fw], 706 | axis=-1) 707 | bw_input_all = tf.concat([backward_lstm_outputs[:, time_step, :], 708 | info_event_sem_next_sent, tag_bw], axis=-1) 709 | else: 710 | two_info = tf.concat([info_event_sem_pre_sent, info_event_sem_next_sent], axis=-1) 711 | fw_input_all = tf.concat([lstm_outputs[:, time_step, :], two_info, tag_fw], axis=-1) 712 | bw_input_all = tf.concat([backward_lstm_outputs[:, time_step, :], two_info, tag_bw], 713 | axis=-1) 714 | else: 715 | fw_input_all = tf.concat([lstm_outputs[:, time_step, :], tag_fw], 716 | axis=-1) 717 | bw_input_all = tf.concat([backward_lstm_outputs[:, time_step, :], tag_bw], axis=-1) 718 | # forward decoder 719 | (fw_cell_output, (fw_c_state, fw_h_state)) = fw_lstm_cell(fw_input_all, (fw_c_state, fw_h_state)) 720 | tag_fw, _ = self.project(fw_cell_output, tag_dim) 721 | if self.event_vector_trans: 722 | tag_fw = tf.tanh(tag_fw) 723 | fw_tag_outputs.append(tag_fw) 724 | 725 | # backward decoder 726 | (bw_cell_output, (bw_c_state, bw_h_state)) = bw_lstm_cell(bw_input_all, (bw_c_state, bw_h_state)) 727 | tag_bw, _ = self.project(bw_cell_output, tag_dim) 728 | if self.event_vector_trans: 729 | tag_bw = tf.tanh(tag_bw) 730 | bw_tag_outputs.append(tag_bw) 731 | 732 | fw_tag_outputs = tf.reshape(tf.transpose(fw_tag_outputs, [1, 0, 2]), [self.batch_size, self.max_doc_len, 733 | self.max_seq_len, tag_dim]) 734 | bw_tag_outputs = tf.reshape(tf.transpose(bw_tag_outputs, [1, 0, 2]), [self.batch_size, self.max_doc_len, 735 | self.max_seq_len, tag_dim]) 736 | # recover the bw_tag_outputs in order 737 | bw_tag_outputs = bw_tag_outputs[:, :, ::-1, :] 738 | 739 | tag_final_fw += weight_decay ** layer_id * fw_tag_outputs 740 | tag_final_list_fw.append(fw_tag_outputs) 741 | tag_final_bw += weight_decay ** layer_id * bw_tag_outputs 742 | tag_final_list_bw.append(bw_tag_outputs) 743 | if self.num_tag_layers > 1: 744 | # -----------update event and semantic information for the previous and next setence---------- 745 | # info aggregation of current sentence, [batch_size, max_doc_len,event_info_h] 746 | if agg_choice == "lstm": 747 | info_event_sem_current_sent_fw = self.info_agg_layer(tf.concat([fw_tag_outputs, bw_tag_outputs], 748 | axis=-1), reverse_seq=False) 749 | # corresponds to the information of previous sentence 750 | info_event_sem_pre_sent = tf.concat( 751 | [info_event_sem_init_sent, info_event_sem_current_sent_fw[:, :-1, :]], 752 | axis=1) 753 | info_event_sem_pre_sent = tf.reshape(info_event_sem_pre_sent, 754 | shape=[self.batch_size * self.max_doc_len, -1]) 755 | 756 | # find valid sentence firstly, and replace with emebedding of unk 757 | info_event_sem_current_sent_bw = self.info_agg_layer( 758 | tf.concat([fw_tag_outputs, bw_tag_outputs],axis=-1), 759 | reverse_seq=self.reverse_seq) 760 | 761 | valid_sent_ind = tf.sequence_mask(self.valid_sent_len, maxlen=self.max_doc_len, dtype=tf.float32) 762 | valid_sent_ind = tf.tile(tf.expand_dims(valid_sent_ind, axis=2), 763 | multiples=[1, 1, self.event_info_h]) 764 | info_event_sem_current_sent_bw = info_event_sem_current_sent_bw * valid_sent_ind + \ 765 | info_event_sem_mat0 * (1 - valid_sent_ind) 766 | 767 | # corresponds to the information of previous sentence 768 | info_event_sem_next_sent = tf.concat( 769 | [info_event_sem_current_sent_bw[:, 1:, :], info_event_sem_init_sent], 770 | axis=1) 771 | info_event_sem_next_sent = tf.reshape(info_event_sem_next_sent, 772 | shape=[self.batch_size * self.max_doc_len, -1]) 773 | elif agg_choice == "average": 774 | # two_outputs = tf.concat([fw_tag_outputs, bw_tag_outputs], axis=-1) 775 | two_outputs = (fw_tag_outputs + bw_tag_outputs) / 2 776 | dim_tmp = two_outputs.get_shape().as_list()[-1] 777 | 778 | valid_sent_ind = tf.sequence_mask(self.valid_words_len, maxlen= self.max_seq_len) 779 | valid_sent_ind = tf.tile(tf.expand_dims(valid_sent_ind, axis=3), [1, 1, 1, dim_tmp]) 780 | avg_vect = tf.reduce_sum(two_outputs * tf.cast(valid_sent_ind, dtype=tf.float32), axis=-2) 781 | 782 | valid_words_inv = tf.tile(tf.expand_dims(1/self.valid_words_len, axis=2), 783 | [1, 1, dim_tmp]) 784 | avg_vect = avg_vect * tf.cast(valid_words_inv, dtype=tf.float32) 785 | pad_vect = tf.zeros(shape=[self.batch_size, 1, dim_tmp]) 786 | 787 | info_event_sem_pre_sent = tf.concat([pad_vect, avg_vect[:, :-1, :]], axis=1) 788 | info_event_sem_pre_sent = tf.reshape(info_event_sem_pre_sent, 789 | shape=[self.batch_size * self.max_doc_len, -1]) 790 | info_event_sem_next_sent = tf.concat([avg_vect[:, 1:, :], pad_vect], axis=1) 791 | info_event_sem_next_sent = tf.reshape(info_event_sem_next_sent, 792 | shape=[self.batch_size * self.max_doc_len, -1]) 793 | elif agg_choice == "concat": 794 | """ 795 | element-wise sum 796 | """ 797 | # two_outputs = tf.concat([fw_tag_outputs, bw_tag_outputs], axis=-1) 798 | two_outputs = (fw_tag_outputs + bw_tag_outputs) / 2 799 | dim_tmp = two_outputs.get_shape().as_list()[-1] 800 | 801 | valid_sent_ind = tf.one_hot(self.valid_words_len, depth=self.max_seq_len) 802 | valid_sent_ind = tf.tile(tf.expand_dims(valid_sent_ind, axis=3),[1,1,1,dim_tmp]) 803 | print("shape of two_outputs:\t",two_outputs.get_shape()) 804 | print("shape of valid_sent_ind:\t",valid_sent_ind.get_shape()) 805 | 806 | first_vect = two_outputs[:, :, 0, :] 807 | last_vect = tf.reduce_sum(two_outputs * valid_sent_ind,axis=-2) 808 | print("shape of last_vect:\t",last_vect.get_shape()) 809 | 810 | # sent_vect = tf.concat([first_vect, last_vect], axis=-1) 811 | sent_vect = (first_vect + last_vect) / 2 812 | pad_vect = tf.zeros(shape=[self.batch_size, 1, sent_vect.get_shape().as_list()[-1]]) 813 | info_event_sem_pre_sent = tf.concat([pad_vect, sent_vect[:, :-1, :]], axis=1) 814 | info_event_sem_pre_sent = tf.reshape(info_event_sem_pre_sent, 815 | shape=[self.batch_size * self.max_doc_len, -1]) 816 | info_event_sem_next_sent = tf.concat([sent_vect[:, 1:, :], pad_vect], axis=1) 817 | info_event_sem_next_sent = tf.reshape(info_event_sem_next_sent, 818 | shape=[self.batch_size * self.max_doc_len, -1]) 819 | else: 820 | print("agg_choice is not suppoted!") 821 | return tag_final_fw, tag_final_bw, tag_final_list_fw, tag_final_list_bw 822 | 823 | 824 | def fully_connected_layer(self, tag_vects): 825 | """ 826 | fully connected layer 827 | """ 828 | tag_vects = tf.nn.dropout(tag_vects, keep_prob=1 - self.dropout_rate) 829 | enc_dim = tag_vects.get_shape().as_list()[-1] 830 | with tf.variable_scope("logits"): 831 | W = tf.get_variable("W", 832 | shape=[enc_dim, self.class_size], 833 | dtype=tf.float32, 834 | initializer=self.initializer 835 | ) 836 | b = tf.get_variable("b", 837 | shape=[self.class_size], 838 | dtype=tf.float32, 839 | initializer=tf.zeros_initializer() 840 | ) 841 | output = tf.reshape(tag_vects, shape=[-1, enc_dim]) 842 | logits_ed = tf.nn.xw_plus_b(output, W, b) 843 | logits_ed = tf.reshape(logits_ed, [self.batch_size, self.max_doc_len, self.max_seq_len, self.class_size]) 844 | return logits_ed 845 | 846 | 847 | def loss_layer(self, tag_vects): 848 | """ 849 | define the loss function 850 | """ 851 | # projection layer 852 | logits_ed = self.fully_connected_layer(tag_vects) 853 | 854 | # calculate loss 855 | with tf.variable_scope("loss"): 856 | losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_ed, labels=self.input_label_docs) 857 | 858 | # mask invalid batches 859 | mask_batches_0 = tf.sequence_mask(self.valid_batch, maxlen=self.batch_size) 860 | valid_len_list = tf.boolean_mask(self.valid_words_len, mask_batches_0) 861 | mask_batches = tf.tile(tf.expand_dims(tf.expand_dims(mask_batches_0, 1), 2), 862 | multiples=[1, self.max_doc_len, self.max_seq_len]) 863 | # mask invalid sents 864 | mask_sents = tf.sequence_mask(self.valid_sent_len, maxlen=self.max_doc_len) 865 | valid_len_list = tf.boolean_mask(valid_len_list, tf.boolean_mask(mask_sents, mask_batches_0)) 866 | mask_sents = tf.tile(tf.expand_dims(mask_sents, axis=2), multiples=[1, 1, self.max_seq_len]) 867 | 868 | # mask invalid words 869 | mask_words = tf.sequence_mask(self.valid_words_len, maxlen=self.max_seq_len) 870 | 871 | valid_ind = tf.cast(mask_batches, tf.float32) * tf.cast(mask_sents, tf.float32) * tf.cast(mask_words, 872 | tf.float32) 873 | losses = losses * valid_ind 874 | 875 | # weight the loss of positive events 876 | ind_id_O = tf.cast(tf.equal(self.input_label_docs, self.id_O), tf.float32) 877 | losses = losses * ind_id_O + self.positive_weights * losses * (1 - ind_id_O) 878 | 879 | loss = tf.reduce_sum(losses) / tf.reduce_sum(valid_ind) 880 | 881 | mask_all_invalid = tf.cast(valid_ind, dtype=tf.bool) 882 | 883 | label_pred = tf.boolean_mask(tf.cast(tf.argmax(logits_ed, axis=-1), tf.float32), mask_all_invalid) 884 | label_true = tf.boolean_mask(tf.cast(self.input_label_docs, dtype=tf.float32), mask_all_invalid) 885 | 886 | self.final_words_id = tf.boolean_mask(self.input_docs, mask_all_invalid) 887 | 888 | return loss, label_true, label_pred, valid_len_list 889 | 890 | 891 | if __name__ == "__main__": 892 | pass 893 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DocED 2 | This repository is the official implementation of the ACL 2021 paper [MLBiNet: A Cross-Sentence Collective Event Detection Network](https://arxiv.org/pdf/2105.09458v1.pdf). 3 | 4 | ## Requirements 5 | ### To install basic requirements: 6 | pip install requirements.txt 7 | 8 | ## Datasets 9 | ACE2005 can be found here: https://catalog.ldc.upenn.edu/LDC2006T06 10 | 11 | ## Basic training 12 | ### To evaluate a setting with serveral random trials, execute 13 | python run_experiments_multi.py 14 | 15 | #### Main hyperparameters in train_MLBiNet.py include: 16 | --tagging_mechanism, mechanism to model event inter-dependency, you can choose one of "forward_decoder", "backward_decoder" or "bidirectional_decoder" 17 | 18 | --num_tag_layers, number of tagging layers, 1 indicates that we do sentence-level ED, 2 indicates that information of adjacent sentences were aggregated, ... 19 | 20 | --max_doc_len, maximum number of consecutive sentences are extracted as a mini-document, we can set it as 8 or 16 21 | 22 | --tag_dim, dimension of an uni-directional event tagging vector 23 | 24 | --self_att_not, whether to apply self-attention mechanism in sentence encoder 25 | 26 | ## Main results 27 | ### Overall performance on ACE2005 28 | ![image](https://user-images.githubusercontent.com/32415352/118842889-252e6900-b8fc-11eb-9de8-dba5f82377f4.png) 29 | 30 | ### Performance on detecting multiple events collectively 31 | ![image](https://user-images.githubusercontent.com/32415352/118843522-b9003500-b8fc-11eb-8e3f-759f6d37f98a.png) 32 | 33 | where 1/1 means one sentence that has one event; otherwise, 1/n is used. 34 | 35 | ### Performance of our proposed method with different multi-layer settings or decoder methods 36 | ![image](https://user-images.githubusercontent.com/32415352/118843910-11cfcd80-b8fd-11eb-965c-fbcde1319983.png) 37 | 38 | ## How to Cite 39 | 40 | ```bibtex 41 | @inproceedings{ACL2021_MLBiNet, 42 | author = {Dongfang Lou and 43 | Zhilin Liao and 44 | Shumin Deng and 45 | Ningyu Zhang and 46 | Huajun Chen}, 47 | title = {MLBiNet: A Cross-Sentence Collective Event Detection Network}, 48 | booktitle = {{ACL}}, 49 | publisher = {Association for Computational Linguistics}, 50 | year = {2021} 51 | } 52 | ``` 53 | -------------------------------------------------------------------------------- /ace_model_evaluation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | 5 | def ace_pred_result_stat(filename): 6 | wlast_true = "" 7 | wlast_pred = "" 8 | true_dict = set() 9 | pred_dict = set() 10 | id_true_init, id_true_end, id_pred_init, id_pred_end = 0, 0, 0, 0 11 | with open(filename,encoding='utf-8',mode='r') as f: 12 | for i,line in enumerate(f): 13 | line = line.strip() 14 | if len(line) > 0: 15 | line_split = line.split('\t') 16 | ## true label stats 17 | if line_split[1].startswith("B-"): 18 | if wlast_true != '': 19 | true_dict.add('\t'.join([str(id_true_init), str(max(id_true_init, id_true_end)), wlast_true])) 20 | id_true_init = i # init id 21 | id_true_end = i # end id 22 | wlast_true = line_split[1][2:] 23 | elif "I-" + wlast_true != line_split[1]: # the last id is end of a trigger 24 | if wlast_true != '': # the last one is a trigger 25 | true_dict.add('\t'.join([str(id_true_init), str(max(id_true_init, id_true_end)), wlast_true])) 26 | wlast_true = "" 27 | elif "I-" + wlast_true == line_split[1]: # the same with the last event type 28 | id_true_end = i 29 | wlast_true = line_split[1][2:] 30 | else: # different from last label, and not start with B- 31 | if wlast_true != '': 32 | true_dict.add('\t'.join([str(id_true_init), str(max(id_true_init, id_true_end)), wlast_true])) 33 | wlast_true = "" 34 | 35 | ## pred label stats 36 | if line_split[2].startswith("B-"): 37 | if wlast_pred != '': 38 | pred_dict.add('\t'.join([str(id_pred_init), str(max(id_pred_init, id_pred_end)), wlast_pred])) 39 | id_pred_init = i 40 | id_pred_end = i 41 | wlast_pred = line_split[2][2:] 42 | elif "I-" + wlast_pred != line_split[2]: # begging of new trigger 43 | if wlast_pred != '': 44 | pred_dict.add('\t'.join([str(id_pred_init), str(max(id_pred_init, id_pred_end)), wlast_pred])) 45 | wlast_pred = "" 46 | elif "I-" + wlast_pred == line_split[2]: 47 | id_pred_end = i 48 | wlast_pred = line_split[2][2:] 49 | else: 50 | if wlast_pred != '': 51 | pred_dict.add('\t'.join([str(id_pred_init), str(max(id_pred_init, id_pred_end)), wlast_pred])) 52 | wlast_pred = "" 53 | else: 54 | if wlast_true != '': 55 | true_dict.add('\t'.join([str(id_true_init), str(max(id_true_init, id_true_end)), wlast_true])) 56 | if wlast_pred != '': 57 | pred_dict.add('\t'.join([str(id_pred_init), str(max(id_pred_init, id_pred_end)), wlast_pred])) 58 | wlast_true = "" 59 | wlast_pred = "" 60 | 61 | true_cnt = len(true_dict) 62 | pred_cnt = len(pred_dict) 63 | acc_cnt = len(pred_dict & true_dict) 64 | prec_tmp = acc_cnt / (pred_cnt + 1e-8) 65 | recall_tmp = acc_cnt / (true_cnt + 1e-8) 66 | f1_tmp = 2 * prec_tmp * recall_tmp / (prec_tmp + recall_tmp + 1e-8) 67 | return prec_tmp,recall_tmp,f1_tmp 68 | 69 | 70 | def write_2_file(filename, ED_2_id, label_true_list,valid_len_list,words_sents, label_pred_list, id_2_vocab): 71 | id_to_ner_final = {v: u for u, v in ED_2_id.items()} 72 | with open(filename, encoding='utf-8', mode='w') as f: 73 | init_step = 0 74 | k = 0 75 | len_all = len(label_true_list) 76 | while init_step < len_all: 77 | end_step = init_step + valid_len_list[k] 78 | words_tmp = words_sents[init_step:end_step] 79 | pred_label_tmp_tmp = label_pred_list[init_step:end_step] 80 | true_label_tmp_tmp = label_true_list[init_step:end_step] 81 | for i in range(len(words_tmp)): 82 | f.write('\t'.join([id_2_vocab[words_tmp[i]], 83 | id_to_ner_final[true_label_tmp_tmp[i]], 84 | id_to_ner_final[pred_label_tmp_tmp[i]]]) + '\n') 85 | f.write('\n') 86 | init_step = end_step 87 | k += 1 88 | 89 | 90 | if __name__ == "__main__": 91 | pass 92 | 93 | 94 | 95 | 96 | 97 | 98 | -------------------------------------------------------------------------------- /data-ACE/example_new.dev: -------------------------------------------------------------------------------- 1 | BEGIN CNN_IP_20030405.1600.01-1 O O O 2 | VIDEOTAPE CNN_IP_20030405.1600.01-1 O O O 3 | -RRB- CNN_IP_20030405.1600.01-1 O O O 4 | MILES CNN_IP_20030405.1600.01-1 B-1_PER B-2_Individual O 5 | O'BRIEN CNN_IP_20030405.1600.01-1 I-1_PER I-2_Individual O 6 | , CNN_IP_20030405.1600.01-1 O O O 7 | CNN CNN_IP_20030405.1600.01-1 B-1_ORG B-2_Media O 8 | CORRESPONDENT CNN_IP_20030405.1600.01-1 B-1_PER B-2_Individual O 9 | -LRB- CNN_IP_20030405.1600.01-1 O O O 10 | voice-over CNN_IP_20030405.1600.01-1 O O O 11 | -RRB- CNN_IP_20030405.1600.01-1 O O O 12 | Seven-eleven CNN_IP_20030405.1600.01-1 B-1_Time B-2_Time O 13 | a.m. CNN_IP_20030405.1600.01-1 I-1_Time I-2_Time O 14 | Eastern CNN_IP_20030405.1600.01-1 I-1_Time I-2_Time O 15 | , CNN_IP_20030405.1600.01-1 O O O 16 | 4:11 CNN_IP_20030405.1600.01-1 B-1_Time B-2_Time O 17 | p.m. CNN_IP_20030405.1600.01-1 I-1_Time I-2_Time O 18 | in CNN_IP_20030405.1600.01-1 I-1_Time I-2_Time O 19 | Iraq CNN_IP_20030405.1600.01-1 B-1_GPE B-2_Nation O 20 | -------------------------------------------------------------------------------- /data-ACE/example_new.test: -------------------------------------------------------------------------------- 1 | WASHINGTON APW_ENG_20030304.0555 B-1_GPE B-2_Population_Center O 2 | -LRB- APW_ENG_20030304.0555 O O O 3 | AP APW_ENG_20030304.0555 B-1_ORG B-2_Media O 4 | -RRB- APW_ENG_20030304.0555 O O O 5 | With APW_ENG_20030304.0555 O O O 6 | opposition APW_ENG_20030304.0555 O O O 7 | hardening APW_ENG_20030304.0555 O O O 8 | , APW_ENG_20030304.0555 O O O 9 | the APW_ENG_20030304.0555 O O O 10 | White APW_ENG_20030304.0555 B-1_GPE B-2_Nation O 11 | House APW_ENG_20030304.0555 I-1_GPE I-2_Nation O 12 | left APW_ENG_20030304.0555 O O O 13 | open APW_ENG_20030304.0555 O O O 14 | the APW_ENG_20030304.0555 O O O 15 | possibility APW_ENG_20030304.0555 O O O 16 | Tuesday APW_ENG_20030304.0555 B-1_Time B-2_Time O 17 | that APW_ENG_20030304.0555 O O O 18 | it APW_ENG_20030304.0555 B-1_GPE B-2_Nation O 19 | would APW_ENG_20030304.0555 O O O 20 | not APW_ENG_20030304.0555 O O O 21 | seek APW_ENG_20030304.0555 O O O 22 | a APW_ENG_20030304.0555 O O O 23 | United APW_ENG_20030304.0555 B-1_ORG B-2_Non_Governmental O 24 | Nations APW_ENG_20030304.0555 I-1_ORG I-2_Non_Governmental O 25 | vote APW_ENG_20030304.0555 O O O 26 | on APW_ENG_20030304.0555 O O O 27 | its APW_ENG_20030304.0555 B-1_GPE B-2_Nation O 28 | war-making APW_ENG_20030304.0555 O O O 29 | resolution APW_ENG_20030304.0555 O O O 30 | if APW_ENG_20030304.0555 O O O 31 | the APW_ENG_20030304.0555 O O O 32 | measure APW_ENG_20030304.0555 O O O 33 | was APW_ENG_20030304.0555 O O O 34 | headed APW_ENG_20030304.0555 O O O 35 | for APW_ENG_20030304.0555 O O O 36 | defeat APW_ENG_20030304.0555 O O O 37 | '' APW_ENG_20030304.0555 O O O 38 | 39 | The APW_ENG_20030304.0555 O O O 40 | vote APW_ENG_20030304.0555 O O O 41 | is APW_ENG_20030304.0555 O O O 42 | desirable APW_ENG_20030304.0555 O O O 43 | -------------------------------------------------------------------------------- /data-ACE/example_new.train: -------------------------------------------------------------------------------- 1 | the CNN_ENG_20030512_190454.7 O O O 2 | story CNN_ENG_20030512_190454.7 O O O 3 | of CNN_ENG_20030512_190454.7 O O O 4 | a CNN_ENG_20030512_190454.7 O O O 5 | woman CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 6 | in CNN_ENG_20030512_190454.7 O O O 7 | an CNN_ENG_20030512_190454.7 O O O 8 | east CNN_ENG_20030512_190454.7 B-1_LOC B-2_Region_General O 9 | texas CNN_ENG_20030512_190454.7 I-1_LOC I-2_Region_General O 10 | jail CNN_ENG_20030512_190454.7 O O O 11 | cell CNN_ENG_20030512_190454.7 B-1_FAC B-2_Subarea_Facility O 12 | tonight CNN_ENG_20030512_190454.7 B-1_Time B-2_Time O 13 | 14 | we CNN_ENG_20030512_190454.7 B-1_PER B-2_Group O 15 | 're CNN_ENG_20030512_190454.7 O O O 16 | told CNN_ENG_20030512_190454.7 O O O 17 | sometimes CNN_ENG_20030512_190454.7 O O O 18 | she CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 19 | sits CNN_ENG_20030512_190454.7 O O O 20 | in CNN_ENG_20030512_190454.7 O O O 21 | the CNN_ENG_20030512_190454.7 O O O 22 | fetal CNN_ENG_20030512_190454.7 O O O 23 | position CNN_ENG_20030512_190454.7 O O O 24 | , CNN_ENG_20030512_190454.7 O O O 25 | other CNN_ENG_20030512_190454.7 O O O 26 | times CNN_ENG_20030512_190454.7 O O O 27 | singing CNN_ENG_20030512_190454.7 O O O 28 | gospel CNN_ENG_20030512_190454.7 O O O 29 | music CNN_ENG_20030512_190454.7 O O O 30 | 31 | occasionally CNN_ENG_20030512_190454.7 O O O 32 | she CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 33 | 'll CNN_ENG_20030512_190454.7 O O O 34 | pray CNN_ENG_20030512_190454.7 O O O 35 | , CNN_ENG_20030512_190454.7 O O O 36 | sometimes CNN_ENG_20030512_190454.7 O O O 37 | cries CNN_ENG_20030512_190454.7 O O O 38 | hysterically CNN_ENG_20030512_190454.7 O O O 39 | 40 | her CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 41 | name CNN_ENG_20030512_190454.7 O O O 42 | is CNN_ENG_20030512_190454.7 O O O 43 | deanna CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 44 | lejeune CNN_ENG_20030512_190454.7 I-1_PER I-2_Individual O 45 | laney CNN_ENG_20030512_190454.7 I-1_PER I-2_Individual O 46 | 47 | she CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 48 | 's CNN_ENG_20030512_190454.7 O O O 49 | accused CNN_ENG_20030512_190454.7 O O O 50 | of CNN_ENG_20030512_190454.7 O O O 51 | beating CNN_ENG_20030512_190454.7 O O O 52 | two CNN_ENG_20030512_190454.7 B-1_PER B-2_Group O 53 | of CNN_ENG_20030512_190454.7 O O O 54 | her CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 55 | three CNN_ENG_20030512_190454.7 O O O 56 | children CNN_ENG_20030512_190454.7 B-1_PER B-2_Group O 57 | to CNN_ENG_20030512_190454.7 O O O 58 | death CNN_ENG_20030512_190454.7 O O B-Life_Die 59 | because CNN_ENG_20030512_190454.7 O O O 60 | , CNN_ENG_20030512_190454.7 O O O 61 | she CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 62 | says CNN_ENG_20030512_190454.7 O O O 63 | , CNN_ENG_20030512_190454.7 O O O 64 | god CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 65 | told CNN_ENG_20030512_190454.7 O O O 66 | her CNN_ENG_20030512_190454.7 B-1_PER B-2_Individual O 67 | to CNN_ENG_20030512_190454.7 O O O 68 | 69 | -------------------------------------------------------------------------------- /dict/dict_gen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | """ 5 | @version: 6 | @software:PyCharm 7 | @file:dict_gen.py 8 | @time:2020/10/27 16:29 9 | """ 10 | 11 | def event_dict_gen(): 12 | files = ["../data-ACE/example_new.train", "../data-ACE/example_new.dev", "../data-ACE/example_new.test"] 13 | vocab_set = set() 14 | vocab_set_ner_1 = set() 15 | vocab_set_ner_2 = set() 16 | for filei in files: 17 | with open(filei, encoding="utf-8", mode="r") as f: 18 | for line in f: 19 | line = line.strip().split(" ") 20 | if len(line) == 5: 21 | vocab_set.add(line[-1]) 22 | vocab_set_ner_1.add(line[-3]) 23 | vocab_set_ner_2.add(line[-2]) 24 | vocab_list = list(vocab_set) 25 | vocab_list = [x for x in vocab_list if x == "O" or x.startswith("B-")] 26 | vocab_list += ["I-" + x[2:] for x in vocab_list if x.startswith("B-")] 27 | vocab_list = sorted(vocab_list, key=lambda x: x, reverse=True) 28 | with open("event_types.txt", encoding="utf-8", mode="w") as fw: 29 | for line in vocab_list: 30 | fw.write(line + "\n") 31 | 32 | vocab_list_ner_1 = list(vocab_set_ner_1) 33 | vocab_list_ner_1 = sorted(vocab_list_ner_1, key=lambda x: x, reverse=True) 34 | vocab_set_ner_2 = list(vocab_set_ner_2) 35 | vocab_set_ner_2 = sorted(vocab_set_ner_2, key=lambda x: x, reverse=True) 36 | 37 | with open("ner_1.txt", encoding="utf-8", mode="w") as fw: 38 | for line in vocab_list_ner_1: 39 | fw.write(line + "\n") 40 | 41 | with open("ner_2.txt", encoding="utf-8", mode="w") as fw: 42 | for line in vocab_set_ner_2: 43 | fw.write(line + "\n") 44 | 45 | 46 | if __name__ == "__main__": 47 | event_dict_gen() 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /dict/event_types.txt: -------------------------------------------------------------------------------- 1 | O 2 | I-Transaction_Transfer_Ownership 3 | I-Transaction_Transfer_Money 4 | I-Personnel_Start_Position 5 | I-Personnel_Nominate 6 | I-Personnel_End_Position 7 | I-Personnel_Elect 8 | I-Movement_Transport 9 | I-Life_Marry 10 | I-Life_Injure 11 | I-Life_Divorce 12 | I-Life_Die 13 | I-Life_Be_Born 14 | I-Justice_Trial_Hearing 15 | I-Justice_Sue 16 | I-Justice_Sentence 17 | I-Justice_Release_Parole 18 | I-Justice_Pardon 19 | I-Justice_Fine 20 | I-Justice_Extradite 21 | I-Justice_Execute 22 | I-Justice_Convict 23 | I-Justice_Charge_Indict 24 | I-Justice_Arrest_Jail 25 | I-Justice_Appeal 26 | I-Justice_Acquit 27 | I-Contact_Phone_Write 28 | I-Contact_Meet 29 | I-Conflict_Demonstrate 30 | I-Conflict_Attack 31 | I-Business_Start_Org 32 | I-Business_Merge_Org 33 | I-Business_End_Org 34 | I-Business_Declare_Bankruptcy 35 | B-Transaction_Transfer_Ownership 36 | B-Transaction_Transfer_Money 37 | B-Personnel_Start_Position 38 | B-Personnel_Nominate 39 | B-Personnel_End_Position 40 | B-Personnel_Elect 41 | B-Movement_Transport 42 | B-Life_Marry 43 | B-Life_Injure 44 | B-Life_Divorce 45 | B-Life_Die 46 | B-Life_Be_Born 47 | B-Justice_Trial_Hearing 48 | B-Justice_Sue 49 | B-Justice_Sentence 50 | B-Justice_Release_Parole 51 | B-Justice_Pardon 52 | B-Justice_Fine 53 | B-Justice_Extradite 54 | B-Justice_Execute 55 | B-Justice_Convict 56 | B-Justice_Charge_Indict 57 | B-Justice_Arrest_Jail 58 | B-Justice_Appeal 59 | B-Justice_Acquit 60 | B-Contact_Phone_Write 61 | B-Contact_Meet 62 | B-Conflict_Demonstrate 63 | B-Conflict_Attack 64 | B-Business_Start_Org 65 | B-Business_Merge_Org 66 | B-Business_End_Org 67 | B-Business_Declare_Bankruptcy 68 | -------------------------------------------------------------------------------- /dict/ner_1.txt: -------------------------------------------------------------------------------- 1 | O 2 | I-1_WEA 3 | I-1_VEH 4 | I-1_Time 5 | I-1_Sentence 6 | I-1_PER 7 | I-1_ORG 8 | I-1_Numeric 9 | I-1_LOC 10 | I-1_Job_Title 11 | I-1_GPE 12 | I-1_FAC 13 | I-1_Crime 14 | I-1_Contact_Info 15 | B-1_WEA 16 | B-1_VEH 17 | B-1_Time 18 | B-1_Sentence 19 | B-1_PER 20 | B-1_ORG 21 | B-1_Numeric 22 | B-1_LOC 23 | B-1_Job_Title 24 | B-1_GPE 25 | B-1_FAC 26 | B-1_Crime 27 | B-1_Contact_Info 28 | -------------------------------------------------------------------------------- /dict/ner_2.txt: -------------------------------------------------------------------------------- 1 | O 2 | I-2_Water_Body 3 | I-2_Water 4 | I-2_Underspecified 5 | I-2_Time 6 | I-2_Subarea_Vehicle 7 | I-2_Subarea_Facility 8 | I-2_State_or_Province 9 | I-2_Sports 10 | I-2_Special 11 | I-2_Shooting 12 | I-2_Sentence 13 | I-2_Religious 14 | I-2_Region_International 15 | I-2_Region_General 16 | I-2_Projectile 17 | I-2_Population_Center 18 | I-2_Path 19 | I-2_Numeric 20 | I-2_Non_Governmental 21 | I-2_Nation 22 | I-2_Medical_Science 23 | I-2_Media 24 | I-2_Land_Region_Natural 25 | I-2_Land 26 | I-2_Job_Title 27 | I-2_Individual 28 | I-2_Indeterminate 29 | I-2_Group 30 | I-2_Government 31 | I-2_GPE_Cluster 32 | I-2_Exploding 33 | I-2_Entertainment 34 | I-2_Educational 35 | I-2_Crime 36 | I-2_County_or_District 37 | I-2_Continent 38 | I-2_Contact_Info 39 | I-2_Commercial 40 | I-2_Celestial 41 | I-2_Building_Grounds 42 | I-2_Biological 43 | I-2_Airport 44 | I-2_Air 45 | I-2_Address 46 | B-2_Water_Body 47 | B-2_Water 48 | B-2_Underspecified 49 | B-2_Time 50 | B-2_Subarea_Vehicle 51 | B-2_Subarea_Facility 52 | B-2_State_or_Province 53 | B-2_Sports 54 | B-2_Special 55 | B-2_Shooting 56 | B-2_Sharp 57 | B-2_Sentence 58 | B-2_Religious 59 | B-2_Region_International 60 | B-2_Region_General 61 | B-2_Projectile 62 | B-2_Population_Center 63 | B-2_Plant 64 | B-2_Path 65 | B-2_Numeric 66 | B-2_Nuclear 67 | B-2_Non_Governmental 68 | B-2_Nation 69 | B-2_Medical_Science 70 | B-2_Media 71 | B-2_Land_Region_Natural 72 | B-2_Land 73 | B-2_Job_Title 74 | B-2_Individual 75 | B-2_Indeterminate 76 | B-2_Group 77 | B-2_Government 78 | B-2_GPE_Cluster 79 | B-2_Exploding 80 | B-2_Entertainment 81 | B-2_Educational 82 | B-2_Crime 83 | B-2_County_or_District 84 | B-2_Continent 85 | B-2_Contact_Info 86 | B-2_Commercial 87 | B-2_Chemical 88 | B-2_Celestial 89 | B-2_Building_Grounds 90 | B-2_Boundary 91 | B-2_Blunt 92 | B-2_Biological 93 | B-2_Airport 94 | B-2_Air 95 | B-2_Address 96 | -------------------------------------------------------------------------------- /dict/vocab.txt: -------------------------------------------------------------------------------- 1 | EU 2 | German 3 | call 4 | to 5 | boycott 6 | British 7 | . 8 | Peter 9 | Blackburn 10 | BRUSSELS 11 | 1996-08-22 12 | The 13 | European 14 | Commission 15 | said 16 | on 17 | Thursday 18 | it 19 | with 20 | advice 21 | consumers 22 | until 23 | scientists 24 | determine 25 | whether 26 | mad 27 | cow 28 | disease 29 | can 30 | be 31 | sheep 32 | Germany 33 | 's 34 | representative 35 | the 36 | Union 37 | veterinary 38 | committee 39 | Wednesday 40 | should 41 | buy 42 | from 43 | countries 44 | other 45 | than 46 | Britain 47 | scientific 48 | was 49 | " 50 | We 51 | do 52 | n't 53 | support 54 | any 55 | such 56 | because 57 | we 58 | see 59 | grounds 60 | for 61 | , 62 | chief 63 | spokesman 64 | van 65 | der 66 | told 67 | a 68 | news 69 | briefing 70 | He 71 | further 72 | study 73 | required 74 | and 75 | if 76 | found 77 | that 78 | action 79 | needed 80 | taken 81 | by 82 | proposal 83 | last 84 | month 85 | Commissioner 86 | Fischler 87 | ban 88 | spinal 89 | human 90 | animal 91 | food 92 | chains 93 | highly 94 | specific 95 | move 96 | health 97 | proposed 98 | measures 99 | after 100 | reports 101 | France 102 | under 103 | conditions 104 | could 105 | contract 106 | ( 107 | BSE 108 | ) 109 | -- 110 | But 111 | agreed 112 | review 113 | his 114 | standing 115 | officials 116 | questioned 117 | as 118 | there 119 | only 120 | slight 121 | risk 122 | Spanish 123 | Minister 124 | de 125 | had 126 | earlier 127 | accused 128 | at 129 | an 130 | farm 131 | ministers 132 | ' 133 | meeting 134 | of 135 | through 136 | dangerous 137 | Only 138 | backed 139 | are 140 | due 141 | issue 142 | early 143 | next 144 | make 145 | senior 146 | have 147 | long 148 | been 149 | known 150 | similar 151 | which 152 | is 153 | believed 154 | cattle 155 | feed 156 | containing 157 | waste 158 | farmers 159 | denied 160 | danger 161 | their 162 | but 163 | expressed 164 | concern 165 | government 166 | avoid 167 | might 168 | across 169 | Europe 170 | What 171 | extremely 172 | how 173 | going 174 | take 175 | lead 176 | National 177 | chairman 178 | John 179 | Lloyd 180 | Jones 181 | radio 182 | Bonn 183 | has 184 | led 185 | efforts 186 | public 187 | consumer 188 | confidence 189 | in 190 | March 191 | report 192 | illness 193 | eating 194 | beef 195 | imported 196 | year 197 | nearly 198 | half 199 | total 200 | imports 201 | It 202 | brought 203 | tonnes 204 | some 205 | 10 206 | percent 207 | overall 208 | -DOCSTART- 209 | Hendrix 210 | draft 211 | almost 212 | $ 213 | LONDON 214 | A 215 | rare 216 | U.S. 217 | sold 218 | auction 219 | late 220 | Florida 221 | restaurant 222 | paid 223 | pounds 224 | no 225 | telling 226 | piece 227 | London 228 | hotel 229 | At 230 | end 231 | January 232 | English 233 | city 234 | Nottingham 235 | he 236 | threw 237 | sheet 238 | paper 239 | into 240 | audience 241 | where 242 | also 243 | snapped 244 | up 245 | 16 246 | items 247 | were 248 | put 249 | former 250 | who 251 | lived 252 | him 253 | They 254 | included 255 | black 256 | mother 257 | box 258 | used 259 | store 260 | drugs 261 | Australian 262 | bought 263 | died 264 | aged 265 | 27 266 | China 267 | says 268 | Taiwan 269 | atmosphere 270 | talks 271 | BEIJING 272 | Taipei 273 | visit 274 | Ukraine 275 | President 276 | this 277 | week 278 | Beijing 279 | hours 280 | Chinese 281 | state 282 | media 283 | time 284 | right 285 | political 286 | Foreign 287 | Ministry 288 | Reuters 289 | : 290 | necessary 291 | opening 292 | authorities 293 | State 294 | quoted 295 | top 296 | negotiator 297 | Tang 298 | visiting 299 | group 300 | rivals 301 | hold 302 | Now 303 | two 304 | sides 305 | ... 306 | overseas 307 | edition 308 | People 309 | Daily 310 | saying 311 | foreign 312 | ministry 313 | interview 314 | read 315 | comments 316 | gave 317 | details 318 | why 319 | considered 320 | renegade 321 | province 322 | opposed 323 | all 324 | gain 325 | international 326 | rival 327 | island 328 | towards 329 | goal 330 | held 331 | set 332 | official 333 | agency 334 | executive 335 | vice 336 | Association 337 | July 338 | car 339 | registrations 340 | pct 341 | yr 342 | / 343 | FRANKFURT 344 | motor 345 | vehicles 346 | period 347 | Federal 348 | office 349 | new 350 | cars 351 | registered 352 | 1996 353 | passenger 354 | figures 355 | increase 356 | decline 357 | 1995 358 | registration 359 | rose 360 | growth 361 | partly 362 | increased 363 | number 364 | buying 365 | abroad 366 | while 367 | manufacturers 368 | domestic 369 | demand 370 | weak 371 | federal 372 | posted 373 | gains 374 | numbers 375 | AG 376 | won 377 | slightly 378 | more 379 | quarter 380 | together 381 | General 382 | came 383 | second 384 | place 385 | figure 386 | Third 387 | Ford 388 | or 389 | fewer 390 | compared 391 | fell 392 | TO 393 | PM 394 | FOR 395 | ATHENS 396 | Greek 397 | party 398 | bureau 399 | green 400 | light 401 | Prime 402 | Costas 403 | Simitis 404 | snap 405 | elections 406 | its 407 | general 408 | secretary 409 | reporters 410 | announcement 411 | cabinet 412 | later 413 | Dimitris 414 | Kontogiannis 415 | Athens 416 | Newsroom 417 | +301 418 | 3311812-4 419 | sets 420 | C$ 421 | 100 422 | million 423 | bond 424 | following 425 | announced 426 | manager 427 | Toronto 428 | ISS 429 | PRICE 430 | PAY 431 | DATE 432 | BP 433 | MOODY 434 | = 435 | S&P 436 | US 437 | UK 438 | GERMAN 439 | 7.0 440 | PCT 441 | 2001 442 | NOTES 443 | +44 444 | 171 445 | 542 446 | 300 447 | 1999 448 | Lehman 449 | International 450 | NATIONAL 451 | - 452 | LAST 453 | FIRST 454 | 2 455 | ENGLISH 456 | 5 457 | Port 458 | Syria 459 | Service 460 | Aug 461 | waiting 462 | 24 463 | Israel 464 | plays 465 | down 466 | fears 467 | war 468 | JERUSALEM 469 | peace 470 | current 471 | between 472 | appeared 473 | storm 474 | ambassador 475 | Washington 476 | conducted 477 | negotiations 478 | Radio 479 | looked 480 | like 481 | Damascus 482 | wanted 483 | talk 484 | rather 485 | fight 486 | appears 487 | me 488 | still 489 | they 490 | definitely 491 | tense 492 | assessment 493 | here 494 | term 495 | will 496 | replaced 497 | Israeli 498 | envoy 499 | Egypt 500 | right-wing 501 | politician 502 | sent 503 | message 504 | via 505 | committed 506 | open 507 | without 508 | what 509 | called 510 | campaign 511 | against 512 | television 513 | reported 514 | recently 515 | test 516 | fired 517 | arms 518 | ready 519 | enter 520 | David 521 | Levy 522 | since 523 | Benjamin 524 | Netanyahu 525 | took 526 | June 527 | retain 528 | captured 529 | Middle 530 | East 531 | over 532 | 1991 533 | despite 534 | previous 535 | Peace 536 | February 537 | coming 538 | out 539 | bad 540 | not 541 | good 542 | full 543 | must 544 | very 545 | those 546 | become 547 | prisoners 548 | expect 549 | face 550 | answer 551 | our 552 | want 553 | God 554 | No 555 | one 556 | Two 557 | signal 558 | source 559 | confirm 560 | Cairo 561 | United 562 | States 563 | Moscow 564 | Polish 565 | diplomat 566 | nurses 567 | Libya 568 | trying 569 | return 570 | home 571 | working 572 | North 573 | African 574 | country 575 | This 576 | true 577 | Up 578 | today 579 | kept 580 | her 581 | received 582 | embassy 583 | charge 584 | telephone 585 | Poland 586 | labour 587 | would 588 | send 589 | team 590 | probe 591 | prompted 592 | about 593 | work 594 | estimated 595 | 800 596 | Iranian 597 | opposition 598 | leaders 599 | meet 600 | Baghdad 601 | Hassan 602 | BAGHDAD 603 | An 604 | exile 605 | based 606 | Iraq 607 | vowed 608 | Iran 609 | Kurdish 610 | rebels 611 | attacked 612 | troops 613 | inside 614 | statement 615 | leader 616 | met 617 | Kurdistan 618 | Democratic 619 | Party 620 | rebel 621 | Kurds 622 | continue 623 | stand 624 | side 625 | movement 626 | level 627 | cooperation 628 | heavily 629 | targets 630 | northern 631 | pursuit 632 | guerrillas 633 | Iraqi 634 | areas 635 | outside 636 | control 637 | Patriotic 638 | PUK 639 | KDP 640 | main 641 | factions 642 | forces 643 | ousted 644 | Kuwait 645 | Gulf 646 | War 647 | parties 648 | broke 649 | weekend 650 | most 651 | serious 652 | fighting 653 | ceasefire 654 | shelling 655 | positions 656 | region 657 | near 658 | border 659 | days 660 | killed 661 | wounded 662 | attack 663 | Both 664 | Turkey 665 | air 666 | land 667 | strikes 668 | own 669 | U.S.-led 670 | force 671 | southern 672 | possible 673 | attacks 674 | Saudi 675 | riyal 676 | rates 677 | steady 678 | quiet 679 | summer 680 | trade 681 | spot 682 | dollar 683 | deposit 684 | mainly 685 | dealers 686 | kingdom 687 | There 688 | changes 689 | market 690 | dealer 691 | three 692 | months 693 | 1/2 694 | six 695 | 5/8 696 | funds 697 | Arafat 698 | flight 699 | West 700 | Bank 701 | Palestinian 702 | Yasser 703 | permission 704 | fly 705 | territory 706 | ending 707 | brief 708 | crisis 709 | problem 710 | president 711 | aircraft 712 | pass 713 | expected 714 | travel 715 | before 716 | Monday 717 | Abu 718 | scheduled 719 | prime 720 | minister 721 | Shimon 722 | Peres 723 | town 724 | Ramallah 725 | venue 726 | changed 727 | Gaza 728 | stop 729 | keeping 730 | cancelled 731 | PLO 732 | civilian 733 | affairs 734 | Allenby 735 | Bridge 736 | crossing 737 | Jordan 738 | decided 739 | flying 740 | lifted 741 | schedule 742 | free 743 | Palestinians 744 | barred 745 | planned 746 | helicopter 747 | attempt 748 | defeated 749 | May 750 | Afghan 751 | UAE 752 | Taleban 753 | guards 754 | DUBAI 755 | Three 756 | Arab 757 | Russian 758 | hostages 759 | escaped 760 | militia 761 | Afghanistan 762 | few 763 | Our 764 | Their 765 | them 766 | documents 767 | added 768 | Islamic 769 | seven 770 | Friday 771 | board 772 | Kandahar 773 | hand 774 | Red 775 | possibly 776 | Tuesday 777 | When 778 | asked 779 | back 780 | capital 781 | Kabul 782 | That 783 | headquarters 784 | controlled 785 | Rabbani 786 | men 787 | currently 788 | did 789 | Russians 790 | firm 791 | republic 792 | hostage 793 | forced 794 | cargo 795 | plane 796 | August 797 | shipment 798 | Albania 799 | evidence 800 | military 801 | crew 802 | diplomatic 803 | attempts 804 | failed 805 | armed 806 | doing 807 | regular 808 | 76 809 | left 810 | Sunday 811 | Saddam 812 | meets 813 | Russia 814 | Zhirinovsky 815 | Hussein 816 | Vladimir 817 | maintain 818 | newspapers 819 | during 820 | parliament 821 | calling 822 | immediate 823 | lifting 824 | embargo 825 | imposed 826 | 1990 827 | press 828 | help 829 | U.N. 830 | sanctions 831 | blamed 832 | establishment 833 | ties 834 | economic 835 | resume 836 | visited 837 | twice 838 | Last 839 | October 840 | invited 841 | attend 842 | referendum 843 | presidency 844 | extended 845 | years 846 | PRESS 847 | DIGEST 848 | 22 849 | These 850 | leading 851 | stories 852 | verified 853 | these 854 | does 855 | vouch 856 | accuracy 857 | democratic 858 | Turkish 859 | part 860 | fair 861 | November 862 | 12 863 | rice 864 | arrives 865 | port 866 | Lebanon 867 | Beirut 868 | threats 869 | serve 870 | Parliament 871 | Speaker 872 | preparing 873 | battle 874 | .. 875 | prepared 876 | law 877 | violation 878 | incidents 879 | occurred 880 | Financial 881 | Pakistan 882 | step 883 | election 884 | list 885 | violations 886 | live 887 | calls 888 | range 889 | mixed 890 | CHICAGO 891 | futures 892 | cent 893 | higher 894 | lower 895 | livestock 896 | analysts 897 | continued 898 | strong 899 | cash 900 | markets 901 | prompt 902 | bullish 903 | However 904 | likely 905 | prices 906 | evening 907 | ahead 908 | Cash 909 | record 910 | amount 911 | traded 912 | debt 913 | hit 914 | results 915 | Inc 916 | mean 917 | loss 918 | 1.2 919 | fiscal 920 | 1997 921 | first 922 | company 923 | began 924 | 1 925 | 30 926 | par 927 | value 928 | outstanding 929 | Philip 930 | financial 931 | officer 932 | child 933 | care 934 | offered 935 | opportunity 936 | reduce 937 | average 938 | interest 939 | costs 940 | improve 941 | future 942 | earnings 943 | RESEARCH 944 | ALERT 945 | starts 946 | analyst 947 | started 948 | Southern 949 | New 950 | England 951 | Corp 952 | outperform 953 | rating 954 | price 955 | target 956 | 45 957 | estimate 958 | per 959 | share 960 | immediately 961 | available 962 | closed 963 | Wall 964 | Street 965 | Data 966 | Q2 967 | net 968 | rises 969 | Summary 970 | In 971 | Thousands 972 | except 973 | data 974 | Six 975 | Jul 976 | 31 977 | Income 978 | Total 979 | Revenue 980 | Operating 981 | 599 982 | Net 983 | Jan 984 | Capital 985 | 93 986 | give 987 | backing 988 | Le 989 | Monde 990 | PARIS 991 | afternoon 992 | daily 993 | dated 994 | 23 995 | seeking 996 | residence 997 | rights 998 | say 999 | Alain 1000 | Juppe 1001 | proposals 1002 | strike 1003 | day 1004 | Paris 1005 | church 1006 | rally 1007 | 8,000 1008 | nationalist 1009 | truce 1010 | night 1011 | French 1012 | points 1013 | industry 1014 | competition 1015 | failure 1016 | keep 1017 | trends 1018 | Secretary 1019 | union 1020 | social 1021 | unrest 1022 | weeks 1023 | 42 1024 | 21 1025 | 53 1026 | 81 1027 | lift 1028 | oil 1029 | output 1030 | water 1031 | wells 1032 | off 1033 | reopened 1034 | operator 1035 | AS 1036 | 30,000 1037 | barrels 1038 | bpd 1039 | according 1040 | problems 1041 | newsroom 1042 | 50 1043 | 41 1044 | April 1045 | surplus 1046 | 3.8 1047 | billion 1048 | markka 1049 | HELSINKI 1050 | Finland 1051 | Board 1052 | exports 1053 | Trade 1054 | balance 1055 | 96 1056 | 95 1057 | import 1058 | revised 1059 | export 1060 | 3.2 1061 | monthly 1062 | behind 1063 | customs 1064 | when 1065 | joined 1066 | start 1067 | 0 1068 | Dutch 1069 | sale 1070 | AMSTERDAM 1071 | Finance 1072 | raised 1073 | sales 1074 | September 1075 | being 1076 | GMT 1077 | guilders 1078 | close 1079 | Amsterdam 1080 | +31 1081 | 20 1082 | 504 1083 | 5000 1084 | BONN 1085 | Agriculture 1086 | animals 1087 | cleared 1088 | done 1089 | quickly 1090 | I 1091 | concrete 1092 | too 1093 | many 1094 | holes 1095 | know 1096 | filled 1097 | ensure 1098 | protection 1099 | dealing 1100 | erupted 1101 | use 1102 | suspect 1103 | experts 1104 | members 1105 | given 1106 | question 1107 | After 1108 | admitted 1109 | link 1110 | fatal 1111 | equivalent 1112 | worldwide 1113 | takes 1114 | GOLF 1115 | SCORES 1116 | AT 1117 | WORLD 1118 | SERIES 1119 | OF 1120 | AKRON 1121 | Ohio 1122 | Scores 1123 | NEC 1124 | World 1125 | Series 1126 | Golf 1127 | round 1128 | 70 1129 | course 1130 | players 1131 | unless 1132 | stated 1133 | 66 1134 | Paul 1135 | Billy 1136 | Mayfair 1137 | Japan 1138 | 68 1139 | Steve 1140 | 69 1141 | Justin 1142 | Mark 1143 | Brooks 1144 | Tim 1145 | Davis 1146 | Anders 1147 | Sweden 1148 | Nick 1149 | Phil 1150 | Mickelson 1151 | Greg 1152 | Norman 1153 | Australia 1154 | 71 1155 | Els 1156 | South 1157 | Africa 1158 | Scott 1159 | 72 1160 | Rose 1161 | Fred 1162 | Sven 1163 | Alexander 1164 | Tom 1165 | 73 1166 | Brad 1167 | Craig 1168 | Stewart 1169 | Stadler 1170 | 74 1171 | Costantino 1172 | Rocca 1173 | Italy 1174 | 75 1175 | Jim 1176 | 77 1177 | Wayne 1178 | 79 1179 | SOCCER 1180 | BEAT 1181 | 2-1 1182 | F.C. 1183 | Gloria 1184 | Bistrita 1185 | Romania 1186 | beat 1187 | halftime 1188 | 1-1 1189 | Valletta 1190 | Malta 1191 | Cup 1192 | winners 1193 | match 1194 | leg 1195 | preliminary 1196 | Scorers 1197 | La 1198 | Gilbert 1199 | 24th 1200 | Attendance 1201 | 4-2 1202 | aggregate 1203 | qualified 1204 | RACING 1205 | YORK 1206 | Sir 1207 | landed 1208 | victory 1209 | 25 1210 | chance 1211 | veteran 1212 | George 1213 | short 1214 | head 1215 | deny 1216 | trained 1217 | Henry 1218 | Chris 1219 | Prix 1220 | winner 1221 | finished 1222 | third 1223 | away 1224 | 7-4 1225 | Games 1226 | fourth 1227 | Royal 1228 | may 1229 | now 1230 | aimed 1231 | season 1232 | sprint 1233 | race 1234 | reluctant 1235 | go 1236 | result 1237 | never 1238 | so 1239 | thought 1240 | better 1241 | wait 1242 | bit 1243 | longer 1244 | am 1245 | daughter 1246 | Jack 1247 | gone 1248 | search 1249 | success 1250 | around 1251 | disappointed 1252 | feel 1253 | well 1254 | metres 1255 | just 1256 | RESULTS 1257 | Result 1258 | run 1259 | five 1260 | km 1261 | 1. 1262 | 2. 1263 | 3. 1264 | Jason 1265 | Weaver 1266 | Eight 1267 | ran 1268 | owned 1269 | Park 1270 | sterling 1271 | TENNIS 1272 | TOSHIBA 1273 | CLASSIC 1274 | CARLSBAD 1275 | California 1276 | 1996-08-21 1277 | Results 1278 | 450,000 1279 | Toshiba 1280 | Classic 1281 | tennis 1282 | tournament 1283 | prefix 1284 | denotes 1285 | seeding 1286 | Second 1287 | Arantxa 1288 | Sanchez 1289 | Vicario 1290 | Spain 1291 | Naoko 1292 | 1-6 1293 | 6-4 1294 | 6-3 1295 | 4 1296 | Kimiko 1297 | Date 1298 | 6-2 1299 | 7-5 1300 | 7 1301 | 4-6 1302 | 8 1303 | Nathalie 1304 | Tauziat 1305 | Wang 1306 | CUP 1307 | York 1308 | Hamlet 1309 | Michael 1310 | Chang 1311 | Sergi 1312 | Bruguera 1313 | Joyce 1314 | 3 1315 | 3-6 1316 | Martin 1317 | Damm 1318 | Czech 1319 | Republic 1320 | 6 1321 | El 1322 | Morocco 1323 | 5-7 1324 | 3-0 1325 | retired 1326 | Karol 1327 | Kucera 1328 | Slovakia 1329 | 7-6 1330 | Kenny 1331 | spoke 1332 | leaving 1333 | club 1334 | premier 1335 | league 1336 | title 1337 | mutual 1338 | confessed 1339 | taking 1340 | division 1341 | local 1342 | newspaper 1343 | holiday 1344 | same 1345 | opinion 1346 | little 1347 | If 1348 | opened 1349 | my 1350 | stayed 1351 | way 1352 | let 1353 | get 1354 | job 1355 | then 1356 | past 1357 | 15 1358 | director 1359 | football 1360 | CRICKET 1361 | COUNTY 1362 | CHAMPIONSHIP 1363 | Close 1364 | play 1365 | scores 1366 | four-day 1367 | County 1368 | Championship 1369 | cricket 1370 | matches 1371 | Durham 1372 | 326 1373 | D. 1374 | S. 1375 | Campbell 1376 | ; 1377 | G. 1378 | Somerset 1379 | M. 1380 | 85 1381 | Colchester 1382 | Gloucestershire 1383 | 280 1384 | J. 1385 | Russell 1386 | 63 1387 | A. 1388 | 52 1389 | Essex 1390 | Cardiff 1391 | Kent 1392 | Walker 1393 | 59 1394 | v 1395 | Glamorgan 1396 | Leicester 1397 | Leicestershire 1398 | P. 1399 | 108 1400 | 67 1401 | Hampshire 1402 | Northampton 1403 | Sussex 1404 | N. 1405 | 145 1406 | V. 1407 | Wells 1408 | 51 1409 | Northamptonshire 1410 | Trent 1411 | Nottinghamshire 1412 | 107 1413 | Surrey 1414 | Warwickshire 1415 | Giles 1416 | 57 1417 | W. 1418 | Khan 1419 | Worcestershire 1420 | Headingley 1421 | Yorkshire 1422 | C. 1423 | White 1424 | Moxon 1425 | Lancashire 1426 | ENGLAND 1427 | V 1428 | PAKISTAN 1429 | FINAL 1430 | TEST 1431 | SCOREBOARD 1432 | Scoreboard 1433 | final 1434 | Oval 1435 | innings 1436 | Atherton 1437 | b 1438 | Waqar 1439 | Younis 1440 | Mushtaq 1441 | Ahmed 1442 | 44 1443 | Hussain 1444 | c 1445 | Saeed 1446 | Anwar 1447 | Thorpe 1448 | lbw 1449 | Mohammad 1450 | Akram 1451 | 54 1452 | Crawley 1453 | Knight 1454 | 17 1455 | Lewis 1456 | Wasim 1457 | Salisbury 1458 | Extras 1459 | wickets 1460 | 278 1461 | Fall 1462 | To 1463 | bat 1464 | R. 1465 | Croft 1466 | Cork 1467 | Mullally 1468 | Bowling 1469 | date 1470 | Aamir 1471 | Sohail 1472 | Ijaz 1473 | Inzamam-ul-Haq 1474 | Salim 1475 | Malik 1476 | Asif 1477 | Mujtaba 1478 | Moin 1479 | Akam 1480 | IN 1481 | SCOTTISH 1482 | SQUAD 1483 | AFTER 1484 | Everton 1485 | Ferguson 1486 | scored 1487 | Manchester 1488 | picked 1489 | Scottish 1490 | squad 1491 | Rangers 1492 | striker 1493 | another 1494 | man 1495 | form 1496 | four 1497 | named 1498 | qualifier 1499 | Austria 1500 | Vienna 1501 | served 1502 | jail 1503 | Scotland 1504 | caps 1505 | December 1506 | 1994 1507 | Brown 1508 | 've 1509 | great 1510 | scoring 1511 | moment 1512 | successful 1513 | ON 1514 | THIRD 1515 | lunch 1516 | WITH 1517 | MANCHESTER 1518 | Ireland 1519 | midfielder 1520 | signed 1521 | champions 1522 | deal 1523 | game 1524 | Alex 1525 | CANADIAN 1526 | OPEN 1527 | TORONTO 1528 | Canadian 1529 | Open 1530 | Daniel 1531 | Nestor 1532 | Canada 1533 | Thomas 1534 | Muster 1535 | Mikael 1536 | Tillstrom 1537 | Goran 1538 | Ivanisevic 1539 | Croatia 1540 | 6-7 1541 | Ferreira 1542 | Marcelo 1543 | Rios 1544 | Chile 1545 | Kenneth 1546 | Denmark 1547 | MaliVai 1548 | Todd 1549 | 7-3 1550 | Philippoussis 1551 | Marc 1552 | Rosset 1553 | Switzerland 1554 | 8-6 1555 | 9 1556 | Cedric 1557 | Pioline 1558 | 7-1 1559 | Patrick 1560 | Rafter 1561 | 11 1562 | Alberto 1563 | 6-1 1564 | Petr 1565 | Korda 1566 | Francisco 1567 | Vacek 1568 | 13 1569 | Stoltenberg 1570 | Woodbridge 1571 | O'Brien 1572 | Black 1573 | Zimbabwe 1574 | 7-2 1575 | Bohdan 1576 | Ulihrach 1577 | Andrea 1578 | Henman 1579 | walkover 1580 | provincial 1581 | fast 1582 | professional 1583 | EUROPEAN 1584 | TIRANA 1585 | Winners 1586 | qualifying 1587 | soccer 1588 | Tirana 1589 | Chemlon 1590 | Humenne 1591 | 0-0 1592 | minute 1593 | 54th 1594 | 5,000 1595 | win 1596 | Chorzow 1597 | Ruch 1598 | Wales 1599 | 1-0 1600 | 47th 1601 | Larnaca 1602 | Cyprus 1603 | 2-0 1604 | 60th 1605 | penalty 1606 | 5-1 1607 | Lithuania 1608 | Sion 1609 | Nyva 1610 | Estonia 1611 | 3,000 1612 | Aggregate 1613 | score 1614 | 2-2 1615 | goals 1616 | rule 1617 | Brann 1618 | Norway 1619 | 10th 1620 | 5-2 1621 | Sofia 1622 | Bulgaria 1623 | Slovenia 1624 | 58th 1625 | 25,000 1626 | 4-3 1627 | penalties 1628 | Vaduz 1629 | Latvia 1630 | 90th 1631 | Luxembourg 1632 | Dynamo 1633 | Georgia 1634 | Prague 1635 | Sparta 1636 | Northern 1637 | 4-0 1638 | 26th 1639 | 19th 1640 | 80th 1641 | 86th 1642 | Hearts 1643 | Star 1644 | Belgrade 1645 | Yugoslavia 1646 | 59th 1647 | Hapoel 1648 | Moldova 1649 | Hungary 1650 | Add 1651 | 1,500 1652 | OUT 1653 | BUDAPEST 1654 | drew 1655 | tie 1656 | played 1657 | 4-1 1658 | 15th 1659 | Andreas 1660 | DE 1661 | Brazilian 1662 | championship 1663 | Atletico 1664 | SYDNEY 1665 | captain 1666 | Newcombe 1667 | resignation 1668 | Wimbledon 1669 | champion 1670 | coach 1671 | Tony 1672 | determined 1673 | events 1674 | lose 1675 | look 1676 | giving 1677 | someone 1678 | else 1679 | Sydney 1680 | world 1681 | Under 1682 | leadership 1683 | slipped 1684 | Since 1685 | doubles 1686 | partner 1687 | wins 1688 | losses 1689 | selected 1690 | semifinalist 1691 | Olympic 1692 | Croatian 1693 | best 1694 | described 1695 | faced 1696 | 1986 1697 | beaten 1698 | ago 1699 | Men 1700 | singles 1701 | 9/16 1702 | Korea 1703 | 15-7 1704 | Malaysia 1705 | Abdul 1706 | 3/4 1707 | Van 1708 | Netherlands 1709 | 15-11 1710 | Indonesia 1711 | 15-6 1712 | 15-8 1713 | 15-12 1714 | Women 1715 | 11-6 1716 | Sun 1717 | Liu 1718 | Zealand 1719 | 'S 1720 | DRAW 1721 | NEW 1722 | draw 1723 | championships 1724 | beginning 1725 | U.S 1726 | Tennis 1727 | Centre 1728 | Pete 1729 | Sampras 1730 | vs. 1731 | Adrian 1732 | Magnus 1733 | vs 1734 | Qualifier 1735 | Andrei 1736 | Roberto 1737 | ------------------------ 1738 | Christian 1739 | Grant 1740 | Fernando 1741 | Brazil 1742 | Kafelnikov 1743 | Johansson 1744 | Medvedev 1745 | Fleurian 1746 | 14 1747 | Costa 1748 | Jonathan 1749 | Bernd 1750 | Stefan 1751 | Edberg 1752 | Richard 1753 | Krajicek 1754 | Andre 1755 | Agassi 1756 | Colombia 1757 | Carlos 1758 | Kevin 1759 | Kim 1760 | Nicolas 1761 | Ecuador 1762 | Alami 1763 | Enqvist 1764 | Stephane 1765 | Belgium 1766 | Bahamas 1767 | Stich 1768 | Adams 1769 | Javier 1770 | Argentina 1771 | Stefano 1772 | Venezuela 1773 | Jeff 1774 | BASEBALL 1775 | BALTIMORE 1776 | Baltimore 1777 | Orioles 1778 | Johnson 1779 | miss 1780 | Seattle 1781 | Mariners 1782 | hospital 1783 | treated 1784 | William 1785 | adding 1786 | released 1787 | Andy 1788 | manage 1789 | absence 1790 | Angels 1791 | Columbia 1792 | Hospital 1793 | blood 1794 | eight 1795 | seasons 1796 | Cincinnati 1797 | Reds 1798 | League 1799 | games 1800 | pull 1801 | within 1802 | Yankees 1803 | American 1804 | Division 1805 | MAJOR 1806 | LEAGUE 1807 | STANDINGS 1808 | WEDNESDAY 1809 | GAMES 1810 | Major 1811 | Baseball 1812 | standings 1813 | tabulate 1814 | lost 1815 | winning 1816 | percentage 1817 | AMERICAN 1818 | EASTERN 1819 | DIVISION 1820 | W 1821 | L 1822 | GB 1823 | 58 1824 | BOSTON 1825 | 64 1826 | .496 1827 | .457 1828 | DETROIT 1829 | 82 1830 | 28 1831 | CENTRAL 1832 | CLEVELAND 1833 | MINNESOTA 1834 | .500 1835 | MILWAUKEE 1836 | 60 1837 | .469 1838 | KANSAS 1839 | CITY 1840 | 18 1841 | WESTERN 1842 | TEXAS 1843 | SEATTLE 1844 | 61 1845 | OAKLAND 1846 | 62 1847 | CALIFORNIA 1848 | THURSDAY 1849 | AUGUST 1850 | SCHEDULE 1851 | ATLANTA 1852 | 46 1853 | MONTREAL 1854 | FLORIDA 1855 | PHILADELPHIA 1856 | HOUSTON 1857 | ST 1858 | LOUIS 1859 | .504 1860 | CINCINNATI 1861 | PITTSBURGH 1862 | SAN 1863 | DIEGO 1864 | .543 1865 | LOS 1866 | ANGELES 1867 | COLORADO 1868 | 65 1869 | FRANCISCO 1870 | CAPS 1871 | Chicago 1872 | Milwaukee 1873 | Oakland 1874 | Texas 1875 | Pittsburgh 1876 | St 1877 | Louis 1878 | Philadelphia 1879 | Montreal 1880 | two-run 1881 | homer 1882 | inning 1883 | rallied 1884 | Cleveland 1885 | Indians 1886 | rubber 1887 | three-game 1888 | series 1889 | With 1890 | pitch 1891 | tried 1892 | big 1893 | crowd 1894 | shot 1895 | Rodriguez 1896 | double 1897 | Johnny 1898 | Terry 1899 | Kennedy 1900 | 40 1901 | nine 1902 | meetings 1903 | Western 1904 | Field 1905 | teams 1906 | Central 1907 | extra 1908 | ninth 1909 | single 1910 | save 1911 | got 1912 | allowing 1913 | hits 1914 | walks 1915 | strikeouts 1916 | scoreless 1917 | Dean 1918 | 30th 1919 | Ripken 1920 | solo 1921 | Bobby 1922 | Bonilla 1923 | three-run 1924 | seventh 1925 | power 1926 | runs 1927 | fifth 1928 | bottom 1929 | 21st 1930 | starter 1931 | blast 1932 | Young 1933 | allowed 1934 | relief 1935 | RBI 1936 | cut 1937 | straight 1938 | homers 1939 | dropped 1940 | row 1941 | became 1942 | major-league 1943 | history 1944 | 34 1945 | scattered 1946 | debut 1947 | each 1948 | Derek 1949 | settled 1950 | Jimmy 1951 | Key 1952 | interim 1953 | Boston 1954 | Mike 1955 | Sox 1956 | Athletics 1957 | owns 1958 | career 1959 | bases 1960 | loaded 1961 | went 1962 | walk 1963 | stole 1964 | homered 1965 | drove 1966 | Detroit 1967 | capped 1968 | eighth 1969 | Tigers 1970 | consecutive 1971 | Kansas 1972 | City 1973 | Juan 1974 | Guzman 1975 | span 1976 | earned 1977 | Minnesota 1978 | five-run 1979 | Brewers 1980 | Twins 1981 | Jose 1982 | PSV 1983 | WIN 1984 | Cocu 1985 | Eindhoven 1986 | Nijmegen 1987 | kick 1988 | minutes 1989 | Arthur 1990 | Nilis 1991 | Ajax 1992 | defence 1993 | NAC 1994 | Breda 1995 | DUTCH 1996 | SUMMARY 1997 | 11th 1998 | Halftime 1999 | 1-2 2000 | RESULT 2001 | GENEVA 2002 | 26 2003 | recalled 2004 | Jorge 2005 | finals 2006 | clearly 2007 | progress 2008 | beyond 2009 | phase 2010 | Euro 2011 | Squad 2012 | Marco 2013 | Grasshoppers 2014 | Hamburg 2015 | Antonio 2016 | Stuttgart 2017 | Milan 2018 | Borussia 2019 | Dortmund 2020 | ATHLETICS 2021 | RECORD 2022 | 40,000 2023 | THE 2024 | Brussels 2025 | grand 2026 | prix 2027 | athletes 2028 | glass 2029 | Belgian 2030 | pay 2031 | organisers 2032 | rounds 2033 | ever 2034 | already 2035 | records 2036 | broken 2037 | women 2038 | 1,000 2039 | ROUND 2040 | Leading 2041 | golf 2042 | Broadhurst 2043 | Raymond 2044 | Ian 2045 | Woosnam 2046 | Roe 2047 | Carl 2048 | Stephen 2049 | Lawrie 2050 | Max 2051 | Carter 2052 | Lee 2053 | Miguel 2054 | Angel 2055 | Gary 2056 | Eales 2057 | Williams 2058 | Andrew 2059 | Robert 2060 | Marcus 2061 | Pedro 2062 | Linhart 2063 | Price 2064 | UEFA 2065 | additional 2066 | headed 2067 | rankings 2068 | account 2069 | factors 2070 | including 2071 | red 2072 | national 2073 | allocated 2074 | places 2075 | 4. 2076 | 5. 2077 | 6. 2078 | 7. 2079 | 8. 2080 | Belarus 2081 | 9. 2082 | 10. 2083 | 13. 2084 | 15. 2085 | 16. 2086 | Portugal 2087 | Greece 2088 | MATCH 2089 | COLOMBO 2090 | Armed 2091 | police 2092 | ground 2093 | tour 2094 | Sri 2095 | Lanka 2096 | youth 2097 | limited 2098 | overs 2099 | includes 2100 | India 2101 | promised 2102 | presence 2103 | policemen 2104 | making 2105 | ethnic 2106 | violence 2107 | balls 2108 | fours 2109 | made 2110 | SOFIA 2111 | One 2112 | Romanian 2113 | others 2114 | injured 2115 | bus 2116 | collided 2117 | Bulgarian 2118 | morning 2119 | road 2120 | towns 2121 | woman 2122 | Maria 2123 | 35 2124 | accident 2125 | OFFICIAL 2126 | OJ 2127 | * 2128 | Note 2129 | contents 2130 | displayed 2131 | order 2132 | printed 2133 | Journal 2134 | Regulation 2135 | invitation 2136 | tender 2137 | refunds 2138 | white 2139 | certain 2140 | payments 2141 | system 2142 | producers 2143 | values 2144 | END 2145 | Home 2146 | Health 2147 | appeal 2148 | District 2149 | Court 2150 | decision 2151 | reimbursement 2152 | previously 2153 | regarding 2154 | related 2155 | community 2156 | personnel 2157 | continues 2158 | believe 2159 | majority 2160 | terms 2161 | Medicare 2162 | program 2163 | resolution 2164 | recorded 2165 | reserve 2166 | equal 2167 | revenue 2168 | ruled 2169 | court 2170 | dispute 2171 | pleased 2172 | significant 2173 | toward 2174 | Newsdesk 2175 | div 2176 | distribution 2177 | rate 2178 | quarterly 2179 | improved 2180 | ended 2181 | declared 2182 | partnership 2183 | unit 2184 | Best 2185 | sees 2186 | Co 2187 | Chairman 2188 | Chief 2189 | retailer 2190 | annual 2191 | even 2192 | emerged 2193 | always 2194 | particularly 2195 | you 2196 | something 2197 | stores 2198 | fall 2199 | closing 2200 | plan 2201 | states 2202 | For 2203 | 29 2204 | 1996-08-23 2205 | researchers 2206 | Swedish 2207 | born 2208 | caught 2209 | developed 2210 | cases 2211 | School 2212 | delivered 2213 | University 2214 | 1949 2215 | Four 2216 | children 2217 | wrote 2218 | medical 2219 | sometimes 2220 | require 2221 | surgery 2222 | pain 2223 | weight 2224 | involved 2225 | especially 2226 | often 2227 | cause 2228 | Most 2229 | high 2230 | baby 2231 | All 2232 | key 2233 | industrial 2234 | Following 2235 | survey 2236 | Industry 2237 | AUG 2238 | book 2239 | stocks 2240 | goods 2241 | expectations 2242 | above 2243 | normal 2244 | below 2245 | companies 2246 | representing 2247 | employees 2248 | brokers 2249 | built 2250 | 1989 2251 | buyers 2252 | Some 2253 | subject 2254 | pills 2255 | cholesterol 2256 | finds 2257 | show 2258 | doctors 2259 | Oxford 2260 | people 2261 | benefit 2262 | garlic 2263 | levels 2264 | 900 2265 | groups 2266 | receiving 2267 | Several 2268 | pressure 2269 | either 2270 | special 2271 | trial 2272 | makes 2273 | address 2274 | whole 2275 | affect 2276 | gives 2277 | aid 2278 | Caribbean 2279 | 39 2280 | development 2281 | much 2282 | population 2283 | living 2284 | south 2285 | fled 2286 | times 2287 | Plymouth 2288 | north 2289 | provide 2290 | track 2291 | programme 2292 | area 2293 | active 2294 | recent 2295 | shown 2296 | remain 2297 | she 2298 | Bill 2299 | Grand 2300 | Slam 2301 | seed 2302 | Graf 2303 | aiming 2304 | able 2305 | major 2306 | begins 2307 | opens 2308 | crown 2309 | Basuki 2310 | ceremony 2311 | both 2312 | holders 2313 | rising 2314 | defeat 2315 | While 2316 | semifinal 2317 | star 2318 | tested 2319 | probably 2320 | repeat 2321 | landing 2322 | sixth 2323 | avoided 2324 | possibility 2325 | quarter-finals 2326 | ranked 2327 | Monica 2328 | Seles 2329 | Anne 2330 | Miller 2331 | victim 2332 | Austrian 2333 | seeded 2334 | faces 2335 | playing 2336 | yet 2337 | unfortunate 2338 | first-round 2339 | popular 2340 | affair 2341 | Frenchman 2342 | injury 2343 | runner-up 2344 | Anke 2345 | Huber 2346 | Conchita 2347 | Martinez 2348 | Lindsay 2349 | Davenport 2350 | looking 2351 | opponents 2352 | Amanda 2353 | Coetzer 2354 | young 2355 | Hingis 2356 | clash 2357 | Jana 2358 | Novotna 2359 | 61-2 2360 | 9373-1800 2361 | RTRS 2362 | worked 2363 | along 2364 | unseeded 2365 | Other 2366 | hour 2367 | 55 2368 | point 2369 | break 2370 | compatriot 2371 | 13th 2372 | knew 2373 | service 2374 | serving 2375 | really 2376 | chances 2377 | every 2378 | come 2379 | Playing 2380 | maybe 2381 | difference 2382 | deficit 2383 | surprise 2384 | 32 2385 | quick 2386 | arrived 2387 | And 2388 | soon 2389 | 'm 2390 | fought 2391 | hoped 2392 | tight 2393 | helped 2394 | Soccer 2395 | Korean 2396 | SEOUL 2397 | Anyang 2398 | Puchon 2399 | Suwon 2400 | Standings 2401 | drawn 2402 | D 2403 | G 2404 | F 2405 | P 2406 | Chonan 2407 | outbreak 2408 | kills 2409 | central 2410 | staff 2411 | appear 2412 | 160 2413 | miles 2414 | southeast 2415 | Nigerian 2416 | Liberia 2417 | commander 2418 | latest 2419 | civil 2420 | officers 2421 | dozen 2422 | accords 2423 | difficult 2424 | painful 2425 | Nations 2426 | observers 2427 | travelling 2428 | western 2429 | delayed 2430 | shooting 2431 | highway 2432 | Anthony 2433 | finally 2434 | faction 2435 | Saturday 2436 | breaking 2437 | disarmament 2438 | 10,000 2439 | Community 2440 | Guinea 2441 | prayer 2442 | repeatedly 2443 | dead 2444 | reason 2445 | clear 2446 | Organisation 2447 | Conference 2448 | prayers 2449 | army 2450 | ordered 2451 | crackdown 2452 | seized 2453 | death 2454 | 1993 2455 | bottle 2456 | JOHANNESBURG 2457 | boy 2458 | girl 2459 | whose 2460 | Nelson 2461 | old 2462 | prison 2463 | son 2464 | Island 2465 | winter 2466 | letter 2467 | ordinary 2468 | post 2469 | age 2470 | reply 2471 | Atlantic 2472 | Ocean 2473 | belonging 2474 | couple 2475 | front 2476 | garden 2477 | house 2478 | Johannesburg 2479 | His 2480 | body 2481 | parents 2482 | unclear 2483 | enough 2484 | becoming 2485 | CPI 2486 | m 2487 | Current 2488 | NBH 2489 | bln 2490 | % 2491 | Government 2492 | 1998 2493 | & 2494 | Thomson 2495 | Moody 2496 | Investors 2497 | Rating 2498 | Agency 2499 | flow 2500 | Budapest 2501 | 36 2502 | die 2503 | MOSCOW 2504 | least 2505 | separatist 2506 | Chechen 2507 | Grozny 2508 | Interfax 2509 | command 2510 | Chechnya 2511 | 200 2512 | interior 2513 | mission 2514 | confirmed 2515 | Lebed 2516 | chief-of-staff 2517 | Aslan 2518 | Maskhadov 2519 | agreement 2520 | noon 2521 | letters 2522 | threatening 2523 | explained 2524 | money 2525 | residents 2526 | lives 2527 | nearby 2528 | railway 2529 | station 2530 | school 2531 | books 2532 | case 2533 | poor 2534 | family 2535 | trouble 2536 | index 2537 | pts 2538 | sign 2539 | Boris 2540 | Yeltsin 2541 | security 2542 | renewed 2543 | document 2544 | negotiated 2545 | village 2546 | Itar-Tass 2547 | provided 2548 | aide 2549 | Press 2550 | completed 2551 | showed 2552 | nominee 2553 | returned 2554 | Kremlin 2555 | two-day 2556 | Bosnia 2557 | Sarajevo 2558 | Bosnian 2559 | federation 2560 | common 2561 | taxes 2562 | kuna 2563 | mark 2564 | currency 2565 | introduced 2566 | Serbian 2567 | mines 2568 | According 2569 | mine 2570 | briefly 2571 | overnight 2572 | planes 2573 | artillery 2574 | although 2575 | heard 2576 | speaking 2577 | flew 2578 | firing 2579 | anything 2580 | separatists 2581 | halt 2582 | threatened 2583 | bombing 2584 | assault 2585 | passengers 2586 | rescued 2587 | Colombian 2588 | coast 2589 | BOGOTA 2590 | Coast 2591 | boat 2592 | Pacific 2593 | missing 2594 | trip 2595 | fuel 2596 | sea 2597 | Argentine 2598 | BUENOS 2599 | AIRES 2600 | iron 2601 | production 2602 | cold 2603 | Buenos 2604 | Aires 2605 | Peru 2606 | kill 2607 | terrorist 2608 | sources 2609 | Maoist 2610 | small 2611 | northeast 2612 | propaganda 2613 | centre 2614 | By 2615 | 1992 2616 | activities 2617 | guerrilla 2618 | cost 2619 | damage 2620 | 1980 2621 | Former 2622 | Brunswijk 2623 | custody 2624 | charged 2625 | attempted 2626 | murder 2627 | turned 2628 | himself 2629 | Pinas 2630 | mining 2631 | 56 2632 | 90 2633 | east 2634 | showing 2635 | cuts 2636 | wife 2637 | charges 2638 | merely 2639 | less 2640 | regime 2641 | conflict 2642 | 500 2643 | caused 2644 | thousands 2645 | neighbouring 2646 | 1980s 2647 | eventually 2648 | Despite 2649 | businessman 2650 | interests 2651 | saw 2652 | leads 2653 | Thai 2654 | heroin 2655 | BANGKOK 2656 | Hong 2657 | Kong 2658 | arrested 2659 | kg 2660 | searched 2661 | Police 2662 | several 2663 | street 2664 | baht 2665 | Officials 2666 | detained 2667 | pending 2668 | formal 2669 | follows 2670 | nations 2671 | consulate 2672 | colony 2673 | sell 2674 | issues 2675 | include 2676 | plans 2677 | pact 2678 | Canberra 2679 | Tibet 2680 | exiled 2681 | conservative 2682 | Information 2683 | written 2684 | critics 2685 | self-rule 2686 | deals 2687 | anyone 2688 | legal 2689 | Amman 2690 | owner 2691 | copies 2692 | citizen 2693 | accord 2694 | On 2695 | confiscated 2696 | insisted 2697 | Authority 2698 | strategy 2699 | freedom 2700 | resulted 2701 | mistakes 2702 | explain 2703 | journalists 2704 | banned 2705 | sure 2706 | selling 2707 | think 2708 | ? 2709 | Jewish 2710 | handed 2711 | parts 2712 | Istanbul 2713 | CAIRO 2714 | airport 2715 | fire 2716 | taxi 2717 | onto 2718 | line 2719 | Mohamed 2720 | conference 2721 | instead 2722 | yards 2723 | aviation 2724 | noted 2725 | pilot 2726 | Its 2727 | private 2728 | wants 2729 | nothing 2730 | Sudanese 2731 | Egyptian 2732 | militants 2733 | Mubarak 2734 | speech 2735 | Moslem 2736 | Security 2737 | Council 2738 | flights 2739 | Khartoum 2740 | incident 2741 | effect 2742 | Sudan 2743 | fails 2744 | cannot 2745 | Ethiopia 2746 | Front 2747 | far 2748 | relations 2749 | shares 2750 | shed 2751 | profit-taking 2752 | amid 2753 | volume 2754 | trillion 2755 | rise 2756 | profit 2757 | session 2758 | gained 2759 | Shares 2760 | Of 2761 | stable 2762 | Miss 2763 | Universe 2764 | Machado 2765 | Mexico 2766 | questions 2767 | claims 2768 | crash 2769 | 19 2770 | Los 2771 | Angeles 2772 | attended 2773 | USA 2774 | Inc. 2775 | drop 2776 | losing 2777 | 112 2778 | blocked 2779 | appearance 2780 | stage 2781 | She 2782 | stay 2783 | returning 2784 | Sept 2785 | alleged 2786 | putting 2787 | fine 2788 | happened 2789 | associated 2790 | routine 2791 | Kevorkian 2792 | suicide 2793 | bringing 2794 | suffering 2795 | multiple 2796 | emergency 2797 | room 2798 | Patricia 2799 | Smith 2800 | midday 2801 | attending 2802 | starting 2803 | doctor 2804 | lawyer 2805 | husband 2806 | father 2807 | James 2808 | Judith 2809 | Massachusetts 2810 | suffered 2811 | life 2812 | As 2813 | Sale 2814 | Amount 2815 | Municipal 2816 | Desk 2817 | 212-859-1650 2818 | jailed 2819 | sentenced 2820 | Lauck 2821 | base 2822 | Lincoln 2823 | abuse 2824 | struggle 2825 | network 2826 | material 2827 | judge 2828 | greatest 2829 | Nazi 2830 | slaughter 2831 | millions 2832 | complex 2833 | prosecutor 2834 | demanded 2835 | five-year 2836 | satisfied 2837 | sentence 2838 | illegal 2839 | laws 2840 | produce 2841 | magazines 2842 | Interior 2843 | biggest 2844 | Social 2845 | Democrats 2846 | suit 2847 | spent 2848 | nor 2849 | extradition 2850 | truth 2851 | actions 2852 | carried 2853 | Socialist 2854 | Workers 2855 | name 2856 | Battle 2857 | magazine 2858 | request 2859 | convicted 2860 | organisations 2861 | arrest 2862 | NATIONS 2863 | arrangements 2864 | quite 2865 | monitors 2866 | carry 2867 | Department 2868 | deputy 2869 | Algeria 2870 | TV 2871 | Algerian 2872 | suspected 2873 | killing 2874 | sought 2875 | Roman 2876 | Catholic 2877 | bomb 2878 | Algiers 2879 | 50,000 2880 | 110 2881 | foreigners 2882 | radical 2883 | Islamists 2884 | commanding 2885 | flown 2886 | table 2887 | shows 2888 | airports 2889 | association 2890 | Berlin 2891 | 202 2892 | Bremen 2893 | 4.4 2894 | Frankfurt 2895 | 1.5 2896 | 3.5 2897 | Cologne 2898 | Munich 2899 | TOTAL 2900 | Air 2901 | research 2902 | Not 2903 | forecast 2904 | performance 2905 | EPS 2906 | Dividend 2907 | Fax 2908 | Clinton 2909 | Ballybunion 2910 | fans 2911 | resort 2912 | Irish 2913 | America 2914 | Dick 2915 | Spring 2916 | convention 2917 | bring 2918 | addressed 2919 | packed 2920 | Dublin 2921 | us 2922 | Frank 2923 | process 2924 | Ischinger 2925 | seek 2926 | solution 2927 | Cooperation 2928 | OSCE 2929 | personal 2930 | threat 2931 | positive 2932 | responsible 2933 | global 2934 | nuclear 2935 | treaty 2936 | Delhi 2937 | intended 2938 | Gujral 2939 | block 2940 | Geneva 2941 | entering 2942 | signing 2943 | weapons 2944 | Treaty 2945 | bilateral 2946 | position 2947 | clause 2948 | Asked 2949 | Assembly 2950 | cross 2951 | fact 2952 | weapon 2953 | concerns 2954 | impossible 2955 | option 2956 | carrying 2957 | tests 2958 | accept 2959 | DHAKA 2960 | Dhaka 2961 | Bangladesh 2962 | importance 2963 | Mia 2964 | Commonwealth 2965 | Affairs 2966 | Liam 2967 | Fox 2968 | arriving 2969 | injuries 2970 | complained 2971 | wrist 2972 | act 2973 | seriously 2974 | reasons 2975 | important 2976 | leave 2977 | Nepal 2978 | governments 2979 | House 2980 | matter 2981 | investigation 2982 | outcome 2983 | connection 2984 | business 2985 | Kashmir 2986 | polls 2987 | planning 2988 | troubled 2989 | seems 2990 | 1987 2991 | direct 2992 | create 2993 | among 2994 | growing 2995 | independence 2996 | hopes 2997 | 20,000 2998 | Over 2999 | engineering 3000 | banks 3001 | edged 3002 | sharply 3003 | investors 3004 | Stock 3005 | Exchange 3006 | turnover 3007 | taka 3008 | remained 3009 | unchanged 3010 | recovered 3011 | edge 3012 | ratio 3013 | Reserve 3014 | bank 3015 | governor 3016 | Rangarajan 3017 | maintained 3018 | reduced 3019 | having 3020 | supply 3021 | increasing 3022 | stood 3023 | commitment 3024 | response 3025 | gross 3026 | product 3027 | GDP 3028 | real 3029 | Bombay 3030 | Mother 3031 | Teresa 3032 | Nobel 3033 | Prize 3034 | hope 3035 | love 3036 | nun 3037 | Albanian 3038 | need 3039 | missionary 3040 | intensive 3041 | Indian 3042 | heart 3043 | 86 3044 | condition 3045 | task 3046 | alone 3047 | Calcutta 3048 | grew 3049 | 80 3050 | highest 3051 | award 3052 | Her 3053 | Vatican 3054 | Missionaries 3055 | Charity 3056 | More 3057 | delegates 3058 | elect 3059 | Rome 3060 | receive 3061 | failing 3062 | stopped 3063 | Serbia 3064 | religious 3065 | begin 3066 | approved 3067 | training 3068 | missionaries 3069 | rest 3070 | 150 3071 | homes 3072 | destitute 3073 | founded 3074 | forecasts 3075 | boost 3076 | Bernard 3077 | News 3078 | Ltd 3079 | 1995/96 3080 | profits 3081 | film 3082 | Day 3083 | 1996/97 3084 | From 3085 | begun 3086 | motion 3087 | picture 3088 | orders 3089 | advertising 3090 | shareholders 3091 | A$ 3092 | US$ 3093 | cents 3094 | Analysts 3095 | disappointing 3096 | outlook 3097 | First 3098 | offset 3099 | operations 3100 | hard 3101 | operating 3102 | cover 3103 | Times 3104 | revenues 3105 | arm 3106 | operation 3107 | hurt 3108 | San 3109 | minimum 3110 | setting 3111 | pulled 3112 | 're 3113 | internal 3114 | declined 3115 | budget 3116 | realised 3117 | raising 3118 | pledged 3119 | effects 3120 | activity 3121 | generally 3122 | long-term 3123 | investment 3124 | NZ 3125 | lending 3126 | WELLINGTON 3127 | cutting 3128 | 10.5 3129 | Wellington 3130 | Power 3131 | approach 3132 | followed 3133 | equipment 3134 | hunt 3135 | Thailand 3136 | launched 3137 | Bangkok 3138 | drug 3139 | escape 3140 | confident 3141 | trafficking 3142 | window 3143 | climbed 3144 | bed 3145 | department 3146 | prevent 3147 | 266 3148 | Tokyo 3149 | parent 3150 | TOKYO 3151 | Year 3152 | billions 3153 | yen 3154 | specified 3155 | LATEST 3156 | ACTUAL 3157 | FORECAST 3158 | YEAR-AGO 3159 | Sales 3160 | 400 3161 | NOTE 3162 | HK$ 3163 | 43 3164 | mln 3165 | HONG 3166 | KONG 3167 | placed 3168 | basis 3169 | 852 3170 | applications 3171 | believes 3172 | respect 3173 | safety 3174 | maximum 3175 | 2000 3176 | JAKARTA 3177 | Indonesian 3178 | ** 3179 | Jakarta 3180 | involving 3181 | demonstrators 3182 | Dow 3183 | streak 3184 | beating 3185 | rupiah 3186 | trading 3187 | Super 3188 | offer 3189 | subsidiary 3190 | loans 3191 | management 3192 | build 3193 | property 3194 | projects 3195 | worth 3196 | Surabaya 3197 | stock 3198 | change 3199 | lows 3200 | Also 3201 | Japanese 3202 | pound 3203 | gold 3204 | closes 3205 | .... 3206 | Gold 3207 | AND 3208 | Nikkei 3209 | Dec 3210 | CAC-40 3211 | marks 3212 | post-Soviet 3213 | effectively 3214 | blow 3215 | Soviet 3216 | largest 3217 | initially 3218 | accompanied 3219 | signs 3220 | economy 3221 | Palace 3222 | reforms 3223 | doubt 3224 | replace 3225 | 33 3226 | postpone 3227 | NATO 3228 | policy 3229 | protest 3230 | Kiev 3231 | itself 3232 | eastern 3233 | strategic 3234 | though 3235 | push 3236 | membership 3237 | anniversary 3238 | factor 3239 | united 3240 | complete 3241 | example 3242 | Gazeta 3243 | published 3244 | language 3245 | barter 3246 | concerned 3247 | Volkova 3248 | Committee 3249 | issued 3250 | decree 3251 | situation 3252 | traders 3253 | 180 3254 | preferred 3255 | systems 3256 | various 3257 | +7095 3258 | 941 3259 | 8520 3260 | release 3261 | version 3262 | Goldman 3263 | Brian 3264 | De 3265 | criticised 3266 | batsman 3267 | delay 3268 | century 3269 | rain 3270 | improvement 3271 | weather 3272 | announce 3273 | event 3274 | unbeaten 3275 | bowled 3276 | MOTOR 3277 | BELGIAN 3278 | GRAND 3279 | PRIX 3280 | TIMES 3281 | Gerhard 3282 | Berger 3283 | Benetton 3284 | seconds 3285 | McLaren 3286 | Jacques 3287 | Villeneuve 3288 | Mika 3289 | Hakkinen 3290 | Jean 3291 | Alesi 3292 | Damon 3293 | Hill 3294 | Schumacher 3295 | 11. 3296 | Herbert 3297 | Sauber 3298 | 12. 3299 | Olivier 3300 | Ligier 3301 | [ 3302 | ] 3303 | Karina 3304 | Habsudova 3305 | Portsmouth 3306 | Queens 3307 | Tranmere 3308 | Grimsby 3309 | Stirling 3310 | Gooch 3311 | 389 3312 | K. 3313 | 310 3314 | 83 3315 | 181 3316 | 109 3317 | Fairbrother 3318 | Pollock 3319 | county 3320 | returns 3321 | 106 3322 | 116 3323 | wicket 3324 | 1-106 3325 | 4-95 3326 | Middlesbrough 3327 | Italian 3328 | seen 3329 | forward 3330 | fun 3331 | foot 3332 | Chelsea 3333 | reached 3334 | advanced 3335 | Americans 3336 | 2-6 3337 | getting 3338 | pretty 3339 | things 3340 | missed 3341 | felt 3342 | midnight 3343 | person 3344 | aggressive 3345 | affected 3346 | Tour 3347 | hitting 3348 | ball 3349 | lot 3350 | shots 3351 | mostly 3352 | RUGBY 3353 | UNION 3354 | SECOND 3355 | check 3356 | means 3357 | speculation 3358 | wing 3359 | shortly 3360 | seedings 3361 | 15-10 3362 | 15-9 3363 | Rubin 3364 | player 3365 | moved 3366 | Mary 3367 | McGrath 3368 | 47 3369 | .531 3370 | slam 3371 | pair 3372 | Brady 3373 | Anderson 3374 | walked 3375 | none 3376 | struck 3377 | Charlton 3378 | Ken 3379 | Edgar 3380 | advantage 3381 | heading 3382 | Roger 3383 | pinch-hitter 3384 | majors 3385 | pitched 3386 | 0-1 3387 | apiece 3388 | claimed 3389 | Thompson 3390 | route 3391 | Erik 3392 | Fernandez 3393 | LISBON 3394 | Sporting 3395 | Luis 3396 | 3-1 3397 | Although 3398 | 35th 3399 | restored 3400 | 38th 3401 | 57th 3402 | reigning 3403 | Porto 3404 | Benfica 3405 | Portuguese 3406 | Pauli 3407 | candidates 3408 | produced 3409 | Bundesliga 3410 | Schalke 3411 | 64th 3412 | thanks 3413 | Hansa 3414 | Rostock 3415 | clocked 3416 | SUMMARIES 3417 | Summaries 3418 | FRENCH 3419 | Nancy 3420 | Germain 3421 | 15,000 3422 | Svetlana 3423 | Masterkova 3424 | Zurich 3425 | lap 3426 | Mozambique 3427 | Mutola 3428 | stadium 3429 | pushed 3430 | capacity 3431 | Atlanta 3432 | plus 3433 | 100,000 3434 | tabulated 3435 | 129 3436 | Diego 3437 | Trinidad 3438 | Cea 3439 | Barry 3440 | Dennis 3441 | Mitchell 3442 | present 3443 | bright 3444 | Donovan 3445 | Bailey 3446 | Linford 3447 | Christie 3448 | 1988 3449 | sport 3450 | bronze 3451 | medallist 3452 | hurdles 3453 | Allen 3454 | Colin 3455 | Kingdom 3456 | seemed 3457 | finish 3458 | faster 3459 | speed 3460 | stepped 3461 | crashed 3462 | Michelle 3463 | Freeman 3464 | Cuban 3465 | Lopez 3466 | Seven 3467 | medal 3468 | settle 3469 | 11.00 3470 | Natalya 3471 | Irina 3472 | Jamaica 3473 | Cuba 3474 | Julie 3475 | Johan 3476 | Kenya 3477 | Nigeria 3478 | Jon 3479 | Sarah 3480 | Joseph 3481 | Uganda 3482 | Frankie 3483 | Fredericks 3484 | Namibia 3485 | Bob 3486 | jump 3487 | Edwards 3488 | Wellman 3489 | Burundi 3490 | Rwanda 3491 | Anton 3492 | UP 3493 | Paulo 3494 | Juventus 3495 | member 3496 | Santos 3497 | entire 3498 | friendly 3499 | Zabrze 3500 | nominated 3501 | Jens 3502 | however 3503 | defender 3504 | formally 3505 | 'll 3506 | themselves 3507 | Oliver 3508 | Klinsmann 3509 | cup 3510 | Legia 3511 | Warsaw 3512 | Brugge 3513 | SRI 3514 | LANKA 3515 | guilty 3516 | controversial 3517 | excellent 3518 | Colombo 3519 | Lankan 3520 | heavy 3521 | Healy 3522 | Angolan 3523 | Unita 3524 | joint 3525 | administration 3526 | installed 3527 | timetable 3528 | sending 3529 | estimates 3530 | MON 3531 | Gencor 3532 | YR 3533 | DIV 3534 | N 3535 | McCarthy 3536 | Group 3537 | falls 3538 | Shr 3539 | shr 3540 | Bureau 3541 | 416 3542 | discuss 3543 | fever 3544 | junior 3545 | changing 3546 | hands 3547 | developments 3548 | formed 3549 | Research 3550 | newsdesk 3551 | stake 3552 | WASHINGTON 3553 | 8.0 3554 | Securities 3555 | holding 3556 | holds 3557 | investments 3558 | large 3559 | reach 3560 | allow 3561 | operate 3562 | Miami 3563 | shift 3564 | weekly 3565 | add 3566 | status 3567 | services 3568 | Mideast 3569 | IOC 3570 | options 3571 | Samsung 3572 | Singapore 3573 | LG 3574 | Akron 3575 | worst 3576 | My 3577 | Maybe 3578 | your 3579 | field 3580 | Taibe 3581 | fields 3582 | Arabs 3583 | crowds 3584 | thing 3585 | promoted 3586 | Jerusalem 3587 | Haj 3588 | Yihye 3589 | treatment 3590 | driving 3591 | hearing 3592 | load 3593 | elsewhere 3594 | mayor 3595 | Football 3596 | Liverpool 3597 | Lynch 3598 | C 3599 | Rapid 3600 | Old 3601 | Trafford 3602 | Barcelona 3603 | suspects 3604 | Rwandan 3605 | Hutu 3606 | refugees 3607 | Zaire 3608 | Rally 3609 | Democracy 3610 | blown 3611 | Serb 3612 | casualties 3613 | find 3614 | Serbs 3615 | Yugoslav 3616 | minority 3617 | 200,000 3618 | reserves 3619 | assets 3620 | crime 3621 | WARSAW 3622 | organised 3623 | Kohl 3624 | sealed 3625 | links 3626 | materials 3627 | Chechens 3628 | fighters 3629 | soldiers 3630 | effective 3631 | relatively 3632 | roads 3633 | soldier 3634 | dark 3635 | try 3636 | BELGRADE 3637 | aboard 3638 | jet 3639 | diplomats 3640 | ends 3641 | powers 3642 | presidential 3643 | palace 3644 | different 3645 | sectors 3646 | elected 3647 | platform 3648 | nation 3649 | BRASILIA 3650 | bid 3651 | difficulties 3652 | huge 3653 | river 3654 | Tsang 3655 | Ali 3656 | Deputy 3657 | bread 3658 | riots 3659 | passed 3660 | Ibrahim 3661 | spying 3662 | comment 3663 | assistant 3664 | tension 3665 | King 3666 | Jordanian 3667 | secure 3668 | Islamist 3669 | houses 3670 | streets 3671 | communist 3672 | smaller 3673 | clashes 3674 | policies 3675 | blame 3676 | protests 3677 | Kurd 3678 | ANKARA 3679 | PKK 3680 | 12-year-old 3681 | autonomy 3682 | Davies 3683 | contacts 3684 | pursue 3685 | apparently 3686 | intent 3687 | sexual 3688 | a.m. 3689 | girls 3690 | Virginia 3691 | entered 3692 | sexually 3693 | II 3694 | detective 3695 | allegedly 3696 | stabbed 3697 | once 3698 | identified 3699 | USDA 3700 | monitoring 3701 | Dan 3702 | corn 3703 | purchase 3704 | wheat 3705 | Republican 3706 | Gov 3707 | Weld 3708 | Senate 3709 | seat 3710 | incumbent 3711 | vote 3712 | facing 3713 | Cambridge 3714 | friend 3715 | Roosevelt 3716 | politics 3717 | friends 3718 | transport 3719 | Dm 3720 | 125 3721 | inch 3722 | mm 3723 | showers 3724 | isolated 3725 | Corporation 3726 | Tamil 3727 | directed 3728 | terrorism 3729 | finance 3730 | Liberation 3731 | king 3732 | Premier 3733 | giant 3734 | neighbours 3735 | distance 3736 | Queen 3737 | Nepali 3738 | Post 3739 | volatility 3740 | falling 3741 | trader 3742 | low 3743 | size 3744 | perhaps 3745 | You 3746 | makers 3747 | potential 3748 | yield 3749 | Bonds 3750 | Sachs 3751 | warrants 3752 | controls 3753 | DEM 3754 | X 3755 | challenge 3756 | Diana 3757 | Princess 3758 | criminal 3759 | Stenning 3760 | contest 3761 | everything 3762 | driver 3763 | rider 3764 | compensation 3765 | Prince 3766 | Charles 3767 | remove 3768 | asking 3769 | spring 3770 | effort 3771 | Necmettin 3772 | Erbakan 3773 | B 3774 | Turnover 3775 | listed 3776 | Index 3777 | editorial 3778 | Zenith 3779 | plant 3780 | set-top 3781 | boxes 3782 | gets 3783 | develop 3784 | addition 3785 | partners 3786 | Corp. 3787 | SBC 3788 | join 3789 | jobs 3790 | profitable 3791 | retail 3792 | Law 3793 | hundreds 3794 | comes 3795 | candidate 3796 | running 3797 | Many 3798 | involves 3799 | word 3800 | Windows 3801 | Microsoft 3802 | computer 3803 | fallen 3804 | software 3805 | launch 3806 | lines 3807 | customers 3808 | dollars 3809 | building 3810 | impact 3811 | lawsuit 3812 | tied 3813 | space 3814 | products 3815 | Office 3816 | handling 3817 | technical 3818 | corporate 3819 | adopted 3820 | 4.0 3821 | sweeping 3822 | slow 3823 | intercepted 3824 | everyone 3825 | hijacking 3826 | hijackers 3827 | hijacked 3828 | commercial 3829 | welfare 3830 | Wisconsin 3831 | administrative 3832 | Tommy 3833 | Human 3834 | reform 3835 | acquire 3836 | directly 3837 | limits 3838 | train 3839 | Alaska 3840 | boxcar 3841 | Phan 3842 | agents 3843 | Arkansas 3844 | churches 3845 | kms 3846 | located 3847 | investigating 3848 | idea 3849 | established 3850 | scene 3851 | St. 3852 | Turner 3853 | surrounded 3854 | cotton 3855 | rural 3856 | shared 3857 | suspicion 3858 | income 3859 | short-term 3860 | Calif. 3861 | Systems 3862 | Business 3863 | 212 3864 | PTT 3865 | largely 3866 | view 3867 | ING 3868 | continuing 3869 | sound 3870 | weaker 3871 | bourse 3872 | prospects 3873 | first-half 3874 | ordering 3875 | poll 3876 | wounds 3877 | investigations 3878 | H1 3879 | traffic 3880 | freight 3881 | whales 3882 | lions 3883 | ca 3884 | article 3885 | model 3886 | using 3887 | Howard 3888 | warned 3889 | sharp 3890 | spending 3891 | Greens 3892 | Toyota 3893 | workers 3894 | 2,000 3895 | striking 3896 | voted 3897 | Melbourne 3898 | assembly 3899 | Niugini 3900 | surge 3901 | 38 3902 | considering 3903 | copper 3904 | project 3905 | students 3906 | gathered 3907 | staged 3908 | violent 3909 | university 3910 | intervention 3911 | supporting 3912 | nil 3913 | 48 3914 | Shanghai 3915 | stockpile 3916 | Just 3917 | metal 3918 | Export 3919 | CNIEC 3920 | Traders 3921 | tax 3922 | otherwise 3923 | spend 3924 | moving 3925 | exact 3926 | LME 3927 | behalf 3928 | tonne 3929 | secret 3930 | owners 3931 | posts 3932 | Company 3933 | Holdings 3934 | construction 3935 | exchange 3936 | acquisition 3937 | maker 3938 | yuan 3939 | Promodes 3940 | decide 3941 | Metro 3942 | discussions 3943 | 4221 3944 | highlights 3945 | fund 3946 | Taylor 3947 | council 3948 | yesterday 3949 | Merrill 3950 | 1996-08-24 3951 | truck 3952 | vehicle 3953 | engine 3954 | RALLYING 3955 | JYVASKYLA 3956 | Juha 3957 | Kankkunen 3958 | 37 3959 | stages 3960 | Lakes 3961 | prospect 3962 | Tommi 3963 | Makinen 3964 | Mitsubishi 3965 | fined 3966 | sports 3967 | Subaru 3968 | 6.5 3969 | boss 3970 | pace 3971 | Spaniard 3972 | kph 3973 | Ferrari 3974 | Eddie 3975 | Tyrrell 3976 | 14. 3977 | Did 3978 | RALLY 3979 | unnamed 3980 | suspended 3981 | suspension 3982 | +1 3983 | +3 3984 | SUPER 3985 | rugby 3986 | Bradford 3987 | Wigan 3988 | 78 3989 | Helens 3990 | Warrington 3991 | 555 3992 | Sheffield 3993 | Oldham 3994 | Leeds 3995 | 398 3996 | 325 3997 | TITLE 3998 | favour 3999 | removed 4000 | Graham 4001 | 88 4002 | Notts 4003 | successive 4004 | follow 4005 | Darren 4006 | Gough 4007 | 176 4008 | Final 4009 | Match 4010 | abandoned 4011 | 111 4012 | Irani 4013 | Raith 4014 | Dundee 4015 | Aberdeen 4016 | Hamilton 4017 | Ross 4018 | Played 4019 | Newcastle 4020 | Alan 4021 | header 4022 | Tottenham 4023 | Gianluca 4024 | Arsenal 4025 | Villa 4026 | Southampton 4027 | Bolton 4028 | Norwich 4029 | Bromwich 4030 | Crystal 4031 | Ipswich 4032 | Reading 4033 | Southend 4034 | Birmingham 4035 | Stoke 4036 | Swindon 4037 | Vale 4038 | Wolverhampton 4039 | Barnsley 4040 | Huddersfield 4041 | Bristol 4042 | Blackpool 4043 | Chesterfield 4044 | Preston 4045 | Rovers 4046 | Swansea 4047 | Doncaster 4048 | SOUTH 4049 | Blacks 4050 | tries 4051 | Wilson 4052 | replacement 4053 | Zinzan 4054 | Brooke 4055 | Simon 4056 | Partizan 4057 | Spartak 4058 | Krakow 4059 | Wroclaw 4060 | Odra 4061 | GKS 4062 | Polonia 4063 | Lodz 4064 | BASKETBALL 4065 | KOREAN 4066 | Hyundai 4067 | Haitai 4068 | Hanwha 4069 | OB 4070 | Lotte 4071 | Ssangbangwool 4072 | .527 4073 | .538 4074 | Heerenveen 4075 | Veldman 4076 | goalkeeper 4077 | defenders 4078 | strikers 4079 | Groningen 4080 | Antwerp 4081 | Charleroi 4082 | LEADING 4083 | Bastia 4084 | Marseille 4085 | Lille 4086 | Monaco 4087 | Smicer 4088 | Lens 4089 | Christopher 4090 | Guingamp 4091 | Nantes 4092 | Nice 4093 | Rennes 4094 | Bordeaux 4095 | Auxerre 4096 | Metz 4097 | Strasbourg 4098 | Havre 4099 | Caen 4100 | 70th 4101 | Lyon 4102 | Montpellier 4103 | Cannes 4104 | Graafschap 4105 | Doetinchem 4106 | RKC 4107 | Waalwijk 4108 | Willem 4109 | Tilburg 4110 | Fortuna 4111 | Sittard 4112 | Rotterdam 4113 | Twente 4114 | Enschede 4115 | Vitesse 4116 | Arnhem 4117 | Utrecht 4118 | Feyenoord 4119 | Roda 4120 | JC 4121 | Kerkrade 4122 | Volendam 4123 | AZ 4124 | Alkmaar 4125 | Bochum 4126 | Arminia 4127 | Bielefeld 4128 | Moenchengladbach 4129 | Karlsruhe 4130 | 33rd 4131 | Werder 4132 | 1860 4133 | Bayer 4134 | Leverkusen 4135 | Duesseldorf 4136 | Freiburg 4137 | VfB 4138 | Bayern 4139 | MSV 4140 | Duisburg 4141 | FC 4142 | Linz 4143 | SV 4144 | one-day 4145 | Waugh 4146 | Tendulkar 4147 | Frederic 4148 | ransom 4149 | tanks 4150 | civilians 4151 | whom 4152 | spokeswoman 4153 | role 4154 | Doctors 4155 | Arabia 4156 | About 4157 | climb 4158 | racing 4159 | mountain 4160 | Team 4161 | walking 4162 | servants 4163 | Public 4164 | dismissed 4165 | Labour 4166 | Florence 4167 | firms 4168 | demands 4169 | wage 4170 | Opposition 4171 | unions 4172 | Civil 4173 | inflation 4174 | camps 4175 | genocide 4176 | fear 4177 | counterpart 4178 | skull 4179 | scientist 4180 | forest 4181 | arrests 4182 | Eritrea 4183 | alliance 4184 | accuses 4185 | accusing 4186 | congress 4187 | turn 4188 | electoral 4189 | class 4190 | stance 4191 | consensus 4192 | Nicaraguan 4193 | Nicaragua 4194 | Chamorro 4195 | column 4196 | wo 4197 | split 4198 | widespread 4199 | activists 4200 | communications 4201 | Revolutionary 4202 | site 4203 | executed 4204 | lb 4205 | Movement 4206 | MDS 4207 | sentences 4208 | judicial 4209 | withdrew 4210 | camp 4211 | west 4212 | independent 4213 | allegations 4214 | grenades 4215 | intends 4216 | gathering 4217 | Dole 4218 | Congress 4219 | sex 4220 | offers 4221 | again 4222 | heads 4223 | drive 4224 | happen 4225 | bill 4226 | remarkable 4227 | offering 4228 | demanding 4229 | stronger 4230 | families 4231 | fellow 4232 | attention 4233 | Hurricane 4234 | Edouard 4235 | winds 4236 | ETA 4237 | Basque 4238 | Defence 4239 | mandate 4240 | Bossi 4241 | ancient 4242 | Corsica 4243 | talking 4244 | bodies 4245 | Stevanin 4246 | unable 4247 | investigators 4248 | paedophile 4249 | scandal 4250 | kidnapping 4251 | throughout 4252 | Dutroux 4253 | managed 4254 | freed 4255 | abduction 4256 | imprisonment 4257 | Bourlet 4258 | Melissa 4259 | kidnapped 4260 | Eefje 4261 | unknown 4262 | Bratislava 4263 | Five 4264 | Among 4265 | information 4266 | theft 4267 | accepted 4268 | hear 4269 | inquiry 4270 | Rights 4271 | cannabis 4272 | 7.5 4273 | container 4274 | MADRID 4275 | supported 4276 | neither 4277 | margin 4278 | Pakistani 4279 | inches 4280 | rail 4281 | witnesses 4282 | Army 4283 | Internet 4284 | Startup 4285 | Java 4286 | 1996-08-25 4287 | JavaSoft 4288 | venture 4289 | acting 4290 | experience 4291 | aims 4292 | individual 4293 | marketing 4294 | WINS 4295 | style 4296 | easy 4297 | feet 4298 | 12th 4299 | defending 4300 | 6-0 4301 | LAKES 4302 | Celica 4303 | Escort 4304 | Eriksson 4305 | Honda 4306 | Kawasaki 4307 | Yamaha 4308 | placings 4309 | GTR 4310 | laps 4311 | 170 4312 | 162 4313 | Fastest 4314 | drivers 4315 | Takeda 4316 | Haga 4317 | Yoshikawa 4318 | Corser 4319 | Ducati 4320 | Kocinski 4321 | Slight 4322 | Fogarty 4323 | Aoki 4324 | names 4325 | strongly 4326 | riders 4327 | reaching 4328 | 521-8 4329 | PREMIER 4330 | 87 4331 | mate 4332 | forcing 4333 | ONE-DAY 4334 | Headley 4335 | Togo 4336 | BUCHAREST 4337 | Bucharest 4338 | Dinamo 4339 | 49 4340 | Brisbane 4341 | Queensland 4342 | Auckland 4343 | 10-15 4344 | .470 4345 | reliever 4346 | contact 4347 | acquired 4348 | 6-5 4349 | O'Neill 4350 | Cardinals 4351 | Padres 4352 | Braves 4353 | Cubs 4354 | Colorado 4355 | Pirates 4356 | Marlins 4357 | Dodgers 4358 | Phillies 4359 | Expos 4360 | shut 4361 | Giants 4362 | Houston 4363 | Astros 4364 | Shane 4365 | equaliser 4366 | lack 4367 | knee 4368 | MILAN 4369 | Batistuta 4370 | Fiorentina 4371 | serie 4372 | 83rd 4373 | ITALIAN 4374 | Norwegian 4375 | Swiss 4376 | Neuchatel 4377 | TOUR 4378 | Montgomerie 4379 | CYCLING 4380 | cycling 4381 | Ferrigato 4382 | Lance 4383 | Armstrong 4384 | narrow 4385 | Briton 4386 | 199 4387 | Baker 4388 | FROM 4389 | Ronald 4390 | Club 4391 | Arnold 4392 | Blind 4393 | Azharuddin 4394 | Singer 4395 | 1996-08-26 4396 | choice 4397 | riding 4398 | supporters 4399 | Rugby 4400 | apartheid 4401 | commission 4402 | paint 4403 | amnesty 4404 | tell 4405 | abuses 4406 | achieve 4407 | 1948 4408 | express 4409 | Doboj 4410 | Moslems 4411 | Dayton 4412 | refugee 4413 | thrown 4414 | warning 4415 | pullout 4416 | Mexican 4417 | MEXICO 4418 | leftist 4419 | EPR 4420 | Guerrero 4421 | Commander 4422 | protesters 4423 | Ten 4424 | northwest 4425 | dragged 4426 | linking 4427 | allied 4428 | supplies 4429 | reveal 4430 | finding 4431 | democracy 4432 | Kabariti 4433 | speaker 4434 | Weizman 4435 | speak 4436 | Earlier 4437 | cool 4438 | education 4439 | establish 4440 | parliamentary 4441 | voters 4442 | thin 4443 | stations 4444 | employee 4445 | skilled 4446 | contribute 4447 | Sandra 4448 | O'Neal 4449 | customer 4450 | agenda 4451 | ask 4452 | rated 4453 | decisions 4454 | insurance 4455 | fraud 4456 | Salang 4457 | Supreme 4458 | postponement 4459 | Hasina 4460 | Bogra 4461 | ruling 4462 | Nearly 4463 | drought 4464 | regions 4465 | diesel 4466 | grain 4467 | sector 4468 | transition 4469 | reputation 4470 | dictator 4471 | quality 4472 | Wheat 4473 | Institute 4474 | Celsius 4475 | rainfall 4476 | farms 4477 | acres 4478 | 220 4479 | badly 4480 | autumn 4481 | negative 4482 | attributable 4483 | Gilbertson 4484 | rand 4485 | substantial 4486 | shutdown 4487 | rains 4488 | merger 4489 | Plc 4490 | portfolio 4491 | Advanced 4492 | Medical 4493 | IVAC 4494 | infusion 4495 | therapy 4496 | IMED 4497 | plants 4498 | 120 4499 | expects 4500 | delivery 4501 | Morris 4502 | boosted 4503 | bonds 4504 | Treasury 4505 | secured 4506 | hat-trick 4507 | decisive 4508 | Sharpe 4509 | Botham 4510 | Middlesex 4511 | BY 4512 | FRIENDLY 4513 | Rica 4514 | Mauritius 4515 | crude 4516 | Gabon 4517 | legs 4518 | postponed 4519 | swimming 4520 | Popov 4521 | titles 4522 | medicine 4523 | Olympics 4524 | Slovak 4525 | Hungarian 4526 | Petra 4527 | SK 4528 | Leon 4529 | catch 4530 | Sao 4531 | Rio 4532 | usual 4533 | hot 4534 | Perry 4535 | Larry 4536 | Magdalena 4537 | Maleeva 4538 | cement 4539 | tough 4540 | Olsza 4541 | Barbara 4542 | ranking 4543 | mind 4544 | original 4545 | MONDAY 4546 | Stephanie 4547 | Oncins 4548 | 6- 4549 | Elena 4550 | 3-2 4551 | .466 4552 | Sosa 4553 | Boer 4554 | Madrid 4555 | ZIMBABWE 4556 | Strang 4557 | Whittall 4558 | Brandes 4559 | Hogg 4560 | Reiffel 4561 | Flower 4562 | filed 4563 | existing 4564 | ferry 4565 | boats 4566 | voting 4567 | municipal 4568 | irregularities 4569 | Election 4570 | consider 4571 | 600,000 4572 | Ambassador 4573 | cast 4574 | ballot 4575 | voter 4576 | High 4577 | appointed 4578 | compromise 4579 | settlement 4580 | insistence 4581 | quit 4582 | Federation 4583 | recover 4584 | Zastava 4585 | factory 4586 | wages 4587 | settlements 4588 | 5.5 4589 | controllers 4590 | Raichev 4591 | Tass 4592 | eye 4593 | Mostostal 4594 | premium 4595 | 2.6 4596 | JOSE 4597 | kidnappers 4598 | tourist 4599 | photographs 4600 | Rican 4601 | peso 4602 | limit 4603 | Dealers 4604 | Hashimoto 4605 | credit 4606 | auctions 4607 | gunman 4608 | 1996-08-27 4609 | gun 4610 | Petroleum 4611 | contracts 4612 | Spot 4613 | liquidity 4614 | deposed 4615 | Specter 4616 | resign 4617 | Airways 4618 | Airbus 4619 | surrender 4620 | asylum 4621 | dozens 4622 | grade 4623 | Midwest 4624 | 6.0 4625 | 859 4626 | tree 4627 | bounce 4628 | fairly 4629 | hog 4630 | sparked 4631 | Gore 4632 | Gingrich 4633 | Convention 4634 | re-election 4635 | coalition 4636 | justice 4637 | painted 4638 | Republicans 4639 | praised 4640 | Reeve 4641 | politicians 4642 | Gephardt 4643 | Reagan 4644 | legislature 4645 | abortion 4646 | nomination 4647 | gas 4648 | Oklahoma 4649 | handgun 4650 | Karachi 4651 | fob 4652 | ARA 4653 | sentiment 4654 | bids 4655 | frost 4656 | 5.0 4657 | moisture 4658 | 32.0 4659 | 12.0 4660 | Brush 4661 | suits 4662 | Salomon 4663 | margins 4664 | versus 4665 | CALL 4666 | CONFIRMED 4667 | Tapie 4668 | instructions 4669 | procedure 4670 | francs 4671 | Rabobank 4672 | epidemic 4673 | debate 4674 | Banharn 4675 | fix 4676 | unq 4677 | Ekeus 4678 | Tamils 4679 | LTTE 4680 | Tankan 4681 | franc 4682 | Iraqis 4683 | diverted 4684 | Stansted 4685 | explosives 4686 | larger 4687 | festival 4688 | Ata-ur-Rehman 4689 | Cozma 4690 | lei 4691 | tunnel 4692 | 12-15 4693 | TUESDAY 4694 | 85th 4695 | Cyprien 4696 | Moura 4697 | STAGE 4698 | NETHERLANDS 4699 | kilometre 4700 | Colonna 4701 | Mapei 4702 | McEwen 4703 | Koerts 4704 | Palmans 4705 | Teutenberg 4706 | Postal 4707 | Aki 4708 | Capiot 4709 | Collstrop 4710 | Neths 4711 | TVM 4712 | Wolf 4713 | Motorola 4714 | Cofinec 4715 | MFS 4716 | certainly 4717 | Faulding 4718 | N.J. 4719 | Kelly 4720 | killer 4721 | SPLA 4722 | representatives 4723 | weakening 4724 | Iliescu 4725 | PUNR 4726 | oppose 4727 | Meri 4728 | Ruutel 4729 | votes 4730 | college 4731 | Reform 4732 | MPs 4733 | acts 4734 | Gajdos 4735 | Interpol 4736 | films 4737 | offices 4738 | victims 4739 | plantation 4740 | note 4741 | CVG 4742 | privatization 4743 | Banespa 4744 | Globo 4745 | Bamerindus 4746 | loan 4747 | Lanusse 4748 | coup 4749 | collision 4750 | Seoul 4751 | remote 4752 | mainland 4753 | humanitarian 4754 | understanding 4755 | Democrat 4756 | bushel 4757 | crop 4758 | Chg 4759 | .02 4760 | unc 4761 | --- 4762 | dn 4763 | HRW 4764 | ABC 4765 | Perot 4766 | bulk 4767 | barley 4768 | Vermont 4769 | medals 4770 | recovery 4771 | Rifkind 4772 | normally 4773 | Hamas 4774 | Rohrabacher 4775 | Tehran 4776 | rupees 4777 | Soyoil 4778 | refined 4779 | Yellow 4780 | n.a 4781 | Helibor 4782 | Barrick 4783 | Arequipa 4784 | Insurance 4785 | Commercial 4786 | AOL 4787 | provinces 4788 | Nasdaq 4789 | Latest 4790 | CDU 4791 | CSU 4792 | SPD 4793 | FDP 4794 | PDS 4795 | Emnid 4796 | Elect 4797 | Res 4798 | Heathrow 4799 | Gatwick 4800 | birds 4801 | Drew 4802 | commentary 4803 | Sakai 4804 | Kan 4805 | shipments 4806 | inventories 4807 | Commodities 4808 | publicly 4809 | SEC 4810 | rules 4811 | 1996-08-28 4812 | Saskatchewan 4813 | units 4814 | Heeswijk 4815 | Telekom 4816 | Boardman 4817 | Collinelli 4818 | 4,000 4819 | Fowler 4820 | 56th 4821 | Albert 4822 | Parma 4823 | Genoa 4824 | Pires 4825 | Foe 4826 | PSG 4827 | relay 4828 | Owens 4829 | Jayasuriya 4830 | Kaluwitharana 4831 | Silva 4832 | airliner 4833 | Arch 4834 | Leduc 4835 | Reef 4836 | Biogen 4837 | Berlex 4838 | Avonex 4839 | Betaseron 4840 | Oct 4841 | noise 4842 | exclusive 4843 | Sierra 4844 | dissidents 4845 | Ivorian 4846 | Bedie 4847 | ADRs 4848 | Gazprom 4849 | Rosati 4850 | quake 4851 | 1996-08-29 4852 | Nov 4853 | endorse 4854 | marijuana 4855 | SER 4856 | Amoco 4857 | Yemen 4858 | defends 4859 | Bhutto 4860 | Sharif 4861 | ACC 4862 | Palkhivala 4863 | pipes 4864 | fishermen 4865 | ND 4866 | Evert 4867 | BOJ 4868 | Lamm 4869 | Names 4870 | gestures 4871 | prosecutors 4872 | GPG 4873 | bush 4874 | Khmer 4875 | Ieng 4876 | Pol 4877 | Pot 4878 | Kubo 4879 | Sakigake 4880 | Takemura 4881 | Hatoyama 4882 | Banisadr 4883 | Ballanger 4884 | 1996-08-30 4885 | Grobbelaar 4886 | SEP 4887 | Kornblum 4888 | roubles 4889 | virus 4890 | ICAC 4891 | Yassin 4892 | A-rated 4893 | muscle 4894 | soybeans 4895 | jewelry 4896 | Simpson 4897 | surgeon 4898 | meal 4899 | Gluten 4900 | Gas 4901 | KV 4902 | Pro 4903 | Slough 4904 | Warburg 4905 | kerosene 4906 | cargoes 4907 | OM 4908 | PULPEX 4909 | pulp 4910 | pesetas 4911 | trend 4912 | Hope 4913 | Daewoo 4914 | Dacom 4915 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tensorflow==1.13.1 2 | -------------------------------------------------------------------------------- /run_experiments_multi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | 5 | 6 | from train_MLBiNet import train 7 | 8 | 9 | if __name__ == "__main__": 10 | niter = 10 11 | for i in range(niter): 12 | train(seed_id=i) 13 | -------------------------------------------------------------------------------- /train_MLBiNet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | import os 5 | import time 6 | import json 7 | import random 8 | import numpy as np 9 | import tensorflow as tf 10 | 11 | tf.flags.DEFINE_integer('encode_h', 100, 'dim of encoding layer') 12 | tf.flags.DEFINE_integer('decode_h', 200, 'dim of decoding layer') 13 | tf.flags.DEFINE_integer('tag_dim', 100, 'dimension of tags') 14 | tf.flags.DEFINE_integer('event_info_h', 100, 'hidden size of sentence level information aggregation layer') 15 | tf.flags.DEFINE_integer('batch_size', 64, 'batch size') 16 | tf.flags.DEFINE_integer('max_doc_len', 8, 'max number of sentences in a document') 17 | tf.flags.DEFINE_integer('max_seq_len', 50, 'maximum length of sequence') 18 | tf.flags.DEFINE_integer('num_tag_layers', 2, 'number of tagging layers') 19 | tf.flags.DEFINE_integer('reverse_seq', 1, 'decoder mechanism') 20 | tf.flags.DEFINE_string('tagging_mechanism', "backward_decoder", 'decoder mechanism') 21 | tf.flags.DEFINE_integer('ner_dim_1', 20, 'embedding size of level-1 NER') 22 | tf.flags.DEFINE_integer('ner_dim_2', 20, 'embedding size of level-2 NER') 23 | tf.flags.DEFINE_integer('self_att_not', 1, 'self attention or not') 24 | tf.flags.DEFINE_integer('context_info', 1, 25 | '0: single sentence information, 1: information of two neighbor sentences') 26 | tf.flags.DEFINE_float('penalty_coef', 2e-5, 'penalty coefficient') 27 | tf.flags.DEFINE_float('event_vector_trans', 1, 'event_vector_trans') 28 | 29 | tf.flags.DEFINE_integer('num_epochs', 50, 'Number of epoches') 30 | tf.flags.DEFINE_integer('eval_every_steps', 100, 'Number of epoches') 31 | tf.flags.DEFINE_integer('num_epochs_warm', 0, 'Number of epoches of warm start') 32 | tf.flags.DEFINE_integer('nconsect_epoch', 3, 'early stopping epoches') 33 | tf.flags.DEFINE_float('weight_decay', 1, 'truncation of event attention weights') 34 | 35 | tf.flags.DEFINE_float('warm_learning_rate', 1e-5, 'warm-up learning rate') 36 | tf.flags.DEFINE_float('learning_rate', 5e-4, 'learning rate') 37 | tf.flags.DEFINE_float('decay_rate', 0.99, 'decay rate') 38 | 39 | tf.flags.DEFINE_float('dropout_rate', 0.5, 'dropout rate') 40 | tf.flags.DEFINE_float('grad_clip', 10, 'grad clip to prevent gradient exlode') 41 | tf.flags.DEFINE_float('positive_weights', 1, 'weights for positive sample') 42 | 43 | tf.flags.DEFINE_string('train_file', './data-ACE/example_new.train', 'train file') 44 | tf.flags.DEFINE_string('dev_file', './data-ACE/example_new.dev', 'dev file') 45 | tf.flags.DEFINE_string('test_file', './data-ACE/example_new.test', 'test file') 46 | tf.flags.DEFINE_string('embedding_file','./embedding/embeddings.txt','pretrained embedding file') 47 | tf.flags.DEFINE_integer('word_emb_dim', 100, 'word embedding size') 48 | 49 | tf.flags.DEFINE_string('NER_dict_file', './dict/event_types.txt', 'ner dict file') 50 | tf.flags.DEFINE_string('ner_1_dict_file', './dict/ner_1.txt', 'level-1 ner dict file') 51 | tf.flags.DEFINE_string('ner_2_dict_file', './dict/ner_2.txt', 'level-2 ner dict file') 52 | 53 | FLAGS = tf.flags.FLAGS 54 | 55 | lower_case = False 56 | 57 | os.environ["CUDA_VISIBLE_DEVICES"] = "0" 58 | config_gpu = tf.ConfigProto() 59 | config_gpu.gpu_options.per_process_gpu_memory_fraction = 0.6 60 | 61 | 62 | def train(seed_id=1): 63 | # set seed 64 | tf.set_random_seed(seed_id) 65 | 66 | from MLBiNet import MLBiNet 67 | 68 | from utils_init import load_ED_data 69 | from utils_init import data_transformation_doc 70 | from utils_init import batch_generation_doc 71 | 72 | from utils_init import load_vocab 73 | from utils_init import load_pretrain 74 | 75 | from ace_model_evaluation import write_2_file, ace_pred_result_stat 76 | 77 | with tf.Graph().as_default() as g: 78 | # loading the embedding matrix 79 | embedding_matrix, vocab_words, vocab_2_id, id_2_vocab = load_pretrain(FLAGS.embedding_file, 80 | FLAGS.word_emb_dim) 81 | print('shape of embedding_matrix is:', np.asmatrix(embedding_matrix).shape) 82 | 83 | # load train, dev, test data 84 | sents_train, ners_train, ner_vocab, ner_1_train, ner_2_train, doc_file_to_sents_train = \ 85 | load_ED_data(FLAGS.train_file, lower_case=lower_case) 86 | 87 | # load the vocab of event type 88 | _, ED_2_id = load_vocab(FLAGS.NER_dict_file) 89 | print("ner_2_id is:\t", ED_2_id) 90 | 91 | sents_dev, ners_dev, _, ner_1_dev, ner_2_dev, doc_file_to_sents_dev = \ 92 | load_ED_data(FLAGS.dev_file, lower_case=lower_case) 93 | sents_test, ners_test, _, ner_1_test, ner_2_test, doc_file_to_sents_test = \ 94 | load_ED_data(FLAGS.test_file, lower_case=lower_case) 95 | print("load_ner_data finished!") 96 | print("doc_file_to_sents_test:\t", doc_file_to_sents_test) 97 | 98 | # load NER label 99 | ner_vocab_1, ner_to_id_1 = load_vocab(FLAGS.ner_1_dict_file) 100 | ner_vocab_2, ner_to_id_2 = load_vocab(FLAGS.ner_2_dict_file) 101 | print("NER vocab loaded!") 102 | 103 | # encoding the train, dev, test data 104 | encode_train = data_transformation_doc(sents_train, ner_1_train, ner_2_train, ners_train, 105 | vocab_2_id, ED_2_id, vocab_2_id[''], ner_to_id_1, ner_to_id_2) 106 | encode_dev = data_transformation_doc(sents_dev, ner_1_dev, ner_2_dev, ners_dev, 107 | vocab_2_id, ED_2_id, vocab_2_id[''], ner_to_id_1, ner_to_id_2) 108 | encode_test = data_transformation_doc(sents_test, ner_1_test, ner_2_test, ners_test, 109 | vocab_2_id, ED_2_id, vocab_2_id[''], ner_to_id_1, ner_to_id_2) 110 | print("Document data transformation finished!") 111 | 112 | # batch generating 113 | train_batches = batch_generation_doc(doc_file_to_sents_train, encode_train, FLAGS.batch_size, FLAGS.max_doc_len, 114 | FLAGS.max_seq_len, vocab_2_id, ED_2_id, num_epoches=FLAGS.num_epochs) 115 | dev_batches = batch_generation_doc(doc_file_to_sents_dev, encode_dev, FLAGS.batch_size, FLAGS.max_doc_len, 116 | FLAGS.max_seq_len, vocab_2_id, ED_2_id, num_epoches=1) 117 | test_batches = batch_generation_doc(doc_file_to_sents_test, encode_test, FLAGS.batch_size, FLAGS.max_doc_len, 118 | FLAGS.max_seq_len, vocab_2_id, ED_2_id, num_epoches=1) 119 | print("batch_generation_doc finished!") 120 | 121 | print('Begin model initialization!') 122 | with tf.Session(config=config_gpu) as sess: 123 | model = MLBiNet( 124 | encode_h = FLAGS.encode_h, 125 | decode_h = FLAGS.decode_h, 126 | tag_dim = FLAGS.tag_dim, 127 | event_info_h = FLAGS.event_info_h, 128 | word_emb_mat = np.array(embedding_matrix), 129 | batch_size = FLAGS.batch_size, 130 | max_doc_len = FLAGS.max_doc_len, 131 | max_seq_len = FLAGS.max_seq_len, 132 | id_O = ED_2_id['O'], 133 | num_tag_layers = FLAGS.num_tag_layers, 134 | weight_decay = FLAGS.weight_decay, 135 | reverse_seq = FLAGS.reverse_seq, 136 | class_size = len(ED_2_id), 137 | tagging_mechanism = FLAGS.tagging_mechanism, 138 | ner_size_1 = len(ner_to_id_1), 139 | ner_dim_1 = FLAGS.ner_dim_1, 140 | ner_size_2 = len(ner_to_id_2), 141 | ner_dim_2 = FLAGS.ner_dim_2, 142 | self_att_not = FLAGS.self_att_not, 143 | context_info = FLAGS.context_info, 144 | event_vector_trans = FLAGS.event_vector_trans 145 | ) 146 | print('encoder-decoder model initialized!') 147 | 148 | loss_ed = model.loss 149 | for tvarsi in tf.trainable_variables(): 150 | if tvarsi.name != 'word_emb_mat:0': 151 | loss_ed += FLAGS.penalty_coef * tf.reduce_sum(tvarsi ** 2) 152 | else: 153 | print("\n\n{} is not penalied!\n\n".format(tvarsi)) 154 | 155 | with tf.name_scope('accuracy'): 156 | label_pred_naive = model.label_pred 157 | label_pred = model.label_pred 158 | label_true = model.label_true 159 | acc_cnt_naive = tf.reduce_sum(tf.cast(tf.equal(label_pred_naive,label_true),dtype=tf.float32)) 160 | acc_cnt = tf.reduce_sum(tf.cast(tf.equal(label_pred,label_true),dtype=tf.float32)) 161 | cnt_all = tf.reduce_sum(tf.cast(tf.greater(label_true,-1),dtype=tf.float32)) 162 | acc_rate = acc_cnt / cnt_all 163 | 164 | valid_len_final = model.valid_len_list 165 | 166 | timestamp = str(int(time.time())) 167 | out_dir = os.path.join('./runs', timestamp) 168 | checkpoint_dir = os.path.join(out_dir, "checkpoints") 169 | if not os.path.exists(checkpoint_dir): 170 | os.makedirs(checkpoint_dir) 171 | 172 | param_dict = FLAGS.flag_values_dict() 173 | param_dict['lower_case'] = lower_case 174 | 175 | with open(os.path.join(checkpoint_dir,'config.json'), "w") as f: 176 | f.write(json.dumps(param_dict, indent=2, ensure_ascii=False)) 177 | 178 | tvars = tf.trainable_variables() 179 | for kk, tvarsi in enumerate(tvars): 180 | print('The %d-th tvars is %s' % (kk, tvarsi)) 181 | 182 | global_step = tf.Variable(0, trainable=False) 183 | 184 | learning_rate = tf.train.exponential_decay(learning_rate=FLAGS.learning_rate, 185 | global_step=global_step, 186 | decay_steps=len(train_batches) // int(FLAGS.num_epochs), 187 | decay_rate=FLAGS.decay_rate) 188 | tvars_no_emb = [x for x in tvars if 'word_emb_mat' not in x.name] 189 | opt_ed_NO_emb_sent = tf.train.AdamOptimizer(learning_rate) 190 | # 191 | grads_trig_sent_NO_EMB, _ = tf.clip_by_global_norm(tf.gradients(loss_ed, tvars_no_emb), FLAGS.grad_clip) 192 | grads_and_vars_trig_sent_NO_EMB = tuple(zip(grads_trig_sent_NO_EMB, tvars_no_emb)) 193 | train_ed_NO_emb = opt_ed_NO_emb_sent.apply_gradients(grads_and_vars_trig_sent_NO_EMB, global_step=global_step) 194 | sess.run(tf.global_variables_initializer()) 195 | 196 | def train_step(train_batch,epoch): 197 | positive_weights = FLAGS.positive_weights 198 | 199 | feed_dict = { 200 | model.dropout_rate: FLAGS.dropout_rate, 201 | model.input_docs: np.array(train_batch[0]), 202 | model.ner_docs_1: np.array(train_batch[1]), 203 | model.ner_docs_2: np.array(train_batch[2]), 204 | model.input_label_docs: np.array(train_batch[3]), 205 | model.valid_batch: train_batch[4], 206 | model.valid_sent_len: np.array(train_batch[5]), 207 | model.valid_words_len: np.array(train_batch[6]), 208 | model.positive_weights: positive_weights 209 | } 210 | 211 | _, loss_trigger_tmp, acc_rate_tmp, step_curr = sess.run([train_ed_NO_emb, loss_ed, acc_rate, global_step], 212 | feed_dict) 213 | return loss_trigger_tmp,step_curr,acc_rate_tmp 214 | 215 | 216 | def dev_test_step(dev_batches): 217 | def dev_ont_batch(dev_batch): 218 | feed_dict = { 219 | model.dropout_rate: 0, 220 | model.input_docs: np.array(dev_batch[0]), 221 | model.ner_docs_1: np.array(dev_batch[1]), 222 | model.ner_docs_2: np.array(dev_batch[2]), 223 | model.input_label_docs: np.array(dev_batch[3]), 224 | model.valid_batch: dev_batch[4], 225 | model.valid_sent_len: np.array(dev_batch[5]), 226 | model.valid_words_len: np.array(dev_batch[6]), 227 | model.positive_weights: 1.0 228 | } 229 | acc_cnt_tmp, cnt_all_tmp,acc_cnt_naive_tmp,valid_len_tmp,\ 230 | label_pred_tmp, label_pred_naive_tmp, label_true_tmp, final_words_id_tmp, loss_tmp \ 231 | = sess.run([acc_cnt,cnt_all,acc_cnt_naive,valid_len_final,label_pred, label_pred_naive, 232 | label_true,model.final_words_id,loss_ed], feed_dict) 233 | return acc_cnt_tmp, cnt_all_tmp,acc_cnt_naive_tmp,valid_len_tmp,label_pred_tmp, \ 234 | label_pred_naive_tmp, label_true_tmp,final_words_id_tmp, loss_tmp 235 | 236 | acc_cnt_list, cnt_all_list = [], [] 237 | acc_cnt_naive_list, cnt_all_naive_list = [], [] 238 | label_pred_list, label_pred_naive_list, label_true_list = [],[],[] 239 | valid_len_list = [] 240 | words_sents = [] 241 | loss_dev_test = 0 242 | len_seq_all = 0 243 | for dev_batchi in dev_batches: 244 | acc_cnt_tmp, cnt_all_tmp,acc_cnt_naive_tmp,valid_len_tmp,\ 245 | label_pred_tmp, label_pred_naive_tmp, label_true_tmp,final_words_id_tmp, loss_tmp_i\ 246 | = dev_ont_batch(dev_batchi) 247 | acc_cnt_list.append(acc_cnt_tmp) 248 | cnt_all_list.append(cnt_all_tmp) 249 | acc_cnt_naive_list.append(acc_cnt_naive_tmp) 250 | label_pred_list.extend(label_pred_tmp) 251 | label_pred_naive_list.extend(label_pred_naive_tmp) 252 | label_true_list.extend(label_true_tmp) 253 | valid_len_list.extend(valid_len_tmp) 254 | words_sents.extend(final_words_id_tmp) 255 | loss_dev_test += loss_tmp_i * len(label_pred_naive_tmp) 256 | len_seq_all += len(label_pred_naive_tmp) 257 | loss_dev_test = loss_dev_test / (len_seq_all + 1e-8) 258 | 259 | prec_dev = sum(acc_cnt_list) / sum(cnt_all_list) 260 | prec_dev_naive = sum(acc_cnt_naive_list) / sum(cnt_all_list) 261 | return prec_dev,prec_dev_naive,words_sents,label_pred_list,\ 262 | label_true_list,valid_len_list,loss_dev_test 263 | 264 | print('Total train batch is:\t',len(train_batches),flush=True) 265 | 266 | prec_test_best = 0 267 | loss_dev_best = 10000 268 | loss_dev_second = 10000 269 | loss_dev_list = [] 270 | nconsect = 0 271 | print("total train steps:\t", len(train_batches)) 272 | for i, train_batchi in enumerate(train_batches): 273 | epoch = i // FLAGS.eval_every_steps 274 | loss_trigger_tmp, step_curr, acc_rate_tmp = train_step(train_batchi,0) 275 | if i % 1e1 == 0: 276 | print('epoch {}, step: {},loss: {},acc_rate: {}'.format( 277 | epoch,step_curr,loss_trigger_tmp,acc_rate_tmp), flush=True) 278 | 279 | if i % FLAGS.eval_every_steps == 0 or i == len(train_batches) - 1: 280 | prec_dev,prec_dev_naive,words_sents,label_pred_list,\ 281 | label_true_list,valid_len_list,loss_dev_ = dev_test_step(dev_batches) 282 | print('epoch {} prec_dev is: \n'.format(epoch), prec_dev, flush=True) 283 | if epoch == 0: 284 | os.makedirs(os.path.join(checkpoint_dir, 'dev')) 285 | filename_dev = os.path.join(checkpoint_dir, 'dev/test_result_{}.txt').format(step_curr) 286 | write_2_file(filename_dev, ED_2_id, label_true_list,valid_len_list, 287 | words_sents, label_pred_list, id_2_vocab) 288 | prec_event_dev, recall_event_dev, f1_event_dev = ace_pred_result_stat(filename_dev) 289 | print('epoch: {}, loss_dev_: {}'.format(epoch, loss_dev_), flush=True) 290 | print('epoch: {}, prec_event_dev: {}, recall_event_dev: {}, f1_event_dev: {}'.format( 291 | epoch, prec_event_dev, recall_event_dev, f1_event_dev), flush=True) 292 | 293 | loss_dev_list.append(loss_dev_) 294 | loss_dev_list = sorted(loss_dev_list,key=lambda x: x, reverse=False) 295 | if len(loss_dev_list) > 2: 296 | loss_dev_second = loss_dev_list[2] 297 | if loss_dev_ > loss_dev_best: 298 | if loss_dev_ > loss_dev_second: 299 | nconsect += 1 300 | else: 301 | nconsect = 0 302 | else: 303 | nconsect = 0 304 | loss_dev_best = loss_dev_ 305 | 306 | print('\n') 307 | prec_test,prec_test_naive,words_sents,label_pred_list,\ 308 | label_true_list,valid_len_list, loss_test_ = dev_test_step(test_batches) 309 | print('epoch {} prec_test is: \n'.format(epoch), prec_test, flush=True) 310 | print('\n') 311 | # write to file 312 | if epoch == 0: 313 | os.makedirs(os.path.join(checkpoint_dir, 'test')) 314 | filename_test = os.path.join(checkpoint_dir, 'test/test_result_{}.txt').format(step_curr) 315 | write_2_file(filename_test, ED_2_id, label_true_list,valid_len_list, 316 | words_sents, label_pred_list, id_2_vocab) 317 | prec_event_test, recall_event_test, f1_event_test = ace_pred_result_stat(filename_test) 318 | print('epoch: {}, prec_event_test: {}, recall_event_test: {}, f1_event_test:{}'.format( 319 | epoch, prec_event_test, recall_event_test, f1_event_test), flush=True) 320 | 321 | if prec_test_best < f1_event_test: 322 | prec_test_best = f1_event_test 323 | 324 | print('The best dev loss value is:\t', [loss_dev_best,nconsect]) 325 | # print('The best dev f1 value is:\t', [prec_dev_best,nconsect]) 326 | print('The best test f1 value is:\t', prec_test_best) 327 | with open(os.path.join(checkpoint_dir, 'test_result.txt'), encoding='utf-8', mode='a') as f: 328 | f.write('\t'.join([str(epoch), str(prec_event_test),str(recall_event_test), 329 | str(f1_event_test), str(loss_dev_best), str(loss_dev_second), str(nconsect)]) + '\n') 330 | 331 | if nconsect >= FLAGS.nconsect_epoch: 332 | break 333 | tf.reset_default_graph() 334 | 335 | if __name__ == "__main__": 336 | # train() 337 | pass -------------------------------------------------------------------------------- /utils_init.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*- coding: utf-8 -*- 3 | 4 | 5 | import random 6 | import numpy as np 7 | 8 | 9 | def load_vocab(filename): 10 | vocab = [] 11 | with open(filename,encoding='utf-8',mode='r') as f: 12 | for line in f: 13 | vocab.append(line.strip()) 14 | vocab_to_id = {u:i for i,u in enumerate(vocab)} 15 | return vocab,vocab_to_id 16 | 17 | 18 | def load_pretrain(glove_file,word_emb_dim): 19 | embedding_matrix,vocab = [], [] 20 | with open(glove_file,encoding='utf-8',mode='r') as f: 21 | for i,line in enumerate(f): 22 | if i % 1e5 == 0: 23 | print('Current index is %d' %i) 24 | try: 25 | line_split = line.strip().split() 26 | if len(line_split) == word_emb_dim + 1: 27 | # if line_split[0] in vocab_set: 28 | vocab.append(line_split[0]) 29 | embedding_matrix.append([float(x) for x in line_split[1:]]) 30 | except: 31 | pass 32 | vocab_to_id = {u:i for i,u in enumerate(vocab)} 33 | id_to_vocab = {v:u for u,v in vocab_to_id.items()} 34 | return embedding_matrix,vocab,vocab_to_id,id_to_vocab 35 | 36 | 37 | def load_ED_data(filename,lower_case=False): 38 | """ 39 | loading ner data, sentence and its corresponding word-level ner label 40 | """ 41 | sents_all = [] 42 | ners_all = [] 43 | ner_1 = [] 44 | ner_2 = [] 45 | sent_tmp = [] 46 | ner_tmp = [] 47 | ner_1_tmp = [] 48 | ner_2_tmp = [] 49 | ner_vocab = set() 50 | doc_file_to_sents = {} 51 | with open(filename,encoding='utf-8',mode='r') as f: 52 | w_last = '' 53 | for line in f: 54 | line = line.strip() 55 | line_split = line.split(' ') 56 | if len(line_split) == 5: 57 | doc_file = line_split[1] 58 | if lower_case: 59 | line_split[0] = str(line_split).lower() 60 | sent_tmp.append(line_split[0]) 61 | 62 | ner_tmp.append(line_split[-1]) 63 | ner_vocab.add(line_split[-1]) 64 | ner_1_tmp_tmp = line_split[2] 65 | ner_1_tmp_tmp = ner_1_tmp_tmp 66 | ner_1_tmp.append(ner_1_tmp_tmp) 67 | ner_2_tmp_tmp = line_split[3] 68 | ner_2_tmp_tmp = ner_2_tmp_tmp 69 | ner_2_tmp.append(ner_2_tmp_tmp) 70 | else: 71 | if len(sent_tmp): 72 | sents_all.append(sent_tmp) 73 | ners_all.append(ner_tmp) 74 | ner_1.append(ner_1_tmp) 75 | ner_2.append(ner_2_tmp) 76 | sent_tmp = [] 77 | ner_tmp = [] 78 | ner_1_tmp = [] 79 | ner_2_tmp = [] 80 | if doc_file not in doc_file_to_sents: 81 | doc_file_to_sents[doc_file] = [len(sents_all) - 1] 82 | else: 83 | doc_file_to_sents[doc_file] += [len(sents_all) - 1] 84 | w_last = line_split[0] 85 | if len(sent_tmp) > 0: 86 | sents_all.append(sent_tmp) 87 | ners_all.append(ner_tmp) 88 | ner_1.append(ner_1_tmp) 89 | ner_2.append(ner_2_tmp) 90 | if doc_file not in doc_file_to_sents: 91 | doc_file_to_sents[doc_file] = [len(sents_all) - 1] 92 | else: 93 | doc_file_to_sents[doc_file] += [len(sents_all) - 1] 94 | return sents_all,ners_all,ner_vocab,ner_1,ner_2,doc_file_to_sents 95 | 96 | 97 | def data_transformation_doc(sents_list,ner_1_list,ner_2_list,ner_list,vocab_2_id,ner_2_id,word_unk_id,ner_to_id_1,ner_to_id_2): 98 | """ 99 | transform the raw data into numerics 100 | """ 101 | encode_res = [] 102 | for i,senti in enumerate(sents_list): 103 | neri = ner_list[i] 104 | ner_1_i = ner_1_list[i] 105 | ner_2_i = ner_2_list[i] 106 | ner_tmp = [] 107 | sent_tmp = [] 108 | ner_1_tmp = [] 109 | ner_2_tmp = [] 110 | for k, wordk in enumerate(senti): 111 | nerk = neri[k] 112 | try: 113 | sent_tmp.append(vocab_2_id[wordk]) 114 | except: 115 | sent_tmp.append(word_unk_id) 116 | ner_tmp.append(ner_2_id[nerk]) 117 | ner_1_tmp.append(ner_to_id_1[ner_1_i[k]]) 118 | ner_2_tmp.append(ner_to_id_2[ner_2_i[k]]) 119 | encode_res.append([sent_tmp,ner_1_tmp,ner_2_tmp,ner_tmp]) 120 | return encode_res 121 | 122 | 123 | def batch_generation_doc(doc_to_sents,enc_list,batch_size,max_doc_len,max_seq_len,vocab_2_id,ner_2_id, num_epoches=1): 124 | # padding and trimming 125 | ner_pad = ner_2_id['O'] 126 | word_pad = vocab_2_id[''] 127 | valid_len_list = [] 128 | for i,linei in enumerate(enc_list): 129 | senti = linei[0] 130 | ner_1_i = linei[1] 131 | ner_2_i = linei[2] 132 | neri = linei[3] 133 | valid_len_list.append(min(len(senti),max_seq_len)) 134 | senti = senti[:max_seq_len] 135 | senti = senti + [word_pad] * max(0,max_seq_len-len(senti)) 136 | neri = neri[:max_seq_len] 137 | neri = neri + [ner_pad] * max(0, max_seq_len - len(neri)) 138 | ner_1_i = ner_1_i[:max_seq_len] 139 | ner_1_i = ner_1_i + [0] * max(0, max_seq_len - len(ner_1_i)) 140 | ner_2_i = ner_2_i[:max_seq_len] 141 | ner_2_i = ner_2_i + [0] * max(0, max_seq_len - len(ner_2_i)) 142 | enc_list[i] = [senti,ner_1_i,ner_2_i,neri] 143 | 144 | docs_all = [] 145 | for kk,dockk in enumerate(list(doc_to_sents.keys())): 146 | sent_ids = doc_to_sents[dockk] 147 | if len(sent_ids) <= max_doc_len: 148 | sent_all = [] 149 | ner_1_all = [] 150 | ner_2_all = [] 151 | ner_all = [] 152 | valid_sents = len(sent_ids) 153 | valid_words = [] 154 | for idi in sent_ids: 155 | sent_all.append(enc_list[idi][0]) 156 | ner_1_all.append(enc_list[idi][1]) 157 | ner_2_all.append(enc_list[idi][2]) 158 | ner_all.append(enc_list[idi][3]) 159 | valid_words.append(valid_len_list[idi]) 160 | for kk in range(max_doc_len - valid_sents): 161 | sent_all.append(enc_list[idi][0]) 162 | ner_1_all.append(enc_list[idi][1]) 163 | ner_2_all.append(enc_list[idi][2]) 164 | ner_all.append(enc_list[idi][3]) 165 | valid_words.append(valid_len_list[idi]) 166 | docs_all.append([sent_all,ner_1_all,ner_2_all,ner_all,valid_sents,valid_words]) 167 | else: 168 | len_all = len(sent_ids) 169 | ndocs_mini = int(np.ceil(len_all / max_doc_len)) 170 | for kk in range(ndocs_mini): 171 | init_step = kk * max_doc_len 172 | end_step = kk * max_doc_len + max_doc_len 173 | ids_tmp = sent_ids[init_step:end_step] 174 | sent_all = [] 175 | ner_1_all = [] 176 | ner_2_all = [] 177 | ner_all = [] 178 | valid_sents = len(ids_tmp) 179 | valid_words = [] 180 | for idi in ids_tmp: 181 | sent_all.append(enc_list[idi][0]) 182 | ner_1_all.append(enc_list[idi][1]) 183 | ner_2_all.append(enc_list[idi][2]) 184 | ner_all.append(enc_list[idi][3]) 185 | valid_words.append(valid_len_list[idi]) 186 | for kk in range(max_doc_len - valid_sents): 187 | sent_all.append(enc_list[idi][0]) 188 | ner_1_all.append(enc_list[idi][1]) 189 | ner_2_all.append(enc_list[idi][2]) 190 | ner_all.append(enc_list[idi][3]) 191 | valid_words.append(valid_len_list[idi]) 192 | docs_all.append([sent_all, ner_1_all, ner_2_all, ner_all, valid_sents, valid_words]) 193 | random.shuffle(docs_all) 194 | 195 | batches_all = [] 196 | sent_alls = [] 197 | ner_1_alls = [] 198 | ner_2_alls = [] 199 | ner_alls = [] 200 | valid_sentss = [] 201 | valid_wordss = [] 202 | 203 | docs_all = docs_all * num_epoches 204 | 205 | for k,dock in enumerate(docs_all): 206 | if k % batch_size == 0 and k > 0: 207 | batches_all.append([sent_alls,ner_1_alls,ner_2_alls,ner_alls,batch_size,valid_sentss,valid_wordss]) 208 | sent_alls = [] 209 | ner_1_alls = [] 210 | ner_2_alls = [] 211 | ner_alls = [] 212 | valid_sentss = [] 213 | valid_wordss = [] 214 | sent_alls.append(dock[0]) 215 | ner_1_alls.append(dock[1]) 216 | ner_2_alls.append(dock[2]) 217 | ner_alls.append(dock[3]) 218 | valid_sentss.append(dock[4]) 219 | valid_wordss.append(dock[5]) 220 | else: 221 | sent_alls.append(dock[0]) 222 | ner_1_alls.append(dock[1]) 223 | ner_2_alls.append(dock[2]) 224 | ner_alls.append(dock[3]) 225 | valid_sentss.append(dock[4]) 226 | valid_wordss.append(dock[5]) 227 | # paste the final 228 | len_valid = len(sent_alls) 229 | if len_valid == batch_size: 230 | batches_all.append([sent_alls, ner_1_alls, ner_2_alls, ner_alls, len_valid, valid_sentss, valid_wordss]) 231 | else: 232 | sent_alls += [sent_alls[-1]] * (batch_size - len_valid) 233 | ner_1_alls += [ner_1_alls[-1]] * (batch_size - len_valid) 234 | ner_2_alls += [ner_2_alls[-1]] * (batch_size - len_valid) 235 | ner_alls += [ner_alls[-1]] * (batch_size - len_valid) 236 | valid_sentss += [valid_sentss[-1]] * (batch_size - len_valid) 237 | valid_wordss += [valid_wordss[-1]] * (batch_size - len_valid) 238 | batches_all.append([sent_alls, ner_1_alls, ner_2_alls, ner_alls, len_valid, valid_sentss, valid_wordss]) 239 | return batches_all 240 | 241 | 242 | 243 | 244 | if __name__ == "__main__": 245 | pass 246 | 247 | --------------------------------------------------------------------------------