├── BiDAF+Self Attention ├── dureader │ ├── SIF.py │ ├── dataset.py │ ├── json_to_sentence.py │ ├── layers │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── basic_rnn.cpython-36.pyc │ │ │ ├── cu_rnn.cpython-36.pyc │ │ │ ├── match_layer.cpython-36.pyc │ │ │ └── pointer_net.cpython-36.pyc │ │ ├── basic_rnn.py │ │ ├── cu_rnn.py │ │ ├── match_layer.py │ │ └── pointer_net.py │ ├── pretrain_embedding.py │ ├── rc_model.py │ ├── run.py │ └── vocab.py └── utils │ ├── __init__.py │ ├── baseline_eval.py │ ├── bleu.py │ ├── bleu_metric │ ├── __pycache__ │ │ ├── bleu.cpython-36.pyc │ │ └── bleu_score.cpython-36.pyc │ ├── bleu.py │ └── bleu_score.py │ ├── common.py │ ├── dureader_eval.py │ ├── get_vocab.py │ ├── mrc_eval.py │ ├── preprocess.py │ ├── rouge.py │ └── rouge_metric │ ├── __pycache__ │ └── rouge.cpython-36.pyc │ └── rouge.py ├── BiDAF_Origin ├── dureader │ ├── dataset.py │ ├── layers │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── basic_rnn.cpython-36.pyc │ │ │ ├── match_layer.cpython-36.pyc │ │ │ └── pointer_net.cpython-36.pyc │ │ ├── basic_rnn.py │ │ ├── match_layer.py │ │ └── pointer_net.py │ ├── rc_model.py │ ├── run.py │ └── vocab.py └── utils │ ├── __init__.py │ ├── baseline_eval.py │ ├── bleu_metric │ ├── __pycache__ │ │ ├── bleu.cpython-36.pyc │ │ └── bleu_score.cpython-36.pyc │ ├── bleu.py │ └── bleu_score.py │ ├── dureader_eval.py │ ├── get_vocab.py │ ├── json_to_sentence.py │ ├── preprocess.py │ ├── pretrain_embedding.py │ └── rouge_metric │ ├── __pycache__ │ └── rouge.cpython-36.pyc │ └── rouge.py ├── GatedRNN ├── GatedRNN.py ├── GatedRNN_prepro.py ├── GatedRNN_run.py ├── GatedRNN_util.py └── basic_rnn.py ├── R-Net ├── S_model.py ├── S_prepro.py ├── S_run.py ├── S_util.py ├── basic_rnn.py ├── bleu.py ├── common.py ├── mrc_eval.py └── rouge.py ├── README.md ├── data └── demo │ ├── README.md │ ├── devset │ └── search.dev.json │ ├── testset │ └── search.test.json │ └── trainset │ └── search.train.json └── 竞赛技术报告 ├── Final_Naturali-2018机器阅读理解技术竞赛系统报告.pptx ├── 东北大学-2018机器阅读理解竞赛报告.ppt └── 台达电子-Delta-MRC系統報告.pdf /BiDAF+Self Attention/dureader/SIF.py: -------------------------------------------------------------------------------- 1 | import pickle as pkl 2 | import os 3 | import sys 4 | import ujson as json 5 | import logging 6 | import numpy as np 7 | from gensim.models import word2vec 8 | from sklearn.decomposition import TruncatedSVD 9 | 10 | 11 | class SIFModel(object): 12 | def __init__(self, args, logger, pre_train, train_files=[], dev_files=[], test_files=[], a=1e-3, embed_dim=300): 13 | self.logger = logger 14 | self.segmented_dir = args.segmented_dir 15 | self.prepared_dir = args.prepared_dir 16 | self.a = a 17 | self.embed_dim = embed_dim 18 | self.weighted_word_dict = None 19 | self.pc = None 20 | self.train_set, self.dev_set, self.test_set = [], [], [] 21 | 22 | if pre_train: 23 | for train_file in train_files: 24 | self.train_set += self._load_dataset(train_file, train=True) 25 | self.train_set_seg = os.path.join(self.segmented_dir, 'train_set.seg') 26 | self.logger.info('Writing train_set.seg') 27 | self._write_data(self.train_set, self.train_set_seg) 28 | del self.train_set 29 | 30 | for dev_file in dev_files: 31 | self.dev_set += self._load_dataset(dev_file) 32 | self.dev_set_seg = os.path.join(self.segmented_dir, 'dev_set.seg') 33 | self.logger.info('Writing dev_set.seg') 34 | self._write_data(self.dev_set, self.dev_set_seg) 35 | del self.dev_set 36 | 37 | for test_file in test_files: 38 | self.test_set += self._load_dataset(test_file) 39 | self.test_set_seg = os.path.join(self.segmented_dir, 'test_set.seg') 40 | self.logger.info('Writing test_set.seg') 41 | self._write_data(self.test_set, self.test_set_seg) 42 | del self.test_set 43 | 44 | def _load_dataset(self, data_path, train=False): 45 | fin = open(data_path, 'r', encoding='utf8') 46 | data_set = [] 47 | for lidx, line in enumerate(fin): 48 | sample = json.loads(line.strip()) 49 | del sample['question'] 50 | if train: 51 | del sample['answers'] 52 | del sample['fake_answers'] 53 | del sample['segmented_answers'] 54 | sample['passages'] = [] 55 | for d_idx, doc in enumerate(sample['documents']): 56 | if train: 57 | most_related_para = doc['most_related_para'] 58 | sample['passages'].append({'passage_tokens': doc['segmented_paragraphs'][most_related_para]}) 59 | else: 60 | for segmented_paragraph in doc['segmented_paragraphs']: 61 | sample['passages'].append({'passage_tokens': segmented_paragraph}) 62 | del sample['documents'] 63 | data_set.append(sample) 64 | fin.close() 65 | return data_set 66 | 67 | def _write_data(self, data_set, tar_dir): 68 | with open(tar_dir, 'w', encoding='utf8') as f: 69 | for sample in data_set: 70 | f.write(' '.join(sample['segmented_question']) + '\n') 71 | for passage in sample['passages']: 72 | f.write(' '.join(passage['passage_tokens']) + '\n') 73 | del sample 74 | f.close() 75 | 76 | def train_embeddings(self): 77 | sys.path.append('..') 78 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 79 | logging.root.setLevel(level=logging.INFO) 80 | self.logger.info("running %s" % ' '.join(sys.argv)) 81 | 82 | model = word2vec.Word2Vec(word2vec.PathLineSentences(self.segmented_dir), size=300, min_count=2, workers=8, 83 | iter=15) 84 | w2v_dict = {} 85 | for word in model.wv.vocab: 86 | w2v_dict[word] = model[word] 87 | with open(os.path.join(self.prepared_dir, 'w2v_dic.pkl'), 'wb') as f: 88 | pkl.dump(w2v_dict, f) 89 | f.close() 90 | model.wv.save_word2vec_format(os.path.join(self.prepared_dir, 'w2v_model.bin'), binary=True) 91 | 92 | def get_dict_word_fre(self): 93 | word_all_num = 0 94 | dict_word_num = {} 95 | dict_word_fre = {} 96 | for root, dirs, files in os.walk(self.segmented_dir): 97 | for file_name in files: 98 | with open(os.path.join(self.segmented_dir, file_name), 'r', encoding='utf8') as f: 99 | for line in f.readlines(): 100 | line = line.replace('\n', '') 101 | words = line.split(' ') 102 | for word in words: 103 | word_all_num += 1 104 | if word in dict_word_num: 105 | dict_word_num[word] += 1 106 | else: 107 | dict_word_num[word] = 1 108 | f.close() 109 | for word in dict_word_num: 110 | dict_word_fre[word] = dict_word_num[word] / word_all_num 111 | return word_all_num, dict_word_fre 112 | 113 | def get_dict_word_weight(self): 114 | word_all_num, dict_word_fre = self.get_dict_word_fre() 115 | self.logger.info('Total words num is {}'.format(word_all_num)) 116 | if self.a <= 0: 117 | self.a = 1.0 118 | dict_word_weight = {} 119 | for word in dict_word_fre: 120 | dict_word_weight[word] = self.a / (self.a + dict_word_fre[word]) 121 | return dict_word_weight 122 | 123 | def load_model(self): 124 | with open(os.path.join(self.prepared_dir, 'weighted_word_dict.pkl'), 'rb') as fww: 125 | self.weighted_word_dict = pkl.load(fww) 126 | with open(os.path.join(self.prepared_dir, 'pc.pkl'), 'rb') as fpc: 127 | self.pc = pkl.load(fpc) 128 | 129 | def get_weighted_embedding(self, sentence): 130 | # init the sentence embedding 131 | weighted_embedding = np.array([0.0] * self.embed_dim) 132 | for word in sentence: 133 | # weighted_embedding += self.weighted_word_dict[word] 134 | if word in self.weighted_word_dict: 135 | weighted_embedding += self.weighted_word_dict[word] 136 | else: 137 | weighted_embedding += np.array([1.0] * self.embed_dim) * 0.001 138 | return weighted_embedding 139 | 140 | def get_weighted_embedding_list(self, dict_word_weight): 141 | weighted_embedding_list = [] 142 | weighted_word_dict = {} 143 | with open(os.path.join(self.prepared_dir, 'w2v_dic.pkl'), 'rb') as fin: 144 | w2v_model = pkl.load(fin) 145 | fin.close() 146 | for root, dirs, files in os.walk(self.segmented_dir): 147 | for file_name in files: 148 | with open(os.path.join(self.segmented_dir, file_name), 'r', encoding='utf8') as f: 149 | for line in f.readlines(): 150 | line = line.replace('\n', '') 151 | words = line.split(' ') 152 | weighted_embedding = np.array([0.0] * self.embed_dim) 153 | for word in words: 154 | if word not in weighted_word_dict: 155 | if word in w2v_model: 156 | weighted_word_embedding = w2v_model[word] * dict_word_weight[word] 157 | else: 158 | weighted_word_embedding = np.array([1.0] * self.embed_dim) * 0.001 159 | weighted_word_dict[word] = weighted_word_embedding 160 | weighted_embedding += weighted_word_dict[word] 161 | weighted_embedding_list.append(weighted_embedding) 162 | f.close() 163 | pkl.dump(weighted_word_dict, open(os.path.join(self.prepared_dir, 'weighted_word_dict.pkl'), 'wb')) 164 | return np.array(weighted_embedding_list) 165 | 166 | def compute_pc(self, x, npc=1): 167 | svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0) 168 | svd.fit(x) 169 | return svd.components_ 170 | 171 | def remove_pc(self, x, npc=1): 172 | """ 173 | Remove the projection on the principal components 174 | :param x: x[i,:] is a data point 175 | :param npc: number of principal components to remove 176 | :return: XX[i, :] is the data point after removing its projection 177 | """ 178 | pc = self.compute_pc(x, npc) 179 | if npc == 1: 180 | xx = x - x.dot(pc.transpose()) * pc 181 | else: 182 | xx = x - x.dot(pc.transpose()).dot(pc) 183 | return xx 184 | 185 | def build_pc_and_sif_embedding_list(self): 186 | dict_word_weight = self.get_dict_word_weight() 187 | # pkl.dump(dict_word_weight, open(os.path.join(self.prepared_dir, 'dict_word_weight.pkl'), 'wb')) 188 | weighted_embedding_list = self.get_weighted_embedding_list(dict_word_weight) 189 | self.logger.info('Finish building the weighted embedding list of sentence list') 190 | pc = self.compute_pc(weighted_embedding_list) 191 | pkl.dump(pc, open(os.path.join(self.prepared_dir, 'pc.pkl'), 'wb')) 192 | self.logger.info('Finish building the pc') 193 | # sif_embedding_list = self.remove_pc(weighted_embedding_list) 194 | # pickle.dump(sif_embedding_list, open(params.dump_sif_embedding_list_path, 'wb')) 195 | # self.logger.info('Finish building the sif_embedding') 196 | 197 | def get_sif_embedding(self, text): 198 | sentence_embedding = self.get_weighted_embedding(text) 199 | rmpc_sentence_embedding = sentence_embedding - sentence_embedding.dot(self.pc.transpose()) * self.pc 200 | return rmpc_sentence_embedding 201 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/json_to_sentence.py: -------------------------------------------------------------------------------- 1 | # import sys 2 | import os 3 | # sys.path.append('..') 4 | 5 | 6 | def write_data(brc_data, tar_dir): 7 | # print('Converting ' + file) 8 | # fin = open(file, encoding='utf8') 9 | out_file = os.path.join(tar_dir, 'train_set.seg') 10 | with open(out_file, 'w', encoding='utf8') as ftrain: 11 | for sample in brc_data.train_set: 12 | ftrain.write(' '.join(sample['segmented_question']) + '\n') 13 | for passage in sample['passages']: 14 | ftrain.write(' '.join(passage['passage_tokens']) + '\n') 15 | del sample 16 | ftrain.close() 17 | 18 | out_file = os.path.join(tar_dir, 'dev_set.seg') 19 | with open(out_file, 'w', encoding='utf8') as fdev: 20 | for sample in brc_data.dev_set: 21 | fdev.write(' '.join(sample['segmented_question']) + '\n') 22 | for passage in sample['passages']: 23 | fdev.write(' '.join(passage['passage_tokens']) + '\n') 24 | del sample 25 | fdev.close() 26 | 27 | out_file = os.path.join(tar_dir, 'test_set.seg') 28 | with open(out_file, 'w', encoding='utf8') as ftest: 29 | for sample in brc_data.test_set: 30 | ftest.write(' '.join(sample['segmented_question']) + '\n') 31 | for passage in sample['passages']: 32 | ftest.write(' '.join(passage['passage_tokens']) + '\n') 33 | del sample 34 | ftest.close() 35 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | Empty __init__.py file 19 | 20 | Authors: Yizhong Wang(wangyizhong01@baidu.com) 21 | Date: 2017/09/20 12:00:00 22 | """ 23 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/__pycache__/cu_rnn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/cu_rnn.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/__pycache__/match_layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/match_layer.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/__pycache__/pointer_net.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/pointer_net.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/basic_rnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides wrappers for variants of RNN in Tensorflow 3 | """ 4 | 5 | import tensorflow as tf 6 | import tensorflow.contrib as tc 7 | 8 | 9 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True): 10 | """ 11 | Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN 12 | Args: 13 | rnn_type: the type of rnn 14 | inputs: padded inputs into rnn 15 | length: the valid length of the inputs 16 | hidden_size: the size of hidden units 17 | layer_num: multiple rnn layer are stacked if layer_num > 1 18 | dropout_keep_prob: 19 | concat: When the rnn is bidirectional, the forward outputs and backward outputs are 20 | concatenated if this is True, else we add them. 21 | Returns: 22 | RNN outputs and final state 23 | """ 24 | if not rnn_type.startswith('bi'): 25 | cells = tc.rnn.MultiRNNCell([get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) for _ in range(layer_num)], 26 | state_is_tuple=True) 27 | outputs, state = tf.nn.dynamic_rnn(cells, inputs, sequence_length=length, dtype=tf.float32) 28 | if rnn_type.endswith('lstm'): 29 | c, h = state 30 | state = h 31 | else: 32 | if layer_num > 1: 33 | cell_fw = [get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) for _ in range(layer_num)] 34 | cell_bw = [get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) for _ in range(layer_num)] 35 | outputs, state_fw, state_bw = tc.rnn.stack_bidirectional_dynamic_rnn( 36 | cell_fw, cell_bw, inputs, sequence_length=length, dtype=tf.float32 37 | ) 38 | else: 39 | cell_fw = get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) 40 | cell_bw = get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) 41 | outputs, state = tf.nn.bidirectional_dynamic_rnn( 42 | cell_fw, cell_bw, inputs, sequence_length=length, dtype=tf.float32 43 | ) 44 | return outputs 45 | 46 | 47 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None): 48 | """ 49 | Gets the RNN Cell 50 | Args: 51 | rnn_type: 'lstm', 'gru' or 'rnn' 52 | hidden_size: The size of hidden units 53 | layer_num: MultiRNNCell are used if layer_num > 1 54 | dropout_keep_prob: dropout in RNN 55 | Returns: 56 | An RNN Cell 57 | """ 58 | if rnn_type.endswith('lstm'): 59 | cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True) 60 | elif rnn_type.endswith('gru'): 61 | cell = tc.rnn.GRUCell(num_units=hidden_size) 62 | elif rnn_type.endswith('rnn'): 63 | cell = tc.rnn.BasicRNNCell(num_units=hidden_size) 64 | elif rnn_type.endswith('sru'): 65 | cell = tc.rnn.SRUCell(num_units=hidden_size) 66 | elif rnn_type.endswith('indy'): 67 | cell = tc.rnn.IndyGRUCell(num_units=hidden_size) 68 | else: 69 | raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type)) 70 | if dropout_keep_prob is not None: 71 | cell = tc.rnn.DropoutWrapper(cell, 72 | input_keep_prob=dropout_keep_prob, 73 | output_keep_prob=dropout_keep_prob) 74 | return cell 75 | 76 | 77 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/cu_rnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides wrappers for variants of RNN in Tensorflow 3 | """ 4 | 5 | import tensorflow as tf 6 | from tensorflow.contrib import cudnn_rnn 7 | 8 | 9 | def rnn(rnn_type, inputs, hidden_size, batch_size, training, layer_num=1, dropout_keep_prob=None): 10 | """ 11 | Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN 12 | Args: 13 | rnn_type: the type of rnn 14 | inputs: padded inputs into rnn 15 | hidden_size: the size of hidden units 16 | layer_num: multiple rnn layer are stacked if layer_num > 1 17 | dropout_keep_prob: 18 | Returns: 19 | RNN outputs and final state 20 | """ 21 | if not rnn_type.startswith('bi'): 22 | cell = get_cell(rnn_type, hidden_size, layer_num, 'unidirectional') 23 | inputs = tf.transpose(inputs, [1, 0, 2]) 24 | c = tf.zeros([layer_num, batch_size, hidden_size], tf.float32) 25 | h = tf.zeros([layer_num, batch_size, hidden_size], tf.float32) 26 | outputs, state = cell(inputs, (h, c), training=training) 27 | if rnn_type.endswith('lstm'): 28 | c, h = state 29 | state = h 30 | else: 31 | cell = get_cell(rnn_type, hidden_size, layer_num, 'bidirectional') 32 | inputs = tf.transpose(inputs, [1, 0, 2]) 33 | outputs, state = cell(inputs, training=training) 34 | # if rnn_type.endswith('lstm'): 35 | # state_h, state_c = state 36 | # h_fw, h_bw = state_h[0, :], state_h[1, :] 37 | # state_fw, state_bw = h_fw, h_bw 38 | # else: 39 | # state_fw, state_bw = state[0][0, :], state[0][1, :] 40 | # if concat: 41 | # state = tf.concat([state_fw, state_bw], 1) 42 | # else: 43 | # state = state_fw + state_bw 44 | outputs = tf.transpose(outputs, [1, 0, 2]) 45 | return outputs, state 46 | 47 | 48 | def get_cell(rnn_type, hidden_size, layer_num=1, direction='bidirectional'): 49 | if rnn_type.endswith('lstm'): 50 | cudnn_cell = cudnn_rnn.CudnnLSTM(num_layers=layer_num, num_units=hidden_size, direction=direction, 51 | dropout=0) 52 | elif rnn_type.endswith('gru'): 53 | cudnn_cell = cudnn_rnn.CudnnGRU(num_layers=layer_num, num_units=hidden_size, direction=direction, 54 | dropout=0) 55 | elif rnn_type.endswith('rnn'): 56 | cudnn_cell = cudnn_rnn.CudnnRNNTanh(num_layers=layer_num, num_units=hidden_size, direction=direction, 57 | dropout=0) 58 | else: 59 | raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type)) 60 | return cudnn_cell 61 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/match_layer.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib as tc 3 | 4 | 5 | class MatchLSTMAttnCell(tc.rnn.LSTMCell): 6 | """ 7 | Implements the Match-LSTM attention cell 8 | """ 9 | 10 | def __init__(self, num_units, context_to_attend): 11 | super(MatchLSTMAttnCell, self).__init__(num_units, state_is_tuple=True) 12 | self.context_to_attend = context_to_attend 13 | self.fc_context = tc.layers.fully_connected(self.context_to_attend, 14 | num_outputs=self._num_units, 15 | activation_fn=None) 16 | 17 | def __call__(self, inputs, state, scope=None): 18 | (c_prev, h_prev) = state 19 | with tf.variable_scope(scope or type(self).__name__): 20 | ref_vector = tf.concat([inputs, h_prev], -1) 21 | G = tf.tanh(self.fc_context 22 | + tf.expand_dims(tc.layers.fully_connected(ref_vector, 23 | num_outputs=self._num_units, 24 | activation_fn=None), 1)) 25 | logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None) 26 | scores = tf.nn.softmax(logits, 1) 27 | attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1) 28 | new_inputs = tf.concat([inputs, attended_context, 29 | inputs - attended_context, inputs * attended_context], 30 | -1) 31 | return super(MatchLSTMAttnCell, self).__call__(new_inputs, state, scope) 32 | 33 | 34 | class MatchLSTMBlockAttnCell(tc.rnn.LSTMBlockCell): 35 | """ 36 | Implements the Match-LSTM attention cell 37 | """ 38 | 39 | def __init__(self, num_units, context_to_attend): 40 | super(MatchLSTMBlockAttnCell, self).__init__(num_units, reuse=tf.AUTO_REUSE) 41 | self.context_to_attend = context_to_attend 42 | self.fc_context = tc.layers.fully_connected(self.context_to_attend, 43 | num_outputs=self._num_units, 44 | activation_fn=None) 45 | 46 | def __call__(self, inputs, state, scope=None): 47 | (c_prev, h_prev) = state 48 | with tf.variable_scope(scope or type(self).__name__): 49 | ref_vector = tf.concat([inputs, h_prev], -1) 50 | G = tf.tanh(self.fc_context 51 | + tf.expand_dims(tc.layers.fully_connected(ref_vector, 52 | num_outputs=self._num_units, 53 | activation_fn=None), 1)) 54 | logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None) 55 | scores = tf.nn.softmax(logits, 1) 56 | attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1) 57 | new_inputs = tf.concat([inputs, attended_context, 58 | inputs - attended_context, inputs * attended_context], 59 | -1) 60 | return super(MatchLSTMBlockAttnCell, self).__call__(new_inputs, state, scope) 61 | 62 | 63 | class MatchLSTMLayer(object): 64 | """ 65 | Implements the Match-LSTM layer, which attend to the question dynamically in a LSTM fashion. 66 | """ 67 | 68 | def __init__(self, hidden_size): 69 | self.hidden_size = hidden_size 70 | 71 | def match(self, passage_encodes, question_encodes, p_length, q_length): 72 | """ 73 | Match the passage_encodes with question_encodes using Match-LSTM algorithm 74 | """ 75 | with tf.variable_scope('match_lstm', reuse=tf.AUTO_REUSE): 76 | cell_fw = MatchLSTMBlockAttnCell(self.hidden_size, question_encodes) 77 | cell_bw = MatchLSTMBlockAttnCell(self.hidden_size, question_encodes) 78 | outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, 79 | inputs=passage_encodes, 80 | sequence_length=p_length, 81 | dtype=tf.float32) 82 | match_outputs = tf.concat(outputs, 2) 83 | state_fw, state_bw = state 84 | c_fw, h_fw = state_fw 85 | c_bw, h_bw = state_bw 86 | match_state = tf.concat([h_fw, h_bw], 1) 87 | return match_outputs, match_state 88 | 89 | 90 | class AttentionFlowMatchLayer(object): 91 | """ 92 | Implements the Attention Flow layer, 93 | which computes Context-to-question Attention and question-to-context Attention 94 | """ 95 | 96 | def __init__(self, hidden_size): 97 | self.hidden_size = hidden_size 98 | 99 | def match(self, passage_encodes, question_encodes, p_length, q_length): 100 | """ 101 | Match the passage_encodes with question_encodes using Attention Flow Match algorithm 102 | """ 103 | with tf.variable_scope('bidaf', reuse=tf.AUTO_REUSE): 104 | sim_matrix_0 = tf.matmul(passage_encodes, question_encodes, transpose_b=True) 105 | context2question_attn = tf.matmul(tf.nn.softmax(sim_matrix_0, -1), question_encodes) 106 | b = tf.nn.softmax(tf.expand_dims(tf.reduce_max(sim_matrix_0, 2), 1), -1) 107 | question2context_attn = tf.tile(tf.matmul(b, passage_encodes), 108 | [1, tf.shape(passage_encodes)[1], 1]) 109 | sim_matrix_1 = tf.matmul(passage_encodes, passage_encodes, transpose_b=True) 110 | context2context_attn = tf.matmul(tf.nn.softmax(sim_matrix_1, -1), passage_encodes) 111 | concat_outputs = tf.concat([passage_encodes, context2question_attn, context2context_attn, 112 | passage_encodes * context2question_attn, 113 | passage_encodes * question2context_attn], -1) 114 | return concat_outputs, None 115 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/layers/pointer_net.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tensorflow.contrib as tc 3 | 4 | 5 | def custom_dynamic_rnn(cell, inputs, inputs_len, initial_state=None): 6 | """ 7 | Implements a dynamic rnn that can store scores in the pointer network, 8 | the reason why we implements this is that the raw_rnn or dynamic_rnn function in Tensorflow 9 | seem to require the hidden unit and memory unit has the same dimension, and we cannot 10 | store the scores directly in the hidden unit. 11 | Args: 12 | cell: RNN cell 13 | inputs: the input sequence to rnn 14 | inputs_len: valid length 15 | initial_state: initial_state of the cell 16 | Returns: 17 | outputs and state 18 | """ 19 | batch_size = tf.shape(inputs)[0] 20 | max_time = tf.shape(inputs)[1] 21 | 22 | inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time) 23 | inputs_ta = inputs_ta.unstack(tf.transpose(inputs, [1, 0, 2])) 24 | emit_ta = tf.TensorArray(dtype=tf.float32, dynamic_size=True, size=0) 25 | t0 = tf.constant(0, dtype=tf.int32) 26 | if initial_state is not None: 27 | s0 = initial_state 28 | else: 29 | s0 = cell.zero_state(batch_size, dtype=tf.float32) 30 | f0 = tf.zeros([batch_size], dtype=tf.bool) 31 | 32 | def loop_fn(t, prev_s, emit_ta, finished): 33 | """ 34 | the loop function of rnn 35 | """ 36 | cur_x = inputs_ta.read(t) 37 | scores, cur_state = cell(cur_x, prev_s) 38 | 39 | # copy through 40 | scores = tf.where(finished, tf.zeros_like(scores), scores) 41 | if isinstance(cell, tc.rnn.LSTMBlockCell): 42 | # if isinstance(cell, tc.rnn.LSTMCell): 43 | cur_c, cur_h = cur_state 44 | prev_c, prev_h = prev_s 45 | cur_state = tc.rnn.LSTMStateTuple(tf.where(finished, prev_c, cur_c), 46 | tf.where(finished, prev_h, cur_h)) 47 | else: 48 | cur_state = tf.where(finished, prev_s, cur_state) 49 | 50 | emit_ta = emit_ta.write(t, scores) 51 | finished = tf.greater_equal(t + 1, inputs_len) 52 | return [t + 1, cur_state, emit_ta, finished] 53 | 54 | _, state, emit_ta, _ = tf.while_loop( 55 | cond=lambda _1, _2, _3, finished: tf.logical_not(tf.reduce_all(finished)), 56 | body=loop_fn, 57 | loop_vars=(t0, s0, emit_ta, f0), 58 | parallel_iterations=32, 59 | swap_memory=False) 60 | 61 | outputs = tf.transpose(emit_ta.stack(), [1, 0, 2]) 62 | return outputs, state 63 | 64 | 65 | def attend_pooling(pooling_vectors, ref_vector, hidden_size, scope=None): 66 | """ 67 | Applies attend pooling to a set of vectors according to a reference vector. 68 | Args: 69 | pooling_vectors: the vectors to pool 70 | ref_vector: the reference vector 71 | hidden_size: the hidden size for attention function 72 | scope: score name 73 | Returns: 74 | the pooled vector 75 | """ 76 | with tf.variable_scope(scope or 'attend_pooling', reuse=tf.AUTO_REUSE): 77 | U = tf.tanh(tc.layers.fully_connected(pooling_vectors, num_outputs=hidden_size, 78 | activation_fn=None, biases_initializer=None) 79 | + tc.layers.fully_connected(tf.expand_dims(ref_vector, 1), 80 | num_outputs=hidden_size, 81 | activation_fn=None)) 82 | logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None) 83 | scores = tf.nn.softmax(logits, 1) 84 | pooled_vector = tf.reduce_sum(pooling_vectors * scores, axis=1) 85 | return pooled_vector 86 | 87 | 88 | class PointerNetLSTMCell(tc.rnn.LSTMCell): 89 | """ 90 | Implements the Pointer Network Cell 91 | """ 92 | 93 | def __init__(self, num_units, context_to_point): 94 | super(PointerNetLSTMCell, self).__init__(num_units, state_is_tuple=True) 95 | self.context_to_point = context_to_point 96 | self.fc_context = tc.layers.fully_connected(self.context_to_point, 97 | num_outputs=self._num_units, 98 | activation_fn=None) 99 | 100 | def __call__(self, inputs, state, scope=None): 101 | (c_prev, m_prev) = state 102 | with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE): 103 | U = tf.tanh(self.fc_context 104 | + tf.expand_dims(tc.layers.fully_connected(m_prev, 105 | num_outputs=self._num_units, 106 | activation_fn=None), 107 | 1)) 108 | logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None) 109 | scores = tf.nn.softmax(logits, 1) 110 | attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1) 111 | lstm_out, lstm_state = super(PointerNetLSTMCell, self).__call__(attended_context, state) 112 | return tf.squeeze(scores, -1), lstm_state 113 | 114 | 115 | class PointerNetLSTMBlockCell(tc.rnn.LSTMBlockCell): 116 | """ 117 | Implements the Pointer Network Cell 118 | """ 119 | 120 | def __init__(self, num_units, context_to_point): 121 | super(PointerNetLSTMBlockCell, self).__init__(num_units, reuse=tf.AUTO_REUSE) 122 | self.context_to_point = context_to_point 123 | self.fc_context = tc.layers.fully_connected(self.context_to_point, 124 | num_outputs=self._num_units, 125 | activation_fn=None) 126 | 127 | def __call__(self, inputs, state, scope=None): 128 | (c_prev, m_prev) = state 129 | with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE): 130 | U = tf.tanh(self.fc_context 131 | + tf.expand_dims(tc.layers.fully_connected(m_prev, 132 | num_outputs=self._num_units, 133 | activation_fn=None), 134 | 1)) 135 | logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None) 136 | scores = tf.nn.softmax(logits, 1) 137 | attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1) 138 | lstm_out, lstm_state = super(PointerNetLSTMBlockCell, self).__call__(attended_context, state) 139 | return tf.squeeze(scores, -1), lstm_state 140 | 141 | 142 | class PointerNetDecoder(object): 143 | """ 144 | Implements the Pointer Network 145 | """ 146 | 147 | def __init__(self, hidden_size): 148 | self.hidden_size = hidden_size 149 | 150 | def decode(self, passage_vectors, question_vectors, init_with_question=True): 151 | """ 152 | Use Pointer Network to compute the probabilities of each position 153 | to be start and end of the answer 154 | Args: 155 | passage_vectors: the encoded passage vectors 156 | question_vectors: the encoded question vectors 157 | init_with_question: if set to be true, 158 | we will use the question_vectors to init the state of Pointer Network 159 | Returns: 160 | the probs of evary position to be start and end of the answer 161 | """ 162 | with tf.variable_scope('pn_decoder', reuse=tf.AUTO_REUSE): 163 | fake_inputs = tf.zeros([tf.shape(passage_vectors)[0], 2, 1]) # not used 164 | sequence_len = tf.tile([2], [tf.shape(passage_vectors)[0]]) 165 | if init_with_question: 166 | random_attn_vector = tf.Variable(tf.random_normal([1, self.hidden_size]), 167 | trainable=True, name="random_attn_vector") 168 | pooled_question_rep = tc.layers.fully_connected( 169 | attend_pooling(question_vectors, random_attn_vector, self.hidden_size), 170 | num_outputs=self.hidden_size, activation_fn=None 171 | ) 172 | init_state = tc.rnn.LSTMStateTuple(pooled_question_rep, pooled_question_rep) 173 | else: 174 | init_state = None 175 | with tf.variable_scope('fw', reuse=tf.AUTO_REUSE): 176 | fw_cell = PointerNetLSTMBlockCell(self.hidden_size, passage_vectors) 177 | # fw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors) 178 | fw_outputs, _ = custom_dynamic_rnn(fw_cell, fake_inputs, sequence_len, init_state) 179 | with tf.variable_scope('bw', reuse=tf.AUTO_REUSE): 180 | bw_cell = PointerNetLSTMBlockCell(self.hidden_size, passage_vectors) 181 | # bw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors) 182 | bw_outputs, _ = custom_dynamic_rnn(bw_cell, fake_inputs, sequence_len, init_state) 183 | start_prob = (fw_outputs[0:, 0, 0:] + bw_outputs[0:, 1, 0:]) / 2 184 | end_prob = (fw_outputs[0:, 1, 0:] + bw_outputs[0:, 0, 0:]) / 2 185 | return start_prob, end_prob 186 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/pretrain_embedding.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import logging 4 | # import argparse 5 | from gensim.models import word2vec 6 | 7 | 8 | def pre_train(segmented_dir): 9 | sys.path.append('..') 10 | 11 | program = os.path.basename(sys.argv[0]) 12 | logger = logging.getLogger(program) 13 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 14 | logging.root.setLevel(level=logging.INFO) 15 | logger.info("running %s" % ' '.join(sys.argv)) 16 | 17 | model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir), size=300, min_count=2, workers=8, iter=10) 18 | with open(os.path.join(segmented_dir, 'w2v_dic.data'), 'w', encoding='utf-8') as f: 19 | for word in model.wv.vocab: 20 | f.write(word + ' ') 21 | f.write(' '.join(list(map(str, model[word])))) 22 | f.write('\n') 23 | f.close() 24 | 25 | model.save_word2vec_format(os.path.join(segmented_dir, 'w2v_model.bin'), binary=True) 26 | 27 | 28 | def write_data(data_set, tar_dir): 29 | 30 | with open(tar_dir, 'w', encoding='utf8') as f: 31 | for sample in data_set: 32 | f.write(' '.join(sample['segmented_question']) + '\n') 33 | for passage in sample['passages']: 34 | f.write(' '.join(passage['passage_tokens']) + '\n') 35 | del sample 36 | f.close() 37 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/dureader/vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module implements the Vocab class for converting string to id and back 19 | """ 20 | 21 | import numpy as np 22 | import pickle as pkl 23 | 24 | 25 | class Vocab(object): 26 | """ 27 | Implements a vocabulary to store the tokens in the data, with their corresponding embeddings. 28 | """ 29 | 30 | def __init__(self, embed_dim=300, filename=None, initial_tokens=None, lower=False): 31 | self.id2token = {} 32 | self.token2id = {} 33 | self.token_cnt = {} 34 | self.lower = lower 35 | 36 | self.embed_dim = embed_dim 37 | self.embeddings = None 38 | 39 | self.pad_token = '' 40 | self.unk_token = '' 41 | 42 | self.initial_tokens = initial_tokens if initial_tokens is not None else [] 43 | self.initial_tokens.extend([self.pad_token, self.unk_token]) 44 | for token in self.initial_tokens: 45 | self.add(token) 46 | 47 | if filename is not None: 48 | self.load_from_file(filename) 49 | 50 | def size(self): 51 | """ 52 | get the size of vocabulary 53 | Returns: 54 | an integer indicating the size 55 | """ 56 | return len(self.id2token) 57 | 58 | def load_from_file(self, file_path): 59 | """ 60 | loads the vocab from file_path 61 | Args: 62 | file_path: a file with a word in each line 63 | """ 64 | for line in open(file_path, 'r'): 65 | token = line.rstrip('\n') 66 | self.add(token) 67 | 68 | def get_id(self, token): 69 | """ 70 | gets the id of a token, returns the id of unk token if token is not in vocab 71 | Args: 72 | key: a string indicating the word 73 | Returns: 74 | an integer 75 | """ 76 | token = token.lower() if self.lower else token 77 | try: 78 | return self.token2id[token] 79 | except KeyError: 80 | return self.token2id[self.unk_token] 81 | 82 | def get_token(self, idx): 83 | """ 84 | gets the token corresponding to idx, returns unk token if idx is not in vocab 85 | Args: 86 | idx: an integer 87 | returns: 88 | a token string 89 | """ 90 | try: 91 | return self.id2token[idx] 92 | except KeyError: 93 | return self.unk_token 94 | 95 | def add(self, token, cnt=1): 96 | """ 97 | adds the token to vocab 98 | Args: 99 | token: a string 100 | cnt: a num indicating the count of the token to add, default is 1 101 | """ 102 | token = token.lower() if self.lower else token 103 | if token in self.token2id: 104 | idx = self.token2id[token] 105 | else: 106 | # vocab中无此token,则添加token2id id2token 107 | idx = len(self.id2token) 108 | self.id2token[idx] = token 109 | self.token2id[token] = idx 110 | if cnt > 0: 111 | if token in self.token_cnt: 112 | self.token_cnt[token] += cnt 113 | else: 114 | self.token_cnt[token] = cnt 115 | return idx 116 | 117 | def filter_tokens_by_cnt(self, min_cnt): 118 | """ 119 | filter the tokens in vocab by their count 120 | Args: 121 | min_cnt: tokens with frequency less than min_cnt is filtered 122 | """ 123 | filtered_tokens = [token for token in self.token2id if self.token_cnt[token] >= min_cnt] 124 | # rebuild the token x id map 125 | self.token2id = {} 126 | self.id2token = {} 127 | for token in self.initial_tokens: 128 | self.add(token, cnt=0) 129 | for token in filtered_tokens: 130 | self.add(token, cnt=0) 131 | 132 | def randomly_init_embeddings(self, embed_dim): 133 | """ 134 | randomly initializes the embeddings for each token 135 | Args: 136 | embed_dim: the size of the embedding for each token 137 | """ 138 | self.embed_dim = embed_dim 139 | self.embeddings = np.random.rand(self.size(), embed_dim) 140 | # 填充符号和未知词符号初始化为0 141 | for token in [self.pad_token, self.unk_token]: 142 | self.embeddings[self.get_id(token)] = np.zeros([self.embed_dim]) 143 | 144 | def load_pretrained_embeddings(self, embedding_path): 145 | """ 146 | loads the pretrained embeddings from embedding_path, 147 | tokens not in pretrained embeddings will be filtered 148 | Args: 149 | embedding_path: the path of the pretrained embedding file 150 | """ 151 | with open(embedding_path, 'rb') as fin: 152 | trained_embeddings = pkl.load(fin) 153 | fin.close() 154 | filtered_tokens = trained_embeddings.keys() 155 | # rebuild the token x id map 156 | self.token2id = {} 157 | self.id2token = {} 158 | for token in self.initial_tokens: 159 | self.add(token, cnt=0) 160 | for token in filtered_tokens: 161 | self.add(token, cnt=0) 162 | # load embeddings 163 | self.embeddings = np.zeros([self.size(), self.embed_dim]) 164 | for token in self.token2id.keys(): 165 | if token in trained_embeddings: 166 | self.embeddings[self.get_id(token)] = trained_embeddings[token] 167 | 168 | def convert_to_ids(self, tokens): 169 | """ 170 | Convert a list of tokens to ids, use unk_token if the token is not in vocab. 171 | Args: 172 | tokens: a list of token 173 | Returns: 174 | a list of ids 175 | """ 176 | vec = [self.get_id(label) for label in tokens] 177 | return vec 178 | 179 | def recover_from_ids(self, ids, stop_id=None): 180 | """ 181 | Convert a list of ids to tokens, stop converting if the stop_id is encountered 182 | Args: 183 | ids: a list of ids to convert 184 | stop_id: the stop id, default is None 185 | Returns: 186 | a list of tokens 187 | """ 188 | tokens = [] 189 | for i in ids: 190 | tokens += [self.get_token(i)] 191 | if stop_id is not None and i == stop_id: 192 | break 193 | return tokens 194 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This package implements some utility functions shared by PaddlePaddle 19 | and Tensorflow model implementations. 20 | 21 | Authors: liuyuan(liuyuan04@baidu.com) 22 | Date: 2017/10/06 18:23:06 23 | """ 24 | 25 | 26 | from .dureader_eval import compute_bleu_rouge 27 | from .dureader_eval import normalize 28 | from .preprocess import find_fake_answer 29 | from .preprocess import find_best_question_match 30 | 31 | __all__ = [ 32 | 'compute_bleu_rouge', 33 | 'normalize', 34 | 'find_fake_answer', 35 | 'find_best_question_match', 36 | ] 37 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/bleu.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | 3 | import math 4 | 5 | from utils import common 6 | 7 | 8 | class BLEU(object): 9 | def __init__(self, n_size): 10 | self.match_ngram = {} 11 | self.candi_ngram = {} 12 | self.bp_r = 0 13 | self.bp_c = 0 14 | self.n_size = n_size 15 | 16 | def add_inst(self, cand, ref_list): 17 | for n_size in range(self.n_size): 18 | self.count_ngram(cand, ref_list, n_size) 19 | self.count_bp(cand, ref_list) 20 | 21 | def count_ngram(self, cand, ref_list, n_size): 22 | cand_ngram = common.get_ngram(cand, n_size) 23 | refs_ngram = [] 24 | for ref in ref_list: 25 | refs_ngram.append(common.get_ngram(ref, n_size)) 26 | if n_size not in self.match_ngram: 27 | self.match_ngram[n_size] = 0 28 | self.candi_ngram[n_size] = 0 29 | match_size, cand_size = common.get_match_size(cand_ngram, refs_ngram) 30 | self.match_ngram[n_size] += match_size 31 | self.candi_ngram[n_size] += cand_size 32 | 33 | def count_bp(self, cand, ref_list): 34 | self.bp_c += len(cand) 35 | self.bp_r += min([ 36 | (abs(len(cand) - len(ref)), len(ref)) 37 | for ref in ref_list] 38 | )[1] 39 | 40 | def score(self): 41 | prob_list = [] 42 | for n_size in range(self.n_size): 43 | if float(self.candi_ngram[n_size]) == 0: 44 | prob_list.append(0) 45 | else: 46 | prob_list.append(self.match_ngram[n_size] / float(self.candi_ngram[n_size])) 47 | # prob_list = [ 48 | # self.match_ngram[n_size] / float(self.candi_ngram[n_size]) 49 | # for n_size in range(self.n_size) 50 | # ] 51 | bleu_list = [prob_list[0]] 52 | for n in range(1, self.n_size): 53 | bleu_list.append(bleu_list[-1] * prob_list[n]) 54 | for n in range(self.n_size): 55 | bleu_list[n] = bleu_list[n] ** (1. / float(n + 1)) 56 | bp = math.exp(min(1 - self.bp_r / float(self.bp_c), 0)) 57 | for n in range(self.n_size): 58 | bleu_list[n] = bleu_list[n] * bp 59 | return bleu_list 60 | 61 | 62 | class BLEUWithBonus(BLEU): 63 | def __init__(self, n_size, alpha=1.0, beta=1.0): 64 | super(BLEUWithBonus, self).__init__(n_size) 65 | self.alpha = alpha 66 | self.beta = beta 67 | 68 | def add_inst(self, 69 | cand, 70 | ref_list, 71 | yn_label=None, yn_ref=None, entity_ref=None): 72 | # super(BLEUWithBonus, self).add_inst(cand, ref_list) 73 | BLEU.add_inst(self, cand, ref_list) 74 | if yn_label is not None and yn_ref is not None: 75 | self.add_yn_bonus(cand, ref_list, yn_label, yn_ref) 76 | elif entity_ref is not None: 77 | self.add_entity_bonus(cand, entity_ref) 78 | 79 | def add_yn_bonus(self, cand, ref_list, yn_label, yn_ref): 80 | for n_size in range(self.n_size): 81 | cand_ngram = common.get_ngram(cand, n_size, label=yn_label) 82 | ref_ngram = [] 83 | for ref_id, r in enumerate(yn_ref): 84 | ref_ngram.append(common.get_ngram(ref_list[ref_id], n_size, label=r)) 85 | match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram) 86 | self.match_ngram[n_size] += self.alpha * match_size 87 | self.candi_ngram[n_size] += self.alpha * match_size 88 | 89 | def add_entity_bonus(self, cand, entity_ref): 90 | for n_size in range(self.n_size): 91 | cand_ngram = common.get_ngram(cand, n_size, label='ENTITY') 92 | ref_ngram = [] 93 | for reff_id, r in enumerate(entity_ref): 94 | ref_ngram.append(common.get_ngram(r, n_size, label='ENTITY')) 95 | match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram) 96 | self.match_ngram[n_size] += self.beta * match_size 97 | self.candi_ngram[n_size] += self.beta * match_size 98 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/bleu_metric/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from .bleu_score import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(list(gts.keys()) == list(res.keys())) 24 | imgIds = list(gts.keys()) 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/bleu_metric/bleu_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # bleu_scorer.py 4 | # David Chiang 5 | 6 | # Copyright (c) 2004-2006 University of Maryland. All rights 7 | # reserved. Do not redistribute without permission from the 8 | # author. Not for commercial use. 9 | 10 | # Modified by: 11 | # Hao Fang 12 | # Tsung-Yi Lin 13 | 14 | '''Provides: 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). 17 | ''' 18 | 19 | import copy 20 | # import sys, math, re 21 | import math 22 | from collections import defaultdict 23 | 24 | 25 | def precook(s, n=4, out=False): 26 | """Takes a string as input and returns an object that can be given to 27 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 28 | can take string arguments as well.""" 29 | words = s.split() 30 | counts = defaultdict(int) 31 | for k in range(1, n + 1): 32 | for i in range(len(words) - k + 1): 33 | ngram = tuple(words[i:i + k]) 34 | counts[ngram] += 1 35 | return (len(words), counts) 36 | 37 | 38 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" 39 | '''Takes a list of reference sentences for a single segment 40 | and returns an object that encapsulates everything that BLEU 41 | needs to know about them.''' 42 | 43 | reflen = [] 44 | maxcounts = {} 45 | for ref in refs: 46 | rl, counts = precook(ref, n) 47 | reflen.append(rl) 48 | for (ngram, count) in counts.items(): 49 | maxcounts[ngram] = max(maxcounts.get(ngram, 0), count) 50 | 51 | # Calculate effective reference sentence length. 52 | if eff == "shortest": 53 | reflen = min(reflen) 54 | elif eff == "average": 55 | reflen = float(sum(reflen)) / len(reflen) 56 | 57 | ## lhuang: N.B.: leave reflen computaiton to the very end!! 58 | 59 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) 60 | 61 | return (reflen, maxcounts) 62 | 63 | 64 | def cook_test(test, xxx_todo_changeme, eff=None, n=4): 65 | '''Takes a test sentence and returns an object that 66 | encapsulates everything that BLEU needs to know about it.''' 67 | (reflen, refmaxcounts) = xxx_todo_changeme 68 | testlen, counts = precook(test, n, True) 69 | 70 | result = {} 71 | 72 | # Calculate effective reference sentence length. 73 | 74 | if eff == "closest": 75 | result["reflen"] = min((abs(l - testlen), l) for l in reflen)[1] 76 | else: ## i.e., "average" or "shortest" or None 77 | result["reflen"] = reflen 78 | 79 | result["testlen"] = testlen 80 | 81 | result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)] 82 | 83 | result['correct'] = [0] * n 84 | for (ngram, count) in counts.items(): 85 | result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count) 86 | 87 | return result 88 | 89 | 90 | class BleuScorer(object): 91 | """Bleu scorer. 92 | """ 93 | 94 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" 95 | 96 | # special_reflen is used in oracle (proportional effective ref len for a node). 97 | 98 | def copy(self): 99 | ''' copy the refs.''' 100 | new = BleuScorer(n=self.n) 101 | new.ctest = copy.copy(self.ctest) 102 | new.crefs = copy.copy(self.crefs) 103 | new._score = None 104 | return new 105 | 106 | def __init__(self, test=None, refs=None, n=4, special_reflen=None): 107 | ''' singular instance ''' 108 | 109 | self.n = n 110 | self.crefs = [] 111 | self.ctest = [] 112 | self.cook_append(test, refs) 113 | self.special_reflen = special_reflen 114 | 115 | def cook_append(self, test, refs): 116 | '''called by constructor and __iadd__ to avoid creating new instances.''' 117 | 118 | if refs is not None: 119 | self.crefs.append(cook_refs(refs)) 120 | if test is not None: 121 | cooked_test = cook_test(test, self.crefs[-1]) 122 | self.ctest.append(cooked_test) ## N.B.: -1 123 | else: 124 | self.ctest.append(None) # lens of crefs and ctest have to match 125 | 126 | self._score = None ## need to recompute 127 | 128 | def ratio(self, option=None): 129 | self.compute_score(option=option) 130 | return self._ratio 131 | 132 | def score_ratio(self, option=None): 133 | '''return (bleu, len_ratio) pair''' 134 | return (self.fscore(option=option), self.ratio(option=option)) 135 | 136 | def score_ratio_str(self, option=None): 137 | return "%.4f (%.2f)" % self.score_ratio(option) 138 | 139 | def reflen(self, option=None): 140 | self.compute_score(option=option) 141 | return self._reflen 142 | 143 | def testlen(self, option=None): 144 | self.compute_score(option=option) 145 | return self._testlen 146 | 147 | def retest(self, new_test): 148 | if type(new_test) is str: 149 | new_test = [new_test] 150 | assert len(new_test) == len(self.crefs), new_test 151 | self.ctest = [] 152 | for t, rs in zip(new_test, self.crefs): 153 | self.ctest.append(cook_test(t, rs)) 154 | self._score = None 155 | 156 | return self 157 | 158 | def rescore(self, new_test): 159 | ''' replace test(s) with new test(s), and returns the new score.''' 160 | 161 | return self.retest(new_test).compute_score() 162 | 163 | def size(self): 164 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 165 | return len(self.crefs) 166 | 167 | def __iadd__(self, other): 168 | '''add an instance (e.g., from another sentence).''' 169 | 170 | if type(other) is tuple: 171 | ## avoid creating new BleuScorer instances 172 | self.cook_append(other[0], other[1]) 173 | else: 174 | assert self.compatible(other), "incompatible BLEUs." 175 | self.ctest.extend(other.ctest) 176 | self.crefs.extend(other.crefs) 177 | self._score = None ## need to recompute 178 | 179 | return self 180 | 181 | def compatible(self, other): 182 | return isinstance(other, BleuScorer) and self.n == other.n 183 | 184 | def single_reflen(self, option="average"): 185 | return self._single_reflen(self.crefs[0][0], option) 186 | 187 | def _single_reflen(self, reflens, option=None, testlen=None): 188 | 189 | if option == "shortest": 190 | reflen = min(reflens) 191 | elif option == "average": 192 | reflen = float(sum(reflens)) / len(reflens) 193 | elif option == "closest": 194 | reflen = min((abs(l - testlen), l) for l in reflens)[1] 195 | else: 196 | assert False, "unsupported reflen option %s" % option 197 | 198 | return reflen 199 | 200 | def recompute_score(self, option=None, verbose=0): 201 | self._score = None 202 | return self.compute_score(option, verbose) 203 | 204 | def compute_score(self, option=None, verbose=0): 205 | n = self.n 206 | small = 1e-9 207 | tiny = 1e-15 ## so that if guess is 0 still return 0 208 | bleu_list = [[] for _ in range(n)] 209 | 210 | if self._score is not None: 211 | return self._score 212 | 213 | if option is None: 214 | option = "average" if len(self.crefs) == 1 else "closest" 215 | 216 | self._testlen = 0 217 | self._reflen = 0 218 | totalcomps = {'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n} 219 | 220 | # for each sentence 221 | for comps in self.ctest: 222 | testlen = comps['testlen'] 223 | self._testlen += testlen 224 | 225 | if self.special_reflen is None: ## need computation 226 | reflen = self._single_reflen(comps['reflen'], option, testlen) 227 | else: 228 | reflen = self.special_reflen 229 | 230 | self._reflen += reflen 231 | 232 | for key in ['guess', 'correct']: 233 | for k in range(n): 234 | totalcomps[key][k] += comps[key][k] 235 | 236 | # append per image bleu score 237 | bleu = 1. 238 | for k in range(n): 239 | bleu *= (float(comps['correct'][k]) + tiny) \ 240 | / (float(comps['guess'][k]) + small) 241 | bleu_list[k].append(bleu ** (1. / (k + 1))) 242 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division 243 | if ratio < 1: 244 | for k in range(n): 245 | bleu_list[k][-1] *= math.exp(1 - 1 / ratio) 246 | 247 | if verbose > 1: 248 | print(comps, reflen) 249 | 250 | totalcomps['reflen'] = self._reflen 251 | totalcomps['testlen'] = self._testlen 252 | 253 | bleus = [] 254 | bleu = 1. 255 | for k in range(n): 256 | bleu *= float(totalcomps['correct'][k] + tiny) \ 257 | / (totalcomps['guess'][k] + small) 258 | bleus.append(bleu ** (1. / (k + 1))) 259 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division 260 | if ratio < 1: 261 | for k in range(n): 262 | bleus[k] *= math.exp(1 - 1 / ratio) 263 | 264 | if verbose > 0: 265 | print(totalcomps) 266 | print("ratio:", ratio) 267 | 268 | self._score = bleus 269 | return self._score, bleu_list 270 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/common.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | from functools import reduce 3 | import math 4 | import ujson as json 5 | from collections import defaultdict 6 | import sys 7 | 8 | 9 | def get_match_size(cand_ngram, refs_ngram): 10 | ref_set = defaultdict(int) 11 | for ref_ngram in refs_ngram: 12 | tmp_ref_set = defaultdict(int) 13 | for ngram in ref_ngram: 14 | tmp_ref_set[ngram] += 1 15 | for ngram, count in tmp_ref_set.items(): 16 | ref_set[ngram] = max(ref_set[ngram], count) 17 | cand_set = defaultdict(int) 18 | for ngram in cand_ngram: 19 | cand_set[ngram] += 1 20 | match_size = 0 21 | for ngram, count in cand_set.items(): 22 | match_size += min(count, ref_set.get(ngram, 0)) 23 | cand_size = len(cand_ngram) 24 | return match_size, cand_size 25 | 26 | 27 | def get_ngram(sent, n_size, label=None): 28 | def _ngram(sent, n_size): 29 | ngram_list = [] 30 | for left in range(len(sent) - n_size): 31 | ngram_list.append(sent[left: left + n_size + 1]) 32 | return ngram_list 33 | 34 | ngram_list = _ngram(sent, n_size) 35 | if label is not None: 36 | ngram_list = [ngram + '_' + label for ngram in ngram_list] 37 | return ngram_list 38 | 39 | 40 | def word2char(str_in): 41 | str_out = str_in.replace(' ', '') 42 | return ''.join(str_out.split()) 43 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/get_vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | Utility function to generate vocabulary file. 19 | """ 20 | 21 | 22 | import argparse 23 | import sys 24 | import json 25 | 26 | from itertools import chain 27 | 28 | 29 | def get_vocab(files, vocab_file): 30 | """ 31 | Builds vocabulary file from field 'segmented_paragraphs' 32 | and 'segmented_question'. 33 | 34 | Args: 35 | files: A list of file names. 36 | vocab_file: The file that stores the vocabulary. 37 | """ 38 | vocab = {} 39 | for f in files: 40 | with open(f, 'r') as fin: 41 | for line in fin: 42 | obj = json.loads(line.strip()) 43 | paras = [ 44 | chain(*d['segmented_paragraphs']) 45 | for d in obj['documents']] 46 | doc_tokens = chain(*paras) 47 | question_tokens = obj['segmented_question'] 48 | for t in list(doc_tokens) + question_tokens: 49 | vocab[t] = vocab.get(t, 0) + 1 50 | # output 51 | sorted_vocab = sorted([(v, c) for v, c in vocab.items()], 52 | key=lambda x: x[1], 53 | reverse=True) 54 | with open(vocab_file, 'w') as outf: 55 | for w, c in sorted_vocab: 56 | print >> outf, '{}\t{}'.format(w.encode('utf8'), c) 57 | 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument('--files', nargs='+', required=True, 62 | help='file list to count vocab from.') 63 | parser.add_argument('--vocab', required=True, 64 | help='file to store counted vocab.') 65 | args = parser.parse_args() 66 | get_vocab(args.files, args.vocab) 67 | 68 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/mrc_eval.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | """ 3 | This module computes evaluation metrics for DuReader dataset. 4 | """ 5 | 6 | import argparse 7 | import itertools 8 | import ujson as json 9 | import sys 10 | import zipfile 11 | 12 | from collections import Counter 13 | from .bleu import BLEUWithBonus 14 | from .rouge import RougeLWithBonus 15 | 16 | EMPTY = '' 17 | YESNO_LABELS = set(['Yes', 'No', 'Depends']) 18 | 19 | 20 | def normalize(s): 21 | """ 22 | Normalize strings to space joined chars. 23 | Args: 24 | s: a list of strings. 25 | Returns: 26 | A list of normalized strings. 27 | """ 28 | if not s: 29 | return s 30 | normalized = [] 31 | for ss in s: 32 | tokens = [c for c in list(ss) if len(c.strip()) != 0] 33 | normalized.append(''.join(tokens)) 34 | return normalized 35 | 36 | 37 | def data_check(obj): 38 | """ 39 | Check data. 40 | 41 | Raises: 42 | Raises AssertionError when data is not legal. 43 | """ 44 | # 判断是否有answer_id 45 | assert 'question_id' in obj, "Missing 'question_id' field." 46 | # assert 'yesno_answers' in obj, \ 47 | # "Missing 'yesno_answers' field. question_id: {}".format(obj['question_id']) 48 | # 如果包含yesno_answers,那么格式必须为list 49 | if "yesno_answers" in obj: 50 | assert isinstance(obj['yesno_answers'], list), \ 51 | r"""'yesno_answers' field must be a list, if the 'question_type' is not 52 | 'YES_NO', then this field should be an empty list. 53 | question_id: {}""".format(obj['question_id']) 54 | else: 55 | obj["yesno_answers"] = [] 56 | if "entity_answers" not in obj: 57 | obj["entity_answers"] = [] 58 | 59 | 60 | def read_file(file_name, is_ref=False): 61 | """ 62 | Read predict answers or reference answers from file. 63 | 64 | Args: 65 | file_name: the name of the file containing predict result or reference 66 | result. 67 | 68 | Returns: 69 | A dictionary mapping question_id to the result information. The result 70 | information itself is also a dictionary with has four keys: 71 | - question_type: type of the query. 72 | - yesno_answers: A list of yesno answers corresponding to 'answers'. 73 | - answers: A list of predicted answers. 74 | - entity_answers: A list, each element is also a list containing the entities 75 | tagged out from the corresponding answer string. 76 | """ 77 | 78 | def _open(file_name, mode, zip_obj=None): 79 | if zip_obj is not None: 80 | return zip_obj.open(file_name, mode) 81 | return open(file_name, mode) 82 | 83 | results = {} 84 | # 是否是参考答案 85 | if is_ref: 86 | keys = ['source', 'answers', 'yesno_answers', 'entity_answers', 'question_type'] 87 | else: 88 | keys = ['answers', 'yesno_answers'] 89 | # 如果是zip文件则以zip方式读取 90 | zf = zipfile.ZipFile(file_name, 'r') if file_name.endswith('.zip') else None 91 | # zip包中文件列表 92 | file_list = [file_name] if zf is None else zf.namelist() 93 | 94 | for fn in file_list: 95 | for line in _open(fn, 'r', zip_obj=zf): 96 | try: 97 | obj = json.loads(line.strip()) 98 | except ValueError: 99 | raise ValueError("Every line of data should be legal json") 100 | data_check(obj) 101 | qid = obj['question_id'] 102 | # 必须有question id 103 | assert qid not in results, "Duplicate question_id: {}".format(qid) 104 | results[qid] = {} 105 | for k in keys: 106 | if k == 'answers': 107 | results[qid][k] = normalize(obj[k]) 108 | else: 109 | results[qid][k] = obj[k] 110 | if is_ref: 111 | for i, e in enumerate(results[qid]['entity_answers']): 112 | results[qid]['entity_answers'][i] = normalize(e) 113 | return results 114 | 115 | 116 | def calc_metrics(pred_result, ref_result, bleu_eval, rouge_eval): 117 | """Computes bleu-4 and rouge-l. 118 | 119 | Args: 120 | - pred_result: Refer to the returned dict of `read_file` with 121 | 'is_ref=False'. 122 | - ref_result: Refer to the returned dict of `ref_file` with 123 | 'is_ref=True'. 124 | - bleu_result: A BleuWithBonus object. 125 | - rouge_result: A RougeLWithBonus object. 126 | Returns: 127 | bleu-4 and rouge-l values as a tuple of float values. 128 | """ 129 | for qid, results in ref_result.items(): 130 | # 根据question id从预测结果中选择答案 131 | cand_result = pred_result.get(qid, {}) 132 | pred_answers = cand_result.get('answers', []) 133 | if not pred_answers: 134 | pred_answers = EMPTY 135 | else: 136 | pred_answers = pred_answers[0] 137 | pred_yn_label = None 138 | ref_entities = None 139 | ref_answers = results.get('answers', []) 140 | if not ref_answers: 141 | continue 142 | if results['question_type'] == 'ENTITY': 143 | ref_entities = set( 144 | itertools.chain(*results.get('entity_answers', [[]]))) 145 | if not ref_entities: 146 | ref_entities = None 147 | if results['question_type'] == 'YES_NO': 148 | cand_yesno = cand_result.get('yesno_answers', []) 149 | pred_yn_label = None if len(cand_yesno) == 0 \ 150 | else cand_yesno[0] 151 | bleu_eval.add_inst( 152 | pred_answers, 153 | ref_answers, 154 | yn_label=pred_yn_label, 155 | yn_ref=results['yesno_answers'], 156 | entity_ref=ref_entities) 157 | rouge_eval.add_inst( 158 | pred_answers, 159 | ref_answers, 160 | yn_label=pred_yn_label, 161 | yn_ref=results['yesno_answers'], 162 | entity_ref=ref_entities) 163 | bleu4 = bleu_eval.score()[-1] 164 | rouge_l = rouge_eval.score() 165 | return bleu4, rouge_l 166 | 167 | 168 | def main(args): 169 | err = None 170 | metrics = {} 171 | bleu4, rouge_l = 0.0, 0.0 172 | alpha = args.alpha # default 1.0 173 | beta = args.beta # default 1.0 174 | bleu_eval = BLEUWithBonus(4, alpha=alpha, beta=beta) 175 | rouge_eval = RougeLWithBonus(alpha=alpha, beta=beta, gamma=1.2) 176 | # 载入answer文件 格式dict question_id: {answers:[], yesno_answers:[]} 177 | pred_result = read_file(args.pred_file) 178 | ref_result = read_file(args.ref_file, is_ref=True) 179 | bleu4, rouge_l = calc_metrics(pred_result, 180 | ref_result, 181 | bleu_eval, 182 | rouge_eval) 183 | metrics = { 184 | 'ROUGE-L': round(rouge_l * 100, 2), 185 | 'BLEU-4': round(bleu4 * 100, 2), 186 | } 187 | print(json.dumps(metrics, ensure_ascii=False).encode('utf8')) 188 | 189 | 190 | if __name__ == '__main__': 191 | parser = argparse.ArgumentParser() 192 | parser.add_argument('--pred_file', help='predict file') 193 | parser.add_argument('--ref_file', help='reference file') 194 | parser.add_argument('--alpha', type=float, default=1.0, 195 | help='common value of alpha') 196 | parser.add_argument('--beta', type=float, default=1.0, 197 | help='common value of beta') 198 | args = parser.parse_args() 199 | main(args) 200 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module finds the most related paragraph of each document according to recall. 19 | """ 20 | 21 | import sys 22 | # reload(sys) 23 | # sys.setdefaultencoding('utf8') 24 | import json 25 | from collections import Counter 26 | 27 | 28 | def precision_recall_f1(prediction, ground_truth): 29 | """ 30 | This function calculates and returns the precision, recall and f1-score 31 | Args: 32 | prediction: prediction string or list to be matched 33 | ground_truth: golden string or list reference 34 | Returns: 35 | floats of (p, r, f1) 36 | Raises: 37 | None 38 | """ 39 | if not isinstance(prediction, list): 40 | prediction_tokens = prediction.split() 41 | else: 42 | prediction_tokens = prediction 43 | if not isinstance(ground_truth, list): 44 | ground_truth_tokens = ground_truth.split() 45 | else: 46 | ground_truth_tokens = ground_truth 47 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 48 | num_same = sum(common.values()) 49 | if num_same == 0: 50 | return 0, 0, 0 51 | p = 1.0 * num_same / len(prediction_tokens) 52 | r = 1.0 * num_same / len(ground_truth_tokens) 53 | f1 = (2 * p * r) / (p + r) 54 | return p, r, f1 55 | 56 | 57 | def recall(prediction, ground_truth): 58 | """ 59 | This function calculates and returns the recall 60 | Args: 61 | prediction: prediction string or list to be matched 62 | ground_truth: golden string or list reference 63 | Returns: 64 | floats of recall 65 | Raises: 66 | None 67 | """ 68 | return precision_recall_f1(prediction, ground_truth)[1] 69 | 70 | 71 | def f1_score(prediction, ground_truth): 72 | """ 73 | This function calculates and returns the f1-score 74 | Args: 75 | prediction: prediction string or list to be matched 76 | ground_truth: golden string or list reference 77 | Returns: 78 | floats of f1 79 | Raises: 80 | None 81 | """ 82 | return precision_recall_f1(prediction, ground_truth)[2] 83 | 84 | 85 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 86 | """ 87 | This function calculates and returns the precision, recall and f1-score 88 | Args: 89 | metric_fn: metric function pointer which calculates scores according to corresponding logic. 90 | prediction: prediction string or list to be matched 91 | ground_truth: golden string or list reference 92 | Returns: 93 | floats of (p, r, f1) 94 | Raises: 95 | None 96 | """ 97 | scores_for_ground_truths = [] 98 | for ground_truth in ground_truths: 99 | score = metric_fn(prediction, ground_truth) 100 | scores_for_ground_truths.append(score) 101 | return max(scores_for_ground_truths) 102 | 103 | 104 | def find_best_question_match(doc, question, with_score=False): 105 | """ 106 | For each docment, find the paragraph that matches best to the question. 107 | Args: 108 | doc: The document object. 109 | question: The question tokens. 110 | with_score: If True then the match score will be returned, 111 | otherwise False. 112 | Returns: 113 | The index of the best match paragraph, if with_score=False, 114 | otherwise returns a tuple of the index of the best match paragraph 115 | and the match score of that paragraph. 116 | """ 117 | most_related_para = -1 118 | max_related_score = 0 119 | most_related_para_len = 0 120 | for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']): 121 | if len(question) > 0: 122 | related_score = metric_max_over_ground_truths(recall, 123 | para_tokens, 124 | question) 125 | else: 126 | related_score = 0 127 | 128 | if related_score > max_related_score \ 129 | or (related_score == max_related_score \ 130 | and len(para_tokens) < most_related_para_len): 131 | most_related_para = p_idx 132 | max_related_score = related_score 133 | most_related_para_len = len(para_tokens) 134 | if most_related_para == -1: 135 | most_related_para = 0 136 | if with_score: 137 | return most_related_para, max_related_score 138 | return most_related_para 139 | 140 | 141 | def find_fake_answer(sample): 142 | """ 143 | For each document, finds the most related paragraph based on recall, 144 | then finds a span that maximize the f1_score compared with the gold answers 145 | and uses this span as a fake answer span 146 | Args: 147 | sample: a sample in the dataset 148 | Returns: 149 | None 150 | Raises: 151 | None 152 | """ 153 | for doc in sample['documents']: 154 | most_related_para = -1 155 | most_related_para_len = 999999 156 | max_related_score = 0 157 | for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']): 158 | if len(sample['segmented_answers']) > 0: 159 | related_score = metric_max_over_ground_truths(recall, 160 | para_tokens, 161 | sample['segmented_answers']) 162 | else: 163 | continue 164 | if related_score > max_related_score \ 165 | or (related_score == max_related_score 166 | and len(para_tokens) < most_related_para_len): 167 | most_related_para = p_idx 168 | most_related_para_len = len(para_tokens) 169 | max_related_score = related_score 170 | doc['most_related_para'] = most_related_para 171 | 172 | sample['answer_docs'] = [] 173 | sample['answer_spans'] = [] 174 | sample['fake_answers'] = [] 175 | sample['match_scores'] = [] 176 | 177 | best_match_score = 0 178 | best_match_d_idx, best_match_span = -1, [-1, -1] 179 | best_fake_answer = None 180 | answer_tokens = set() 181 | for segmented_answer in sample['segmented_answers']: 182 | answer_tokens = answer_tokens | set([token for token in segmented_answer]) 183 | for d_idx, doc in enumerate(sample['documents']): 184 | if not doc['is_selected']: 185 | continue 186 | if doc['most_related_para'] == -1: 187 | doc['most_related_para'] = 0 188 | most_related_para_tokens = doc['segmented_paragraphs'][doc['most_related_para']][:1000] 189 | for start_tidx in range(len(most_related_para_tokens)): 190 | if most_related_para_tokens[start_tidx] not in answer_tokens: 191 | continue 192 | for end_tidx in range(len(most_related_para_tokens) - 1, start_tidx - 1, -1): 193 | span_tokens = most_related_para_tokens[start_tidx: end_tidx + 1] 194 | if len(sample['segmented_answers']) > 0: 195 | match_score = metric_max_over_ground_truths(f1_score, span_tokens, 196 | sample['segmented_answers']) 197 | else: 198 | match_score = 0 199 | if match_score == 0: 200 | break 201 | if match_score > best_match_score: 202 | best_match_d_idx = d_idx 203 | best_match_span = [start_tidx, end_tidx] 204 | best_match_score = match_score 205 | best_fake_answer = ''.join(span_tokens) 206 | if best_match_score > 0: 207 | sample['answer_docs'].append(best_match_d_idx) 208 | sample['answer_spans'].append(best_match_span) 209 | sample['fake_answers'].append(best_fake_answer) 210 | sample['match_scores'].append(best_match_score) 211 | 212 | 213 | if __name__ == '__main__': 214 | for line in sys.stdin: 215 | sample = json.loads(line) 216 | find_fake_answer(sample) 217 | print(json.dumps(sample, encoding='utf8', ensure_ascii=False)) 218 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/rouge.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | 3 | from functools import reduce 4 | import math 5 | import json 6 | import numpy as np 7 | from collections import defaultdict 8 | import sys 9 | 10 | # reload(sys) 11 | # sys.setdefaultencoding("utf-8") 12 | 13 | 14 | class RougeLWithBonus(object): 15 | def __init__(self, alpha=1.0, beta=1.0, gamma=1.2): 16 | self.alpha = alpha 17 | self.beta = beta 18 | self.gamma = gamma 19 | self.inst_scores = [] 20 | 21 | def lcs(self, string, sub): 22 | if len(string) < len(sub): 23 | sub, string = string, sub 24 | lengths = np.zeros((len(string) + 1, len(sub) + 1)) 25 | for j in range(1, len(sub) + 1): 26 | for i in range(1, len(string) + 1): 27 | if string[i - 1] == sub[j - 1]: 28 | lengths[i][j] = lengths[i - 1][j - 1] + 1 29 | else: 30 | lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) 31 | return lengths[len(string)][len(sub)] 32 | 33 | def add_inst(self, 34 | cand, 35 | ref_list, 36 | yn_label=None, yn_ref=None, entity_ref=None): 37 | precs, recalls = [], [] 38 | for i, ref in enumerate(ref_list): 39 | basic_lcs = self.lcs(cand, ref) 40 | yn_bonus, entity_bonus = 0.0, 0.0 41 | if yn_ref is not None and yn_label is not None: 42 | yn_bonus = self.add_yn_bonus(cand, ref, yn_label, yn_ref[i]) 43 | elif entity_ref is not None: 44 | entity_bonus = self.add_entity_bonus(cand, entity_ref) 45 | p_denom = len(cand) + self.alpha * yn_bonus + self.beta * entity_bonus 46 | r_denom = len(ref) + self.alpha * yn_bonus + self.beta * entity_bonus 47 | prec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \ 48 | / p_denom if p_denom > 0. else 0. 49 | rec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \ 50 | / r_denom if r_denom > 0. else 0. 51 | precs.append(prec) 52 | recalls.append(rec) 53 | 54 | prec_max = max(precs) 55 | rec_max = max(recalls) 56 | if prec_max != 0 and rec_max != 0: 57 | score = ((1 + self.gamma ** 2) * prec_max * rec_max) / \ 58 | float(rec_max + self.gamma ** 2 * prec_max) 59 | else: 60 | score = 0.0 61 | self.inst_scores.append(score) 62 | 63 | def add_yn_bonus(self, cand, ref, yn_label, yn_ref): 64 | if yn_label != yn_ref: 65 | return 0.0 66 | lcs_ = self.lcs(cand, ref) 67 | return lcs_ 68 | 69 | def add_entity_bonus(self, cand, entity_ref): 70 | lcs_ = 0.0 71 | for ent in entity_ref: 72 | if ent in cand: 73 | lcs_ += len(ent) 74 | return lcs_ 75 | 76 | def score(self): 77 | return 1. * sum(self.inst_scores) / len(self.inst_scores) 78 | -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF+Self Attention/utils/rouge_metric/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | 12 | 13 | # import pdb 14 | 15 | 16 | def my_lcs(string, sub): 17 | """ 18 | Calculates longest common subsequence for a pair of tokenized strings 19 | :param string : list of str : tokens from a string split using whitespace 20 | :param sub : list of str : shorter string, also split using whitespace 21 | :returns: length (list of int): length of the longest common subsequence between the two strings 22 | 23 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 24 | """ 25 | if (len(string) < len(sub)): 26 | sub, string = string, sub 27 | 28 | lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)] 29 | 30 | for j in range(1, len(sub) + 1): 31 | for i in range(1, len(string) + 1): 32 | if (string[i - 1] == sub[j - 1]): 33 | lengths[i][j] = lengths[i - 1][j - 1] + 1 34 | else: 35 | lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) 36 | 37 | return lengths[len(string)][len(sub)] 38 | 39 | 40 | class Rouge(): 41 | ''' 42 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 43 | 44 | ''' 45 | 46 | def __init__(self): 47 | # vrama91: updated the value below based on discussion with Hovey 48 | self.beta = 1.2 49 | 50 | def calc_score(self, candidate, refs): 51 | """ 52 | Compute ROUGE-L score given one candidate and references for an image 53 | :param candidate: str : candidate sentence to be evaluated 54 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 55 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 56 | """ 57 | assert (len(candidate) == 1) 58 | assert (len(refs) > 0) 59 | prec = [] 60 | rec = [] 61 | 62 | # split into tokens 63 | token_c = candidate[0].split(" ") 64 | 65 | for reference in refs: 66 | # split into tokens 67 | token_r = reference.split(" ") 68 | # compute the longest common subsequence 69 | lcs = my_lcs(token_r, token_c) 70 | prec.append(lcs / float(len(token_c))) 71 | rec.append(lcs / float(len(token_r))) 72 | 73 | prec_max = max(prec) 74 | rec_max = max(rec) 75 | 76 | if (prec_max != 0 and rec_max != 0): 77 | score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max) 78 | else: 79 | score = 0.0 80 | return score 81 | 82 | def compute_score(self, gts, res): 83 | """ 84 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 85 | Invoked by evaluate_captions.py 86 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 87 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 88 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 89 | """ 90 | assert (list(gts.keys()) == list(res.keys())) 91 | imgIds = list(gts.keys()) 92 | 93 | score = [] 94 | for id in imgIds: 95 | hypo = res[id] 96 | ref = gts[id] 97 | 98 | score.append(self.calc_score(hypo, ref)) 99 | 100 | # Sanity check. 101 | assert (type(hypo) is list) 102 | assert (len(hypo) == 1) 103 | assert (type(ref) is list) 104 | assert (len(ref) > 0) 105 | 106 | average_score = np.mean(np.array(score)) 107 | return average_score, np.array(score) 108 | 109 | def method(self): 110 | return "Rouge" 111 | -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | Empty __init__.py file 19 | 20 | Authors: Yizhong Wang(wangyizhong01@baidu.com) 21 | Date: 2017/09/20 12:00:00 22 | """ 23 | -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/__pycache__/match_layer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/match_layer.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/__pycache__/pointer_net.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/pointer_net.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/basic_rnn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module provides wrappers for variants of RNN in Tensorflow 19 | """ 20 | 21 | import tensorflow as tf 22 | import tensorflow.contrib as tc 23 | 24 | 25 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True): 26 | """ 27 | Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN 28 | Args: 29 | rnn_type: the type of rnn 30 | inputs: padded inputs into rnn 31 | length: the valid length of the inputs 32 | hidden_size: the size of hidden units 33 | layer_num: multiple rnn layer are stacked if layer_num > 1 34 | dropout_keep_prob: 35 | concat: When the rnn is bidirectional, the forward outputs and backward outputs are 36 | concatenated if this is True, else we add them. 37 | Returns: 38 | RNN outputs and final state 39 | """ 40 | if not rnn_type.startswith('bi'): 41 | cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 42 | outputs, state = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32) 43 | if rnn_type.endswith('lstm'): 44 | c, h = state 45 | state = h 46 | else: 47 | cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 48 | cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob) 49 | outputs, state = tf.nn.bidirectional_dynamic_rnn( 50 | cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32 51 | ) 52 | state_fw, state_bw = state 53 | if rnn_type.endswith('lstm'): 54 | c_fw, h_fw = state_fw 55 | c_bw, h_bw = state_bw 56 | state_fw, state_bw = h_fw, h_bw 57 | if concat: 58 | outputs = tf.concat(outputs, 2) 59 | state = tf.concat([state_fw, state_bw], 1) 60 | else: 61 | outputs = outputs[0] + outputs[1] 62 | state = state_fw + state_bw 63 | return outputs, state 64 | 65 | 66 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None): 67 | """ 68 | Gets the RNN Cell 69 | Args: 70 | rnn_type: 'lstm', 'gru' or 'rnn' 71 | hidden_size: The size of hidden units 72 | layer_num: MultiRNNCell are used if layer_num > 1 73 | dropout_keep_prob: dropout in RNN 74 | Returns: 75 | An RNN Cell 76 | """ 77 | if rnn_type.endswith('lstm'): 78 | cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True) 79 | elif rnn_type.endswith('gru'): 80 | cell = tc.rnn.GRUCell(num_units=hidden_size) 81 | elif rnn_type.endswith('rnn'): 82 | cell = tc.rnn.BasicRNNCell(num_units=hidden_size) 83 | else: 84 | raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type)) 85 | if dropout_keep_prob is not None: 86 | cell = tc.rnn.DropoutWrapper(cell, 87 | input_keep_prob=dropout_keep_prob, 88 | output_keep_prob=dropout_keep_prob) 89 | if layer_num > 1: 90 | cell = tc.rnn.MultiRNNCell([cell]*layer_num, state_is_tuple=True) 91 | return cell 92 | 93 | 94 | -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/match_layer.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module implements the core layer of Match-LSTM and BiDAF 19 | """ 20 | 21 | import tensorflow as tf 22 | import tensorflow.contrib as tc 23 | 24 | 25 | class MatchLSTMAttnCell(tc.rnn.LSTMCell): 26 | """ 27 | Implements the Match-LSTM attention cell 28 | """ 29 | def __init__(self, num_units, context_to_attend): 30 | super(MatchLSTMAttnCell, self).__init__(num_units, state_is_tuple=True) 31 | self.context_to_attend = context_to_attend 32 | self.fc_context = tc.layers.fully_connected(self.context_to_attend, 33 | num_outputs=self._num_units, 34 | activation_fn=None) 35 | 36 | def __call__(self, inputs, state, scope=None): 37 | (c_prev, h_prev) = state 38 | with tf.variable_scope(scope or type(self).__name__): 39 | ref_vector = tf.concat([inputs, h_prev], -1) 40 | G = tf.tanh(self.fc_context 41 | + tf.expand_dims(tc.layers.fully_connected(ref_vector, 42 | num_outputs=self._num_units, 43 | activation_fn=None), 1)) 44 | logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None) 45 | scores = tf.nn.softmax(logits, 1) 46 | attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1) 47 | new_inputs = tf.concat([inputs, attended_context, 48 | inputs - attended_context, inputs * attended_context], 49 | -1) 50 | return super(MatchLSTMAttnCell, self).__call__(new_inputs, state, scope) 51 | 52 | 53 | class MatchLSTMLayer(object): 54 | """ 55 | Implements the Match-LSTM layer, which attend to the question dynamically in a LSTM fashion. 56 | """ 57 | def __init__(self, hidden_size): 58 | self.hidden_size = hidden_size 59 | 60 | def match(self, passage_encodes, question_encodes, p_length, q_length): 61 | """ 62 | Match the passage_encodes with question_encodes using Match-LSTM algorithm 63 | """ 64 | with tf.variable_scope('match_lstm', reuse=tf.AUTO_REUSE): 65 | cell_fw = MatchLSTMAttnCell(self.hidden_size, question_encodes) 66 | cell_bw = MatchLSTMAttnCell(self.hidden_size, question_encodes) 67 | outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw, 68 | inputs=passage_encodes, 69 | sequence_length=p_length, 70 | dtype=tf.float32) 71 | match_outputs = tf.concat(outputs, 2) 72 | state_fw, state_bw = state 73 | c_fw, h_fw = state_fw 74 | c_bw, h_bw = state_bw 75 | match_state = tf.concat([h_fw, h_bw], 1) 76 | return match_outputs, match_state 77 | 78 | 79 | class AttentionFlowMatchLayer(object): 80 | """ 81 | Implements the Attention Flow layer, 82 | which computes Context-to-question Attention and question-to-context Attention 83 | """ 84 | def __init__(self, hidden_size): 85 | self.hidden_size = hidden_size 86 | 87 | def match(self, passage_encodes, question_encodes, p_length, q_length): 88 | """ 89 | Match the passage_encodes with question_encodes using Attention Flow Match algorithm 90 | """ 91 | with tf.variable_scope('bidaf', reuse=tf.AUTO_REUSE): 92 | sim_matrix = tf.matmul(passage_encodes, question_encodes, transpose_b=True) 93 | context2question_attn = tf.matmul(tf.nn.softmax(sim_matrix, -1), question_encodes) 94 | b = tf.nn.softmax(tf.expand_dims(tf.reduce_max(sim_matrix, 2), 1), -1) 95 | question2context_attn = tf.tile(tf.matmul(b, passage_encodes), 96 | [1, tf.shape(passage_encodes)[1], 1]) 97 | concat_outputs = tf.concat([passage_encodes, context2question_attn, 98 | passage_encodes * context2question_attn, 99 | passage_encodes * question2context_attn], -1) 100 | return concat_outputs, None 101 | -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/layers/pointer_net.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module implements the Pointer Network for selecting answer spans, as described in: 19 | https://openreview.net/pdf?id=B1-q5Pqxl 20 | """ 21 | 22 | import tensorflow as tf 23 | import tensorflow.contrib as tc 24 | 25 | 26 | def custom_dynamic_rnn(cell, inputs, inputs_len, initial_state=None): 27 | """ 28 | Implements a dynamic rnn that can store scores in the pointer network, 29 | the reason why we implements this is that the raw_rnn or dynamic_rnn function in Tensorflow 30 | seem to require the hidden unit and memory unit has the same dimension, and we cannot 31 | store the scores directly in the hidden unit. 32 | Args: 33 | cell: RNN cell 34 | inputs: the input sequence to rnn 35 | inputs_len: valid length 36 | initial_state: initial_state of the cell 37 | Returns: 38 | outputs and state 39 | """ 40 | batch_size = tf.shape(inputs)[0] 41 | max_time = tf.shape(inputs)[1] 42 | 43 | inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time) 44 | inputs_ta = inputs_ta.unstack(tf.transpose(inputs, [1, 0, 2])) 45 | emit_ta = tf.TensorArray(dtype=tf.float32, dynamic_size=True, size=0) 46 | t0 = tf.constant(0, dtype=tf.int32) 47 | if initial_state is not None: 48 | s0 = initial_state 49 | else: 50 | s0 = cell.zero_state(batch_size, dtype=tf.float32) 51 | f0 = tf.zeros([batch_size], dtype=tf.bool) 52 | 53 | def loop_fn(t, prev_s, emit_ta, finished): 54 | """ 55 | the loop function of rnn 56 | """ 57 | cur_x = inputs_ta.read(t) 58 | scores, cur_state = cell(cur_x, prev_s) 59 | 60 | # copy through 61 | scores = tf.where(finished, tf.zeros_like(scores), scores) 62 | 63 | if isinstance(cell, tc.rnn.LSTMCell): 64 | cur_c, cur_h = cur_state 65 | prev_c, prev_h = prev_s 66 | cur_state = tc.rnn.LSTMStateTuple(tf.where(finished, prev_c, cur_c), 67 | tf.where(finished, prev_h, cur_h)) 68 | else: 69 | cur_state = tf.where(finished, prev_s, cur_state) 70 | 71 | emit_ta = emit_ta.write(t, scores) 72 | finished = tf.greater_equal(t + 1, inputs_len) 73 | return [t + 1, cur_state, emit_ta, finished] 74 | 75 | _, state, emit_ta, _ = tf.while_loop( 76 | cond=lambda _1, _2, _3, finished: tf.logical_not(tf.reduce_all(finished)), 77 | body=loop_fn, 78 | loop_vars=(t0, s0, emit_ta, f0), 79 | parallel_iterations=32, 80 | swap_memory=False) 81 | 82 | outputs = tf.transpose(emit_ta.stack(), [1, 0, 2]) 83 | return outputs, state 84 | 85 | 86 | def attend_pooling(pooling_vectors, ref_vector, hidden_size, scope=None): 87 | """ 88 | Applies attend pooling to a set of vectors according to a reference vector. 89 | Args: 90 | pooling_vectors: the vectors to pool 91 | ref_vector: the reference vector 92 | hidden_size: the hidden size for attention function 93 | scope: score name 94 | Returns: 95 | the pooled vector 96 | """ 97 | with tf.variable_scope(scope or 'attend_pooling', reuse=tf.AUTO_REUSE): 98 | U = tf.tanh(tc.layers.fully_connected(pooling_vectors, num_outputs=hidden_size, 99 | activation_fn=None, biases_initializer=None) 100 | + tc.layers.fully_connected(tf.expand_dims(ref_vector, 1), 101 | num_outputs=hidden_size, 102 | activation_fn=None)) 103 | logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None) 104 | scores = tf.nn.softmax(logits, 1) 105 | pooled_vector = tf.reduce_sum(pooling_vectors * scores, axis=1) 106 | return pooled_vector 107 | 108 | 109 | class PointerNetLSTMCell(tc.rnn.LSTMCell): 110 | """ 111 | Implements the Pointer Network Cell 112 | """ 113 | 114 | def __init__(self, num_units, context_to_point): 115 | super(PointerNetLSTMCell, self).__init__(num_units, state_is_tuple=True) 116 | self.context_to_point = context_to_point 117 | self.fc_context = tc.layers.fully_connected(self.context_to_point, 118 | num_outputs=self._num_units, 119 | activation_fn=None) 120 | 121 | def __call__(self, inputs, state, scope=None): 122 | (c_prev, m_prev) = state 123 | with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE): 124 | U = tf.tanh(self.fc_context 125 | + tf.expand_dims(tc.layers.fully_connected(m_prev, 126 | num_outputs=self._num_units, 127 | activation_fn=None), 128 | 1)) 129 | logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None) 130 | scores = tf.nn.softmax(logits, 1) 131 | attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1) 132 | lstm_out, lstm_state = super(PointerNetLSTMCell, self).__call__(attended_context, state) 133 | return tf.squeeze(scores, -1), lstm_state 134 | 135 | 136 | class PointerNetDecoder(object): 137 | """ 138 | Implements the Pointer Network 139 | """ 140 | 141 | def __init__(self, hidden_size): 142 | self.hidden_size = hidden_size 143 | 144 | def decode(self, passage_vectors, question_vectors, init_with_question=True): 145 | """ 146 | Use Pointer Network to compute the probabilities of each position 147 | to be start and end of the answer 148 | Args: 149 | passage_vectors: the encoded passage vectors 150 | question_vectors: the encoded question vectors 151 | init_with_question: if set to be true, 152 | we will use the question_vectors to init the state of Pointer Network 153 | Returns: 154 | the probs of evary position to be start and end of the answer 155 | """ 156 | with tf.variable_scope('pn_decoder', reuse=tf.AUTO_REUSE): 157 | fake_inputs = tf.zeros([tf.shape(passage_vectors)[0], 2, 1]) # not used 158 | sequence_len = tf.tile([2], [tf.shape(passage_vectors)[0]]) 159 | if init_with_question: 160 | random_attn_vector = tf.Variable(tf.random_normal([1, self.hidden_size]), 161 | trainable=True, name="random_attn_vector") 162 | pooled_question_rep = tc.layers.fully_connected( 163 | attend_pooling(question_vectors, random_attn_vector, self.hidden_size), 164 | num_outputs=self.hidden_size, activation_fn=None 165 | ) 166 | init_state = tc.rnn.LSTMStateTuple(pooled_question_rep, pooled_question_rep) 167 | else: 168 | init_state = None 169 | with tf.variable_scope('fw', reuse=tf.AUTO_REUSE): 170 | fw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors) 171 | fw_outputs, _ = custom_dynamic_rnn(fw_cell, fake_inputs, sequence_len, init_state) 172 | with tf.variable_scope('bw', reuse=tf.AUTO_REUSE): 173 | bw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors) 174 | bw_outputs, _ = custom_dynamic_rnn(bw_cell, fake_inputs, sequence_len, init_state) 175 | start_prob = (fw_outputs[0:, 0, 0:] + bw_outputs[0:, 1, 0:]) / 2 176 | end_prob = (fw_outputs[0:, 1, 0:] + bw_outputs[0:, 0, 0:]) / 2 177 | return start_prob, end_prob 178 | -------------------------------------------------------------------------------- /BiDAF_Origin/dureader/vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module implements the Vocab class for converting string to id and back 19 | """ 20 | 21 | import numpy as np 22 | 23 | 24 | class Vocab(object): 25 | """ 26 | Implements a vocabulary to store the tokens in the data, with their corresponding embeddings. 27 | """ 28 | def __init__(self, filename=None, initial_tokens=None, lower=False): 29 | self.id2token = {} 30 | self.token2id = {} 31 | self.token_cnt = {} 32 | self.lower = lower 33 | 34 | self.embed_dim = None 35 | self.embeddings = None 36 | 37 | self.pad_token = '' 38 | self.unk_token = '' 39 | 40 | self.initial_tokens = initial_tokens if initial_tokens is not None else [] 41 | self.initial_tokens.extend([self.pad_token, self.unk_token]) 42 | for token in self.initial_tokens: 43 | self.add(token) 44 | 45 | if filename is not None: 46 | self.load_from_file(filename) 47 | 48 | def size(self): 49 | """ 50 | get the size of vocabulary 51 | Returns: 52 | an integer indicating the size 53 | """ 54 | return len(self.id2token) 55 | 56 | def load_from_file(self, file_path): 57 | """ 58 | loads the vocab from file_path 59 | Args: 60 | file_path: a file with a word in each line 61 | """ 62 | for line in open(file_path, 'r'): 63 | token = line.rstrip('\n') 64 | self.add(token) 65 | 66 | def get_id(self, token): 67 | """ 68 | gets the id of a token, returns the id of unk token if token is not in vocab 69 | Args: 70 | key: a string indicating the word 71 | Returns: 72 | an integer 73 | """ 74 | token = token.lower() if self.lower else token 75 | try: 76 | return self.token2id[token] 77 | except KeyError: 78 | return self.token2id[self.unk_token] 79 | 80 | def get_token(self, idx): 81 | """ 82 | gets the token corresponding to idx, returns unk token if idx is not in vocab 83 | Args: 84 | idx: an integer 85 | returns: 86 | a token string 87 | """ 88 | try: 89 | return self.id2token[idx] 90 | except KeyError: 91 | return self.unk_token 92 | 93 | def add(self, token, cnt=1): 94 | """ 95 | adds the token to vocab 96 | Args: 97 | token: a string 98 | cnt: a num indicating the count of the token to add, default is 1 99 | """ 100 | token = token.lower() if self.lower else token 101 | if token in self.token2id: 102 | idx = self.token2id[token] 103 | else: 104 | # vocab中无此token,则添加token2id id2token 105 | idx = len(self.id2token) 106 | self.id2token[idx] = token 107 | self.token2id[token] = idx 108 | if cnt > 0: 109 | if token in self.token_cnt: 110 | self.token_cnt[token] += cnt 111 | else: 112 | self.token_cnt[token] = cnt 113 | return idx 114 | 115 | def filter_tokens_by_cnt(self, min_cnt): 116 | """ 117 | filter the tokens in vocab by their count 118 | Args: 119 | min_cnt: tokens with frequency less than min_cnt is filtered 120 | """ 121 | filtered_tokens = [token for token in self.token2id if self.token_cnt[token] >= min_cnt] 122 | # rebuild the token x id map 123 | self.token2id = {} 124 | self.id2token = {} 125 | for token in self.initial_tokens: 126 | self.add(token, cnt=0) 127 | for token in filtered_tokens: 128 | self.add(token, cnt=0) 129 | 130 | def randomly_init_embeddings(self, embed_dim): 131 | """ 132 | randomly initializes the embeddings for each token 133 | Args: 134 | embed_dim: the size of the embedding for each token 135 | """ 136 | self.embed_dim = embed_dim 137 | self.embeddings = np.random.rand(self.size(), embed_dim) 138 | # 填充符号和未知词符号初始化为0 139 | for token in [self.pad_token, self.unk_token]: 140 | self.embeddings[self.get_id(token)] = np.zeros([self.embed_dim]) 141 | 142 | def load_pretrained_embeddings(self, embedding_path): 143 | """ 144 | loads the pretrained embeddings from embedding_path, 145 | tokens not in pretrained embeddings will be filtered 146 | Args: 147 | embedding_path: the path of the pretrained embedding file 148 | """ 149 | trained_embeddings = {} 150 | with open(embedding_path, 'r', encoding='utf8') as fin: 151 | for line in fin: 152 | contents = line.strip().split() 153 | token = contents[0] 154 | if token not in self.token2id: 155 | continue 156 | trained_embeddings[token] = list(map(float, contents[1:])) 157 | if self.embed_dim is None: 158 | self.embed_dim = len(contents) - 1 159 | fin.close() 160 | filtered_tokens = trained_embeddings.keys() 161 | # rebuild the token x id map 162 | self.token2id = {} 163 | self.id2token = {} 164 | for token in self.initial_tokens: 165 | self.add(token, cnt=0) 166 | for token in filtered_tokens: 167 | self.add(token, cnt=0) 168 | # load embeddings 169 | self.embeddings = np.zeros([self.size(), self.embed_dim]) 170 | for token in self.token2id.keys(): 171 | if token in trained_embeddings: 172 | self.embeddings[self.get_id(token)] = trained_embeddings[token] 173 | 174 | def convert_to_ids(self, tokens): 175 | """ 176 | Convert a list of tokens to ids, use unk_token if the token is not in vocab. 177 | Args: 178 | tokens: a list of token 179 | Returns: 180 | a list of ids 181 | """ 182 | vec = [self.get_id(label) for label in tokens] 183 | return vec 184 | 185 | def recover_from_ids(self, ids, stop_id=None): 186 | """ 187 | Convert a list of ids to tokens, stop converting if the stop_id is encountered 188 | Args: 189 | ids: a list of ids to convert 190 | stop_id: the stop id, default is None 191 | Returns: 192 | a list of tokens 193 | """ 194 | tokens = [] 195 | for i in ids: 196 | tokens += [self.get_token(i)] 197 | if stop_id is not None and i == stop_id: 198 | break 199 | return tokens 200 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This package implements some utility functions shared by PaddlePaddle 19 | and Tensorflow model implementations. 20 | 21 | Authors: liuyuan(liuyuan04@baidu.com) 22 | Date: 2017/10/06 18:23:06 23 | """ 24 | 25 | 26 | from .dureader_eval import compute_bleu_rouge 27 | from .dureader_eval import normalize 28 | from .preprocess import find_fake_answer 29 | from .preprocess import find_best_question_match 30 | 31 | __all__ = [ 32 | 'compute_bleu_rouge', 33 | 'normalize', 34 | 'find_fake_answer', 35 | 'find_best_question_match', 36 | ] 37 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF_Origin/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF_Origin/utils/bleu_metric/bleu.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : bleu.py 4 | # 5 | # Description : Wrapper for BLEU scorer. 6 | # 7 | # Creation Date : 06-01-2015 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT 9 | # Authors : Hao Fang and Tsung-Yi Lin 10 | 11 | from .bleu_score import BleuScorer 12 | 13 | 14 | class Bleu: 15 | def __init__(self, n=4): 16 | # default compute Blue score up to 4 17 | self._n = n 18 | self._hypo_for_image = {} 19 | self.ref_for_image = {} 20 | 21 | def compute_score(self, gts, res): 22 | 23 | assert(list(gts.keys()) == list(res.keys())) 24 | imgIds = list(gts.keys()) 25 | 26 | bleu_scorer = BleuScorer(n=self._n) 27 | for id in imgIds: 28 | hypo = res[id] 29 | ref = gts[id] 30 | 31 | # Sanity check. 32 | assert(type(hypo) is list) 33 | assert(len(hypo) == 1) 34 | assert(type(ref) is list) 35 | assert(len(ref) >= 1) 36 | 37 | bleu_scorer += (hypo[0], ref) 38 | 39 | #score, scores = bleu_scorer.compute_score(option='shortest') 40 | score, scores = bleu_scorer.compute_score(option='closest', verbose=1) 41 | #score, scores = bleu_scorer.compute_score(option='average', verbose=1) 42 | 43 | # return (bleu, bleu_info) 44 | return score, scores 45 | 46 | def method(self): 47 | return "Bleu" 48 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/bleu_metric/bleu_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # bleu_scorer.py 4 | # David Chiang 5 | 6 | # Copyright (c) 2004-2006 University of Maryland. All rights 7 | # reserved. Do not redistribute without permission from the 8 | # author. Not for commercial use. 9 | 10 | # Modified by: 11 | # Hao Fang 12 | # Tsung-Yi Lin 13 | 14 | '''Provides: 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test(). 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked(). 17 | ''' 18 | 19 | import copy 20 | # import sys, math, re 21 | import math 22 | from collections import defaultdict 23 | 24 | 25 | def precook(s, n=4, out=False): 26 | """Takes a string as input and returns an object that can be given to 27 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 28 | can take string arguments as well.""" 29 | words = s.split() 30 | counts = defaultdict(int) 31 | for k in range(1, n + 1): 32 | for i in range(len(words) - k + 1): 33 | ngram = tuple(words[i:i + k]) 34 | counts[ngram] += 1 35 | return (len(words), counts) 36 | 37 | 38 | def cook_refs(refs, eff=None, n=4): ## lhuang: oracle will call with "average" 39 | '''Takes a list of reference sentences for a single segment 40 | and returns an object that encapsulates everything that BLEU 41 | needs to know about them.''' 42 | 43 | reflen = [] 44 | maxcounts = {} 45 | for ref in refs: 46 | rl, counts = precook(ref, n) 47 | reflen.append(rl) 48 | for (ngram, count) in counts.items(): 49 | maxcounts[ngram] = max(maxcounts.get(ngram, 0), count) 50 | 51 | # Calculate effective reference sentence length. 52 | if eff == "shortest": 53 | reflen = min(reflen) 54 | elif eff == "average": 55 | reflen = float(sum(reflen)) / len(reflen) 56 | 57 | ## lhuang: N.B.: leave reflen computaiton to the very end!! 58 | 59 | ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design) 60 | 61 | return (reflen, maxcounts) 62 | 63 | 64 | def cook_test(test, xxx_todo_changeme, eff=None, n=4): 65 | '''Takes a test sentence and returns an object that 66 | encapsulates everything that BLEU needs to know about it.''' 67 | (reflen, refmaxcounts) = xxx_todo_changeme 68 | testlen, counts = precook(test, n, True) 69 | 70 | result = {} 71 | 72 | # Calculate effective reference sentence length. 73 | 74 | if eff == "closest": 75 | result["reflen"] = min((abs(l - testlen), l) for l in reflen)[1] 76 | else: ## i.e., "average" or "shortest" or None 77 | result["reflen"] = reflen 78 | 79 | result["testlen"] = testlen 80 | 81 | result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)] 82 | 83 | result['correct'] = [0] * n 84 | for (ngram, count) in counts.items(): 85 | result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count) 86 | 87 | return result 88 | 89 | 90 | class BleuScorer(object): 91 | """Bleu scorer. 92 | """ 93 | 94 | __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen" 95 | 96 | # special_reflen is used in oracle (proportional effective ref len for a node). 97 | 98 | def copy(self): 99 | ''' copy the refs.''' 100 | new = BleuScorer(n=self.n) 101 | new.ctest = copy.copy(self.ctest) 102 | new.crefs = copy.copy(self.crefs) 103 | new._score = None 104 | return new 105 | 106 | def __init__(self, test=None, refs=None, n=4, special_reflen=None): 107 | ''' singular instance ''' 108 | 109 | self.n = n 110 | self.crefs = [] 111 | self.ctest = [] 112 | self.cook_append(test, refs) 113 | self.special_reflen = special_reflen 114 | 115 | def cook_append(self, test, refs): 116 | '''called by constructor and __iadd__ to avoid creating new instances.''' 117 | 118 | if refs is not None: 119 | self.crefs.append(cook_refs(refs)) 120 | if test is not None: 121 | cooked_test = cook_test(test, self.crefs[-1]) 122 | self.ctest.append(cooked_test) ## N.B.: -1 123 | else: 124 | self.ctest.append(None) # lens of crefs and ctest have to match 125 | 126 | self._score = None ## need to recompute 127 | 128 | def ratio(self, option=None): 129 | self.compute_score(option=option) 130 | return self._ratio 131 | 132 | def score_ratio(self, option=None): 133 | '''return (bleu, len_ratio) pair''' 134 | return (self.fscore(option=option), self.ratio(option=option)) 135 | 136 | def score_ratio_str(self, option=None): 137 | return "%.4f (%.2f)" % self.score_ratio(option) 138 | 139 | def reflen(self, option=None): 140 | self.compute_score(option=option) 141 | return self._reflen 142 | 143 | def testlen(self, option=None): 144 | self.compute_score(option=option) 145 | return self._testlen 146 | 147 | def retest(self, new_test): 148 | if type(new_test) is str: 149 | new_test = [new_test] 150 | assert len(new_test) == len(self.crefs), new_test 151 | self.ctest = [] 152 | for t, rs in zip(new_test, self.crefs): 153 | self.ctest.append(cook_test(t, rs)) 154 | self._score = None 155 | 156 | return self 157 | 158 | def rescore(self, new_test): 159 | ''' replace test(s) with new test(s), and returns the new score.''' 160 | 161 | return self.retest(new_test).compute_score() 162 | 163 | def size(self): 164 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 165 | return len(self.crefs) 166 | 167 | def __iadd__(self, other): 168 | '''add an instance (e.g., from another sentence).''' 169 | 170 | if type(other) is tuple: 171 | ## avoid creating new BleuScorer instances 172 | self.cook_append(other[0], other[1]) 173 | else: 174 | assert self.compatible(other), "incompatible BLEUs." 175 | self.ctest.extend(other.ctest) 176 | self.crefs.extend(other.crefs) 177 | self._score = None ## need to recompute 178 | 179 | return self 180 | 181 | def compatible(self, other): 182 | return isinstance(other, BleuScorer) and self.n == other.n 183 | 184 | def single_reflen(self, option="average"): 185 | return self._single_reflen(self.crefs[0][0], option) 186 | 187 | def _single_reflen(self, reflens, option=None, testlen=None): 188 | 189 | if option == "shortest": 190 | reflen = min(reflens) 191 | elif option == "average": 192 | reflen = float(sum(reflens)) / len(reflens) 193 | elif option == "closest": 194 | reflen = min((abs(l - testlen), l) for l in reflens)[1] 195 | else: 196 | assert False, "unsupported reflen option %s" % option 197 | 198 | return reflen 199 | 200 | def recompute_score(self, option=None, verbose=0): 201 | self._score = None 202 | return self.compute_score(option, verbose) 203 | 204 | def compute_score(self, option=None, verbose=0): 205 | n = self.n 206 | small = 1e-9 207 | tiny = 1e-15 ## so that if guess is 0 still return 0 208 | bleu_list = [[] for _ in range(n)] 209 | 210 | if self._score is not None: 211 | return self._score 212 | 213 | if option is None: 214 | option = "average" if len(self.crefs) == 1 else "closest" 215 | 216 | self._testlen = 0 217 | self._reflen = 0 218 | totalcomps = {'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n} 219 | 220 | # for each sentence 221 | for comps in self.ctest: 222 | testlen = comps['testlen'] 223 | self._testlen += testlen 224 | 225 | if self.special_reflen is None: ## need computation 226 | reflen = self._single_reflen(comps['reflen'], option, testlen) 227 | else: 228 | reflen = self.special_reflen 229 | 230 | self._reflen += reflen 231 | 232 | for key in ['guess', 'correct']: 233 | for k in range(n): 234 | totalcomps[key][k] += comps[key][k] 235 | 236 | # append per image bleu score 237 | bleu = 1. 238 | for k in range(n): 239 | bleu *= (float(comps['correct'][k]) + tiny) \ 240 | / (float(comps['guess'][k]) + small) 241 | bleu_list[k].append(bleu ** (1. / (k + 1))) 242 | ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division 243 | if ratio < 1: 244 | for k in range(n): 245 | bleu_list[k][-1] *= math.exp(1 - 1 / ratio) 246 | 247 | if verbose > 1: 248 | print(comps, reflen) 249 | 250 | totalcomps['reflen'] = self._reflen 251 | totalcomps['testlen'] = self._testlen 252 | 253 | bleus = [] 254 | bleu = 1. 255 | for k in range(n): 256 | bleu *= float(totalcomps['correct'][k] + tiny) \ 257 | / (totalcomps['guess'][k] + small) 258 | bleus.append(bleu ** (1. / (k + 1))) 259 | ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division 260 | if ratio < 1: 261 | for k in range(n): 262 | bleus[k] *= math.exp(1 - 1 / ratio) 263 | 264 | if verbose > 0: 265 | print(totalcomps) 266 | print("ratio:", ratio) 267 | 268 | self._score = bleus 269 | return self._score, bleu_list 270 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/get_vocab.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf8 -*- 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | Utility function to generate vocabulary file. 19 | """ 20 | 21 | 22 | import argparse 23 | import sys 24 | import json 25 | 26 | from itertools import chain 27 | 28 | 29 | def get_vocab(files, vocab_file): 30 | """ 31 | Builds vocabulary file from field 'segmented_paragraphs' 32 | and 'segmented_question'. 33 | 34 | Args: 35 | files: A list of file names. 36 | vocab_file: The file that stores the vocabulary. 37 | """ 38 | vocab = {} 39 | for f in files: 40 | with open(f, 'r') as fin: 41 | for line in fin: 42 | obj = json.loads(line.strip()) 43 | paras = [ 44 | chain(*d['segmented_paragraphs']) 45 | for d in obj['documents']] 46 | doc_tokens = chain(*paras) 47 | question_tokens = obj['segmented_question'] 48 | for t in list(doc_tokens) + question_tokens: 49 | vocab[t] = vocab.get(t, 0) + 1 50 | # output 51 | sorted_vocab = sorted([(v, c) for v, c in vocab.items()], 52 | key=lambda x: x[1], 53 | reverse=True) 54 | with open(vocab_file, 'w') as outf: 55 | for w, c in sorted_vocab: 56 | print >> outf, '{}\t{}'.format(w.encode('utf8'), c) 57 | 58 | 59 | if __name__ == '__main__': 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument('--files', nargs='+', required=True, 62 | help='file list to count vocab from.') 63 | parser.add_argument('--vocab', required=True, 64 | help='file to store counted vocab.') 65 | args = parser.parse_args() 66 | get_vocab(args.files, args.vocab) 67 | 68 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/json_to_sentence.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def load_data(brc_data, tar_dir): 5 | # print('Converting ' + file) 6 | # fin = open(file, encoding='utf8') 7 | out_file = os.path.join(tar_dir, 'train_set.seg') 8 | with open(out_file, 'w', encoding='utf8') as ftrain: 9 | for sample in brc_data.train_set: 10 | ftrain.write(' '.join(sample['segmented_question']) + '\n') 11 | for passage in sample['passages']: 12 | ftrain.write(' '.join(passage['passage_tokens']) + '\n') 13 | del sample 14 | ftrain.close() 15 | 16 | out_file = os.path.join(tar_dir, 'dev_set.seg') 17 | with open(out_file, 'w', encoding='utf8') as fdev: 18 | for sample in brc_data.dev_set: 19 | fdev.write(' '.join(sample['segmented_question']) + '\n') 20 | for passage in sample['passages']: 21 | fdev.write(' '.join(passage['passage_tokens']) + '\n') 22 | del sample 23 | fdev.close() 24 | 25 | out_file = os.path.join(tar_dir, 'test_set.seg') 26 | with open(out_file, 'w', encoding='utf8') as ftest: 27 | for sample in brc_data.test_set: 28 | ftest.write(' '.join(sample['segmented_question']) + '\n') 29 | for passage in sample['passages']: 30 | ftest.write(' '.join(passage['passage_tokens']) + '\n') 31 | del sample 32 | ftest.close() 33 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # ============================================================================== 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ============================================================================== 17 | """ 18 | This module finds the most related paragraph of each document according to recall. 19 | """ 20 | 21 | import sys 22 | # reload(sys) 23 | # sys.setdefaultencoding('utf8') 24 | import json 25 | from collections import Counter 26 | 27 | 28 | def precision_recall_f1(prediction, ground_truth): 29 | """ 30 | This function calculates and returns the precision, recall and f1-score 31 | Args: 32 | prediction: prediction string or list to be matched 33 | ground_truth: golden string or list reference 34 | Returns: 35 | floats of (p, r, f1) 36 | Raises: 37 | None 38 | """ 39 | if not isinstance(prediction, list): 40 | prediction_tokens = prediction.split() 41 | else: 42 | prediction_tokens = prediction 43 | if not isinstance(ground_truth, list): 44 | ground_truth_tokens = ground_truth.split() 45 | else: 46 | ground_truth_tokens = ground_truth 47 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 48 | num_same = sum(common.values()) 49 | if num_same == 0: 50 | return 0, 0, 0 51 | p = 1.0 * num_same / len(prediction_tokens) 52 | r = 1.0 * num_same / len(ground_truth_tokens) 53 | f1 = (2 * p * r) / (p + r) 54 | return p, r, f1 55 | 56 | 57 | def recall(prediction, ground_truth): 58 | """ 59 | This function calculates and returns the recall 60 | Args: 61 | prediction: prediction string or list to be matched 62 | ground_truth: golden string or list reference 63 | Returns: 64 | floats of recall 65 | Raises: 66 | None 67 | """ 68 | return precision_recall_f1(prediction, ground_truth)[1] 69 | 70 | 71 | def f1_score(prediction, ground_truth): 72 | """ 73 | This function calculates and returns the f1-score 74 | Args: 75 | prediction: prediction string or list to be matched 76 | ground_truth: golden string or list reference 77 | Returns: 78 | floats of f1 79 | Raises: 80 | None 81 | """ 82 | return precision_recall_f1(prediction, ground_truth)[2] 83 | 84 | 85 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 86 | """ 87 | This function calculates and returns the precision, recall and f1-score 88 | Args: 89 | metric_fn: metric function pointer which calculates scores according to corresponding logic. 90 | prediction: prediction string or list to be matched 91 | ground_truth: golden string or list reference 92 | Returns: 93 | floats of (p, r, f1) 94 | Raises: 95 | None 96 | """ 97 | scores_for_ground_truths = [] 98 | for ground_truth in ground_truths: 99 | score = metric_fn(prediction, ground_truth) 100 | scores_for_ground_truths.append(score) 101 | return max(scores_for_ground_truths) 102 | 103 | 104 | def find_best_question_match(doc, question, with_score=False): 105 | """ 106 | For each docment, find the paragraph that matches best to the question. 107 | Args: 108 | doc: The document object. 109 | question: The question tokens. 110 | with_score: If True then the match score will be returned, 111 | otherwise False. 112 | Returns: 113 | The index of the best match paragraph, if with_score=False, 114 | otherwise returns a tuple of the index of the best match paragraph 115 | and the match score of that paragraph. 116 | """ 117 | most_related_para = -1 118 | max_related_score = 0 119 | most_related_para_len = 0 120 | for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']): 121 | if len(question) > 0: 122 | related_score = metric_max_over_ground_truths(recall, 123 | para_tokens, 124 | question) 125 | else: 126 | related_score = 0 127 | 128 | if related_score > max_related_score \ 129 | or (related_score == max_related_score \ 130 | and len(para_tokens) < most_related_para_len): 131 | most_related_para = p_idx 132 | max_related_score = related_score 133 | most_related_para_len = len(para_tokens) 134 | if most_related_para == -1: 135 | most_related_para = 0 136 | if with_score: 137 | return most_related_para, max_related_score 138 | return most_related_para 139 | 140 | 141 | def find_fake_answer(sample): 142 | """ 143 | For each document, finds the most related paragraph based on recall, 144 | then finds a span that maximize the f1_score compared with the gold answers 145 | and uses this span as a fake answer span 146 | Args: 147 | sample: a sample in the dataset 148 | Returns: 149 | None 150 | Raises: 151 | None 152 | """ 153 | for doc in sample['documents']: 154 | most_related_para = -1 155 | most_related_para_len = 999999 156 | max_related_score = 0 157 | for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']): 158 | if len(sample['segmented_answers']) > 0: 159 | related_score = metric_max_over_ground_truths(recall, 160 | para_tokens, 161 | sample['segmented_answers']) 162 | else: 163 | continue 164 | if related_score > max_related_score \ 165 | or (related_score == max_related_score 166 | and len(para_tokens) < most_related_para_len): 167 | most_related_para = p_idx 168 | most_related_para_len = len(para_tokens) 169 | max_related_score = related_score 170 | doc['most_related_para'] = most_related_para 171 | 172 | sample['answer_docs'] = [] 173 | sample['answer_spans'] = [] 174 | sample['fake_answers'] = [] 175 | sample['match_scores'] = [] 176 | 177 | best_match_score = 0 178 | best_match_d_idx, best_match_span = -1, [-1, -1] 179 | best_fake_answer = None 180 | answer_tokens = set() 181 | for segmented_answer in sample['segmented_answers']: 182 | answer_tokens = answer_tokens | set([token for token in segmented_answer]) 183 | for d_idx, doc in enumerate(sample['documents']): 184 | if not doc['is_selected']: 185 | continue 186 | if doc['most_related_para'] == -1: 187 | doc['most_related_para'] = 0 188 | most_related_para_tokens = doc['segmented_paragraphs'][doc['most_related_para']][:1000] 189 | for start_tidx in range(len(most_related_para_tokens)): 190 | if most_related_para_tokens[start_tidx] not in answer_tokens: 191 | continue 192 | for end_tidx in range(len(most_related_para_tokens) - 1, start_tidx - 1, -1): 193 | span_tokens = most_related_para_tokens[start_tidx: end_tidx + 1] 194 | if len(sample['segmented_answers']) > 0: 195 | match_score = metric_max_over_ground_truths(f1_score, span_tokens, 196 | sample['segmented_answers']) 197 | else: 198 | match_score = 0 199 | if match_score == 0: 200 | break 201 | if match_score > best_match_score: 202 | best_match_d_idx = d_idx 203 | best_match_span = [start_tidx, end_tidx] 204 | best_match_score = match_score 205 | best_fake_answer = ''.join(span_tokens) 206 | if best_match_score > 0: 207 | sample['answer_docs'].append(best_match_d_idx) 208 | sample['answer_spans'].append(best_match_span) 209 | sample['fake_answers'].append(best_fake_answer) 210 | sample['match_scores'].append(best_match_score) 211 | 212 | 213 | if __name__ == '__main__': 214 | for line in sys.stdin: 215 | sample = json.loads(line) 216 | find_fake_answer(sample) 217 | print(json.dumps(sample, encoding='utf8', ensure_ascii=False)) 218 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/pretrain_embedding.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import logging 4 | from gensim.models import word2vec 5 | from .json_to_sentence import load_data 6 | 7 | 8 | def pre_train(brc_data, segmented_dir): 9 | # parser = argparse.ArgumentParser('Reading Comprehension on BaiduRC dataset') 10 | # path_settings = parser.add_argument_group('path settings') 11 | # path_settings.add_argument('--train_files', nargs='+', 12 | # default=['../data/trainset/search.train.json'], 13 | # help='list of files that contain the preprocessed train data') 14 | # path_settings.add_argument('--dev_files', nargs='+', 15 | # default=['../data/devset/search.dev.json'], 16 | # help='list of files that contain the preprocessed dev data') 17 | # path_settings.add_argument('--test_files', nargs='+', 18 | # default=['../data/testset/search.test.json'], 19 | # help='list of files that contain the preprocessed test data') 20 | # path_settings.add_argument('--segmented_dir', default='../data/segmented', 21 | # help='the dir to store segmented sentences') 22 | 23 | sys.path.append('..') 24 | # args = parser.parse_args() 25 | # for files in args.train_files + args.dev_files + args.test_files: 26 | # json_to_sentence.load_data(files, args.segmented_dir) 27 | load_data(brc_data, segmented_dir) 28 | 29 | program = os.path.basename(sys.argv[0]) 30 | logger = logging.getLogger(program) 31 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 32 | logging.root.setLevel(level=logging.INFO) 33 | logger.info("running %s" % ' '.join(sys.argv)) 34 | 35 | model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir), size=300, min_count=2, workers=8, iter=10) 36 | with open(os.path.join(segmented_dir, 'w2v_dic.data'), 'w', encoding='utf-8') as f: 37 | for word in model.wv.vocab: 38 | f.write(word + ' ') 39 | f.write(' '.join(list(map(str, model[word])))) 40 | f.write('\n') 41 | f.close() 42 | -------------------------------------------------------------------------------- /BiDAF_Origin/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc -------------------------------------------------------------------------------- /BiDAF_Origin/utils/rouge_metric/rouge.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # File Name : rouge.py 4 | # 5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004) 6 | # 7 | # Creation Date : 2015-01-07 06:03 8 | # Author : Ramakrishna Vedantam 9 | 10 | import numpy as np 11 | 12 | 13 | # import pdb 14 | 15 | 16 | def my_lcs(string, sub): 17 | """ 18 | Calculates longest common subsequence for a pair of tokenized strings 19 | :param string : list of str : tokens from a string split using whitespace 20 | :param sub : list of str : shorter string, also split using whitespace 21 | :returns: length (list of int): length of the longest common subsequence between the two strings 22 | 23 | Note: my_lcs only gives length of the longest common subsequence, not the actual LCS 24 | """ 25 | if (len(string) < len(sub)): 26 | sub, string = string, sub 27 | 28 | lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)] 29 | 30 | for j in range(1, len(sub) + 1): 31 | for i in range(1, len(string) + 1): 32 | if (string[i - 1] == sub[j - 1]): 33 | lengths[i][j] = lengths[i - 1][j - 1] + 1 34 | else: 35 | lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) 36 | 37 | return lengths[len(string)][len(sub)] 38 | 39 | 40 | class Rouge(): 41 | ''' 42 | Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set 43 | 44 | ''' 45 | 46 | def __init__(self): 47 | # vrama91: updated the value below based on discussion with Hovey 48 | self.beta = 1.2 49 | 50 | def calc_score(self, candidate, refs): 51 | """ 52 | Compute ROUGE-L score given one candidate and references for an image 53 | :param candidate: str : candidate sentence to be evaluated 54 | :param refs: list of str : COCO reference sentences for the particular image to be evaluated 55 | :returns score: int (ROUGE-L score for the candidate evaluated against references) 56 | """ 57 | assert (len(candidate) == 1) 58 | assert (len(refs) > 0) 59 | prec = [] 60 | rec = [] 61 | 62 | # split into tokens 63 | token_c = candidate[0].split(" ") 64 | 65 | for reference in refs: 66 | # split into tokens 67 | token_r = reference.split(" ") 68 | # compute the longest common subsequence 69 | lcs = my_lcs(token_r, token_c) 70 | prec.append(lcs / float(len(token_c))) 71 | rec.append(lcs / float(len(token_r))) 72 | 73 | prec_max = max(prec) 74 | rec_max = max(rec) 75 | 76 | if (prec_max != 0 and rec_max != 0): 77 | score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max) 78 | else: 79 | score = 0.0 80 | return score 81 | 82 | def compute_score(self, gts, res): 83 | """ 84 | Computes Rouge-L score given a set of reference and candidate sentences for the dataset 85 | Invoked by evaluate_captions.py 86 | :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values 87 | :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values 88 | :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images) 89 | """ 90 | assert (list(gts.keys()) == list(res.keys())) 91 | imgIds = list(gts.keys()) 92 | 93 | score = [] 94 | for id in imgIds: 95 | hypo = res[id] 96 | ref = gts[id] 97 | 98 | score.append(self.calc_score(hypo, ref)) 99 | 100 | # Sanity check. 101 | assert (type(hypo) is list) 102 | assert (len(hypo) == 1) 103 | assert (type(ref) is list) 104 | assert (len(ref) > 0) 105 | 106 | average_score = np.mean(np.array(score)) 107 | return average_score, np.array(score) 108 | 109 | def method(self): 110 | return "Rouge" 111 | -------------------------------------------------------------------------------- /GatedRNN/GatedRNN.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import logging 3 | import time 4 | import os 5 | from basic_rnn import dot_attention, dense, cudnn_gru 6 | 7 | 8 | class GatedRNN(object): 9 | def __init__(self, args, batch, token_embeddings=None, trainable=True, opt=True): 10 | # logging 11 | self.logger = logging.getLogger("brc") 12 | # basic config 13 | self.batch_size = args.batch_size 14 | self.hidden_size = args.hidden_size 15 | self.output_size = 3 16 | # self.d_a = 300 17 | # self.r = 64 18 | # self.p_coef = 1 19 | self.layer_num = args.layer_num 20 | self.optim_type = args.optim 21 | self.weight_decay = args.weight_decay 22 | self.dropout_keep_prob = args.dropout_keep_prob 23 | self.trainable = trainable 24 | # length limit 25 | self.max_q_len = args.max_q_len 26 | self.max_a_len = args.max_a_len 27 | # session info 28 | sess_config = tf.ConfigProto() 29 | sess_config.gpu_options.allow_growth = True 30 | self.sess = tf.Session(config=sess_config) 31 | 32 | self.a, self.q, self.answers_type, self.qa_id = batch.get_next() 33 | self.lr = tf.get_variable('lr', shape=[], dtype=tf.float32, trainable=False) 34 | self.is_train = tf.get_variable('is_train', shape=[], dtype=tf.bool, trainable=False) 35 | self.a_mask = tf.cast(self.a, tf.bool) 36 | self.q_mask = tf.cast(self.q, tf.bool) 37 | self.a_len = tf.reduce_sum(tf.cast(self.a_mask, tf.int32), axis=1) 38 | self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 39 | self.N = tf.shape(self.qa_id)[0] 40 | 41 | self._build_graph(token_embeddings) 42 | 43 | def _build_graph(self, token_embeddings): 44 | start_t = time.time() 45 | self._embed(token_embeddings) 46 | self._encode() 47 | self._gated_attention() 48 | self._self_attention() 49 | # self._annotation() 50 | self._predict() 51 | self._compute_loss() 52 | if self.trainable: 53 | self._create_train_op() 54 | self.logger.info('Time to build graph: {} s'.format(time.time() - start_t)) 55 | 56 | def _embed(self, token_embeddings): 57 | with tf.device('/cpu:0'), tf.variable_scope('word_embedding', reuse=tf.AUTO_REUSE): 58 | word_embeddings = tf.get_variable('word_embeddings', 59 | initializer=tf.constant(token_embeddings, dtype=tf.float32), 60 | trainable=False) 61 | self.a_emb = tf.nn.embedding_lookup(word_embeddings, self.a) 62 | self.q_emb = tf.nn.embedding_lookup(word_embeddings, self.q) 63 | 64 | def _encode(self): 65 | with tf.variable_scope('answer_encoding', reuse=tf.AUTO_REUSE): 66 | a_rnn = cudnn_gru(num_layers=2 * self.layer_num, num_units=self.hidden_size, batch_size=self.N, 67 | input_size=self.a_emb.get_shape().as_list()[-1], 68 | keep_prob=self.dropout_keep_prob, is_train=self.is_train) 69 | self.a_encodes = a_rnn(self.a_emb, seq_len=self.a_len) 70 | with tf.variable_scope('question_encoding', reuse=tf.AUTO_REUSE): 71 | q_rnn = cudnn_gru(num_layers=2 * self.layer_num, num_units=self.hidden_size, batch_size=self.N, 72 | input_size=self.q_emb.get_shape().as_list()[-1], 73 | keep_prob=self.dropout_keep_prob, is_train=self.is_train) 74 | self.q_encodes = q_rnn(self.q_emb, seq_len=self.q_len) 75 | 76 | def _gated_attention(self): 77 | with tf.variable_scope('gated_attention', reuse=tf.AUTO_REUSE): 78 | self.qa_att = dot_attention(self.a_encodes, self.q_encodes, mask=self.q_mask, 79 | hidden=self.hidden_size, keep_prob=self.dropout_keep_prob, 80 | is_train=self.is_train) 81 | gated_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N, 82 | input_size=self.qa_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob, 83 | is_train=self.is_train) 84 | self.gated_att = gated_rnn(self.qa_att, self.a_len) 85 | 86 | def _self_attention(self): 87 | with tf.variable_scope('self_attention', reuse=tf.AUTO_REUSE): 88 | self.aa_att = dot_attention(self.gated_att, self.gated_att, mask=self.a_mask, 89 | hidden=self.hidden_size, keep_prob=self.dropout_keep_prob, 90 | is_train=self.is_train) 91 | self_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N, 92 | input_size=self.aa_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob, 93 | is_train=self.is_train) 94 | self.self_att = self_rnn(self.aa_att, self.a_len) 95 | 96 | def _annotation(self): 97 | # shape(W_s1) = d_a * 2u 98 | self.W_s1 = tf.get_variable('W_s1', shape=[self.d_a, 2 * self.hidden_size], 99 | initializer=tf.contrib.layers.xavier_initializer()) 100 | # shape(W_s2) = r * d_a 101 | self.W_s2 = tf.get_variable('W_s2', shape=[self.r, self.d_a], 102 | initializer=tf.contrib.layers.xavier_initializer()) 103 | self.A = tf.nn.softmax(tf.map_fn( 104 | lambda x: tf.matmul(self.W_s2, x), 105 | tf.tanh(tf.map_fn(lambda x: tf.matmul(self.W_s1, tf.transpose(x)), 106 | self.gated_att)))) 107 | self.M = tf.matmul(self.A, self.gated_att) 108 | self.A_T = tf.transpose(self.A, perm=[0, 2, 1]) 109 | tile_eye = tf.tile(tf.eye(self.r), [self.N, 1]) 110 | tile_eye = tf.reshape(tile_eye, [-1, self.r, self.r]) 111 | self.AA_T = tf.matmul(self.A, self.A_T) - tile_eye 112 | self.P = tf.square(tf.norm(self.AA_T, axis=[-2, -1], ord='fro')) 113 | 114 | def _predict(self): 115 | with tf.variable_scope('predict', reuse=tf.AUTO_REUSE): 116 | self.att = tf.reshape(self.self_att, shape=[self.N, 2 * self.max_a_len * self.hidden_size]) 117 | self.mlp = tf.nn.relu(dense(self.att, hidden=4 * self.hidden_size, scope='dense_0')) 118 | if self.is_train: 119 | self.mlp = tf.nn.dropout(self.mlp, self.dropout_keep_prob) 120 | self.mlp = tf.nn.relu(dense(self.mlp, hidden=2 * self.hidden_size, scope='dense_1')) 121 | if self.is_train: 122 | self.mlp = tf.nn.dropout(self.mlp, self.dropout_keep_prob) 123 | self.outputs = dense(self.mlp, hidden=self.output_size, scope='output') 124 | 125 | def _compute_loss(self): 126 | self.pre_labels = tf.argmax(self.outputs, axis=1) 127 | self.loss = tf.reduce_mean(tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.outputs, 128 | labels=tf.stop_gradient( 129 | self.answers_type)))) 130 | 131 | def _create_train_op(self): 132 | """ 133 | Selects the training algorithm and creates a train operation with it 134 | """ 135 | with tf.variable_scope('optimizer', reuse=tf.AUTO_REUSE): 136 | if self.optim_type == 'adadelta': 137 | self.optimizer = tf.train.AdadeltaOptimizer(self.lr) 138 | elif self.optim_type == 'adam': 139 | self.optimizer = tf.train.AdamOptimizer(self.lr) 140 | elif self.optim_type == 'rprop': 141 | self.optimizer = tf.train.RMSPropOptimizer(self.lr) 142 | elif self.optim_type == 'sgd': 143 | self.optimizer = tf.train.GradientDescentOptimizer(self.lr) 144 | else: 145 | raise NotImplementedError('Unsupported optimizer: {}'.format(self.optim_type)) 146 | self.train_op = self.optimizer.minimize(self.loss) 147 | -------------------------------------------------------------------------------- /GatedRNN/GatedRNN_prepro.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import pickle as pkl 3 | from tqdm import tqdm 4 | import ujson as json 5 | import numpy as np 6 | import jieba 7 | import os 8 | 9 | TYPE = {'Yes': 0, 'No': 1, 'Depends': 2, 'No_Opinion': 1} 10 | 11 | 12 | def split_answers(answers): 13 | tokens = jieba.cut(answers) 14 | return [token for token in tokens] 15 | 16 | 17 | def filter_questions(filenames): 18 | questions = {} 19 | for filename in filenames: 20 | with open(filename, 'r', encoding='utf8') as fh: 21 | for line in fh: 22 | source = json.loads(line.strip()) 23 | if source['question_type'] != 'YES_NO': 24 | continue 25 | questions[source['question_id']] = source['segmented_question'] 26 | print("{} questions in total".format(len(questions))) 27 | return questions 28 | 29 | 30 | def process_test_file(filename, questions, max_p_len=500): 31 | print("Generating test examples...") 32 | total = 0 33 | examples = [] 34 | other_examples = [] 35 | eval_examples = {} 36 | with open(filename, 'r', encoding='utf8') as fh: 37 | for line in fh: 38 | source = json.loads(line.strip()) 39 | if source['question_type'] != 'YES_NO': 40 | other_examples.append(source) 41 | continue 42 | total += 1 43 | answer_type = -1 44 | example = {'question_tokens': questions[str(source['question_id'])], 45 | 'answer_tokens': split_answers(source['answers'][0]), 46 | 'answer_type': answer_type, 47 | 'id': total} 48 | eval_examples[str(total)] = {'question_id': source['question_id'], 49 | 'answers': source['answers']} 50 | examples.append(example) 51 | # random.shuffle(examples) 52 | print("{} questions in total".format(len(examples))) 53 | return examples, eval_examples, other_examples 54 | 55 | 56 | def process_file(filenames, data_type, max_p_len=500): 57 | print("Generating {} examples...".format(data_type)) 58 | total = 0 59 | examples = [] 60 | eval_examples = {} 61 | for filename in filenames: 62 | with open(filename, 'r', encoding='utf8') as fh: 63 | for line in fh: 64 | source = json.loads(line.strip()) 65 | if source['question_type'] != 'YES_NO': 66 | continue 67 | if len(source['answer_spans']) == 0: 68 | continue 69 | if source['answer_spans'][0][1] >= max_p_len: 70 | continue 71 | question_tokens = source['segmented_question'] 72 | for idx, answer_tokens in enumerate(source['segmented_answers']): 73 | total += 1 74 | answer_type = TYPE[source['yesno_answers'][idx]] if len(source['yesno_answers']) else -1 75 | example = {'question_tokens': question_tokens, 76 | 'answer_tokens': answer_tokens, 77 | 'answer_type': answer_type, 78 | 'id': total} 79 | eval_examples[str(total)] = {'question_id': source['question_id'], 80 | 'answer_type': answer_type} 81 | examples.append(example) 82 | # random.shuffle(examples) 83 | print("{} questions in total".format(len(examples))) 84 | return examples, eval_examples 85 | 86 | 87 | def build_features(config, examples, data_type, out_file, word2id): 88 | ans_limit = config.max_a_len 89 | ques_limit = config.max_q_len 90 | 91 | print("Processing {} examples...".format(data_type)) 92 | writer = tf.python_io.TFRecordWriter(out_file) 93 | total = 0 94 | meta = {} 95 | for example in tqdm(examples): 96 | total += 1 97 | answer_token_ids = np.zeros([ans_limit], dtype=np.int32) 98 | question_token_ids = np.zeros([ques_limit], dtype=np.int32) 99 | answer_type = np.zeros([3], dtype=np.int32) 100 | answer_type[example['answer_type']] = 1 101 | 102 | def _get_word(word): 103 | for each in (word, word.lower(), word.capitalize(), word.upper()): 104 | if each in word2id: 105 | return word2id[each] 106 | return 1 107 | 108 | answers_token_num = min(len(example['answer_tokens']), ques_limit) 109 | for i in range(answers_token_num): 110 | answer_token_ids[i] = _get_word(example['answer_tokens'][i]) 111 | question_token_num = min(len(example['question_tokens']), ques_limit) 112 | for j in range(question_token_num): 113 | question_token_ids[j] = _get_word(example['question_tokens'][j]) 114 | 115 | record = tf.train.Example(features=tf.train.Features( 116 | feature={ 117 | 'answer_token_ids': tf.train.Feature( 118 | bytes_list=tf.train.BytesList(value=[answer_token_ids.tostring()])), 119 | 'question_token_ids': tf.train.Feature( 120 | bytes_list=tf.train.BytesList(value=[question_token_ids.tostring()])), 121 | 'answer_type': tf.train.Feature( 122 | bytes_list=tf.train.BytesList(value=[answer_type.tostring()])), 123 | 'id': tf.train.Feature(int64_list=tf.train.Int64List(value=[example['id']])) 124 | })) 125 | writer.write(record.SerializeToString()) 126 | print("Build {} instances of features in total".format(total)) 127 | meta["total"] = total 128 | writer.close() 129 | return meta 130 | 131 | 132 | def save(filename, obj, message=None): 133 | if message is not None: 134 | print("Saving {}...".format(message)) 135 | with open(filename, "w") as fh: 136 | json.dump(obj, fh) 137 | 138 | 139 | def prepro(config, flags): 140 | token2id = None 141 | if os.path.isfile(flags.token2id_file): 142 | with open(flags.token2id_file, 'r') as fh: 143 | token2id = json.load(fh) 144 | # train_examples, _ = process_file(config.train_files, 'train') 145 | # train_meta = build_features(config, train_examples, 'train', flags.train_record_file, token2id) 146 | # save(flags.train_meta, train_meta, message='train meta') 147 | # del train_examples, train_meta 148 | # 149 | # dev_examples, dev_eval = process_file(config.dev_files, "dev") 150 | # # 创建dev TFRecord文件 151 | # dev_meta = build_features(config, dev_examples, "dev", flags.dev_record_file, token2id) 152 | # save(flags.dev_eval_file, dev_eval, message="dev eval") 153 | # save(flags.dev_meta, dev_meta, message="dev meta") 154 | # del dev_examples, dev_eval, dev_meta 155 | 156 | # filtered_questions = filter_questions(config.test_files) 157 | # save(flags.filtered_questions, filtered_questions, message='filtered questions') 158 | filtered_questions = None 159 | if os.path.isfile(flags.token2id_file): 160 | with open(flags.filtered_questions, 'r') as fh: 161 | filtered_questions = json.load(fh) 162 | test_examples, test_eval, other_examples = process_test_file(flags.predicted_answers, filtered_questions) 163 | # 创建test TFRecord文件 164 | test_meta = build_features(config, test_examples, "test", flags.test_record_file, token2id) 165 | save(flags.test_eval_file, test_eval, message="test eval") 166 | save(flags.final_file, other_examples, message="test final") 167 | with open(flags.final_file, 'w') as fout: 168 | for example in other_examples: 169 | fout.write(json.dumps(example, ensure_ascii=False) + '\n') 170 | fout.close() 171 | save(flags.test_meta, test_meta, message="test meta") 172 | del test_examples, test_meta, test_eval, other_examples 173 | -------------------------------------------------------------------------------- /GatedRNN/GatedRNN_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import ujson as json 5 | from sklearn.metrics import accuracy_score 6 | TYPE = {0: 'Yes', 1: 'No', 2: 'Depends'} 7 | 8 | 9 | def get_record_parser(config): 10 | def parse(example): 11 | ans_limit = config.max_a_len 12 | ques_limit = config.max_q_len 13 | features = tf.parse_single_example(example, 14 | features={ 15 | 'answer_token_ids': tf.FixedLenFeature([], tf.string), 16 | 'question_token_ids': tf.FixedLenFeature([], tf.string), 17 | 'answer_type': tf.FixedLenFeature([], tf.string), 18 | 'id': tf.FixedLenFeature([], tf.int64) 19 | }) 20 | answer_token_ids = tf.reshape(tf.decode_raw(features['answer_token_ids'], tf.int32), [ans_limit]) 21 | question_token_ids = tf.reshape(tf.decode_raw(features['question_token_ids'], tf.int32), [ques_limit]) 22 | answer_type = tf.reshape(tf.decode_raw(features['answer_type'], tf.int32), [3]) 23 | qa_id = features['id'] 24 | return answer_token_ids, question_token_ids, answer_type, qa_id 25 | 26 | return parse 27 | 28 | 29 | def get_batch_dataset(record_file, parser, config): 30 | num_threads = tf.constant(config.num_threads, dtype=tf.int32) 31 | dataset = tf.data.TFRecordDataset(record_file).map(parser, num_parallel_calls=num_threads).shuffle( 32 | config.capacity).batch(config.batch_size).repeat(config.epochs) 33 | return dataset 34 | 35 | 36 | def get_dataset(record_file, parser, config): 37 | num_threads = tf.constant(config.num_threads, dtype=tf.int32) 38 | dataset = tf.data.TFRecordDataset(record_file).map( 39 | parser, num_parallel_calls=num_threads).batch(config.batch_size).repeat() 40 | return dataset 41 | 42 | 43 | def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle): 44 | losses = [] 45 | # pred_answers = [] 46 | pre_ans_types, ref_ans_types = [], [] 47 | for i in range(num_batches): 48 | qa_ids, loss, pre_labels = sess.run([model.qa_id, model.loss, model.pre_labels], 49 | feed_dict={handle: str_handle} if handle is not None else None) 50 | losses.append(loss) 51 | for qa_id, pre_label in zip(qa_ids, pre_labels): 52 | sample = eval_file[str(qa_id)] 53 | pre_ans_types.append(pre_label) 54 | ref_ans_types.append(sample['answer_type']) 55 | 56 | avg_loss = np.mean(losses) 57 | avg_acc = accuracy_score(y_true=ref_ans_types, y_pred=pre_ans_types) 58 | 59 | loss_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/loss".format(data_type), simple_value=avg_loss), ]) 60 | acc_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/f1".format(data_type), simple_value=avg_acc), ]) 61 | return avg_loss, avg_acc, [loss_sum, acc_sum] 62 | 63 | 64 | def predict_batch(model, num_batches, eval_file, sess, data_type, final_file, logger): 65 | pred_answers = [] 66 | for i in range(num_batches): 67 | qa_ids, pre_labels = sess.run([model.qa_id, model.pre_labels]) 68 | for qa_id, pre_label in zip(qa_ids, pre_labels): 69 | sample = eval_file[str(qa_id)] 70 | pred_answers.append({'question_id': sample['question_id'], 71 | 'question_type': 'YES_NO', 72 | 'answers': sample['answers'], 73 | 'entity_answers': [[]], 74 | 'yesno_answers': [TYPE[pre_label]]}) 75 | 76 | logger.info('{} questions'.format(len(pred_answers))) 77 | with open(final_file, 'a') as fout: 78 | for pred_answer in pred_answers: 79 | fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') 80 | fout.close() 81 | logger.info('Saving classification results') 82 | -------------------------------------------------------------------------------- /R-Net/S_model.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import logging 3 | import time 4 | from basic_rnn import cudnn_gru, dot_attention, ptr_net, summ 5 | 6 | 7 | class Model(object): 8 | def __init__(self, args, batch, token_embeddings=None, trainable=True, opt=True): 9 | self.logger = logging.getLogger('brc') 10 | self.hidden_size = args.hidden_size 11 | self.batch_size = args.batch_size 12 | self.layer_num = args.layer_num 13 | self.optim_type = args.optim 14 | self.dropout_keep_prob = args.dropout_keep_prob 15 | self.learning_rate = args.learning_rate 16 | self.weight_decay = args.weight_decay 17 | self.trainable = trainable 18 | # length limit 19 | self.max_p_num = args.max_p_num 20 | self.max_p_len = args.max_p_len 21 | self.max_q_len = args.max_q_len 22 | self.max_a_len = args.max_a_len 23 | 24 | self.p, self.q, self.start_id, self.end_id, self.qa_id = batch.get_next() 25 | self.lr = tf.get_variable('lr', shape=[], dtype=tf.float32, trainable=False) 26 | self.is_train = tf.get_variable('is_train', shape=[], dtype=tf.bool, trainable=False) 27 | self.p_mask = tf.cast(self.p, tf.bool) 28 | self.q_mask = tf.cast(self.q, tf.bool) 29 | # passage的真实长度 30 | self.p_len = tf.reduce_sum(tf.cast(self.p_mask, tf.int32), axis=1) 31 | # self.p = tf.boolean_mask(self.p, mask=self.p_len) 32 | # question的真实长度 33 | self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1) 34 | # self.q = tf.boolean_mask(self.q, mask=self.q_len) 35 | 36 | if opt: 37 | self.N = tf.shape(self.start_id)[0] 38 | # 当前batch中passage的最大长度 39 | self.p_maxlen = tf.reduce_max(self.p_len) 40 | # 当前batch中question的最大长度 41 | self.q_maxlen = tf.reduce_max(self.q_len) 42 | self.p = tf.slice(self.p, [0, 0], [self.N, self.p_maxlen]) 43 | self.q = tf.slice(self.q, [0, 0], [self.N, self.q_maxlen]) 44 | self.p_mask = tf.slice(self.p_mask, [0, 0], [self.N, self.p_maxlen]) 45 | self.q_mask = tf.slice(self.q_mask, [0, 0], [self.N, self.q_maxlen]) 46 | else: 47 | self.p_maxlen, self.q_maxlen = self.max_p_len, self.max_q_len 48 | 49 | self._build_graph(token_embeddings) 50 | 51 | def _build_graph(self, token_embeddings): 52 | """ 53 | Builds the computation graph with Tensorflow 54 | """ 55 | start_t = time.time() 56 | # 对paragraph question做embedding 57 | self._embed(token_embeddings) 58 | # 对paragraph question分别用Bi-LSTM编码 59 | self._encode() 60 | # 基于question-aware的passage编码 61 | self._gated_attention() 62 | self._self_attention() 63 | self._pointer() 64 | # self._predict() 65 | # 对数似然损失,start end两部分损失取平均 66 | self._compute_loss() 67 | if self.trainable: 68 | # 选择优化算法 69 | self._create_train_op() 70 | self.logger.info('Time to build graph: {} s'.format(time.time() - start_t)) 71 | 72 | def _embed(self, token_embeddings): 73 | with tf.device('/cpu:0'), tf.variable_scope('word_embedding', reuse=tf.AUTO_REUSE): 74 | word_embeddings = tf.get_variable('word_embeddings', 75 | initializer=tf.constant(token_embeddings, dtype=tf.float32), 76 | trainable=False) 77 | self.p_emb = tf.nn.embedding_lookup(word_embeddings, self.p) 78 | self.q_emb = tf.nn.embedding_lookup(word_embeddings, self.q) 79 | 80 | def _encode(self): 81 | with tf.variable_scope('passage_encoding', reuse=tf.AUTO_REUSE): 82 | self.p_rnn = cudnn_gru(num_layers=2*self.layer_num, num_units=self.hidden_size, batch_size=self.N, 83 | input_size=self.p_emb.get_shape().as_list()[-1], 84 | keep_prob=self.dropout_keep_prob, is_train=self.is_train) 85 | self.p_encodes = self.p_rnn(self.p_emb, seq_len=self.p_len) 86 | with tf.variable_scope('question_encoding', reuse=tf.AUTO_REUSE): 87 | self.q_rnn = cudnn_gru(num_layers=2*self.layer_num, num_units=self.hidden_size, batch_size=self.N, 88 | input_size=self.q_emb.get_shape().as_list()[-1], 89 | keep_prob=self.dropout_keep_prob, is_train=self.is_train) 90 | self.q_encodes = self.q_rnn(self.q_emb, seq_len=self.q_len) 91 | 92 | def _gated_attention(self): 93 | with tf.variable_scope('gated_attention', reuse=tf.AUTO_REUSE): 94 | self.qp_att = dot_attention(self.p_encodes, self.q_encodes, mask=self.q_mask, 95 | hidden=self.hidden_size, keep_prob=self.dropout_keep_prob, 96 | is_train=self.is_train) 97 | gated_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N, 98 | input_size=self.qp_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob, 99 | is_train=self.is_train) 100 | self.gated_att = gated_rnn(self.qp_att, self.p_len) # v_Pt 101 | 102 | def _self_attention(self): 103 | with tf.variable_scope('self_attention', reuse=tf.AUTO_REUSE): 104 | self.pp_att = dot_attention(self.gated_att, self.gated_att, mask=self.p_mask, 105 | hidden=self.hidden_size, keep_prob=self.dropout_keep_prob, 106 | is_train=self.is_train) 107 | self_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N, 108 | input_size=self.pp_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob, 109 | is_train=self.is_train) 110 | self.self_att = self_rnn(self.pp_att, self.p_len) 111 | 112 | def _pointer(self): 113 | with tf.variable_scope('pointer', reuse=tf.AUTO_REUSE): 114 | self.ques_vec = summ(self.q_encodes[:, :, -2 * self.hidden_size:], self.hidden_size, mask=self.q_mask, 115 | keep_prob=self.dropout_keep_prob, is_train=self.is_train) # r_Q 116 | pointer = ptr_net(batch=self.N, hidden=self.ques_vec.get_shape().as_list()[-1], 117 | keep_prob=self.dropout_keep_prob, is_train=self.is_train) 118 | self.logits1, self.logits2 = pointer(self.ques_vec, self.self_att, self.hidden_size, self.p_mask) 119 | 120 | def _predict(self): 121 | with tf.variable_scope("predict"): 122 | outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2), 123 | tf.expand_dims(tf.nn.softmax(self.logits2), axis=1)) 124 | self.outer = tf.matrix_band_part(outer, 0, self.max_a_len) 125 | self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1) 126 | self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1) 127 | self.start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits1, labels=tf.stop_gradient( 128 | tf.one_hot(self.start_id, tf.shape(self.logits1)[1], axis=1))) 129 | self.end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits2, labels=tf.stop_gradient( 130 | tf.one_hot(self.start_id, tf.shape(self.logits2)[1], axis=1))) 131 | self.loss = tf.reduce_mean(self.start_loss + self.end_loss) 132 | 133 | def _compute_loss(self): 134 | def sparse_nll_loss(probs, labels, epsilon=1e-9, scope=None): 135 | with tf.name_scope(scope, "log_loss"): 136 | labels = tf.one_hot(labels, tf.shape(probs)[1], axis=1) 137 | losses = - tf.reduce_sum(labels * tf.log(probs + epsilon), 1) 138 | return losses 139 | self.logits1 = tf.nn.softmax(self.logits1) 140 | self.logits2 = tf.nn.softmax(self.logits2) 141 | self.start_loss = sparse_nll_loss(probs=self.logits1, labels=self.start_id) 142 | self.end_loss = sparse_nll_loss(probs=self.logits2, labels=self.end_id) 143 | self.all_params = tf.trainable_variables() 144 | self.loss = tf.reduce_mean(tf.add(self.start_loss, self.end_loss)) 145 | if self.weight_decay > 0: 146 | with tf.variable_scope('l2_loss'): 147 | l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.all_params]) 148 | self.loss += self.weight_decay * l2_loss 149 | 150 | def _create_train_op(self): 151 | with tf.variable_scope('optimizer', reuse=tf.AUTO_REUSE): 152 | if self.optim_type == 'adadelta': 153 | self.optimizer = tf.train.AdadeltaOptimizer(self.lr) 154 | elif self.optim_type == 'adam': 155 | self.optimizer = tf.train.AdamOptimizer(self.lr) 156 | elif self.optim_type == 'rprop': 157 | self.optimizer = tf.train.RMSPropOptimizer(self.lr) 158 | elif self.optim_type == 'sgd': 159 | self.optimizer = tf.train.GradientDescentOptimizer(self.lr) 160 | else: 161 | raise NotImplementedError('Unsupported optimizer: {}'.format(self.optim_type)) 162 | self.train_op = self.optimizer.minimize(self.loss) 163 | -------------------------------------------------------------------------------- /R-Net/S_prepro.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import pickle as pkl 3 | import os 4 | from tqdm import tqdm 5 | import ujson as json 6 | from collections import Counter 7 | import numpy as np 8 | 9 | 10 | def process_file(filenames, data_type, max_p_len=500): 11 | print("Generating {} examples...".format(data_type)) 12 | is_train = False 13 | if data_type == 'train': 14 | is_train = True 15 | examples = [] 16 | eval_examples = {} 17 | total = 0 18 | for filename in filenames: 19 | with open(filename, 'r', encoding='utf8') as fh: 20 | for line in fh: 21 | source = json.loads(line.strip()) 22 | if is_train: 23 | if len(source['answer_spans']) == 0: 24 | continue 25 | if source['answer_spans'][0][1] >= max_p_len: 26 | continue 27 | total += 1 28 | answers = [] 29 | if 'answer_docs' in source: 30 | del source['fake_answers'] 31 | del source['segmented_answers'] 32 | answers = source['answers'] 33 | question_tokens = source['segmented_question'] 34 | passages = [] 35 | passages_len = [] 36 | start, end, answer_passages = 0, 0, 0 37 | if 'answer_docs' in source and len(source['answer_docs']): 38 | start = source['answer_spans'][0][0] 39 | end = source['answer_spans'][0][1] 40 | answer_passages = source['answer_docs'][0] 41 | for idx, doc in enumerate(source['documents']): 42 | del doc['paragraphs'] 43 | para_len = 0 44 | if is_train: 45 | para_len = min(len(doc['segmented_paragraphs'][doc['most_related_para']]), max_p_len) 46 | passages += doc['segmented_paragraphs'][doc['most_related_para']][:para_len] 47 | else: 48 | para_infos = [] 49 | for para_tokens in doc['segmented_paragraphs']: 50 | # para_tokens 每篇文档分词后的段落,question_tokens 问题分词 51 | common_with_question = Counter(para_tokens) & Counter(question_tokens) 52 | correct_preds = sum(common_with_question.values()) 53 | if correct_preds == 0: 54 | recall_wrt_question = 0 55 | else: 56 | recall_wrt_question = float(correct_preds) / len(question_tokens) 57 | para_infos.append((para_tokens, recall_wrt_question, len(para_tokens))) 58 | # 排序 选出与question匹配recall最高的para_tokens 59 | para_infos.sort(key=lambda x: (-x[1], x[2])) 60 | fake_passage_tokens = [] 61 | for para_info in para_infos[:1]: 62 | fake_passage_tokens += para_info[0] 63 | para_len = min(len(fake_passage_tokens), max_p_len) 64 | passages += fake_passage_tokens[:para_len] 65 | if idx < answer_passages: 66 | start += para_len 67 | end += para_len 68 | passages_len.append(para_len) 69 | example = {'passages': passages, 70 | 'question_tokens': question_tokens, 71 | 'answer_passages': answer_passages, 72 | 'start_id': start, 73 | 'end_id': end, 74 | 'id': total} 75 | if not is_train: 76 | eval_examples[str(total)] = {'passages': passages, 77 | 'passages_len': passages_len, 78 | 'answers': answers, 79 | 'answer_passages': answer_passages, 80 | 'question': source['segmented_question'], 81 | 'question_id': source['question_id'], 82 | 'question_type': source['question_type']} 83 | examples.append(example) 84 | # random.shuffle(examples) 85 | print("{} questions in total".format(len(examples))) 86 | return examples, eval_examples 87 | 88 | 89 | def get_embedding(data_type, emb_file=None, vec_size=None, token2id_dict=None): 90 | print("Generating {} embedding...".format(data_type)) 91 | filtered_tokens = {} 92 | if emb_file is not None: 93 | assert vec_size is not None 94 | with open(emb_file, 'rb') as fin: 95 | trained_embeddings = pkl.load(fin) 96 | fin.close() 97 | filtered_tokens = trained_embeddings.keys() 98 | 99 | NULL = "" 100 | OOV = "" 101 | # token2id 102 | token2id = {token: idx for idx, token in 103 | enumerate(filtered_tokens, 2)} if token2id_dict is None else token2id_dict 104 | id2token = {idx: token for idx, token in enumerate(filtered_tokens, 2)} 105 | token2id[NULL] = 0 106 | token2id[OOV] = 1 107 | id2token['0'] = NULL 108 | id2token['1'] = OOV 109 | embedding_mat = np.zeros([len(token2id), vec_size]) 110 | # idx2emb = {idx: embedding_mat[token] for token, idx in token2id.items()} 111 | # embedding_mat = [idx2emb[idx] for idx in range(len(idx2emb))] 112 | for token in filtered_tokens: 113 | # if token in trained_embeddings: 114 | embedding_mat[token2id[token]] = trained_embeddings[token] 115 | return embedding_mat, token2id, id2token 116 | 117 | 118 | def build_features(config, examples, data_type, out_file, word2id): 119 | para_limit = config.max_p_len 120 | ques_limit = config.max_q_len 121 | 122 | print("Processing {} examples...".format(data_type)) 123 | writer = tf.python_io.TFRecordWriter(out_file) 124 | total = 0 125 | meta = {} 126 | for example in tqdm(examples): 127 | total += 1 128 | passages_token_ids = np.zeros([config.max_p_num * para_limit], dtype=np.int32) 129 | question_token_ids = np.zeros([ques_limit], dtype=np.int32) 130 | 131 | def _get_word(word): 132 | for each in (word, word.lower(), word.capitalize(), word.upper()): 133 | if each in word2id: 134 | return word2id[each] 135 | return 1 136 | 137 | # passages token转id 138 | idx = 0 139 | for pdx, passage_token in enumerate(example['passages']): 140 | passages_token_ids[pdx] = _get_word(passage_token) 141 | # 问题token转id 142 | question_token_num = min(len(example['question_tokens']), ques_limit) 143 | for i in range(question_token_num): 144 | question_token_ids[i] = _get_word(example['question_tokens'][i]) 145 | 146 | record = tf.train.Example(features=tf.train.Features(feature={ 147 | "passages_token_ids": tf.train.Feature( 148 | bytes_list=tf.train.BytesList(value=[passages_token_ids.tostring()])), 149 | "question_token_ids": tf.train.Feature( 150 | bytes_list=tf.train.BytesList(value=[question_token_ids.tostring()])), 151 | "start_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example['start_id']])), 152 | "end_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example['end_id']])), 153 | "id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example["id"]])) 154 | })) 155 | writer.write(record.SerializeToString()) 156 | print("Build {} instances of features in total".format(total)) 157 | meta["total"] = total 158 | writer.close() 159 | return meta 160 | 161 | 162 | def save(filename, obj, message=None): 163 | if message is not None: 164 | print("Saving {}...".format(message)) 165 | with open(filename, "w") as fh: 166 | json.dump(obj, fh) 167 | 168 | 169 | def prepro(config, flags): 170 | token2id = None 171 | if os.path.isfile(flags.token2id_file): 172 | with open(flags.token2id_file, 'r') as fh: 173 | token2id = json.load(fh) 174 | 175 | train_examples, train_eval = process_file(config.train_files, "train", config.max_p_len) 176 | # 创建train TFRecord文件 177 | train_meta = build_features(config, train_examples, "train", flags.train_record_file, token2id) 178 | save(flags.train_eval_file, train_eval, message="train eval") 179 | save(flags.train_meta, train_meta, message="dev meta") 180 | del train_examples, train_eval, train_meta 181 | 182 | dev_examples, dev_eval = process_file(config.dev_files, "dev", config.max_p_len) 183 | # 创建dev TFRecord文件 184 | dev_meta = build_features(config, dev_examples, "dev", flags.dev_record_file, token2id) 185 | save(flags.dev_eval_file, dev_eval, message="dev eval") 186 | save(flags.dev_meta, dev_meta, message="dev meta") 187 | del dev_examples, dev_eval, dev_meta 188 | 189 | test_examples, test_eval = process_file(config.test_files, "test", config.max_p_len) 190 | # # 创建test TFRecord文件 191 | test_meta = build_features(config, test_examples, "test", flags.test_record_file, token2id) 192 | save(flags.test_eval_file, test_eval, message="test eval") 193 | save(flags.test_meta, test_meta, message="test meta") 194 | del test_examples, test_eval, test_meta 195 | 196 | # save(flags.token2id_file, token2id, message="word2idx") 197 | 198 | # def draw_hist(x, bins, label): 199 | # plt.hist(x=x, bins=bins) 200 | # plt.xlabel(label) 201 | # plt.ylabel('Num') 202 | # plt.show() 203 | -------------------------------------------------------------------------------- /R-Net/S_util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import os 4 | import ujson as json 5 | import mrc_eval 6 | from bleu import BLEUWithBonus 7 | from rouge import RougeLWithBonus 8 | 9 | 10 | def get_record_parser(config): 11 | def parse(example): 12 | features = tf.parse_single_example(example, 13 | features={ 14 | 'passages_token_ids': tf.FixedLenFeature([], tf.string), 15 | 'question_token_ids': tf.FixedLenFeature([], tf.string), 16 | 'start_id': tf.FixedLenFeature([], tf.int64), 17 | 'end_id': tf.FixedLenFeature([], tf.int64), 18 | 'id': tf.FixedLenFeature([], tf.int64) 19 | }) 20 | passages_token_ids = tf.reshape(tf.decode_raw(features["passages_token_ids"], tf.int32), 21 | [config.max_p_num * config.max_p_len]) 22 | question_token_ids = tf.reshape(tf.decode_raw(features["question_token_ids"], tf.int32), 23 | [config.max_q_len]) 24 | start_id = features['start_id'] 25 | end_id = features['end_id'] 26 | qa_id = features['id'] 27 | return passages_token_ids, question_token_ids, start_id, end_id, qa_id 28 | 29 | return parse 30 | 31 | 32 | def get_batch_dataset(record_file, parser, config): 33 | num_threads = tf.constant(config.num_threads, dtype=tf.int32) 34 | dataset = tf.data.TFRecordDataset(record_file).map(parser, num_parallel_calls=num_threads).shuffle( 35 | config.capacity).batch(config.batch_size).repeat(config.epochs) 36 | # if config.is_bucket: 37 | # buckets = [tf.constant(num) for num in range(*config.bucket_range)] 38 | # 39 | # def key_func(context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, y1, y2, qa_id): 40 | # c_len = tf.reduce_sum( 41 | # tf.cast(tf.cast(context_idxs, tf.bool), tf.int32)) 42 | # buckets_min = [np.iinfo(np.int32).min] + buckets 43 | # buckets_max = buckets + [np.iinfo(np.int32).max] 44 | # conditions_c = tf.logical_and( 45 | # tf.less(buckets_min, c_len), tf.less_equal(c_len, buckets_max)) 46 | # bucket_id = tf.reduce_min(tf.where(conditions_c)) 47 | # return bucket_id 48 | # 49 | # def reduce_func(key, elements): 50 | # return elements.batch(config.batch_size) 51 | # 52 | # dataset = dataset.apply( 53 | # tf.contrib.data.group_by_window(key_func, reduce_func, window_size=5 * config.batch_size)).shuffle( 54 | # len(buckets) * 25) 55 | # else: 56 | return dataset 57 | 58 | 59 | def get_dataset(record_file, parser, config): 60 | num_threads = tf.constant(config.num_threads, dtype=tf.int32) 61 | dataset = tf.data.TFRecordDataset(record_file).map( 62 | parser, num_parallel_calls=num_threads).batch(config.batch_size).repeat() 63 | return dataset 64 | 65 | 66 | def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle, args, logger, result_prefix=None): 67 | losses = [] 68 | pred_answers, ref_answers = [], [] 69 | padded_p_len = args.max_p_len 70 | for i in range(num_batches): 71 | qa_id, loss, start_probs, end_probs = sess.run([model.qa_id, model.loss, model.logits1, model.logits2], 72 | feed_dict={handle: str_handle} if handle is not None else None) 73 | losses.append(loss) 74 | start, end = 0, 0 75 | for id, start_prob, end_prob in zip(qa_id, start_probs, end_probs): 76 | best_p_idx, best_span, best_score = None, None, 0 77 | sample = eval_file[str(id)] 78 | for p_idx, passage_len in enumerate(sample['passages_len']): 79 | if p_idx >= args.max_p_num: 80 | continue 81 | # 为每个passage找到best answer 82 | end = start + passage_len 83 | answer_span, score = find_best_answer_for_passage(start_prob[start: end], end_prob[start: end], 84 | passage_len, args.max_a_len) 85 | answer_span[0] += start 86 | answer_span[1] += start 87 | # 各passage间最大score 88 | if score > best_score: 89 | best_score = score 90 | best_p_idx = p_idx 91 | best_span = answer_span 92 | end = start 93 | # best_span = [start_prob, end_prob] 94 | # best_answer = sample['passages'][best_span[0]: best_span[1] + 1] 95 | # 根据span找到token 96 | if best_p_idx is None or best_span is None: 97 | best_answer = '' 98 | else: 99 | best_answer = ''.join(sample['passages'][best_span[0]: best_span[1] + 1]) 100 | # TODO 加入question tokens 101 | pred_answers.append({'question_id': sample['question_id'], 102 | 'question_type': sample['question_type'], 103 | 'answers': [best_answer], 104 | 'yesno_answers': []}) 105 | # 标准答案 106 | # if 'answers' in sample and len(sample['answers']) > 0: 107 | if 'answers' in sample: 108 | ref_answers.append({'question_id': sample['question_id'], 109 | 'question_type': sample['question_type'], 110 | 'answers': sample['answers'], 111 | 'yesno_answers': []}) 112 | 113 | if result_prefix is not None: 114 | result_file = os.path.join(args.result_dir, result_prefix + '.json') 115 | with open(result_file, 'w') as fout: 116 | for pred_answer in pred_answers: 117 | fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n') 118 | logger.info('Saving {} results to {}'.format(result_prefix, result_file)) 119 | 120 | avg_loss = np.mean(losses) 121 | bleu4, rouge_l = 0, 0 122 | if len(ref_answers) > 0: 123 | # K-V 问题ID-答案 124 | pred_dict, ref_dict, bleu_rouge = {}, {}, {} 125 | for pred, ref in zip(pred_answers, ref_answers): 126 | question_id = ref['question_id'] 127 | if len(ref['answers']) > 0: 128 | # 将answer tokens转换为由空格连接的一句话 129 | pred_dict[question_id] = {'answers': mrc_eval.normalize(pred['answers']), 130 | 'yesno_answers': []} 131 | ref_dict[question_id] = {'question_type': ref['question_type'], 132 | 'answers': mrc_eval.normalize(ref['answers']), 133 | 'yesno_answers': []} 134 | bleu_eval = BLEUWithBonus(4, alpha=1.0, beta=1.0) 135 | rouge_eval = RougeLWithBonus(alpha=1.0, beta=1.0, gamma=1.2) 136 | bleu4, rouge_l = mrc_eval.calc_metrics(pred_dict, 137 | ref_dict, 138 | bleu_eval, 139 | rouge_eval) 140 | bleu_rouge['Bleu-4'] = bleu4 141 | bleu_rouge['Rouge-L'] = rouge_l 142 | else: 143 | bleu_rouge = None 144 | 145 | loss_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/loss".format(data_type), simple_value=avg_loss), ]) 146 | bleu_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/f1".format(data_type), simple_value=bleu4), ]) 147 | rouge_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/em".format(data_type), simple_value=rouge_l), ]) 148 | return avg_loss, bleu_rouge, [loss_sum, bleu_sum, rouge_sum] 149 | 150 | 151 | def find_best_answer_for_passage(start_probs, end_probs, passage_len=None, max_a_len=None): 152 | """ 153 | Finds the best answer with the maximum start_prob * end_prob from a single passage 154 | """ 155 | if passage_len is None: 156 | passage_len = len(start_probs) 157 | else: 158 | passage_len = min(len(start_probs), passage_len) 159 | best_start, best_end, max_prob = -1, -1, 0 160 | # 从头扫描passage 161 | for start_idx in range(passage_len): 162 | for ans_len in range(max_a_len): 163 | end_idx = start_idx + ans_len 164 | if end_idx >= passage_len: 165 | continue 166 | prob = start_probs[start_idx] * end_probs[end_idx] 167 | if prob > max_prob: 168 | best_start = start_idx 169 | best_end = end_idx 170 | max_prob = prob 171 | return [best_start, best_end], max_prob 172 | -------------------------------------------------------------------------------- /R-Net/bleu.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | 3 | import math 4 | 5 | import common 6 | 7 | 8 | class BLEU(object): 9 | def __init__(self, n_size): 10 | self.match_ngram = {} 11 | self.candi_ngram = {} 12 | self.bp_r = 0 13 | self.bp_c = 0 14 | self.n_size = n_size 15 | 16 | def add_inst(self, cand, ref_list): 17 | for n_size in range(self.n_size): 18 | self.count_ngram(cand, ref_list, n_size) 19 | self.count_bp(cand, ref_list) 20 | 21 | def count_ngram(self, cand, ref_list, n_size): 22 | cand_ngram = common.get_ngram(cand, n_size) 23 | refs_ngram = [] 24 | for ref in ref_list: 25 | refs_ngram.append(common.get_ngram(ref, n_size)) 26 | if n_size not in self.match_ngram: 27 | self.match_ngram[n_size] = 0 28 | self.candi_ngram[n_size] = 0 29 | match_size, cand_size = common.get_match_size(cand_ngram, refs_ngram) 30 | self.match_ngram[n_size] += match_size 31 | self.candi_ngram[n_size] += cand_size 32 | 33 | def count_bp(self, cand, ref_list): 34 | self.bp_c += len(cand) 35 | self.bp_r += min([ 36 | (abs(len(cand) - len(ref)), len(ref)) 37 | for ref in ref_list] 38 | )[1] 39 | 40 | def score(self): 41 | prob_list = [] 42 | for n_size in range(self.n_size): 43 | if float(self.candi_ngram[n_size]) == 0: 44 | prob_list.append(0) 45 | else: 46 | prob_list.append(self.match_ngram[n_size] / float(self.candi_ngram[n_size])) 47 | # prob_list = [ 48 | # self.match_ngram[n_size] / float(self.candi_ngram[n_size]) 49 | # for n_size in range(self.n_size) 50 | # ] 51 | bleu_list = [prob_list[0]] 52 | for n in range(1, self.n_size): 53 | bleu_list.append(bleu_list[-1] * prob_list[n]) 54 | for n in range(self.n_size): 55 | bleu_list[n] = bleu_list[n] ** (1. / float(n + 1)) 56 | bp = math.exp(min(1 - self.bp_r / float(self.bp_c), 0)) 57 | for n in range(self.n_size): 58 | bleu_list[n] = bleu_list[n] * bp 59 | return bleu_list 60 | 61 | 62 | class BLEUWithBonus(BLEU): 63 | def __init__(self, n_size, alpha=1.0, beta=1.0): 64 | super(BLEUWithBonus, self).__init__(n_size) 65 | self.alpha = alpha 66 | self.beta = beta 67 | 68 | def add_inst(self, 69 | cand, 70 | ref_list, 71 | yn_label=None, yn_ref=None, entity_ref=None): 72 | # super(BLEUWithBonus, self).add_inst(cand, ref_list) 73 | BLEU.add_inst(self, cand, ref_list) 74 | if yn_label is not None and yn_ref is not None: 75 | self.add_yn_bonus(cand, ref_list, yn_label, yn_ref) 76 | elif entity_ref is not None: 77 | self.add_entity_bonus(cand, entity_ref) 78 | 79 | def add_yn_bonus(self, cand, ref_list, yn_label, yn_ref): 80 | for n_size in range(self.n_size): 81 | cand_ngram = common.get_ngram(cand, n_size, label=yn_label) 82 | ref_ngram = [] 83 | for ref_id, r in enumerate(yn_ref): 84 | ref_ngram.append(common.get_ngram(ref_list[ref_id], n_size, label=r)) 85 | match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram) 86 | self.match_ngram[n_size] += self.alpha * match_size 87 | self.candi_ngram[n_size] += self.alpha * match_size 88 | 89 | def add_entity_bonus(self, cand, entity_ref): 90 | for n_size in range(self.n_size): 91 | cand_ngram = common.get_ngram(cand, n_size, label='ENTITY') 92 | ref_ngram = [] 93 | for reff_id, r in enumerate(entity_ref): 94 | ref_ngram.append(common.get_ngram(r, n_size, label='ENTITY')) 95 | match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram) 96 | self.match_ngram[n_size] += self.beta * match_size 97 | self.candi_ngram[n_size] += self.beta * match_size 98 | -------------------------------------------------------------------------------- /R-Net/common.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | from functools import reduce 3 | import math 4 | import ujson as json 5 | from collections import defaultdict 6 | import sys 7 | 8 | 9 | def get_match_size(cand_ngram, refs_ngram): 10 | ref_set = defaultdict(int) 11 | for ref_ngram in refs_ngram: 12 | tmp_ref_set = defaultdict(int) 13 | for ngram in ref_ngram: 14 | tmp_ref_set[ngram] += 1 15 | for ngram, count in tmp_ref_set.items(): 16 | ref_set[ngram] = max(ref_set[ngram], count) 17 | cand_set = defaultdict(int) 18 | for ngram in cand_ngram: 19 | cand_set[ngram] += 1 20 | match_size = 0 21 | for ngram, count in cand_set.items(): 22 | match_size += min(count, ref_set.get(ngram, 0)) 23 | cand_size = len(cand_ngram) 24 | return match_size, cand_size 25 | 26 | 27 | def get_ngram(sent, n_size, label=None): 28 | def _ngram(sent, n_size): 29 | ngram_list = [] 30 | for left in range(len(sent) - n_size): 31 | ngram_list.append(sent[left: left + n_size + 1]) 32 | return ngram_list 33 | 34 | ngram_list = _ngram(sent, n_size) 35 | if label is not None: 36 | ngram_list = [ngram + '_' + label for ngram in ngram_list] 37 | return ngram_list 38 | 39 | 40 | def word2char(str_in): 41 | str_out = str_in.replace(' ', '') 42 | return ''.join(str_out.split()) 43 | -------------------------------------------------------------------------------- /R-Net/mrc_eval.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | """ 3 | This module computes evaluation metrics for DuReader dataset. 4 | """ 5 | 6 | import argparse 7 | import itertools 8 | import ujson as json 9 | import zipfile 10 | from bleu import BLEUWithBonus 11 | from rouge import RougeLWithBonus 12 | 13 | EMPTY = '' 14 | YESNO_LABELS = set(['Yes', 'No', 'Depends']) 15 | 16 | 17 | def normalize(s): 18 | """ 19 | Normalize strings to space joined chars. 20 | Args: 21 | s: a list of strings. 22 | Returns: 23 | A list of normalized strings. 24 | """ 25 | if not s: 26 | return s 27 | normalized = [] 28 | for ss in s: 29 | tokens = [c for c in list(ss) if len(c.strip()) != 0] 30 | normalized.append(''.join(tokens)) 31 | return normalized 32 | 33 | 34 | def data_check(obj): 35 | """ 36 | Check data. 37 | 38 | Raises: 39 | Raises AssertionError when data is not legal. 40 | """ 41 | # 判断是否有answer_id 42 | assert 'question_id' in obj, "Missing 'question_id' field." 43 | # assert 'yesno_answers' in obj, \ 44 | # "Missing 'yesno_answers' field. question_id: {}".format(obj['question_id']) 45 | # 如果包含yesno_answers,那么格式必须为list 46 | if "yesno_answers" in obj: 47 | assert isinstance(obj['yesno_answers'], list), \ 48 | r"""'yesno_answers' field must be a list, if the 'question_type' is not 49 | 'YES_NO', then this field should be an empty list. 50 | question_id: {}""".format(obj['question_id']) 51 | else: 52 | obj["yesno_answers"] = [] 53 | if "entity_answers" not in obj: 54 | obj["entity_answers"] = [] 55 | 56 | 57 | def read_file(file_name, is_ref=False): 58 | """ 59 | Read predict answers or reference answers from file. 60 | 61 | Args: 62 | file_name: the name of the file containing predict result or reference 63 | result. 64 | 65 | Returns: 66 | A dictionary mapping question_id to the result information. The result 67 | information itself is also a dictionary with has four keys: 68 | - question_type: type of the query. 69 | - yesno_answers: A list of yesno answers corresponding to 'answers'. 70 | - answers: A list of predicted answers. 71 | - entity_answers: A list, each element is also a list containing the entities 72 | tagged out from the corresponding answer string. 73 | """ 74 | 75 | def _open(file_name, mode, zip_obj=None): 76 | if zip_obj is not None: 77 | return zip_obj.open(file_name, mode) 78 | return open(file_name, mode) 79 | 80 | results = {} 81 | # 是否是参考答案 82 | if is_ref: 83 | keys = ['source', 'answers', 'yesno_answers', 'entity_answers', 'question_type'] 84 | else: 85 | keys = ['answers', 'yesno_answers'] 86 | # 如果是zip文件则以zip方式读取 87 | zf = zipfile.ZipFile(file_name, 'r') if file_name.endswith('.zip') else None 88 | # zip包中文件列表 89 | file_list = [file_name] if zf is None else zf.namelist() 90 | 91 | for fn in file_list: 92 | for line in _open(fn, 'r', zip_obj=zf): 93 | try: 94 | obj = json.loads(line.strip()) 95 | except ValueError: 96 | raise ValueError("Every line of data should be legal json") 97 | data_check(obj) 98 | qid = obj['question_id'] 99 | # 必须有question id 100 | assert qid not in results, "Duplicate question_id: {}".format(qid) 101 | results[qid] = {} 102 | for k in keys: 103 | if k == 'answers': 104 | results[qid][k] = normalize(obj[k]) 105 | else: 106 | results[qid][k] = obj[k] 107 | if is_ref: 108 | for i, e in enumerate(results[qid]['entity_answers']): 109 | results[qid]['entity_answers'][i] = normalize(e) 110 | return results 111 | 112 | 113 | def calc_metrics(pred_result, ref_result, bleu_eval, rouge_eval): 114 | """Computes bleu-4 and rouge-l. 115 | 116 | Args: 117 | - pred_result: Refer to the returned dict of `read_file` with 118 | 'is_ref=False'. 119 | - ref_result: Refer to the returned dict of `ref_file` with 120 | 'is_ref=True'. 121 | - bleu_result: A BleuWithBonus object. 122 | - rouge_result: A RougeLWithBonus object. 123 | Returns: 124 | bleu-4 and rouge-l values as a tuple of float values. 125 | """ 126 | for qid, results in ref_result.items(): 127 | # 根据question id从预测结果中选择答案 128 | cand_result = pred_result.get(qid, {}) 129 | pred_answers = cand_result.get('answers', []) 130 | if not pred_answers: 131 | pred_answers = EMPTY 132 | else: 133 | pred_answers = pred_answers[0] 134 | pred_yn_label = None 135 | ref_entities = None 136 | ref_answers = results.get('answers', []) 137 | if not ref_answers: 138 | continue 139 | if results['question_type'] == 'ENTITY': 140 | ref_entities = set( 141 | itertools.chain(*results.get('entity_answers', [[]]))) 142 | if not ref_entities: 143 | ref_entities = None 144 | if results['question_type'] == 'YES_NO': 145 | cand_yesno = cand_result.get('yesno_answers', []) 146 | pred_yn_label = None if len(cand_yesno) == 0 \ 147 | else cand_yesno[0] 148 | bleu_eval.add_inst( 149 | pred_answers, 150 | ref_answers, 151 | yn_label=pred_yn_label, 152 | yn_ref=results['yesno_answers'], 153 | entity_ref=ref_entities) 154 | rouge_eval.add_inst( 155 | pred_answers, 156 | ref_answers, 157 | yn_label=pred_yn_label, 158 | yn_ref=results['yesno_answers'], 159 | entity_ref=ref_entities) 160 | bleu4 = bleu_eval.score()[-1] 161 | rouge_l = rouge_eval.score() 162 | return bleu4, rouge_l 163 | 164 | 165 | def main(args): 166 | err = None 167 | metrics = {} 168 | bleu4, rouge_l = 0.0, 0.0 169 | alpha = args.alpha # default 1.0 170 | beta = args.beta # default 1.0 171 | bleu_eval = BLEUWithBonus(4, alpha=alpha, beta=beta) 172 | rouge_eval = RougeLWithBonus(alpha=alpha, beta=beta, gamma=1.2) 173 | # 载入answer文件 格式dict question_id: {answers:[], yesno_answers:[]} 174 | pred_result = read_file(args.pred_file) 175 | ref_result = read_file(args.ref_file, is_ref=True) 176 | bleu4, rouge_l = calc_metrics(pred_result, 177 | ref_result, 178 | bleu_eval, 179 | rouge_eval) 180 | metrics = { 181 | 'ROUGE-L': round(rouge_l * 100, 2), 182 | 'BLEU-4': round(bleu4 * 100, 2), 183 | } 184 | print(json.dumps(metrics, ensure_ascii=False).encode('utf8')) 185 | 186 | 187 | if __name__ == '__main__': 188 | parser = argparse.ArgumentParser() 189 | parser.add_argument('--pred_file', help='predict file') 190 | parser.add_argument('--ref_file', help='reference file') 191 | parser.add_argument('--alpha', type=float, default=1.0, 192 | help='common value of alpha') 193 | parser.add_argument('--beta', type=float, default=1.0, 194 | help='common value of beta') 195 | args = parser.parse_args() 196 | main(args) 197 | -------------------------------------------------------------------------------- /R-Net/rouge.py: -------------------------------------------------------------------------------- 1 | # coding:utf8 2 | 3 | from functools import reduce 4 | import math 5 | import json 6 | import numpy as np 7 | from collections import defaultdict 8 | import sys 9 | 10 | # reload(sys) 11 | # sys.setdefaultencoding("utf-8") 12 | 13 | 14 | class RougeLWithBonus(object): 15 | def __init__(self, alpha=1.0, beta=1.0, gamma=1.2): 16 | self.alpha = alpha 17 | self.beta = beta 18 | self.gamma = gamma 19 | self.inst_scores = [] 20 | 21 | def lcs(self, string, sub): 22 | if len(string) < len(sub): 23 | sub, string = string, sub 24 | lengths = np.zeros((len(string) + 1, len(sub) + 1)) 25 | for j in range(1, len(sub) + 1): 26 | for i in range(1, len(string) + 1): 27 | if string[i - 1] == sub[j - 1]: 28 | lengths[i][j] = lengths[i - 1][j - 1] + 1 29 | else: 30 | lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1]) 31 | return lengths[len(string)][len(sub)] 32 | 33 | def add_inst(self, 34 | cand, 35 | ref_list, 36 | yn_label=None, yn_ref=None, entity_ref=None): 37 | precs, recalls = [], [] 38 | for i, ref in enumerate(ref_list): 39 | basic_lcs = self.lcs(cand, ref) 40 | yn_bonus, entity_bonus = 0.0, 0.0 41 | if yn_ref is not None and yn_label is not None: 42 | yn_bonus = self.add_yn_bonus(cand, ref, yn_label, yn_ref[i]) 43 | elif entity_ref is not None: 44 | entity_bonus = self.add_entity_bonus(cand, entity_ref) 45 | p_denom = len(cand) + self.alpha * yn_bonus + self.beta * entity_bonus 46 | r_denom = len(ref) + self.alpha * yn_bonus + self.beta * entity_bonus 47 | prec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \ 48 | / p_denom if p_denom > 0. else 0. 49 | rec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \ 50 | / r_denom if r_denom > 0. else 0. 51 | precs.append(prec) 52 | recalls.append(rec) 53 | 54 | prec_max = max(precs) 55 | rec_max = max(recalls) 56 | if prec_max != 0 and rec_max != 0: 57 | score = ((1 + self.gamma ** 2) * prec_max * rec_max) / \ 58 | float(rec_max + self.gamma ** 2 * prec_max) 59 | else: 60 | score = 0.0 61 | self.inst_scores.append(score) 62 | 63 | def add_yn_bonus(self, cand, ref, yn_label, yn_ref): 64 | if yn_label != yn_ref: 65 | return 0.0 66 | lcs_ = self.lcs(cand, ref) 67 | return lcs_ 68 | 69 | def add_entity_bonus(self, cand, entity_ref): 70 | lcs_ = 0.0 71 | for ent in entity_ref: 72 | if ent in cand: 73 | lcs_ += len(ent) 74 | return lcs_ 75 | 76 | def score(self): 77 | return 1. * sum(self.inst_scores) / len(self.inst_scores) 78 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MRC2018 2 | - 2018机器阅读理解技术竞赛 [竞赛网站](http://mrc2018.cipsc.org.cn/) 3 | - 参赛模型:BiDAF+Self Attention+Pre(single) 4 | - 最终排名:28/105(菜鸡第一次参赛) 5 | 6 | ## 最近更新 7 | - 2018/09/11更新,AAAI赶完了,感谢大家的star。 8 | 1. basic_rnn.py 现已支持multi-layer的RNNCell(Tensorflow单层RNN和多层RNN的用法完全不同。。。),添加了最新的SRU和IndyRNN 9 | 2. rc_model.py Adam更换为速度更快的LazyAdam,需tf>=1.9 10 | - 2018/08/20更新,po主在赶AAAI,焦头烂额。。。先简单写一下,整个模型的训练和修改流程(以BiDAF+Self Attention为例,后续做成PDF详解): 11 | 1. /dureader/run.py --prepare(数据预处理) --train(训练、预测) 12 | 2. /dureader/rc_model.py 模型(做修改可从此处着手) 13 | 3. /dureader/layers 各种层(pointnet,match layer,cudnn rnn) 14 | 4. /dureader/json_to_sentence.py 从原始json文件中提取文本 15 | 5. /dureader/pretrain_embedding.py 预训练词向量 16 | 6. /dureader/SIF.py 参考论文A Simple but Tough-to-Beat Baseline for Sentence Embeddings,但效果不佳。。。 17 | 7. /utils 评价指标 18 | - 2018/08/20更新,最好成绩使用的参数即为BiDAF+Self Attention/run.py中[默认参数](https://github.com/shiningliang/MRC2018/blob/master/BiDAF%2BSelf%20Attention/dureader/run.py#L22) 19 | - 2018/08/06更新,po主参加了在[语言与智能高峰论坛](http://www.cipsc.org.cn/lis2018/index.html)上举办的比赛颁奖典礼,发现都是前期特征工程提升巨大,模型上未有亮眼工作,如果拿到了前几名的技术报告,会推上来 20 | - 2018/08/06更新,百度现已开放全部数据,下边的数据集统计表中已更新链接,比赛成绩也会放上来,大家可以日常打榜。颁奖典礼上负责人表示,比赛明年还会继续举办,大家加油! 21 | 22 | ## 参考模型 23 | - [R-Net](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf) 24 | - [BiDAF](https://allenai.github.io/bi-att-flow/) 25 | 26 | ## 参考代码 27 | - [HKUST](https://github.com/HKUST-KnowComp/R-Net) 28 | - [DuReader](https://github.com/baidu/DuReader) 29 | 30 | ## Requirements 31 | ### General 32 | - Python >= 3.4 33 | - numpy 34 | 35 | ### Python Packages 36 | - tensorflow-gpu >= 1.9.0 37 | - ujson 38 | - pickle 39 | - tqdm 40 | 41 | ### Data 42 | 43 | 类型 | train | dev | test 44 | ---|---|---|---| 45 | [比赛](http://ai.baidu.com/broad/download?dataset=dureader) | 27W| 1W | 2W | 46 | [开放](http://ai.baidu.com/broad/download) | 20W | 1W | 1W 47 | 48 | ## Performance 49 | ### Score(Public Board) 50 | 51 | Model | Rouge-L | Bleu-4 52 | ---|---|--- 53 | BiDAF(cuDNN based) | 46.56 | 40.95 54 | R-Net | 42.09 | 41.1 55 | BiDAF+Self Attention | 47.28 | 41.3 56 | BiDAF+Self Attention+Gated RNN | 47.71 | 41.75 57 | 58 | ### Memory and Time 59 | i7-7700k + 32G RAM + GTX1080Ti 60 | batch size=32 dropout=0.7 61 | 62 | Model | GPU Memory | Time(50 batch) | word embedding trainable 63 | ---|---|---|--- 64 | BiDAF(origin)| 8431M | 47s | false 65 | MLSTM | 10655M | 1min27s | false 66 | R-Net | 4295M | 23s | false 67 | BiDAF+Self Attention(cuDNN based) | 8431M | 22s | false 68 | BiDAF+Self Attention+Gated RNN(Pre) | N/A | N/A | false 69 | 70 | ## BUG: 71 | 1. BiDAF+Self Attention无法保存后再加载模型,tensorflow的cuDNN_LSTM虽然极快,但太难用了 72 | 2. R-Net本地的两个指标极差,提交的结果倒是正常 73 | 74 | ## Other 75 | - 实际还有基于HKUST的BiDAF版本,显存和时间占用略小于R-Net,但效果比BiDAF(origin)大约低2个点,可能是使用了GRU的原因 76 | - 最终在训练时无法保存最优模型的情况下,只能针对当前最优epoch进行一次predict,极为耗时 77 | - 这个repo的Self Attention加在了match layer,后来发现cs224n的做法是基于match layer的输出做Self Attention,估计效果更好 78 | -------------------------------------------------------------------------------- /data/demo/README.md: -------------------------------------------------------------------------------- 1 | # Data Preprocessing Strategy 2 | 3 | Here is an example of preprocessed data: 4 | ``` 5 | { 6 | "question_id": 186358, 7 | "question_type": "YES_NO", 8 | "question": "上海迪士尼可以带吃的进去吗", 9 | "segmented_question": ["上海", "迪士尼", "可以", "带", "吃的", "进去", "吗"], 10 | "documents": [ 11 | { 12 | "paragraphs": ["text paragraph 1", "text paragraph 2"], 13 | "segmented_paragraphs": [["tokens of paragraph1"], ["tokens of paragraph2"]], 14 | "title": "上海迪士尼可以带吃的进去吗", 15 | "segmented_title": ["上海", "迪士尼", "可以", "带", "吃的", "进去", "吗"], 16 | "bs_rank_pos": 1, 17 | "is_selected": True 18 | "most_related_para": 0, 19 | }, 20 | # ... 21 | ], 22 | "answers": [ 23 | "完全密封的可以,其它不可以。", # answer1 24 | "可以的,不限制的。只要不是易燃易爆的危险物品,一般都可以带进去的。", # answer2 25 | "罐装婴儿食品、包装完好的果汁、水等饮料及包装完好的食物都可以带进乐园,但游客自己在家制作的食品是不能入园,因为自制食品有一定的安全隐患。" # answer3 26 | ] 27 | "answer_docs": [0], 28 | "answer_spans": [[0, 15]] 29 | "fake_answers": ["完全密封的可以,其他不可以。"], 30 | "match_scores": [1.00], 31 | "segmented_answers": [ 32 | ["完全", "密封", "的", "可以", ",", "其它", "不可以", "。"], 33 | ["tokens for answer2"], 34 | ["tokens for answer3"], 35 | 36 | "yesno_answers": [ 37 | "Depends", # corresponding to answer 1 38 | "Yes", # corresponding to answer 2 39 | "Depends" # corresponding to asnwer 3 40 | ] 41 | } 42 | ``` 43 | 44 | To make it easier for researchers to use DuReader Dataset, we also release the preprocessed data. The preprocessing mainly does the following things: 45 | 1. Word segmentation. We segment all questions, answers, document titles and paragraphs into Chinese words, and the results are stored with a new field which prefix the corresponding field name with "segmented_". For example, the segmented question is stored in "segmented_question". 46 | 2. Answer paragraph targeting. In DuReader dataset, each question has up to 5 related documents, and the average document length is 394, since it is too heavy to feed all 5 documents into popular RC models, so we previously find the most answer related paragraph that might contain an answer for each document. And we replace original documents with the most related paragraphs in our baseline models. The most related paragraphs are selected according to highest recall of the answer tokens of each document, and the index of the selected paragraph of each document is stored in "most_related_para". 47 | 3. Locating answer span. For many popular RC models, an answer span is required in training. Since the original DuReader dataset doesn't provide the answer span, we provide a simple answer span locating strategy for convenience in our preprocess code as an optional preprocess strategy. In the strategy, we match real answer with each documents, then search the substring with maximum F1-score of the real answers, and use the span of substring as the candidate answer span. For each question we find single span as candidate, and store it in the "answer_spans" field, the corresponding substring spanned by answer span is stored in "fake_answers", the recall of the answer span of the real answer is stored in "match_scores", and the document index of the answer span is stored in "answer_docs". 48 | 49 | Except for word segmentation, the rest of the preprocessing strategy is implemented in `utils/preprocess.py`. 50 | 51 | 数据预处理主要包含以下过程: 52 | 1. 分词。我们对所有的问题,答案,文档的标题和段落进行分词,将结果存储在以"segmented_"为前缀的新域中。例如,分词的问题被存储在"segmented_question" 53 | 2. 答案目标段落。在DuReader数据集中,每个问题至多有5篇相关文档,文档的平均长度为394。由于将5篇文档全部输入RC模型计算量过大,我们预先在每篇文档中找出可能包含答案的与答案最相关的段落。在baseline模型中用最相关段落代替原文档。根据每篇文档的答案tokens的最高召回率选择最相关段落,被选出的段落的下标存储在"most_related_para" 54 | 3. 定位answer span。对于大多数流行的RC模型,训练时需要answer span。因为原始数据集中没有提供answer span,为了方便,在预处理代码中,我们提供了一个简单的answer span定位策略。在该策略中,我们将真实答案与每篇文档匹配,然后搜索真实答案获得最大F1-score的子串,并将子串的span作为候选answer span。对每个问题,我们找出一个span作为候选,存储在"answer_span"域,与answer span对应的子串存储在"fake_answers",真实答案的answer span的召回率存储在"match_scores",answer span的文档下标存储在"answer_docs" -------------------------------------------------------------------------------- /竞赛技术报告/Final_Naturali-2018机器阅读理解技术竞赛系统报告.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/竞赛技术报告/Final_Naturali-2018机器阅读理解技术竞赛系统报告.pptx -------------------------------------------------------------------------------- /竞赛技术报告/东北大学-2018机器阅读理解竞赛报告.ppt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/竞赛技术报告/东北大学-2018机器阅读理解竞赛报告.ppt -------------------------------------------------------------------------------- /竞赛技术报告/台达电子-Delta-MRC系統報告.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/竞赛技术报告/台达电子-Delta-MRC系統報告.pdf --------------------------------------------------------------------------------