├── BiDAF+Self Attention
    ├── dureader
    │   ├── SIF.py
    │   ├── dataset.py
    │   ├── json_to_sentence.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── basic_rnn.cpython-36.pyc
    │   │   │   ├── cu_rnn.cpython-36.pyc
    │   │   │   ├── match_layer.cpython-36.pyc
    │   │   │   └── pointer_net.cpython-36.pyc
    │   │   ├── basic_rnn.py
    │   │   ├── cu_rnn.py
    │   │   ├── match_layer.py
    │   │   └── pointer_net.py
    │   ├── pretrain_embedding.py
    │   ├── rc_model.py
    │   ├── run.py
    │   └── vocab.py
    └── utils
    │   ├── __init__.py
    │   ├── baseline_eval.py
    │   ├── bleu.py
    │   ├── bleu_metric
    │       ├── __pycache__
    │       │   ├── bleu.cpython-36.pyc
    │       │   └── bleu_score.cpython-36.pyc
    │       ├── bleu.py
    │       └── bleu_score.py
    │   ├── common.py
    │   ├── dureader_eval.py
    │   ├── get_vocab.py
    │   ├── mrc_eval.py
    │   ├── preprocess.py
    │   ├── rouge.py
    │   └── rouge_metric
    │       ├── __pycache__
    │           └── rouge.cpython-36.pyc
    │       └── rouge.py
├── BiDAF_Origin
    ├── dureader
    │   ├── dataset.py
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── basic_rnn.cpython-36.pyc
    │   │   │   ├── match_layer.cpython-36.pyc
    │   │   │   └── pointer_net.cpython-36.pyc
    │   │   ├── basic_rnn.py
    │   │   ├── match_layer.py
    │   │   └── pointer_net.py
    │   ├── rc_model.py
    │   ├── run.py
    │   └── vocab.py
    └── utils
    │   ├── __init__.py
    │   ├── baseline_eval.py
    │   ├── bleu_metric
    │       ├── __pycache__
    │       │   ├── bleu.cpython-36.pyc
    │       │   └── bleu_score.cpython-36.pyc
    │       ├── bleu.py
    │       └── bleu_score.py
    │   ├── dureader_eval.py
    │   ├── get_vocab.py
    │   ├── json_to_sentence.py
    │   ├── preprocess.py
    │   ├── pretrain_embedding.py
    │   └── rouge_metric
    │       ├── __pycache__
    │           └── rouge.cpython-36.pyc
    │       └── rouge.py
├── GatedRNN
    ├── GatedRNN.py
    ├── GatedRNN_prepro.py
    ├── GatedRNN_run.py
    ├── GatedRNN_util.py
    └── basic_rnn.py
├── R-Net
    ├── S_model.py
    ├── S_prepro.py
    ├── S_run.py
    ├── S_util.py
    ├── basic_rnn.py
    ├── bleu.py
    ├── common.py
    ├── mrc_eval.py
    └── rouge.py
├── README.md
├── data
    └── demo
    │   ├── README.md
    │   ├── devset
    │       └── search.dev.json
    │   ├── testset
    │       └── search.test.json
    │   └── trainset
    │       └── search.train.json
└── 竞赛技术报告
    ├── Final_Naturali-2018机器阅读理解技术竞赛系统报告.pptx
    ├── 东北大学-2018机器阅读理解竞赛报告.ppt
    └── 台达电子-Delta-MRC系統報告.pdf


/BiDAF+Self Attention/dureader/SIF.py:
--------------------------------------------------------------------------------
  1 | import pickle as pkl
  2 | import os
  3 | import sys
  4 | import ujson as json
  5 | import logging
  6 | import numpy as np
  7 | from gensim.models import word2vec
  8 | from sklearn.decomposition import TruncatedSVD
  9 | 
 10 | 
 11 | class SIFModel(object):
 12 |     def __init__(self, args, logger, pre_train, train_files=[], dev_files=[], test_files=[], a=1e-3, embed_dim=300):
 13 |         self.logger = logger
 14 |         self.segmented_dir = args.segmented_dir
 15 |         self.prepared_dir = args.prepared_dir
 16 |         self.a = a
 17 |         self.embed_dim = embed_dim
 18 |         self.weighted_word_dict = None
 19 |         self.pc = None
 20 |         self.train_set, self.dev_set, self.test_set = [], [], []
 21 | 
 22 |         if pre_train:
 23 |             for train_file in train_files:
 24 |                 self.train_set += self._load_dataset(train_file, train=True)
 25 |             self.train_set_seg = os.path.join(self.segmented_dir, 'train_set.seg')
 26 |             self.logger.info('Writing train_set.seg')
 27 |             self._write_data(self.train_set, self.train_set_seg)
 28 |             del self.train_set
 29 | 
 30 |             for dev_file in dev_files:
 31 |                 self.dev_set += self._load_dataset(dev_file)
 32 |             self.dev_set_seg = os.path.join(self.segmented_dir, 'dev_set.seg')
 33 |             self.logger.info('Writing dev_set.seg')
 34 |             self._write_data(self.dev_set, self.dev_set_seg)
 35 |             del self.dev_set
 36 | 
 37 |             for test_file in test_files:
 38 |                 self.test_set += self._load_dataset(test_file)
 39 |             self.test_set_seg = os.path.join(self.segmented_dir, 'test_set.seg')
 40 |             self.logger.info('Writing test_set.seg')
 41 |             self._write_data(self.test_set, self.test_set_seg)
 42 |             del self.test_set
 43 | 
 44 |     def _load_dataset(self, data_path, train=False):
 45 |         fin = open(data_path, 'r', encoding='utf8')
 46 |         data_set = []
 47 |         for lidx, line in enumerate(fin):
 48 |             sample = json.loads(line.strip())
 49 |             del sample['question']
 50 |             if train:
 51 |                 del sample['answers']
 52 |                 del sample['fake_answers']
 53 |                 del sample['segmented_answers']
 54 |             sample['passages'] = []
 55 |             for d_idx, doc in enumerate(sample['documents']):
 56 |                 if train:
 57 |                     most_related_para = doc['most_related_para']
 58 |                     sample['passages'].append({'passage_tokens': doc['segmented_paragraphs'][most_related_para]})
 59 |                 else:
 60 |                     for segmented_paragraph in doc['segmented_paragraphs']:
 61 |                         sample['passages'].append({'passage_tokens': segmented_paragraph})
 62 |             del sample['documents']
 63 |             data_set.append(sample)
 64 |         fin.close()
 65 |         return data_set
 66 | 
 67 |     def _write_data(self, data_set, tar_dir):
 68 |         with open(tar_dir, 'w', encoding='utf8') as f:
 69 |             for sample in data_set:
 70 |                 f.write(' '.join(sample['segmented_question']) + '\n')
 71 |                 for passage in sample['passages']:
 72 |                     f.write(' '.join(passage['passage_tokens']) + '\n')
 73 |                 del sample
 74 |         f.close()
 75 | 
 76 |     def train_embeddings(self):
 77 |         sys.path.append('..')
 78 |         logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
 79 |         logging.root.setLevel(level=logging.INFO)
 80 |         self.logger.info("running %s" % ' '.join(sys.argv))
 81 | 
 82 |         model = word2vec.Word2Vec(word2vec.PathLineSentences(self.segmented_dir), size=300, min_count=2, workers=8,
 83 |                                   iter=15)
 84 |         w2v_dict = {}
 85 |         for word in model.wv.vocab:
 86 |             w2v_dict[word] = model[word]
 87 |         with open(os.path.join(self.prepared_dir, 'w2v_dic.pkl'), 'wb') as f:
 88 |             pkl.dump(w2v_dict, f)
 89 |         f.close()
 90 |         model.wv.save_word2vec_format(os.path.join(self.prepared_dir, 'w2v_model.bin'), binary=True)
 91 | 
 92 |     def get_dict_word_fre(self):
 93 |         word_all_num = 0
 94 |         dict_word_num = {}
 95 |         dict_word_fre = {}
 96 |         for root, dirs, files in os.walk(self.segmented_dir):
 97 |             for file_name in files:
 98 |                 with open(os.path.join(self.segmented_dir, file_name), 'r', encoding='utf8') as f:
 99 |                     for line in f.readlines():
100 |                         line = line.replace('\n', '')
101 |                         words = line.split(' ')
102 |                         for word in words:
103 |                             word_all_num += 1
104 |                             if word in dict_word_num:
105 |                                 dict_word_num[word] += 1
106 |                             else:
107 |                                 dict_word_num[word] = 1
108 |                 f.close()
109 |         for word in dict_word_num:
110 |             dict_word_fre[word] = dict_word_num[word] / word_all_num
111 |         return word_all_num, dict_word_fre
112 | 
113 |     def get_dict_word_weight(self):
114 |         word_all_num, dict_word_fre = self.get_dict_word_fre()
115 |         self.logger.info('Total words num is {}'.format(word_all_num))
116 |         if self.a <= 0:
117 |             self.a = 1.0
118 |         dict_word_weight = {}
119 |         for word in dict_word_fre:
120 |             dict_word_weight[word] = self.a / (self.a + dict_word_fre[word])
121 |         return dict_word_weight
122 | 
123 |     def load_model(self):
124 |         with open(os.path.join(self.prepared_dir, 'weighted_word_dict.pkl'), 'rb') as fww:
125 |             self.weighted_word_dict = pkl.load(fww)
126 |         with open(os.path.join(self.prepared_dir, 'pc.pkl'), 'rb') as fpc:
127 |             self.pc = pkl.load(fpc)
128 | 
129 |     def get_weighted_embedding(self, sentence):
130 |         # init the sentence embedding
131 |         weighted_embedding = np.array([0.0] * self.embed_dim)
132 |         for word in sentence:
133 |             # weighted_embedding += self.weighted_word_dict[word]
134 |             if word in self.weighted_word_dict:
135 |                 weighted_embedding += self.weighted_word_dict[word]
136 |             else:
137 |                 weighted_embedding += np.array([1.0] * self.embed_dim) * 0.001
138 |         return weighted_embedding
139 | 
140 |     def get_weighted_embedding_list(self, dict_word_weight):
141 |         weighted_embedding_list = []
142 |         weighted_word_dict = {}
143 |         with open(os.path.join(self.prepared_dir, 'w2v_dic.pkl'), 'rb') as fin:
144 |             w2v_model = pkl.load(fin)
145 |         fin.close()
146 |         for root, dirs, files in os.walk(self.segmented_dir):
147 |             for file_name in files:
148 |                 with open(os.path.join(self.segmented_dir, file_name), 'r', encoding='utf8') as f:
149 |                     for line in f.readlines():
150 |                         line = line.replace('\n', '')
151 |                         words = line.split(' ')
152 |                         weighted_embedding = np.array([0.0] * self.embed_dim)
153 |                         for word in words:
154 |                             if word not in weighted_word_dict:
155 |                                 if word in w2v_model:
156 |                                     weighted_word_embedding = w2v_model[word] * dict_word_weight[word]
157 |                                 else:
158 |                                     weighted_word_embedding = np.array([1.0] * self.embed_dim) * 0.001
159 |                                 weighted_word_dict[word] = weighted_word_embedding
160 |                             weighted_embedding += weighted_word_dict[word]
161 |                         weighted_embedding_list.append(weighted_embedding)
162 |                 f.close()
163 |         pkl.dump(weighted_word_dict, open(os.path.join(self.prepared_dir, 'weighted_word_dict.pkl'), 'wb'))
164 |         return np.array(weighted_embedding_list)
165 | 
166 |     def compute_pc(self, x, npc=1):
167 |         svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
168 |         svd.fit(x)
169 |         return svd.components_
170 | 
171 |     def remove_pc(self, x, npc=1):
172 |         """
173 |         Remove the projection on the principal components
174 |         :param x: x[i,:] is a data point
175 |         :param npc: number of principal components to remove
176 |         :return: XX[i, :] is the data point after removing its projection
177 |         """
178 |         pc = self.compute_pc(x, npc)
179 |         if npc == 1:
180 |             xx = x - x.dot(pc.transpose()) * pc
181 |         else:
182 |             xx = x - x.dot(pc.transpose()).dot(pc)
183 |         return xx
184 | 
185 |     def build_pc_and_sif_embedding_list(self):
186 |         dict_word_weight = self.get_dict_word_weight()
187 |         # pkl.dump(dict_word_weight, open(os.path.join(self.prepared_dir, 'dict_word_weight.pkl'), 'wb'))
188 |         weighted_embedding_list = self.get_weighted_embedding_list(dict_word_weight)
189 |         self.logger.info('Finish building the weighted embedding list of sentence list')
190 |         pc = self.compute_pc(weighted_embedding_list)
191 |         pkl.dump(pc, open(os.path.join(self.prepared_dir, 'pc.pkl'), 'wb'))
192 |         self.logger.info('Finish building the pc')
193 |         # sif_embedding_list = self.remove_pc(weighted_embedding_list)
194 |         # pickle.dump(sif_embedding_list, open(params.dump_sif_embedding_list_path, 'wb'))
195 |         # self.logger.info('Finish building the sif_embedding')
196 | 
197 |     def get_sif_embedding(self, text):
198 |         sentence_embedding = self.get_weighted_embedding(text)
199 |         rmpc_sentence_embedding = sentence_embedding - sentence_embedding.dot(self.pc.transpose()) * self.pc
200 |         return rmpc_sentence_embedding
201 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/json_to_sentence.py:
--------------------------------------------------------------------------------
 1 | # import sys
 2 | import os
 3 | # sys.path.append('..')
 4 | 
 5 | 
 6 | def write_data(brc_data, tar_dir):
 7 |     # print('Converting ' + file)
 8 |     # fin = open(file, encoding='utf8')
 9 |     out_file = os.path.join(tar_dir, 'train_set.seg')
10 |     with open(out_file, 'w', encoding='utf8') as ftrain:
11 |         for sample in brc_data.train_set:
12 |             ftrain.write(' '.join(sample['segmented_question']) + '\n')
13 |             for passage in sample['passages']:
14 |                 ftrain.write(' '.join(passage['passage_tokens']) + '\n')
15 |             del sample
16 |     ftrain.close()
17 | 
18 |     out_file = os.path.join(tar_dir, 'dev_set.seg')
19 |     with open(out_file, 'w', encoding='utf8') as fdev:
20 |         for sample in brc_data.dev_set:
21 |             fdev.write(' '.join(sample['segmented_question']) + '\n')
22 |             for passage in sample['passages']:
23 |                 fdev.write(' '.join(passage['passage_tokens']) + '\n')
24 |             del sample
25 |     fdev.close()
26 | 
27 |     out_file = os.path.join(tar_dir, 'test_set.seg')
28 |     with open(out_file, 'w', encoding='utf8') as ftest:
29 |         for sample in brc_data.test_set:
30 |             ftest.write(' '.join(sample['segmented_question']) + '\n')
31 |             for passage in sample['passages']:
32 |                 ftest.write(' '.join(passage['passage_tokens']) + '\n')
33 |             del sample
34 |     ftest.close()
35 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | Empty __init__.py file
19 | 
20 | Authors: Yizhong Wang(wangyizhong01@baidu.com)
21 | Date: 2017/09/20 12:00:00
22 | """
23 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/__pycache__/cu_rnn.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/cu_rnn.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/__pycache__/match_layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/match_layer.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/__pycache__/pointer_net.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/dureader/layers/__pycache__/pointer_net.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/basic_rnn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module provides wrappers for variants of RNN in Tensorflow
 3 | """
 4 | 
 5 | import tensorflow as tf
 6 | import tensorflow.contrib as tc
 7 | 
 8 | 
 9 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True):
10 |     """
11 |     Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN
12 |     Args:
13 |         rnn_type: the type of rnn
14 |         inputs: padded inputs into rnn
15 |         length: the valid length of the inputs
16 |         hidden_size: the size of hidden units
17 |         layer_num: multiple rnn layer are stacked if layer_num > 1
18 |         dropout_keep_prob:
19 |         concat: When the rnn is bidirectional, the forward outputs and backward outputs are
20 |                 concatenated if this is True, else we add them.
21 |     Returns:
22 |         RNN outputs and final state
23 |     """
24 |     if not rnn_type.startswith('bi'):
25 |         cells = tc.rnn.MultiRNNCell([get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) for _ in range(layer_num)],
26 |                                     state_is_tuple=True)
27 |         outputs, state = tf.nn.dynamic_rnn(cells, inputs, sequence_length=length, dtype=tf.float32)
28 |         if rnn_type.endswith('lstm'):
29 |             c, h = state
30 |             state = h
31 |     else:
32 |         if layer_num > 1:
33 |             cell_fw = [get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) for _ in range(layer_num)]
34 |             cell_bw = [get_nor_cell(rnn_type, hidden_size, dropout_keep_prob) for _ in range(layer_num)]
35 |             outputs, state_fw, state_bw = tc.rnn.stack_bidirectional_dynamic_rnn(
36 |                 cell_fw, cell_bw, inputs, sequence_length=length, dtype=tf.float32
37 |             )
38 |         else:
39 |             cell_fw = get_nor_cell(rnn_type, hidden_size, dropout_keep_prob)
40 |             cell_bw = get_nor_cell(rnn_type, hidden_size, dropout_keep_prob)
41 |             outputs, state = tf.nn.bidirectional_dynamic_rnn(
42 |                 cell_fw, cell_bw, inputs, sequence_length=length, dtype=tf.float32
43 |             )
44 |     return outputs
45 | 
46 | 
47 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None):
48 |     """
49 |     Gets the RNN Cell
50 |     Args:
51 |         rnn_type: 'lstm', 'gru' or 'rnn'
52 |         hidden_size: The size of hidden units
53 |         layer_num: MultiRNNCell are used if layer_num > 1
54 |         dropout_keep_prob: dropout in RNN
55 |     Returns:
56 |         An RNN Cell
57 |     """
58 |     if rnn_type.endswith('lstm'):
59 |         cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True)
60 |     elif rnn_type.endswith('gru'):
61 |         cell = tc.rnn.GRUCell(num_units=hidden_size)
62 |     elif rnn_type.endswith('rnn'):
63 |         cell = tc.rnn.BasicRNNCell(num_units=hidden_size)
64 |     elif rnn_type.endswith('sru'):
65 |         cell = tc.rnn.SRUCell(num_units=hidden_size)
66 |     elif rnn_type.endswith('indy'):
67 |         cell = tc.rnn.IndyGRUCell(num_units=hidden_size)
68 |     else:
69 |         raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type))
70 |     if dropout_keep_prob is not None:
71 |         cell = tc.rnn.DropoutWrapper(cell,
72 |                                      input_keep_prob=dropout_keep_prob,
73 |                                      output_keep_prob=dropout_keep_prob)
74 |     return cell
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/cu_rnn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module provides wrappers for variants of RNN in Tensorflow
 3 | """
 4 | 
 5 | import tensorflow as tf
 6 | from tensorflow.contrib import cudnn_rnn
 7 | 
 8 | 
 9 | def rnn(rnn_type, inputs, hidden_size, batch_size, training, layer_num=1, dropout_keep_prob=None):
10 |     """
11 |     Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN
12 |     Args:
13 |         rnn_type: the type of rnn
14 |         inputs: padded inputs into rnn
15 |         hidden_size: the size of hidden units
16 |         layer_num: multiple rnn layer are stacked if layer_num > 1
17 |         dropout_keep_prob:
18 |     Returns:
19 |         RNN outputs and final state
20 |     """
21 |     if not rnn_type.startswith('bi'):
22 |         cell = get_cell(rnn_type, hidden_size, layer_num, 'unidirectional')
23 |         inputs = tf.transpose(inputs, [1, 0, 2])
24 |         c = tf.zeros([layer_num, batch_size, hidden_size], tf.float32)
25 |         h = tf.zeros([layer_num, batch_size, hidden_size], tf.float32)
26 |         outputs, state = cell(inputs, (h, c), training=training)
27 |         if rnn_type.endswith('lstm'):
28 |             c, h = state
29 |             state = h
30 |     else:
31 |         cell = get_cell(rnn_type, hidden_size, layer_num, 'bidirectional')
32 |         inputs = tf.transpose(inputs, [1, 0, 2])
33 |         outputs, state = cell(inputs, training=training)
34 |         # if rnn_type.endswith('lstm'):
35 |         #     state_h, state_c = state
36 |         #     h_fw, h_bw = state_h[0, :], state_h[1, :]
37 |         #     state_fw, state_bw = h_fw, h_bw
38 |         # else:
39 |         #     state_fw, state_bw = state[0][0, :], state[0][1, :]
40 |         # if concat:
41 |         #     state = tf.concat([state_fw, state_bw], 1)
42 |         # else:
43 |         #     state = state_fw + state_bw
44 |     outputs = tf.transpose(outputs, [1, 0, 2])
45 |     return outputs, state
46 | 
47 | 
48 | def get_cell(rnn_type, hidden_size, layer_num=1, direction='bidirectional'):
49 |     if rnn_type.endswith('lstm'):
50 |         cudnn_cell = cudnn_rnn.CudnnLSTM(num_layers=layer_num, num_units=hidden_size, direction=direction,
51 |                                          dropout=0)
52 |     elif rnn_type.endswith('gru'):
53 |         cudnn_cell = cudnn_rnn.CudnnGRU(num_layers=layer_num, num_units=hidden_size, direction=direction,
54 |                                         dropout=0)
55 |     elif rnn_type.endswith('rnn'):
56 |         cudnn_cell = cudnn_rnn.CudnnRNNTanh(num_layers=layer_num, num_units=hidden_size, direction=direction,
57 |                                             dropout=0)
58 |     else:
59 |         raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type))
60 |     return cudnn_cell
61 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/match_layer.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.contrib as tc
  3 | 
  4 | 
  5 | class MatchLSTMAttnCell(tc.rnn.LSTMCell):
  6 |     """
  7 |     Implements the Match-LSTM attention cell
  8 |     """
  9 | 
 10 |     def __init__(self, num_units, context_to_attend):
 11 |         super(MatchLSTMAttnCell, self).__init__(num_units, state_is_tuple=True)
 12 |         self.context_to_attend = context_to_attend
 13 |         self.fc_context = tc.layers.fully_connected(self.context_to_attend,
 14 |                                                     num_outputs=self._num_units,
 15 |                                                     activation_fn=None)
 16 | 
 17 |     def __call__(self, inputs, state, scope=None):
 18 |         (c_prev, h_prev) = state
 19 |         with tf.variable_scope(scope or type(self).__name__):
 20 |             ref_vector = tf.concat([inputs, h_prev], -1)
 21 |             G = tf.tanh(self.fc_context
 22 |                         + tf.expand_dims(tc.layers.fully_connected(ref_vector,
 23 |                                                                    num_outputs=self._num_units,
 24 |                                                                    activation_fn=None), 1))
 25 |             logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None)
 26 |             scores = tf.nn.softmax(logits, 1)
 27 |             attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1)
 28 |             new_inputs = tf.concat([inputs, attended_context,
 29 |                                     inputs - attended_context, inputs * attended_context],
 30 |                                    -1)
 31 |             return super(MatchLSTMAttnCell, self).__call__(new_inputs, state, scope)
 32 | 
 33 | 
 34 | class MatchLSTMBlockAttnCell(tc.rnn.LSTMBlockCell):
 35 |     """
 36 |     Implements the Match-LSTM attention cell
 37 |     """
 38 | 
 39 |     def __init__(self, num_units, context_to_attend):
 40 |         super(MatchLSTMBlockAttnCell, self).__init__(num_units, reuse=tf.AUTO_REUSE)
 41 |         self.context_to_attend = context_to_attend
 42 |         self.fc_context = tc.layers.fully_connected(self.context_to_attend,
 43 |                                                     num_outputs=self._num_units,
 44 |                                                     activation_fn=None)
 45 | 
 46 |     def __call__(self, inputs, state, scope=None):
 47 |         (c_prev, h_prev) = state
 48 |         with tf.variable_scope(scope or type(self).__name__):
 49 |             ref_vector = tf.concat([inputs, h_prev], -1)
 50 |             G = tf.tanh(self.fc_context
 51 |                         + tf.expand_dims(tc.layers.fully_connected(ref_vector,
 52 |                                                                    num_outputs=self._num_units,
 53 |                                                                    activation_fn=None), 1))
 54 |             logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None)
 55 |             scores = tf.nn.softmax(logits, 1)
 56 |             attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1)
 57 |             new_inputs = tf.concat([inputs, attended_context,
 58 |                                     inputs - attended_context, inputs * attended_context],
 59 |                                    -1)
 60 |             return super(MatchLSTMBlockAttnCell, self).__call__(new_inputs, state, scope)
 61 | 
 62 | 
 63 | class MatchLSTMLayer(object):
 64 |     """
 65 |     Implements the Match-LSTM layer, which attend to the question dynamically in a LSTM fashion.
 66 |     """
 67 | 
 68 |     def __init__(self, hidden_size):
 69 |         self.hidden_size = hidden_size
 70 | 
 71 |     def match(self, passage_encodes, question_encodes, p_length, q_length):
 72 |         """
 73 |         Match the passage_encodes with question_encodes using Match-LSTM algorithm
 74 |         """
 75 |         with tf.variable_scope('match_lstm', reuse=tf.AUTO_REUSE):
 76 |             cell_fw = MatchLSTMBlockAttnCell(self.hidden_size, question_encodes)
 77 |             cell_bw = MatchLSTMBlockAttnCell(self.hidden_size, question_encodes)
 78 |             outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,
 79 |                                                              inputs=passage_encodes,
 80 |                                                              sequence_length=p_length,
 81 |                                                              dtype=tf.float32)
 82 |             match_outputs = tf.concat(outputs, 2)
 83 |             state_fw, state_bw = state
 84 |             c_fw, h_fw = state_fw
 85 |             c_bw, h_bw = state_bw
 86 |             match_state = tf.concat([h_fw, h_bw], 1)
 87 |         return match_outputs, match_state
 88 | 
 89 | 
 90 | class AttentionFlowMatchLayer(object):
 91 |     """
 92 |     Implements the Attention Flow layer,
 93 |     which computes Context-to-question Attention and question-to-context Attention
 94 |     """
 95 | 
 96 |     def __init__(self, hidden_size):
 97 |         self.hidden_size = hidden_size
 98 | 
 99 |     def match(self, passage_encodes, question_encodes, p_length, q_length):
100 |         """
101 |         Match the passage_encodes with question_encodes using Attention Flow Match algorithm
102 |         """
103 |         with tf.variable_scope('bidaf', reuse=tf.AUTO_REUSE):
104 |             sim_matrix_0 = tf.matmul(passage_encodes, question_encodes, transpose_b=True)
105 |             context2question_attn = tf.matmul(tf.nn.softmax(sim_matrix_0, -1), question_encodes)
106 |             b = tf.nn.softmax(tf.expand_dims(tf.reduce_max(sim_matrix_0, 2), 1), -1)
107 |             question2context_attn = tf.tile(tf.matmul(b, passage_encodes),
108 |                                             [1, tf.shape(passage_encodes)[1], 1])
109 |             sim_matrix_1 = tf.matmul(passage_encodes, passage_encodes, transpose_b=True)
110 |             context2context_attn = tf.matmul(tf.nn.softmax(sim_matrix_1, -1), passage_encodes)
111 |             concat_outputs = tf.concat([passage_encodes, context2question_attn, context2context_attn,
112 |                                         passage_encodes * context2question_attn,
113 |                                         passage_encodes * question2context_attn], -1)
114 |             return concat_outputs, None
115 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/layers/pointer_net.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import tensorflow.contrib as tc
  3 | 
  4 | 
  5 | def custom_dynamic_rnn(cell, inputs, inputs_len, initial_state=None):
  6 |     """
  7 |     Implements a dynamic rnn that can store scores in the pointer network,
  8 |     the reason why we implements this is that the raw_rnn or dynamic_rnn function in Tensorflow
  9 |     seem to require the hidden unit and memory unit has the same dimension, and we cannot
 10 |     store the scores directly in the hidden unit.
 11 |     Args:
 12 |         cell: RNN cell
 13 |         inputs: the input sequence to rnn
 14 |         inputs_len: valid length
 15 |         initial_state: initial_state of the cell
 16 |     Returns:
 17 |         outputs and state
 18 |     """
 19 |     batch_size = tf.shape(inputs)[0]
 20 |     max_time = tf.shape(inputs)[1]
 21 | 
 22 |     inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
 23 |     inputs_ta = inputs_ta.unstack(tf.transpose(inputs, [1, 0, 2]))
 24 |     emit_ta = tf.TensorArray(dtype=tf.float32, dynamic_size=True, size=0)
 25 |     t0 = tf.constant(0, dtype=tf.int32)
 26 |     if initial_state is not None:
 27 |         s0 = initial_state
 28 |     else:
 29 |         s0 = cell.zero_state(batch_size, dtype=tf.float32)
 30 |     f0 = tf.zeros([batch_size], dtype=tf.bool)
 31 | 
 32 |     def loop_fn(t, prev_s, emit_ta, finished):
 33 |         """
 34 |         the loop function of rnn
 35 |         """
 36 |         cur_x = inputs_ta.read(t)
 37 |         scores, cur_state = cell(cur_x, prev_s)
 38 | 
 39 |         # copy through
 40 |         scores = tf.where(finished, tf.zeros_like(scores), scores)
 41 |         if isinstance(cell, tc.rnn.LSTMBlockCell):
 42 |         # if isinstance(cell, tc.rnn.LSTMCell):
 43 |             cur_c, cur_h = cur_state
 44 |             prev_c, prev_h = prev_s
 45 |             cur_state = tc.rnn.LSTMStateTuple(tf.where(finished, prev_c, cur_c),
 46 |                                               tf.where(finished, prev_h, cur_h))
 47 |         else:
 48 |             cur_state = tf.where(finished, prev_s, cur_state)
 49 | 
 50 |         emit_ta = emit_ta.write(t, scores)
 51 |         finished = tf.greater_equal(t + 1, inputs_len)
 52 |         return [t + 1, cur_state, emit_ta, finished]
 53 | 
 54 |     _, state, emit_ta, _ = tf.while_loop(
 55 |         cond=lambda _1, _2, _3, finished: tf.logical_not(tf.reduce_all(finished)),
 56 |         body=loop_fn,
 57 |         loop_vars=(t0, s0, emit_ta, f0),
 58 |         parallel_iterations=32,
 59 |         swap_memory=False)
 60 | 
 61 |     outputs = tf.transpose(emit_ta.stack(), [1, 0, 2])
 62 |     return outputs, state
 63 | 
 64 | 
 65 | def attend_pooling(pooling_vectors, ref_vector, hidden_size, scope=None):
 66 |     """
 67 |     Applies attend pooling to a set of vectors according to a reference vector.
 68 |     Args:
 69 |         pooling_vectors: the vectors to pool
 70 |         ref_vector: the reference vector
 71 |         hidden_size: the hidden size for attention function
 72 |         scope: score name
 73 |     Returns:
 74 |         the pooled vector
 75 |     """
 76 |     with tf.variable_scope(scope or 'attend_pooling', reuse=tf.AUTO_REUSE):
 77 |         U = tf.tanh(tc.layers.fully_connected(pooling_vectors, num_outputs=hidden_size,
 78 |                                               activation_fn=None, biases_initializer=None)
 79 |                     + tc.layers.fully_connected(tf.expand_dims(ref_vector, 1),
 80 |                                                 num_outputs=hidden_size,
 81 |                                                 activation_fn=None))
 82 |         logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None)
 83 |         scores = tf.nn.softmax(logits, 1)
 84 |         pooled_vector = tf.reduce_sum(pooling_vectors * scores, axis=1)
 85 |     return pooled_vector
 86 | 
 87 | 
 88 | class PointerNetLSTMCell(tc.rnn.LSTMCell):
 89 |     """
 90 |     Implements the Pointer Network Cell
 91 |     """
 92 | 
 93 |     def __init__(self, num_units, context_to_point):
 94 |         super(PointerNetLSTMCell, self).__init__(num_units, state_is_tuple=True)
 95 |         self.context_to_point = context_to_point
 96 |         self.fc_context = tc.layers.fully_connected(self.context_to_point,
 97 |                                                     num_outputs=self._num_units,
 98 |                                                     activation_fn=None)
 99 | 
100 |     def __call__(self, inputs, state, scope=None):
101 |         (c_prev, m_prev) = state
102 |         with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE):
103 |             U = tf.tanh(self.fc_context
104 |                         + tf.expand_dims(tc.layers.fully_connected(m_prev,
105 |                                                                    num_outputs=self._num_units,
106 |                                                                    activation_fn=None),
107 |                                          1))
108 |             logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None)
109 |             scores = tf.nn.softmax(logits, 1)
110 |             attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1)
111 |             lstm_out, lstm_state = super(PointerNetLSTMCell, self).__call__(attended_context, state)
112 |         return tf.squeeze(scores, -1), lstm_state
113 | 
114 | 
115 | class PointerNetLSTMBlockCell(tc.rnn.LSTMBlockCell):
116 |     """
117 |     Implements the Pointer Network Cell
118 |     """
119 | 
120 |     def __init__(self, num_units, context_to_point):
121 |         super(PointerNetLSTMBlockCell, self).__init__(num_units, reuse=tf.AUTO_REUSE)
122 |         self.context_to_point = context_to_point
123 |         self.fc_context = tc.layers.fully_connected(self.context_to_point,
124 |                                                     num_outputs=self._num_units,
125 |                                                     activation_fn=None)
126 | 
127 |     def __call__(self, inputs, state, scope=None):
128 |         (c_prev, m_prev) = state
129 |         with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE):
130 |             U = tf.tanh(self.fc_context
131 |                         + tf.expand_dims(tc.layers.fully_connected(m_prev,
132 |                                                                    num_outputs=self._num_units,
133 |                                                                    activation_fn=None),
134 |                                          1))
135 |             logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None)
136 |             scores = tf.nn.softmax(logits, 1)
137 |             attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1)
138 |             lstm_out, lstm_state = super(PointerNetLSTMBlockCell, self).__call__(attended_context, state)
139 |         return tf.squeeze(scores, -1), lstm_state
140 | 
141 | 
142 | class PointerNetDecoder(object):
143 |     """
144 |     Implements the Pointer Network
145 |     """
146 | 
147 |     def __init__(self, hidden_size):
148 |         self.hidden_size = hidden_size
149 | 
150 |     def decode(self, passage_vectors, question_vectors, init_with_question=True):
151 |         """
152 |         Use Pointer Network to compute the probabilities of each position
153 |         to be start and end of the answer
154 |         Args:
155 |             passage_vectors: the encoded passage vectors
156 |             question_vectors: the encoded question vectors
157 |             init_with_question: if set to be true,
158 |                              we will use the question_vectors to init the state of Pointer Network
159 |         Returns:
160 |             the probs of evary position to be start and end of the answer
161 |         """
162 |         with tf.variable_scope('pn_decoder', reuse=tf.AUTO_REUSE):
163 |             fake_inputs = tf.zeros([tf.shape(passage_vectors)[0], 2, 1])  # not used
164 |             sequence_len = tf.tile([2], [tf.shape(passage_vectors)[0]])
165 |             if init_with_question:
166 |                 random_attn_vector = tf.Variable(tf.random_normal([1, self.hidden_size]),
167 |                                                  trainable=True, name="random_attn_vector")
168 |                 pooled_question_rep = tc.layers.fully_connected(
169 |                     attend_pooling(question_vectors, random_attn_vector, self.hidden_size),
170 |                     num_outputs=self.hidden_size, activation_fn=None
171 |                 )
172 |                 init_state = tc.rnn.LSTMStateTuple(pooled_question_rep, pooled_question_rep)
173 |             else:
174 |                 init_state = None
175 |             with tf.variable_scope('fw', reuse=tf.AUTO_REUSE):
176 |                 fw_cell = PointerNetLSTMBlockCell(self.hidden_size, passage_vectors)
177 |                 # fw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors)
178 |                 fw_outputs, _ = custom_dynamic_rnn(fw_cell, fake_inputs, sequence_len, init_state)
179 |             with tf.variable_scope('bw', reuse=tf.AUTO_REUSE):
180 |                 bw_cell = PointerNetLSTMBlockCell(self.hidden_size, passage_vectors)
181 |                 # bw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors)
182 |                 bw_outputs, _ = custom_dynamic_rnn(bw_cell, fake_inputs, sequence_len, init_state)
183 |             start_prob = (fw_outputs[0:, 0, 0:] + bw_outputs[0:, 1, 0:]) / 2
184 |             end_prob = (fw_outputs[0:, 1, 0:] + bw_outputs[0:, 0, 0:]) / 2
185 |             return start_prob, end_prob
186 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/pretrain_embedding.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import logging
 4 | # import argparse
 5 | from gensim.models import word2vec
 6 | 
 7 | 
 8 | def pre_train(segmented_dir):
 9 |     sys.path.append('..')
10 | 
11 |     program = os.path.basename(sys.argv[0])
12 |     logger = logging.getLogger(program)
13 |     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
14 |     logging.root.setLevel(level=logging.INFO)
15 |     logger.info("running %s" % ' '.join(sys.argv))
16 | 
17 |     model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir), size=300, min_count=2, workers=8, iter=10)
18 |     with open(os.path.join(segmented_dir, 'w2v_dic.data'), 'w', encoding='utf-8') as f:
19 |         for word in model.wv.vocab:
20 |             f.write(word + ' ')
21 |             f.write(' '.join(list(map(str, model[word]))))
22 |             f.write('\n')
23 |     f.close()
24 | 
25 |     model.save_word2vec_format(os.path.join(segmented_dir, 'w2v_model.bin'), binary=True)
26 | 
27 | 
28 | def write_data(data_set, tar_dir):
29 | 
30 |     with open(tar_dir, 'w', encoding='utf8') as f:
31 |         for sample in data_set:
32 |             f.write(' '.join(sample['segmented_question']) + '\n')
33 |             for passage in sample['passages']:
34 |                 f.write(' '.join(passage['passage_tokens']) + '\n')
35 |             del sample
36 |     f.close()
37 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/dureader/vocab.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf8 -*-
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module implements the Vocab class for converting string to id and back
 19 | """
 20 | 
 21 | import numpy as np
 22 | import pickle as pkl
 23 | 
 24 | 
 25 | class Vocab(object):
 26 |     """
 27 |     Implements a vocabulary to store the tokens in the data, with their corresponding embeddings.
 28 |     """
 29 | 
 30 |     def __init__(self, embed_dim=300, filename=None, initial_tokens=None, lower=False):
 31 |         self.id2token = {}
 32 |         self.token2id = {}
 33 |         self.token_cnt = {}
 34 |         self.lower = lower
 35 | 
 36 |         self.embed_dim = embed_dim
 37 |         self.embeddings = None
 38 | 
 39 |         self.pad_token = '<blank>'
 40 |         self.unk_token = '<unk>'
 41 | 
 42 |         self.initial_tokens = initial_tokens if initial_tokens is not None else []
 43 |         self.initial_tokens.extend([self.pad_token, self.unk_token])
 44 |         for token in self.initial_tokens:
 45 |             self.add(token)
 46 | 
 47 |         if filename is not None:
 48 |             self.load_from_file(filename)
 49 | 
 50 |     def size(self):
 51 |         """
 52 |         get the size of vocabulary
 53 |         Returns:
 54 |             an integer indicating the size
 55 |         """
 56 |         return len(self.id2token)
 57 | 
 58 |     def load_from_file(self, file_path):
 59 |         """
 60 |         loads the vocab from file_path
 61 |         Args:
 62 |             file_path: a file with a word in each line
 63 |         """
 64 |         for line in open(file_path, 'r'):
 65 |             token = line.rstrip('\n')
 66 |             self.add(token)
 67 | 
 68 |     def get_id(self, token):
 69 |         """
 70 |         gets the id of a token, returns the id of unk token if token is not in vocab
 71 |         Args:
 72 |             key: a string indicating the word
 73 |         Returns:
 74 |             an integer
 75 |         """
 76 |         token = token.lower() if self.lower else token
 77 |         try:
 78 |             return self.token2id[token]
 79 |         except KeyError:
 80 |             return self.token2id[self.unk_token]
 81 | 
 82 |     def get_token(self, idx):
 83 |         """
 84 |         gets the token corresponding to idx, returns unk token if idx is not in vocab
 85 |         Args:
 86 |             idx: an integer
 87 |         returns:
 88 |             a token string
 89 |         """
 90 |         try:
 91 |             return self.id2token[idx]
 92 |         except KeyError:
 93 |             return self.unk_token
 94 | 
 95 |     def add(self, token, cnt=1):
 96 |         """
 97 |         adds the token to vocab
 98 |         Args:
 99 |             token: a string
100 |             cnt: a num indicating the count of the token to add, default is 1
101 |         """
102 |         token = token.lower() if self.lower else token
103 |         if token in self.token2id:
104 |             idx = self.token2id[token]
105 |         else:
106 |             # vocab中无此token，则添加token2id id2token
107 |             idx = len(self.id2token)
108 |             self.id2token[idx] = token
109 |             self.token2id[token] = idx
110 |         if cnt > 0:
111 |             if token in self.token_cnt:
112 |                 self.token_cnt[token] += cnt
113 |             else:
114 |                 self.token_cnt[token] = cnt
115 |         return idx
116 | 
117 |     def filter_tokens_by_cnt(self, min_cnt):
118 |         """
119 |         filter the tokens in vocab by their count
120 |         Args:
121 |             min_cnt: tokens with frequency less than min_cnt is filtered
122 |         """
123 |         filtered_tokens = [token for token in self.token2id if self.token_cnt[token] >= min_cnt]
124 |         # rebuild the token x id map
125 |         self.token2id = {}
126 |         self.id2token = {}
127 |         for token in self.initial_tokens:
128 |             self.add(token, cnt=0)
129 |         for token in filtered_tokens:
130 |             self.add(token, cnt=0)
131 | 
132 |     def randomly_init_embeddings(self, embed_dim):
133 |         """
134 |         randomly initializes the embeddings for each token
135 |         Args:
136 |             embed_dim: the size of the embedding for each token
137 |         """
138 |         self.embed_dim = embed_dim
139 |         self.embeddings = np.random.rand(self.size(), embed_dim)
140 |         # 填充符号和未知词符号初始化为0
141 |         for token in [self.pad_token, self.unk_token]:
142 |             self.embeddings[self.get_id(token)] = np.zeros([self.embed_dim])
143 | 
144 |     def load_pretrained_embeddings(self, embedding_path):
145 |         """
146 |         loads the pretrained embeddings from embedding_path,
147 |         tokens not in pretrained embeddings will be filtered
148 |         Args:
149 |             embedding_path: the path of the pretrained embedding file
150 |         """
151 |         with open(embedding_path, 'rb') as fin:
152 |             trained_embeddings = pkl.load(fin)
153 |         fin.close()
154 |         filtered_tokens = trained_embeddings.keys()
155 |         # rebuild the token x id map
156 |         self.token2id = {}
157 |         self.id2token = {}
158 |         for token in self.initial_tokens:
159 |             self.add(token, cnt=0)
160 |         for token in filtered_tokens:
161 |             self.add(token, cnt=0)
162 |         # load embeddings
163 |         self.embeddings = np.zeros([self.size(), self.embed_dim])
164 |         for token in self.token2id.keys():
165 |             if token in trained_embeddings:
166 |                 self.embeddings[self.get_id(token)] = trained_embeddings[token]
167 | 
168 |     def convert_to_ids(self, tokens):
169 |         """
170 |         Convert a list of tokens to ids, use unk_token if the token is not in vocab.
171 |         Args:
172 |             tokens: a list of token
173 |         Returns:
174 |             a list of ids
175 |         """
176 |         vec = [self.get_id(label) for label in tokens]
177 |         return vec
178 | 
179 |     def recover_from_ids(self, ids, stop_id=None):
180 |         """
181 |         Convert a list of ids to tokens, stop converting if the stop_id is encountered
182 |         Args:
183 |             ids: a list of ids to convert
184 |             stop_id: the stop id, default is None
185 |         Returns:
186 |             a list of tokens
187 |         """
188 |         tokens = []
189 |         for i in ids:
190 |             tokens += [self.get_token(i)]
191 |             if stop_id is not None and i == stop_id:
192 |                 break
193 |         return tokens
194 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | This package implements some utility functions shared by PaddlePaddle
19 | and Tensorflow model implementations.
20 | 
21 | Authors: liuyuan(liuyuan04@baidu.com)
22 | Date:    2017/10/06 18:23:06
23 | """
24 | 
25 | 
26 | from .dureader_eval import compute_bleu_rouge
27 | from .dureader_eval import normalize
28 | from .preprocess import find_fake_answer
29 | from .preprocess import find_best_question_match
30 | 
31 | __all__ = [
32 |     'compute_bleu_rouge',
33 |     'normalize',
34 |     'find_fake_answer',
35 |     'find_best_question_match',
36 |     ]
37 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/bleu.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | 
 3 | import math
 4 | 
 5 | from utils import common
 6 | 
 7 | 
 8 | class BLEU(object):
 9 |     def __init__(self, n_size):
10 |         self.match_ngram = {}
11 |         self.candi_ngram = {}
12 |         self.bp_r = 0
13 |         self.bp_c = 0
14 |         self.n_size = n_size
15 | 
16 |     def add_inst(self, cand, ref_list):
17 |         for n_size in range(self.n_size):
18 |             self.count_ngram(cand, ref_list, n_size)
19 |         self.count_bp(cand, ref_list)
20 | 
21 |     def count_ngram(self, cand, ref_list, n_size):
22 |         cand_ngram = common.get_ngram(cand, n_size)
23 |         refs_ngram = []
24 |         for ref in ref_list:
25 |             refs_ngram.append(common.get_ngram(ref, n_size))
26 |         if n_size not in self.match_ngram:
27 |             self.match_ngram[n_size] = 0
28 |             self.candi_ngram[n_size] = 0
29 |         match_size, cand_size = common.get_match_size(cand_ngram, refs_ngram)
30 |         self.match_ngram[n_size] += match_size
31 |         self.candi_ngram[n_size] += cand_size
32 | 
33 |     def count_bp(self, cand, ref_list):
34 |         self.bp_c += len(cand)
35 |         self.bp_r += min([
36 |             (abs(len(cand) - len(ref)), len(ref))
37 |             for ref in ref_list]
38 |         )[1]
39 | 
40 |     def score(self):
41 |         prob_list = []
42 |         for n_size in range(self.n_size):
43 |             if float(self.candi_ngram[n_size]) == 0:
44 |                 prob_list.append(0)
45 |             else:
46 |                 prob_list.append(self.match_ngram[n_size] / float(self.candi_ngram[n_size]))
47 |         # prob_list = [
48 |         #     self.match_ngram[n_size] / float(self.candi_ngram[n_size])
49 |         #     for n_size in range(self.n_size)
50 |         # ]
51 |         bleu_list = [prob_list[0]]
52 |         for n in range(1, self.n_size):
53 |             bleu_list.append(bleu_list[-1] * prob_list[n])
54 |         for n in range(self.n_size):
55 |             bleu_list[n] = bleu_list[n] ** (1. / float(n + 1))
56 |         bp = math.exp(min(1 - self.bp_r / float(self.bp_c), 0))
57 |         for n in range(self.n_size):
58 |             bleu_list[n] = bleu_list[n] * bp
59 |         return bleu_list
60 | 
61 | 
62 | class BLEUWithBonus(BLEU):
63 |     def __init__(self, n_size, alpha=1.0, beta=1.0):
64 |         super(BLEUWithBonus, self).__init__(n_size)
65 |         self.alpha = alpha
66 |         self.beta = beta
67 | 
68 |     def add_inst(self,
69 |                  cand,
70 |                  ref_list,
71 |                  yn_label=None, yn_ref=None, entity_ref=None):
72 |         # super(BLEUWithBonus, self).add_inst(cand, ref_list)
73 |         BLEU.add_inst(self, cand, ref_list)
74 |         if yn_label is not None and yn_ref is not None:
75 |             self.add_yn_bonus(cand, ref_list, yn_label, yn_ref)
76 |         elif entity_ref is not None:
77 |             self.add_entity_bonus(cand, entity_ref)
78 | 
79 |     def add_yn_bonus(self, cand, ref_list, yn_label, yn_ref):
80 |         for n_size in range(self.n_size):
81 |             cand_ngram = common.get_ngram(cand, n_size, label=yn_label)
82 |             ref_ngram = []
83 |             for ref_id, r in enumerate(yn_ref):
84 |                 ref_ngram.append(common.get_ngram(ref_list[ref_id], n_size, label=r))
85 |             match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram)
86 |             self.match_ngram[n_size] += self.alpha * match_size
87 |             self.candi_ngram[n_size] += self.alpha * match_size
88 | 
89 |     def add_entity_bonus(self, cand, entity_ref):
90 |         for n_size in range(self.n_size):
91 |             cand_ngram = common.get_ngram(cand, n_size, label='ENTITY')
92 |             ref_ngram = []
93 |             for reff_id, r in enumerate(entity_ref):
94 |                 ref_ngram.append(common.get_ngram(r, n_size, label='ENTITY'))
95 |             match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram)
96 |             self.match_ngram[n_size] += self.beta * match_size
97 |             self.candi_ngram[n_size] += self.beta * match_size
98 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/bleu_metric/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from .bleu_score import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(list(gts.keys()) == list(res.keys()))
24 |         imgIds = list(gts.keys())
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/bleu_metric/bleu_score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # bleu_scorer.py
  4 | # David Chiang <chiang@isi.edu>
  5 | 
  6 | # Copyright (c) 2004-2006 University of Maryland. All rights
  7 | # reserved. Do not redistribute without permission from the
  8 | # author. Not for commercial use.
  9 | 
 10 | # Modified by:
 11 | # Hao Fang <hfang@uw.edu>
 12 | # Tsung-Yi Lin <tl483@cornell.edu>
 13 | 
 14 | '''Provides:
 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 17 | '''
 18 | 
 19 | import copy
 20 | # import sys, math, re
 21 | import math
 22 | from collections import defaultdict
 23 | 
 24 | 
 25 | def precook(s, n=4, out=False):
 26 |     """Takes a string as input and returns an object that can be given to
 27 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 28 |     can take string arguments as well."""
 29 |     words = s.split()
 30 |     counts = defaultdict(int)
 31 |     for k in range(1, n + 1):
 32 |         for i in range(len(words) - k + 1):
 33 |             ngram = tuple(words[i:i + k])
 34 |             counts[ngram] += 1
 35 |     return (len(words), counts)
 36 | 
 37 | 
 38 | def cook_refs(refs, eff=None, n=4):  ## lhuang: oracle will call with "average"
 39 |     '''Takes a list of reference sentences for a single segment
 40 |     and returns an object that encapsulates everything that BLEU
 41 |     needs to know about them.'''
 42 | 
 43 |     reflen = []
 44 |     maxcounts = {}
 45 |     for ref in refs:
 46 |         rl, counts = precook(ref, n)
 47 |         reflen.append(rl)
 48 |         for (ngram, count) in counts.items():
 49 |             maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
 50 | 
 51 |     # Calculate effective reference sentence length.
 52 |     if eff == "shortest":
 53 |         reflen = min(reflen)
 54 |     elif eff == "average":
 55 |         reflen = float(sum(reflen)) / len(reflen)
 56 | 
 57 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
 58 | 
 59 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 60 | 
 61 |     return (reflen, maxcounts)
 62 | 
 63 | 
 64 | def cook_test(test, xxx_todo_changeme, eff=None, n=4):
 65 |     '''Takes a test sentence and returns an object that
 66 |     encapsulates everything that BLEU needs to know about it.'''
 67 |     (reflen, refmaxcounts) = xxx_todo_changeme
 68 |     testlen, counts = precook(test, n, True)
 69 | 
 70 |     result = {}
 71 | 
 72 |     # Calculate effective reference sentence length.
 73 | 
 74 |     if eff == "closest":
 75 |         result["reflen"] = min((abs(l - testlen), l) for l in reflen)[1]
 76 |     else:  ## i.e., "average" or "shortest" or None
 77 |         result["reflen"] = reflen
 78 | 
 79 |     result["testlen"] = testlen
 80 | 
 81 |     result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
 82 | 
 83 |     result['correct'] = [0] * n
 84 |     for (ngram, count) in counts.items():
 85 |         result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
 86 | 
 87 |     return result
 88 | 
 89 | 
 90 | class BleuScorer(object):
 91 |     """Bleu scorer.
 92 |     """
 93 | 
 94 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
 95 | 
 96 |     # special_reflen is used in oracle (proportional effective ref len for a node).
 97 | 
 98 |     def copy(self):
 99 |         ''' copy the refs.'''
100 |         new = BleuScorer(n=self.n)
101 |         new.ctest = copy.copy(self.ctest)
102 |         new.crefs = copy.copy(self.crefs)
103 |         new._score = None
104 |         return new
105 | 
106 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
107 |         ''' singular instance '''
108 | 
109 |         self.n = n
110 |         self.crefs = []
111 |         self.ctest = []
112 |         self.cook_append(test, refs)
113 |         self.special_reflen = special_reflen
114 | 
115 |     def cook_append(self, test, refs):
116 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
117 | 
118 |         if refs is not None:
119 |             self.crefs.append(cook_refs(refs))
120 |             if test is not None:
121 |                 cooked_test = cook_test(test, self.crefs[-1])
122 |                 self.ctest.append(cooked_test)  ## N.B.: -1
123 |             else:
124 |                 self.ctest.append(None)  # lens of crefs and ctest have to match
125 | 
126 |         self._score = None  ## need to recompute
127 | 
128 |     def ratio(self, option=None):
129 |         self.compute_score(option=option)
130 |         return self._ratio
131 | 
132 |     def score_ratio(self, option=None):
133 |         '''return (bleu, len_ratio) pair'''
134 |         return (self.fscore(option=option), self.ratio(option=option))
135 | 
136 |     def score_ratio_str(self, option=None):
137 |         return "%.4f (%.2f)" % self.score_ratio(option)
138 | 
139 |     def reflen(self, option=None):
140 |         self.compute_score(option=option)
141 |         return self._reflen
142 | 
143 |     def testlen(self, option=None):
144 |         self.compute_score(option=option)
145 |         return self._testlen
146 | 
147 |     def retest(self, new_test):
148 |         if type(new_test) is str:
149 |             new_test = [new_test]
150 |         assert len(new_test) == len(self.crefs), new_test
151 |         self.ctest = []
152 |         for t, rs in zip(new_test, self.crefs):
153 |             self.ctest.append(cook_test(t, rs))
154 |         self._score = None
155 | 
156 |         return self
157 | 
158 |     def rescore(self, new_test):
159 |         ''' replace test(s) with new test(s), and returns the new score.'''
160 | 
161 |         return self.retest(new_test).compute_score()
162 | 
163 |     def size(self):
164 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
165 |         return len(self.crefs)
166 | 
167 |     def __iadd__(self, other):
168 |         '''add an instance (e.g., from another sentence).'''
169 | 
170 |         if type(other) is tuple:
171 |             ## avoid creating new BleuScorer instances
172 |             self.cook_append(other[0], other[1])
173 |         else:
174 |             assert self.compatible(other), "incompatible BLEUs."
175 |             self.ctest.extend(other.ctest)
176 |             self.crefs.extend(other.crefs)
177 |             self._score = None  ## need to recompute
178 | 
179 |         return self
180 | 
181 |     def compatible(self, other):
182 |         return isinstance(other, BleuScorer) and self.n == other.n
183 | 
184 |     def single_reflen(self, option="average"):
185 |         return self._single_reflen(self.crefs[0][0], option)
186 | 
187 |     def _single_reflen(self, reflens, option=None, testlen=None):
188 | 
189 |         if option == "shortest":
190 |             reflen = min(reflens)
191 |         elif option == "average":
192 |             reflen = float(sum(reflens)) / len(reflens)
193 |         elif option == "closest":
194 |             reflen = min((abs(l - testlen), l) for l in reflens)[1]
195 |         else:
196 |             assert False, "unsupported reflen option %s" % option
197 | 
198 |         return reflen
199 | 
200 |     def recompute_score(self, option=None, verbose=0):
201 |         self._score = None
202 |         return self.compute_score(option, verbose)
203 | 
204 |     def compute_score(self, option=None, verbose=0):
205 |         n = self.n
206 |         small = 1e-9
207 |         tiny = 1e-15  ## so that if guess is 0 still return 0
208 |         bleu_list = [[] for _ in range(n)]
209 | 
210 |         if self._score is not None:
211 |             return self._score
212 | 
213 |         if option is None:
214 |             option = "average" if len(self.crefs) == 1 else "closest"
215 | 
216 |         self._testlen = 0
217 |         self._reflen = 0
218 |         totalcomps = {'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n}
219 | 
220 |         # for each sentence
221 |         for comps in self.ctest:
222 |             testlen = comps['testlen']
223 |             self._testlen += testlen
224 | 
225 |             if self.special_reflen is None:  ## need computation
226 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
227 |             else:
228 |                 reflen = self.special_reflen
229 | 
230 |             self._reflen += reflen
231 | 
232 |             for key in ['guess', 'correct']:
233 |                 for k in range(n):
234 |                     totalcomps[key][k] += comps[key][k]
235 | 
236 |             # append per image bleu score
237 |             bleu = 1.
238 |             for k in range(n):
239 |                 bleu *= (float(comps['correct'][k]) + tiny) \
240 |                         / (float(comps['guess'][k]) + small)
241 |                 bleu_list[k].append(bleu ** (1. / (k + 1)))
242 |             ratio = (testlen + tiny) / (reflen + small)  ## N.B.: avoid zero division
243 |             if ratio < 1:
244 |                 for k in range(n):
245 |                     bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
246 | 
247 |             if verbose > 1:
248 |                 print(comps, reflen)
249 | 
250 |         totalcomps['reflen'] = self._reflen
251 |         totalcomps['testlen'] = self._testlen
252 | 
253 |         bleus = []
254 |         bleu = 1.
255 |         for k in range(n):
256 |             bleu *= float(totalcomps['correct'][k] + tiny) \
257 |                     / (totalcomps['guess'][k] + small)
258 |             bleus.append(bleu ** (1. / (k + 1)))
259 |         ratio = (self._testlen + tiny) / (self._reflen + small)  ## N.B.: avoid zero division
260 |         if ratio < 1:
261 |             for k in range(n):
262 |                 bleus[k] *= math.exp(1 - 1 / ratio)
263 | 
264 |         if verbose > 0:
265 |             print(totalcomps)
266 |             print("ratio:", ratio)
267 | 
268 |         self._score = bleus
269 |         return self._score, bleu_list
270 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/common.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | from functools import reduce
 3 | import math
 4 | import ujson as json
 5 | from collections import defaultdict
 6 | import sys
 7 | 
 8 | 
 9 | def get_match_size(cand_ngram, refs_ngram):
10 |     ref_set = defaultdict(int)
11 |     for ref_ngram in refs_ngram:
12 |         tmp_ref_set = defaultdict(int)
13 |         for ngram in ref_ngram:
14 |             tmp_ref_set[ngram] += 1
15 |         for ngram, count in tmp_ref_set.items():
16 |             ref_set[ngram] = max(ref_set[ngram], count)
17 |     cand_set = defaultdict(int)
18 |     for ngram in cand_ngram:
19 |         cand_set[ngram] += 1
20 |     match_size = 0
21 |     for ngram, count in cand_set.items():
22 |         match_size += min(count, ref_set.get(ngram, 0))
23 |     cand_size = len(cand_ngram)
24 |     return match_size, cand_size
25 | 
26 | 
27 | def get_ngram(sent, n_size, label=None):
28 |     def _ngram(sent, n_size):
29 |         ngram_list = []
30 |         for left in range(len(sent) - n_size):
31 |             ngram_list.append(sent[left: left + n_size + 1])
32 |         return ngram_list
33 | 
34 |     ngram_list = _ngram(sent, n_size)
35 |     if label is not None:
36 |         ngram_list = [ngram + '_' + label for ngram in ngram_list]
37 |     return ngram_list
38 | 
39 | 
40 | def word2char(str_in):
41 |     str_out = str_in.replace(' ', '')
42 |     return ''.join(str_out.split())
43 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/get_vocab.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | Utility function to generate vocabulary file.
19 | """
20 | 
21 | 
22 | import argparse
23 | import sys
24 | import json
25 | 
26 | from itertools import chain
27 | 
28 | 
29 | def get_vocab(files, vocab_file):
30 |     """
31 |     Builds vocabulary file from field 'segmented_paragraphs'
32 |     and 'segmented_question'.
33 | 
34 |     Args:
35 |         files: A list of file names.
36 |         vocab_file: The file that stores the vocabulary.
37 |     """
38 |     vocab = {}
39 |     for f in files:
40 |         with open(f, 'r') as fin:
41 |             for line in fin:
42 |                 obj = json.loads(line.strip())
43 |                 paras = [
44 |                         chain(*d['segmented_paragraphs'])
45 |                         for d in obj['documents']]
46 |                 doc_tokens = chain(*paras)
47 |                 question_tokens = obj['segmented_question']
48 |                 for t in list(doc_tokens) + question_tokens:
49 |                     vocab[t] = vocab.get(t, 0) + 1
50 |     # output
51 |     sorted_vocab = sorted([(v, c) for v, c in vocab.items()],
52 |             key=lambda x: x[1],
53 |             reverse=True)
54 |     with open(vocab_file, 'w') as outf:
55 |         for w, c in sorted_vocab:
56 |             print >> outf, '{}\t{}'.format(w.encode('utf8'), c)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     parser = argparse.ArgumentParser()
61 |     parser.add_argument('--files', nargs='+', required=True,
62 |             help='file list to count vocab from.')
63 |     parser.add_argument('--vocab', required=True,
64 |             help='file to store counted vocab.')
65 |     args = parser.parse_args()
66 |     get_vocab(args.files, args.vocab)
67 | 
68 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/mrc_eval.py:
--------------------------------------------------------------------------------
  1 | # coding:utf8
  2 | """
  3 | This module computes evaluation metrics for DuReader dataset.
  4 | """
  5 | 
  6 | import argparse
  7 | import itertools
  8 | import ujson as json
  9 | import sys
 10 | import zipfile
 11 | 
 12 | from collections import Counter
 13 | from .bleu import BLEUWithBonus
 14 | from .rouge import RougeLWithBonus
 15 | 
 16 | EMPTY = ''
 17 | YESNO_LABELS = set(['Yes', 'No', 'Depends'])
 18 | 
 19 | 
 20 | def normalize(s):
 21 |     """
 22 |     Normalize strings to space joined chars.
 23 |     Args:
 24 |         s: a list of strings.
 25 |     Returns:
 26 |         A list of normalized strings.
 27 |     """
 28 |     if not s:
 29 |         return s
 30 |     normalized = []
 31 |     for ss in s:
 32 |         tokens = [c for c in list(ss) if len(c.strip()) != 0]
 33 |         normalized.append(''.join(tokens))
 34 |     return normalized
 35 | 
 36 | 
 37 | def data_check(obj):
 38 |     """
 39 |     Check data.
 40 | 
 41 |     Raises:
 42 |         Raises AssertionError when data is not legal.
 43 |     """
 44 |     # 判断是否有answer_id
 45 |     assert 'question_id' in obj, "Missing 'question_id' field."
 46 |     # assert 'yesno_answers' in obj, \
 47 |     #        "Missing 'yesno_answers' field. question_id: {}".format(obj['question_id'])
 48 |     # 如果包含yesno_answers，那么格式必须为list
 49 |     if "yesno_answers" in obj:
 50 |         assert isinstance(obj['yesno_answers'], list), \
 51 |             r"""'yesno_answers' field must be a list, if the 'question_type' is not
 52 |             'YES_NO', then this field should be an empty list.
 53 |             question_id: {}""".format(obj['question_id'])
 54 |     else:
 55 |         obj["yesno_answers"] = []
 56 |     if "entity_answers" not in obj:
 57 |         obj["entity_answers"] = []
 58 | 
 59 | 
 60 | def read_file(file_name, is_ref=False):
 61 |     """
 62 |     Read predict answers or reference answers from file.
 63 | 
 64 |     Args:
 65 |         file_name: the name of the file containing predict result or reference
 66 |                    result.
 67 | 
 68 |     Returns:
 69 |         A dictionary mapping question_id to the result information. The result
 70 |         information itself is also a dictionary with has four keys:
 71 |         - question_type: type of the query.
 72 |         - yesno_answers: A list of yesno answers corresponding to 'answers'.
 73 |         - answers: A list of predicted answers.
 74 |         - entity_answers: A list, each element is also a list containing the entities
 75 |                     tagged out from the corresponding answer string.
 76 |     """
 77 | 
 78 |     def _open(file_name, mode, zip_obj=None):
 79 |         if zip_obj is not None:
 80 |             return zip_obj.open(file_name, mode)
 81 |         return open(file_name, mode)
 82 | 
 83 |     results = {}
 84 |     # 是否是参考答案
 85 |     if is_ref:
 86 |         keys = ['source', 'answers', 'yesno_answers', 'entity_answers', 'question_type']
 87 |     else:
 88 |         keys = ['answers', 'yesno_answers']
 89 |     # 如果是zip文件则以zip方式读取
 90 |     zf = zipfile.ZipFile(file_name, 'r') if file_name.endswith('.zip') else None
 91 |     # zip包中文件列表
 92 |     file_list = [file_name] if zf is None else zf.namelist()
 93 | 
 94 |     for fn in file_list:
 95 |         for line in _open(fn, 'r', zip_obj=zf):
 96 |             try:
 97 |                 obj = json.loads(line.strip())
 98 |             except ValueError:
 99 |                 raise ValueError("Every line of data should be legal json")
100 |             data_check(obj)
101 |             qid = obj['question_id']
102 |             # 必须有question id
103 |             assert qid not in results, "Duplicate question_id: {}".format(qid)
104 |             results[qid] = {}
105 |             for k in keys:
106 |                 if k == 'answers':
107 |                     results[qid][k] = normalize(obj[k])
108 |                 else:
109 |                     results[qid][k] = obj[k]
110 |             if is_ref:
111 |                 for i, e in enumerate(results[qid]['entity_answers']):
112 |                     results[qid]['entity_answers'][i] = normalize(e)
113 |     return results
114 | 
115 | 
116 | def calc_metrics(pred_result, ref_result, bleu_eval, rouge_eval):
117 |     """Computes bleu-4 and rouge-l.
118 | 
119 |     Args:
120 |         - pred_result: Refer to the returned dict of `read_file` with
121 |                        'is_ref=False'.
122 |         - ref_result: Refer to the returned dict of `ref_file` with
123 |                       'is_ref=True'.
124 |         - bleu_result: A BleuWithBonus object.
125 |         - rouge_result: A RougeLWithBonus object.
126 |     Returns:
127 |         bleu-4 and rouge-l values as a tuple of float values.
128 |     """
129 |     for qid, results in ref_result.items():
130 |         # 根据question id从预测结果中选择答案
131 |         cand_result = pred_result.get(qid, {})
132 |         pred_answers = cand_result.get('answers', [])
133 |         if not pred_answers:
134 |             pred_answers = EMPTY
135 |         else:
136 |             pred_answers = pred_answers[0]
137 |         pred_yn_label = None
138 |         ref_entities = None
139 |         ref_answers = results.get('answers', [])
140 |         if not ref_answers:
141 |             continue
142 |         if results['question_type'] == 'ENTITY':
143 |             ref_entities = set(
144 |                 itertools.chain(*results.get('entity_answers', [[]])))
145 |             if not ref_entities:
146 |                 ref_entities = None
147 |         if results['question_type'] == 'YES_NO':
148 |             cand_yesno = cand_result.get('yesno_answers', [])
149 |             pred_yn_label = None if len(cand_yesno) == 0 \
150 |                 else cand_yesno[0]
151 |         bleu_eval.add_inst(
152 |             pred_answers,
153 |             ref_answers,
154 |             yn_label=pred_yn_label,
155 |             yn_ref=results['yesno_answers'],
156 |             entity_ref=ref_entities)
157 |         rouge_eval.add_inst(
158 |             pred_answers,
159 |             ref_answers,
160 |             yn_label=pred_yn_label,
161 |             yn_ref=results['yesno_answers'],
162 |             entity_ref=ref_entities)
163 |     bleu4 = bleu_eval.score()[-1]
164 |     rouge_l = rouge_eval.score()
165 |     return bleu4, rouge_l
166 | 
167 | 
168 | def main(args):
169 |     err = None
170 |     metrics = {}
171 |     bleu4, rouge_l = 0.0, 0.0
172 |     alpha = args.alpha  # default 1.0
173 |     beta = args.beta  # default 1.0
174 |     bleu_eval = BLEUWithBonus(4, alpha=alpha, beta=beta)
175 |     rouge_eval = RougeLWithBonus(alpha=alpha, beta=beta, gamma=1.2)
176 |     # 载入answer文件 格式dict question_id: {answers:[], yesno_answers:[]}
177 |     pred_result = read_file(args.pred_file)
178 |     ref_result = read_file(args.ref_file, is_ref=True)
179 |     bleu4, rouge_l = calc_metrics(pred_result,
180 |                                   ref_result,
181 |                                   bleu_eval,
182 |                                   rouge_eval)
183 |     metrics = {
184 |         'ROUGE-L': round(rouge_l * 100, 2),
185 |         'BLEU-4': round(bleu4 * 100, 2),
186 |     }
187 |     print(json.dumps(metrics, ensure_ascii=False).encode('utf8'))
188 | 
189 | 
190 | if __name__ == '__main__':
191 |     parser = argparse.ArgumentParser()
192 |     parser.add_argument('--pred_file', help='predict file')
193 |     parser.add_argument('--ref_file', help='reference file')
194 |     parser.add_argument('--alpha', type=float, default=1.0,
195 |                         help='common value of alpha')
196 |     parser.add_argument('--beta', type=float, default=1.0,
197 |                         help='common value of beta')
198 |     args = parser.parse_args()
199 |     main(args)
200 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module finds the most related paragraph of each document according to recall.
 19 | """
 20 | 
 21 | import sys
 22 | # reload(sys)
 23 | # sys.setdefaultencoding('utf8')
 24 | import json
 25 | from collections import Counter
 26 | 
 27 | 
 28 | def precision_recall_f1(prediction, ground_truth):
 29 |     """
 30 |     This function calculates and returns the precision, recall and f1-score
 31 |     Args:
 32 |         prediction: prediction string or list to be matched
 33 |         ground_truth: golden string or list reference
 34 |     Returns:
 35 |         floats of (p, r, f1)
 36 |     Raises:
 37 |         None
 38 |     """
 39 |     if not isinstance(prediction, list):
 40 |         prediction_tokens = prediction.split()
 41 |     else:
 42 |         prediction_tokens = prediction
 43 |     if not isinstance(ground_truth, list):
 44 |         ground_truth_tokens = ground_truth.split()
 45 |     else:
 46 |         ground_truth_tokens = ground_truth
 47 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 48 |     num_same = sum(common.values())
 49 |     if num_same == 0:
 50 |         return 0, 0, 0
 51 |     p = 1.0 * num_same / len(prediction_tokens)
 52 |     r = 1.0 * num_same / len(ground_truth_tokens)
 53 |     f1 = (2 * p * r) / (p + r)
 54 |     return p, r, f1
 55 | 
 56 | 
 57 | def recall(prediction, ground_truth):
 58 |     """
 59 |     This function calculates and returns the recall
 60 |     Args:
 61 |         prediction: prediction string or list to be matched
 62 |         ground_truth: golden string or list reference
 63 |     Returns:
 64 |         floats of recall
 65 |     Raises:
 66 |         None
 67 |     """
 68 |     return precision_recall_f1(prediction, ground_truth)[1]
 69 | 
 70 | 
 71 | def f1_score(prediction, ground_truth):
 72 |     """
 73 |     This function calculates and returns the f1-score
 74 |     Args:
 75 |         prediction: prediction string or list to be matched
 76 |         ground_truth: golden string or list reference
 77 |     Returns:
 78 |         floats of f1
 79 |     Raises:
 80 |         None
 81 |     """
 82 |     return precision_recall_f1(prediction, ground_truth)[2]
 83 | 
 84 | 
 85 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 86 |     """
 87 |     This function calculates and returns the precision, recall and f1-score
 88 |     Args:
 89 |         metric_fn: metric function pointer which calculates scores according to corresponding logic.
 90 |         prediction: prediction string or list to be matched
 91 |         ground_truth: golden string or list reference
 92 |     Returns:
 93 |         floats of (p, r, f1)
 94 |     Raises:
 95 |         None
 96 |     """
 97 |     scores_for_ground_truths = []
 98 |     for ground_truth in ground_truths:
 99 |         score = metric_fn(prediction, ground_truth)
100 |         scores_for_ground_truths.append(score)
101 |     return max(scores_for_ground_truths)
102 | 
103 | 
104 | def find_best_question_match(doc, question, with_score=False):
105 |     """
106 |     For each docment, find the paragraph that matches best to the question.
107 |     Args:
108 |         doc: The document object.
109 |         question: The question tokens.
110 |         with_score: If True then the match score will be returned,
111 |             otherwise False.
112 |     Returns:
113 |         The index of the best match paragraph, if with_score=False,
114 |         otherwise returns a tuple of the index of the best match paragraph
115 |         and the match score of that paragraph.
116 |     """
117 |     most_related_para = -1
118 |     max_related_score = 0
119 |     most_related_para_len = 0
120 |     for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']):
121 |         if len(question) > 0:
122 |             related_score = metric_max_over_ground_truths(recall,
123 |                     para_tokens,
124 |                     question)
125 |         else:
126 |             related_score = 0
127 | 
128 |         if related_score > max_related_score \
129 |                 or (related_score == max_related_score \
130 |                 and len(para_tokens) < most_related_para_len):
131 |             most_related_para = p_idx
132 |             max_related_score = related_score
133 |             most_related_para_len = len(para_tokens)
134 |     if most_related_para == -1:
135 |         most_related_para = 0
136 |     if with_score:
137 |         return most_related_para, max_related_score
138 |     return most_related_para
139 | 
140 | 
141 | def find_fake_answer(sample):
142 |     """
143 |     For each document, finds the most related paragraph based on recall,
144 |     then finds a span that maximize the f1_score compared with the gold answers
145 |     and uses this span as a fake answer span
146 |     Args:
147 |         sample: a sample in the dataset
148 |     Returns:
149 |         None
150 |     Raises:
151 |         None
152 |     """
153 |     for doc in sample['documents']:
154 |         most_related_para = -1
155 |         most_related_para_len = 999999
156 |         max_related_score = 0
157 |         for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']):
158 |             if len(sample['segmented_answers']) > 0:
159 |                 related_score = metric_max_over_ground_truths(recall,
160 |                                                               para_tokens,
161 |                                                               sample['segmented_answers'])
162 |             else:
163 |                 continue
164 |             if related_score > max_related_score \
165 |                     or (related_score == max_related_score
166 |                         and len(para_tokens) < most_related_para_len):
167 |                 most_related_para = p_idx
168 |                 most_related_para_len = len(para_tokens)
169 |                 max_related_score = related_score
170 |         doc['most_related_para'] = most_related_para
171 | 
172 |     sample['answer_docs'] = []
173 |     sample['answer_spans'] = []
174 |     sample['fake_answers'] = []
175 |     sample['match_scores'] = []
176 | 
177 |     best_match_score = 0
178 |     best_match_d_idx, best_match_span = -1, [-1, -1]
179 |     best_fake_answer = None
180 |     answer_tokens = set()
181 |     for segmented_answer in sample['segmented_answers']:
182 |         answer_tokens = answer_tokens | set([token for token in segmented_answer])
183 |     for d_idx, doc in enumerate(sample['documents']):
184 |         if not doc['is_selected']:
185 |             continue
186 |         if doc['most_related_para'] == -1:
187 |             doc['most_related_para'] = 0
188 |         most_related_para_tokens = doc['segmented_paragraphs'][doc['most_related_para']][:1000]
189 |         for start_tidx in range(len(most_related_para_tokens)):
190 |             if most_related_para_tokens[start_tidx] not in answer_tokens:
191 |                 continue
192 |             for end_tidx in range(len(most_related_para_tokens) - 1, start_tidx - 1, -1):
193 |                 span_tokens = most_related_para_tokens[start_tidx: end_tidx + 1]
194 |                 if len(sample['segmented_answers']) > 0:
195 |                     match_score = metric_max_over_ground_truths(f1_score, span_tokens,
196 |                                                                 sample['segmented_answers'])
197 |                 else:
198 |                     match_score = 0
199 |                 if match_score == 0:
200 |                     break
201 |                 if match_score > best_match_score:
202 |                     best_match_d_idx = d_idx
203 |                     best_match_span = [start_tidx, end_tidx]
204 |                     best_match_score = match_score
205 |                     best_fake_answer = ''.join(span_tokens)
206 |     if best_match_score > 0:
207 |         sample['answer_docs'].append(best_match_d_idx)
208 |         sample['answer_spans'].append(best_match_span)
209 |         sample['fake_answers'].append(best_fake_answer)
210 |         sample['match_scores'].append(best_match_score)
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     for line in sys.stdin:
215 |         sample = json.loads(line)
216 |         find_fake_answer(sample)
217 |         print(json.dumps(sample, encoding='utf8', ensure_ascii=False))
218 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/rouge.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | 
 3 | from functools import reduce
 4 | import math
 5 | import json
 6 | import numpy as np
 7 | from collections import defaultdict
 8 | import sys
 9 | 
10 | # reload(sys)
11 | # sys.setdefaultencoding("utf-8")
12 | 
13 | 
14 | class RougeLWithBonus(object):
15 |     def __init__(self, alpha=1.0, beta=1.0, gamma=1.2):
16 |         self.alpha = alpha
17 |         self.beta = beta
18 |         self.gamma = gamma
19 |         self.inst_scores = []
20 | 
21 |     def lcs(self, string, sub):
22 |         if len(string) < len(sub):
23 |             sub, string = string, sub
24 |         lengths = np.zeros((len(string) + 1, len(sub) + 1))
25 |         for j in range(1, len(sub) + 1):
26 |             for i in range(1, len(string) + 1):
27 |                 if string[i - 1] == sub[j - 1]:
28 |                     lengths[i][j] = lengths[i - 1][j - 1] + 1
29 |                 else:
30 |                     lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
31 |         return lengths[len(string)][len(sub)]
32 | 
33 |     def add_inst(self,
34 |                  cand,
35 |                  ref_list,
36 |                  yn_label=None, yn_ref=None, entity_ref=None):
37 |         precs, recalls = [], []
38 |         for i, ref in enumerate(ref_list):
39 |             basic_lcs = self.lcs(cand, ref)
40 |             yn_bonus, entity_bonus = 0.0, 0.0
41 |             if yn_ref is not None and yn_label is not None:
42 |                 yn_bonus = self.add_yn_bonus(cand, ref, yn_label, yn_ref[i])
43 |             elif entity_ref is not None:
44 |                 entity_bonus = self.add_entity_bonus(cand, entity_ref)
45 |             p_denom = len(cand) + self.alpha * yn_bonus + self.beta * entity_bonus
46 |             r_denom = len(ref) + self.alpha * yn_bonus + self.beta * entity_bonus
47 |             prec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \
48 |                    / p_denom if p_denom > 0. else 0.
49 |             rec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \
50 |                   / r_denom if r_denom > 0. else 0.
51 |             precs.append(prec)
52 |             recalls.append(rec)
53 | 
54 |         prec_max = max(precs)
55 |         rec_max = max(recalls)
56 |         if prec_max != 0 and rec_max != 0:
57 |             score = ((1 + self.gamma ** 2) * prec_max * rec_max) / \
58 |                     float(rec_max + self.gamma ** 2 * prec_max)
59 |         else:
60 |             score = 0.0
61 |         self.inst_scores.append(score)
62 | 
63 |     def add_yn_bonus(self, cand, ref, yn_label, yn_ref):
64 |         if yn_label != yn_ref:
65 |             return 0.0
66 |         lcs_ = self.lcs(cand, ref)
67 |         return lcs_
68 | 
69 |     def add_entity_bonus(self, cand, entity_ref):
70 |         lcs_ = 0.0
71 |         for ent in entity_ref:
72 |             if ent in cand:
73 |                 lcs_ += len(ent)
74 |         return lcs_
75 | 
76 |     def score(self):
77 |         return 1. * sum(self.inst_scores) / len(self.inst_scores)
78 | 


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF+Self Attention/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF+Self Attention/utils/rouge_metric/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | 
 12 | 
 13 | # import pdb
 14 | 
 15 | 
 16 | def my_lcs(string, sub):
 17 |     """
 18 |     Calculates longest common subsequence for a pair of tokenized strings
 19 |     :param string : list of str : tokens from a string split using whitespace
 20 |     :param sub : list of str : shorter string, also split using whitespace
 21 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 22 | 
 23 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 24 |     """
 25 |     if (len(string) < len(sub)):
 26 |         sub, string = string, sub
 27 | 
 28 |     lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]
 29 | 
 30 |     for j in range(1, len(sub) + 1):
 31 |         for i in range(1, len(string) + 1):
 32 |             if (string[i - 1] == sub[j - 1]):
 33 |                 lengths[i][j] = lengths[i - 1][j - 1] + 1
 34 |             else:
 35 |                 lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
 36 | 
 37 |     return lengths[len(string)][len(sub)]
 38 | 
 39 | 
 40 | class Rouge():
 41 |     '''
 42 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 43 | 
 44 |     '''
 45 | 
 46 |     def __init__(self):
 47 |         # vrama91: updated the value below based on discussion with Hovey
 48 |         self.beta = 1.2
 49 | 
 50 |     def calc_score(self, candidate, refs):
 51 |         """
 52 |         Compute ROUGE-L score given one candidate and references for an image
 53 |         :param candidate: str : candidate sentence to be evaluated
 54 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 55 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 56 |         """
 57 |         assert (len(candidate) == 1)
 58 |         assert (len(refs) > 0)
 59 |         prec = []
 60 |         rec = []
 61 | 
 62 |         # split into tokens
 63 |         token_c = candidate[0].split(" ")
 64 | 
 65 |         for reference in refs:
 66 |             # split into tokens
 67 |             token_r = reference.split(" ")
 68 |             # compute the longest common subsequence
 69 |             lcs = my_lcs(token_r, token_c)
 70 |             prec.append(lcs / float(len(token_c)))
 71 |             rec.append(lcs / float(len(token_r)))
 72 | 
 73 |         prec_max = max(prec)
 74 |         rec_max = max(rec)
 75 | 
 76 |         if (prec_max != 0 and rec_max != 0):
 77 |             score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)
 78 |         else:
 79 |             score = 0.0
 80 |         return score
 81 | 
 82 |     def compute_score(self, gts, res):
 83 |         """
 84 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 85 |         Invoked by evaluate_captions.py
 86 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
 87 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 88 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 89 |         """
 90 |         assert (list(gts.keys()) == list(res.keys()))
 91 |         imgIds = list(gts.keys())
 92 | 
 93 |         score = []
 94 |         for id in imgIds:
 95 |             hypo = res[id]
 96 |             ref = gts[id]
 97 | 
 98 |             score.append(self.calc_score(hypo, ref))
 99 | 
100 |             # Sanity check.
101 |             assert (type(hypo) is list)
102 |             assert (len(hypo) == 1)
103 |             assert (type(ref) is list)
104 |             assert (len(ref) > 0)
105 | 
106 |         average_score = np.mean(np.array(score))
107 |         return average_score, np.array(score)
108 | 
109 |     def method(self):
110 |         return "Rouge"
111 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | Empty __init__.py file
19 | 
20 | Authors: Yizhong Wang(wangyizhong01@baidu.com)
21 | Date: 2017/09/20 12:00:00
22 | """
23 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/basic_rnn.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/__pycache__/match_layer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/match_layer.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/__pycache__/pointer_net.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/dureader/layers/__pycache__/pointer_net.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/basic_rnn.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | This module provides wrappers for variants of RNN in Tensorflow
19 | """
20 | 
21 | import tensorflow as tf
22 | import tensorflow.contrib as tc
23 | 
24 | 
25 | def rnn(rnn_type, inputs, length, hidden_size, layer_num=1, dropout_keep_prob=None, concat=True):
26 |     """
27 |     Implements (Bi-)LSTM, (Bi-)GRU and (Bi-)RNN
28 |     Args:
29 |         rnn_type: the type of rnn
30 |         inputs: padded inputs into rnn
31 |         length: the valid length of the inputs
32 |         hidden_size: the size of hidden units
33 |         layer_num: multiple rnn layer are stacked if layer_num > 1
34 |         dropout_keep_prob:
35 |         concat: When the rnn is bidirectional, the forward outputs and backward outputs are
36 |                 concatenated if this is True, else we add them.
37 |     Returns:
38 |         RNN outputs and final state
39 |     """
40 |     if not rnn_type.startswith('bi'):
41 |         cell = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
42 |         outputs, state = tf.nn.dynamic_rnn(cell, inputs, sequence_length=length, dtype=tf.float32)
43 |         if rnn_type.endswith('lstm'):
44 |             c, h = state
45 |             state = h
46 |     else:
47 |         cell_fw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
48 |         cell_bw = get_cell(rnn_type, hidden_size, layer_num, dropout_keep_prob)
49 |         outputs, state = tf.nn.bidirectional_dynamic_rnn(
50 |             cell_bw, cell_fw, inputs, sequence_length=length, dtype=tf.float32
51 |         )
52 |         state_fw, state_bw = state
53 |         if rnn_type.endswith('lstm'):
54 |             c_fw, h_fw = state_fw
55 |             c_bw, h_bw = state_bw
56 |             state_fw, state_bw = h_fw, h_bw
57 |         if concat:
58 |             outputs = tf.concat(outputs, 2)
59 |             state = tf.concat([state_fw, state_bw], 1)
60 |         else:
61 |             outputs = outputs[0] + outputs[1]
62 |             state = state_fw + state_bw
63 |     return outputs, state
64 | 
65 | 
66 | def get_cell(rnn_type, hidden_size, layer_num=1, dropout_keep_prob=None):
67 |     """
68 |     Gets the RNN Cell
69 |     Args:
70 |         rnn_type: 'lstm', 'gru' or 'rnn'
71 |         hidden_size: The size of hidden units
72 |         layer_num: MultiRNNCell are used if layer_num > 1
73 |         dropout_keep_prob: dropout in RNN
74 |     Returns:
75 |         An RNN Cell
76 |     """
77 |     if rnn_type.endswith('lstm'):
78 |         cell = tc.rnn.LSTMCell(num_units=hidden_size, state_is_tuple=True)
79 |     elif rnn_type.endswith('gru'):
80 |         cell = tc.rnn.GRUCell(num_units=hidden_size)
81 |     elif rnn_type.endswith('rnn'):
82 |         cell = tc.rnn.BasicRNNCell(num_units=hidden_size)
83 |     else:
84 |         raise NotImplementedError('Unsuported rnn type: {}'.format(rnn_type))
85 |     if dropout_keep_prob is not None:
86 |         cell = tc.rnn.DropoutWrapper(cell,
87 |                                      input_keep_prob=dropout_keep_prob,
88 |                                      output_keep_prob=dropout_keep_prob)
89 |     if layer_num > 1:
90 |         cell = tc.rnn.MultiRNNCell([cell]*layer_num, state_is_tuple=True)
91 |     return cell
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/match_layer.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf8 -*-
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module implements the core layer of Match-LSTM and BiDAF
 19 | """
 20 | 
 21 | import tensorflow as tf
 22 | import tensorflow.contrib as tc
 23 | 
 24 | 
 25 | class MatchLSTMAttnCell(tc.rnn.LSTMCell):
 26 |     """
 27 |     Implements the Match-LSTM attention cell
 28 |     """
 29 |     def __init__(self, num_units, context_to_attend):
 30 |         super(MatchLSTMAttnCell, self).__init__(num_units, state_is_tuple=True)
 31 |         self.context_to_attend = context_to_attend
 32 |         self.fc_context = tc.layers.fully_connected(self.context_to_attend,
 33 |                                                     num_outputs=self._num_units,
 34 |                                                     activation_fn=None)
 35 | 
 36 |     def __call__(self, inputs, state, scope=None):
 37 |         (c_prev, h_prev) = state
 38 |         with tf.variable_scope(scope or type(self).__name__):
 39 |             ref_vector = tf.concat([inputs, h_prev], -1)
 40 |             G = tf.tanh(self.fc_context
 41 |                         + tf.expand_dims(tc.layers.fully_connected(ref_vector,
 42 |                                                                    num_outputs=self._num_units,
 43 |                                                                    activation_fn=None), 1))
 44 |             logits = tc.layers.fully_connected(G, num_outputs=1, activation_fn=None)
 45 |             scores = tf.nn.softmax(logits, 1)
 46 |             attended_context = tf.reduce_sum(self.context_to_attend * scores, axis=1)
 47 |             new_inputs = tf.concat([inputs, attended_context,
 48 |                                     inputs - attended_context, inputs * attended_context],
 49 |                                    -1)
 50 |             return super(MatchLSTMAttnCell, self).__call__(new_inputs, state, scope)
 51 | 
 52 | 
 53 | class MatchLSTMLayer(object):
 54 |     """
 55 |     Implements the Match-LSTM layer, which attend to the question dynamically in a LSTM fashion.
 56 |     """
 57 |     def __init__(self, hidden_size):
 58 |         self.hidden_size = hidden_size
 59 | 
 60 |     def match(self, passage_encodes, question_encodes, p_length, q_length):
 61 |         """
 62 |         Match the passage_encodes with question_encodes using Match-LSTM algorithm
 63 |         """
 64 |         with tf.variable_scope('match_lstm', reuse=tf.AUTO_REUSE):
 65 |             cell_fw = MatchLSTMAttnCell(self.hidden_size, question_encodes)
 66 |             cell_bw = MatchLSTMAttnCell(self.hidden_size, question_encodes)
 67 |             outputs, state = tf.nn.bidirectional_dynamic_rnn(cell_fw, cell_bw,
 68 |                                                              inputs=passage_encodes,
 69 |                                                              sequence_length=p_length,
 70 |                                                              dtype=tf.float32)
 71 |             match_outputs = tf.concat(outputs, 2)
 72 |             state_fw, state_bw = state
 73 |             c_fw, h_fw = state_fw
 74 |             c_bw, h_bw = state_bw
 75 |             match_state = tf.concat([h_fw, h_bw], 1)
 76 |         return match_outputs, match_state
 77 | 
 78 | 
 79 | class AttentionFlowMatchLayer(object):
 80 |     """
 81 |     Implements the Attention Flow layer,
 82 |     which computes Context-to-question Attention and question-to-context Attention
 83 |     """
 84 |     def __init__(self, hidden_size):
 85 |         self.hidden_size = hidden_size
 86 | 
 87 |     def match(self, passage_encodes, question_encodes, p_length, q_length):
 88 |         """
 89 |         Match the passage_encodes with question_encodes using Attention Flow Match algorithm
 90 |         """
 91 |         with tf.variable_scope('bidaf', reuse=tf.AUTO_REUSE):
 92 |             sim_matrix = tf.matmul(passage_encodes, question_encodes, transpose_b=True)
 93 |             context2question_attn = tf.matmul(tf.nn.softmax(sim_matrix, -1), question_encodes)
 94 |             b = tf.nn.softmax(tf.expand_dims(tf.reduce_max(sim_matrix, 2), 1), -1)
 95 |             question2context_attn = tf.tile(tf.matmul(b, passage_encodes),
 96 |                                          [1, tf.shape(passage_encodes)[1], 1])
 97 |             concat_outputs = tf.concat([passage_encodes, context2question_attn,
 98 |                                         passage_encodes * context2question_attn,
 99 |                                         passage_encodes * question2context_attn], -1)
100 |             return concat_outputs, None
101 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/layers/pointer_net.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf8 -*-
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module implements the Pointer Network for selecting answer spans, as described in:
 19 | https://openreview.net/pdf?id=B1-q5Pqxl
 20 | """
 21 | 
 22 | import tensorflow as tf
 23 | import tensorflow.contrib as tc
 24 | 
 25 | 
 26 | def custom_dynamic_rnn(cell, inputs, inputs_len, initial_state=None):
 27 |     """
 28 |     Implements a dynamic rnn that can store scores in the pointer network,
 29 |     the reason why we implements this is that the raw_rnn or dynamic_rnn function in Tensorflow
 30 |     seem to require the hidden unit and memory unit has the same dimension, and we cannot
 31 |     store the scores directly in the hidden unit.
 32 |     Args:
 33 |         cell: RNN cell
 34 |         inputs: the input sequence to rnn
 35 |         inputs_len: valid length
 36 |         initial_state: initial_state of the cell
 37 |     Returns:
 38 |         outputs and state
 39 |     """
 40 |     batch_size = tf.shape(inputs)[0]
 41 |     max_time = tf.shape(inputs)[1]
 42 | 
 43 |     inputs_ta = tf.TensorArray(dtype=tf.float32, size=max_time)
 44 |     inputs_ta = inputs_ta.unstack(tf.transpose(inputs, [1, 0, 2]))
 45 |     emit_ta = tf.TensorArray(dtype=tf.float32, dynamic_size=True, size=0)
 46 |     t0 = tf.constant(0, dtype=tf.int32)
 47 |     if initial_state is not None:
 48 |         s0 = initial_state
 49 |     else:
 50 |         s0 = cell.zero_state(batch_size, dtype=tf.float32)
 51 |     f0 = tf.zeros([batch_size], dtype=tf.bool)
 52 | 
 53 |     def loop_fn(t, prev_s, emit_ta, finished):
 54 |         """
 55 |         the loop function of rnn
 56 |         """
 57 |         cur_x = inputs_ta.read(t)
 58 |         scores, cur_state = cell(cur_x, prev_s)
 59 | 
 60 |         # copy through
 61 |         scores = tf.where(finished, tf.zeros_like(scores), scores)
 62 | 
 63 |         if isinstance(cell, tc.rnn.LSTMCell):
 64 |             cur_c, cur_h = cur_state
 65 |             prev_c, prev_h = prev_s
 66 |             cur_state = tc.rnn.LSTMStateTuple(tf.where(finished, prev_c, cur_c),
 67 |                                               tf.where(finished, prev_h, cur_h))
 68 |         else:
 69 |             cur_state = tf.where(finished, prev_s, cur_state)
 70 | 
 71 |         emit_ta = emit_ta.write(t, scores)
 72 |         finished = tf.greater_equal(t + 1, inputs_len)
 73 |         return [t + 1, cur_state, emit_ta, finished]
 74 | 
 75 |     _, state, emit_ta, _ = tf.while_loop(
 76 |         cond=lambda _1, _2, _3, finished: tf.logical_not(tf.reduce_all(finished)),
 77 |         body=loop_fn,
 78 |         loop_vars=(t0, s0, emit_ta, f0),
 79 |         parallel_iterations=32,
 80 |         swap_memory=False)
 81 | 
 82 |     outputs = tf.transpose(emit_ta.stack(), [1, 0, 2])
 83 |     return outputs, state
 84 | 
 85 | 
 86 | def attend_pooling(pooling_vectors, ref_vector, hidden_size, scope=None):
 87 |     """
 88 |     Applies attend pooling to a set of vectors according to a reference vector.
 89 |     Args:
 90 |         pooling_vectors: the vectors to pool
 91 |         ref_vector: the reference vector
 92 |         hidden_size: the hidden size for attention function
 93 |         scope: score name
 94 |     Returns:
 95 |         the pooled vector
 96 |     """
 97 |     with tf.variable_scope(scope or 'attend_pooling', reuse=tf.AUTO_REUSE):
 98 |         U = tf.tanh(tc.layers.fully_connected(pooling_vectors, num_outputs=hidden_size,
 99 |                                               activation_fn=None, biases_initializer=None)
100 |                     + tc.layers.fully_connected(tf.expand_dims(ref_vector, 1),
101 |                                                 num_outputs=hidden_size,
102 |                                                 activation_fn=None))
103 |         logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None)
104 |         scores = tf.nn.softmax(logits, 1)
105 |         pooled_vector = tf.reduce_sum(pooling_vectors * scores, axis=1)
106 |     return pooled_vector
107 | 
108 | 
109 | class PointerNetLSTMCell(tc.rnn.LSTMCell):
110 |     """
111 |     Implements the Pointer Network Cell
112 |     """
113 | 
114 |     def __init__(self, num_units, context_to_point):
115 |         super(PointerNetLSTMCell, self).__init__(num_units, state_is_tuple=True)
116 |         self.context_to_point = context_to_point
117 |         self.fc_context = tc.layers.fully_connected(self.context_to_point,
118 |                                                     num_outputs=self._num_units,
119 |                                                     activation_fn=None)
120 | 
121 |     def __call__(self, inputs, state, scope=None):
122 |         (c_prev, m_prev) = state
123 |         with tf.variable_scope(scope or type(self).__name__, reuse=tf.AUTO_REUSE):
124 |             U = tf.tanh(self.fc_context
125 |                         + tf.expand_dims(tc.layers.fully_connected(m_prev,
126 |                                                                    num_outputs=self._num_units,
127 |                                                                    activation_fn=None),
128 |                                          1))
129 |             logits = tc.layers.fully_connected(U, num_outputs=1, activation_fn=None)
130 |             scores = tf.nn.softmax(logits, 1)
131 |             attended_context = tf.reduce_sum(self.context_to_point * scores, axis=1)
132 |             lstm_out, lstm_state = super(PointerNetLSTMCell, self).__call__(attended_context, state)
133 |         return tf.squeeze(scores, -1), lstm_state
134 | 
135 | 
136 | class PointerNetDecoder(object):
137 |     """
138 |     Implements the Pointer Network
139 |     """
140 | 
141 |     def __init__(self, hidden_size):
142 |         self.hidden_size = hidden_size
143 | 
144 |     def decode(self, passage_vectors, question_vectors, init_with_question=True):
145 |         """
146 |         Use Pointer Network to compute the probabilities of each position
147 |         to be start and end of the answer
148 |         Args:
149 |             passage_vectors: the encoded passage vectors
150 |             question_vectors: the encoded question vectors
151 |             init_with_question: if set to be true,
152 |                              we will use the question_vectors to init the state of Pointer Network
153 |         Returns:
154 |             the probs of evary position to be start and end of the answer
155 |         """
156 |         with tf.variable_scope('pn_decoder', reuse=tf.AUTO_REUSE):
157 |             fake_inputs = tf.zeros([tf.shape(passage_vectors)[0], 2, 1])  # not used
158 |             sequence_len = tf.tile([2], [tf.shape(passage_vectors)[0]])
159 |             if init_with_question:
160 |                 random_attn_vector = tf.Variable(tf.random_normal([1, self.hidden_size]),
161 |                                                  trainable=True, name="random_attn_vector")
162 |                 pooled_question_rep = tc.layers.fully_connected(
163 |                     attend_pooling(question_vectors, random_attn_vector, self.hidden_size),
164 |                     num_outputs=self.hidden_size, activation_fn=None
165 |                 )
166 |                 init_state = tc.rnn.LSTMStateTuple(pooled_question_rep, pooled_question_rep)
167 |             else:
168 |                 init_state = None
169 |             with tf.variable_scope('fw', reuse=tf.AUTO_REUSE):
170 |                 fw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors)
171 |                 fw_outputs, _ = custom_dynamic_rnn(fw_cell, fake_inputs, sequence_len, init_state)
172 |             with tf.variable_scope('bw', reuse=tf.AUTO_REUSE):
173 |                 bw_cell = PointerNetLSTMCell(self.hidden_size, passage_vectors)
174 |                 bw_outputs, _ = custom_dynamic_rnn(bw_cell, fake_inputs, sequence_len, init_state)
175 |             start_prob = (fw_outputs[0:, 0, 0:] + bw_outputs[0:, 1, 0:]) / 2
176 |             end_prob = (fw_outputs[0:, 1, 0:] + bw_outputs[0:, 0, 0:]) / 2
177 |             return start_prob, end_prob
178 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/dureader/vocab.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf8 -*-
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module implements the Vocab class for converting string to id and back
 19 | """
 20 | 
 21 | import numpy as np
 22 | 
 23 | 
 24 | class Vocab(object):
 25 |     """
 26 |     Implements a vocabulary to store the tokens in the data, with their corresponding embeddings.
 27 |     """
 28 |     def __init__(self, filename=None, initial_tokens=None, lower=False):
 29 |         self.id2token = {}
 30 |         self.token2id = {}
 31 |         self.token_cnt = {}
 32 |         self.lower = lower
 33 | 
 34 |         self.embed_dim = None
 35 |         self.embeddings = None
 36 | 
 37 |         self.pad_token = '<blank>'
 38 |         self.unk_token = '<unk>'
 39 | 
 40 |         self.initial_tokens = initial_tokens if initial_tokens is not None else []
 41 |         self.initial_tokens.extend([self.pad_token, self.unk_token])
 42 |         for token in self.initial_tokens:
 43 |             self.add(token)
 44 | 
 45 |         if filename is not None:
 46 |             self.load_from_file(filename)
 47 | 
 48 |     def size(self):
 49 |         """
 50 |         get the size of vocabulary
 51 |         Returns:
 52 |             an integer indicating the size
 53 |         """
 54 |         return len(self.id2token)
 55 | 
 56 |     def load_from_file(self, file_path):
 57 |         """
 58 |         loads the vocab from file_path
 59 |         Args:
 60 |             file_path: a file with a word in each line
 61 |         """
 62 |         for line in open(file_path, 'r'):
 63 |             token = line.rstrip('\n')
 64 |             self.add(token)
 65 | 
 66 |     def get_id(self, token):
 67 |         """
 68 |         gets the id of a token, returns the id of unk token if token is not in vocab
 69 |         Args:
 70 |             key: a string indicating the word
 71 |         Returns:
 72 |             an integer
 73 |         """
 74 |         token = token.lower() if self.lower else token
 75 |         try:
 76 |             return self.token2id[token]
 77 |         except KeyError:
 78 |             return self.token2id[self.unk_token]
 79 | 
 80 |     def get_token(self, idx):
 81 |         """
 82 |         gets the token corresponding to idx, returns unk token if idx is not in vocab
 83 |         Args:
 84 |             idx: an integer
 85 |         returns:
 86 |             a token string
 87 |         """
 88 |         try:
 89 |             return self.id2token[idx]
 90 |         except KeyError:
 91 |             return self.unk_token
 92 | 
 93 |     def add(self, token, cnt=1):
 94 |         """
 95 |         adds the token to vocab
 96 |         Args:
 97 |             token: a string
 98 |             cnt: a num indicating the count of the token to add, default is 1
 99 |         """
100 |         token = token.lower() if self.lower else token
101 |         if token in self.token2id:
102 |             idx = self.token2id[token]
103 |         else:
104 |             # vocab中无此token，则添加token2id id2token
105 |             idx = len(self.id2token)
106 |             self.id2token[idx] = token
107 |             self.token2id[token] = idx
108 |         if cnt > 0:
109 |             if token in self.token_cnt:
110 |                 self.token_cnt[token] += cnt
111 |             else:
112 |                 self.token_cnt[token] = cnt
113 |         return idx
114 | 
115 |     def filter_tokens_by_cnt(self, min_cnt):
116 |         """
117 |         filter the tokens in vocab by their count
118 |         Args:
119 |             min_cnt: tokens with frequency less than min_cnt is filtered
120 |         """
121 |         filtered_tokens = [token for token in self.token2id if self.token_cnt[token] >= min_cnt]
122 |         # rebuild the token x id map
123 |         self.token2id = {}
124 |         self.id2token = {}
125 |         for token in self.initial_tokens:
126 |             self.add(token, cnt=0)
127 |         for token in filtered_tokens:
128 |             self.add(token, cnt=0)
129 | 
130 |     def randomly_init_embeddings(self, embed_dim):
131 |         """
132 |         randomly initializes the embeddings for each token
133 |         Args:
134 |             embed_dim: the size of the embedding for each token
135 |         """
136 |         self.embed_dim = embed_dim
137 |         self.embeddings = np.random.rand(self.size(), embed_dim)
138 |         # 填充符号和未知词符号初始化为0
139 |         for token in [self.pad_token, self.unk_token]:
140 |             self.embeddings[self.get_id(token)] = np.zeros([self.embed_dim])
141 | 
142 |     def load_pretrained_embeddings(self, embedding_path):
143 |         """
144 |         loads the pretrained embeddings from embedding_path,
145 |         tokens not in pretrained embeddings will be filtered
146 |         Args:
147 |             embedding_path: the path of the pretrained embedding file
148 |         """
149 |         trained_embeddings = {}
150 |         with open(embedding_path, 'r', encoding='utf8') as fin:
151 |             for line in fin:
152 |                 contents = line.strip().split()
153 |                 token = contents[0]
154 |                 if token not in self.token2id:
155 |                     continue
156 |                 trained_embeddings[token] = list(map(float, contents[1:]))
157 |                 if self.embed_dim is None:
158 |                     self.embed_dim = len(contents) - 1
159 |         fin.close()
160 |         filtered_tokens = trained_embeddings.keys()
161 |         # rebuild the token x id map
162 |         self.token2id = {}
163 |         self.id2token = {}
164 |         for token in self.initial_tokens:
165 |             self.add(token, cnt=0)
166 |         for token in filtered_tokens:
167 |             self.add(token, cnt=0)
168 |         # load embeddings
169 |         self.embeddings = np.zeros([self.size(), self.embed_dim])
170 |         for token in self.token2id.keys():
171 |             if token in trained_embeddings:
172 |                 self.embeddings[self.get_id(token)] = trained_embeddings[token]
173 | 
174 |     def convert_to_ids(self, tokens):
175 |         """
176 |         Convert a list of tokens to ids, use unk_token if the token is not in vocab.
177 |         Args:
178 |             tokens: a list of token
179 |         Returns:
180 |             a list of ids
181 |         """
182 |         vec = [self.get_id(label) for label in tokens]
183 |         return vec
184 | 
185 |     def recover_from_ids(self, ids, stop_id=None):
186 |         """
187 |         Convert a list of ids to tokens, stop converting if the stop_id is encountered
188 |         Args:
189 |             ids: a list of ids to convert
190 |             stop_id: the stop id, default is None
191 |         Returns:
192 |             a list of tokens
193 |         """
194 |         tokens = []
195 |         for i in ids:
196 |             tokens += [self.get_token(i)]
197 |             if stop_id is not None and i == stop_id:
198 |                 break
199 |         return tokens
200 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | This package implements some utility functions shared by PaddlePaddle
19 | and Tensorflow model implementations.
20 | 
21 | Authors: liuyuan(liuyuan04@baidu.com)
22 | Date:    2017/10/06 18:23:06
23 | """
24 | 
25 | 
26 | from .dureader_eval import compute_bleu_rouge
27 | from .dureader_eval import normalize
28 | from .preprocess import find_fake_answer
29 | from .preprocess import find_best_question_match
30 | 
31 | __all__ = [
32 |     'compute_bleu_rouge',
33 |     'normalize',
34 |     'find_fake_answer',
35 |     'find_best_question_match',
36 |     ]
37 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/utils/bleu_metric/__pycache__/bleu.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/utils/bleu_metric/__pycache__/bleu_score.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/bleu_metric/bleu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | # File Name : bleu.py
 4 | #
 5 | # Description : Wrapper for BLEU scorer.
 6 | #
 7 | # Creation Date : 06-01-2015
 8 | # Last Modified : Thu 19 Mar 2015 09:13:28 PM PDT
 9 | # Authors : Hao Fang <hfang@uw.edu> and Tsung-Yi Lin <tl483@cornell.edu>
10 | 
11 | from .bleu_score import BleuScorer
12 | 
13 | 
14 | class Bleu:
15 |     def __init__(self, n=4):
16 |         # default compute Blue score up to 4
17 |         self._n = n
18 |         self._hypo_for_image = {}
19 |         self.ref_for_image = {}
20 | 
21 |     def compute_score(self, gts, res):
22 | 
23 |         assert(list(gts.keys()) == list(res.keys()))
24 |         imgIds = list(gts.keys())
25 | 
26 |         bleu_scorer = BleuScorer(n=self._n)
27 |         for id in imgIds:
28 |             hypo = res[id]
29 |             ref = gts[id]
30 | 
31 |             # Sanity check.
32 |             assert(type(hypo) is list)
33 |             assert(len(hypo) == 1)
34 |             assert(type(ref) is list)
35 |             assert(len(ref) >= 1)
36 | 
37 |             bleu_scorer += (hypo[0], ref)
38 | 
39 |         #score, scores = bleu_scorer.compute_score(option='shortest')
40 |         score, scores = bleu_scorer.compute_score(option='closest', verbose=1)
41 |         #score, scores = bleu_scorer.compute_score(option='average', verbose=1)
42 | 
43 |         # return (bleu, bleu_info)
44 |         return score, scores
45 | 
46 |     def method(self):
47 |         return "Bleu"
48 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/bleu_metric/bleu_score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # bleu_scorer.py
  4 | # David Chiang <chiang@isi.edu>
  5 | 
  6 | # Copyright (c) 2004-2006 University of Maryland. All rights
  7 | # reserved. Do not redistribute without permission from the
  8 | # author. Not for commercial use.
  9 | 
 10 | # Modified by:
 11 | # Hao Fang <hfang@uw.edu>
 12 | # Tsung-Yi Lin <tl483@cornell.edu>
 13 | 
 14 | '''Provides:
 15 | cook_refs(refs, n=4): Transform a list of reference sentences as strings into a form usable by cook_test().
 16 | cook_test(test, refs, n=4): Transform a test sentence as a string (together with the cooked reference sentences) into a form usable by score_cooked().
 17 | '''
 18 | 
 19 | import copy
 20 | # import sys, math, re
 21 | import math
 22 | from collections import defaultdict
 23 | 
 24 | 
 25 | def precook(s, n=4, out=False):
 26 |     """Takes a string as input and returns an object that can be given to
 27 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 28 |     can take string arguments as well."""
 29 |     words = s.split()
 30 |     counts = defaultdict(int)
 31 |     for k in range(1, n + 1):
 32 |         for i in range(len(words) - k + 1):
 33 |             ngram = tuple(words[i:i + k])
 34 |             counts[ngram] += 1
 35 |     return (len(words), counts)
 36 | 
 37 | 
 38 | def cook_refs(refs, eff=None, n=4):  ## lhuang: oracle will call with "average"
 39 |     '''Takes a list of reference sentences for a single segment
 40 |     and returns an object that encapsulates everything that BLEU
 41 |     needs to know about them.'''
 42 | 
 43 |     reflen = []
 44 |     maxcounts = {}
 45 |     for ref in refs:
 46 |         rl, counts = precook(ref, n)
 47 |         reflen.append(rl)
 48 |         for (ngram, count) in counts.items():
 49 |             maxcounts[ngram] = max(maxcounts.get(ngram, 0), count)
 50 | 
 51 |     # Calculate effective reference sentence length.
 52 |     if eff == "shortest":
 53 |         reflen = min(reflen)
 54 |     elif eff == "average":
 55 |         reflen = float(sum(reflen)) / len(reflen)
 56 | 
 57 |     ## lhuang: N.B.: leave reflen computaiton to the very end!!
 58 | 
 59 |     ## lhuang: N.B.: in case of "closest", keep a list of reflens!! (bad design)
 60 | 
 61 |     return (reflen, maxcounts)
 62 | 
 63 | 
 64 | def cook_test(test, xxx_todo_changeme, eff=None, n=4):
 65 |     '''Takes a test sentence and returns an object that
 66 |     encapsulates everything that BLEU needs to know about it.'''
 67 |     (reflen, refmaxcounts) = xxx_todo_changeme
 68 |     testlen, counts = precook(test, n, True)
 69 | 
 70 |     result = {}
 71 | 
 72 |     # Calculate effective reference sentence length.
 73 | 
 74 |     if eff == "closest":
 75 |         result["reflen"] = min((abs(l - testlen), l) for l in reflen)[1]
 76 |     else:  ## i.e., "average" or "shortest" or None
 77 |         result["reflen"] = reflen
 78 | 
 79 |     result["testlen"] = testlen
 80 | 
 81 |     result["guess"] = [max(0, testlen - k + 1) for k in range(1, n + 1)]
 82 | 
 83 |     result['correct'] = [0] * n
 84 |     for (ngram, count) in counts.items():
 85 |         result["correct"][len(ngram) - 1] += min(refmaxcounts.get(ngram, 0), count)
 86 | 
 87 |     return result
 88 | 
 89 | 
 90 | class BleuScorer(object):
 91 |     """Bleu scorer.
 92 |     """
 93 | 
 94 |     __slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
 95 | 
 96 |     # special_reflen is used in oracle (proportional effective ref len for a node).
 97 | 
 98 |     def copy(self):
 99 |         ''' copy the refs.'''
100 |         new = BleuScorer(n=self.n)
101 |         new.ctest = copy.copy(self.ctest)
102 |         new.crefs = copy.copy(self.crefs)
103 |         new._score = None
104 |         return new
105 | 
106 |     def __init__(self, test=None, refs=None, n=4, special_reflen=None):
107 |         ''' singular instance '''
108 | 
109 |         self.n = n
110 |         self.crefs = []
111 |         self.ctest = []
112 |         self.cook_append(test, refs)
113 |         self.special_reflen = special_reflen
114 | 
115 |     def cook_append(self, test, refs):
116 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
117 | 
118 |         if refs is not None:
119 |             self.crefs.append(cook_refs(refs))
120 |             if test is not None:
121 |                 cooked_test = cook_test(test, self.crefs[-1])
122 |                 self.ctest.append(cooked_test)  ## N.B.: -1
123 |             else:
124 |                 self.ctest.append(None)  # lens of crefs and ctest have to match
125 | 
126 |         self._score = None  ## need to recompute
127 | 
128 |     def ratio(self, option=None):
129 |         self.compute_score(option=option)
130 |         return self._ratio
131 | 
132 |     def score_ratio(self, option=None):
133 |         '''return (bleu, len_ratio) pair'''
134 |         return (self.fscore(option=option), self.ratio(option=option))
135 | 
136 |     def score_ratio_str(self, option=None):
137 |         return "%.4f (%.2f)" % self.score_ratio(option)
138 | 
139 |     def reflen(self, option=None):
140 |         self.compute_score(option=option)
141 |         return self._reflen
142 | 
143 |     def testlen(self, option=None):
144 |         self.compute_score(option=option)
145 |         return self._testlen
146 | 
147 |     def retest(self, new_test):
148 |         if type(new_test) is str:
149 |             new_test = [new_test]
150 |         assert len(new_test) == len(self.crefs), new_test
151 |         self.ctest = []
152 |         for t, rs in zip(new_test, self.crefs):
153 |             self.ctest.append(cook_test(t, rs))
154 |         self._score = None
155 | 
156 |         return self
157 | 
158 |     def rescore(self, new_test):
159 |         ''' replace test(s) with new test(s), and returns the new score.'''
160 | 
161 |         return self.retest(new_test).compute_score()
162 | 
163 |     def size(self):
164 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
165 |         return len(self.crefs)
166 | 
167 |     def __iadd__(self, other):
168 |         '''add an instance (e.g., from another sentence).'''
169 | 
170 |         if type(other) is tuple:
171 |             ## avoid creating new BleuScorer instances
172 |             self.cook_append(other[0], other[1])
173 |         else:
174 |             assert self.compatible(other), "incompatible BLEUs."
175 |             self.ctest.extend(other.ctest)
176 |             self.crefs.extend(other.crefs)
177 |             self._score = None  ## need to recompute
178 | 
179 |         return self
180 | 
181 |     def compatible(self, other):
182 |         return isinstance(other, BleuScorer) and self.n == other.n
183 | 
184 |     def single_reflen(self, option="average"):
185 |         return self._single_reflen(self.crefs[0][0], option)
186 | 
187 |     def _single_reflen(self, reflens, option=None, testlen=None):
188 | 
189 |         if option == "shortest":
190 |             reflen = min(reflens)
191 |         elif option == "average":
192 |             reflen = float(sum(reflens)) / len(reflens)
193 |         elif option == "closest":
194 |             reflen = min((abs(l - testlen), l) for l in reflens)[1]
195 |         else:
196 |             assert False, "unsupported reflen option %s" % option
197 | 
198 |         return reflen
199 | 
200 |     def recompute_score(self, option=None, verbose=0):
201 |         self._score = None
202 |         return self.compute_score(option, verbose)
203 | 
204 |     def compute_score(self, option=None, verbose=0):
205 |         n = self.n
206 |         small = 1e-9
207 |         tiny = 1e-15  ## so that if guess is 0 still return 0
208 |         bleu_list = [[] for _ in range(n)]
209 | 
210 |         if self._score is not None:
211 |             return self._score
212 | 
213 |         if option is None:
214 |             option = "average" if len(self.crefs) == 1 else "closest"
215 | 
216 |         self._testlen = 0
217 |         self._reflen = 0
218 |         totalcomps = {'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n}
219 | 
220 |         # for each sentence
221 |         for comps in self.ctest:
222 |             testlen = comps['testlen']
223 |             self._testlen += testlen
224 | 
225 |             if self.special_reflen is None:  ## need computation
226 |                 reflen = self._single_reflen(comps['reflen'], option, testlen)
227 |             else:
228 |                 reflen = self.special_reflen
229 | 
230 |             self._reflen += reflen
231 | 
232 |             for key in ['guess', 'correct']:
233 |                 for k in range(n):
234 |                     totalcomps[key][k] += comps[key][k]
235 | 
236 |             # append per image bleu score
237 |             bleu = 1.
238 |             for k in range(n):
239 |                 bleu *= (float(comps['correct'][k]) + tiny) \
240 |                         / (float(comps['guess'][k]) + small)
241 |                 bleu_list[k].append(bleu ** (1. / (k + 1)))
242 |             ratio = (testlen + tiny) / (reflen + small)  ## N.B.: avoid zero division
243 |             if ratio < 1:
244 |                 for k in range(n):
245 |                     bleu_list[k][-1] *= math.exp(1 - 1 / ratio)
246 | 
247 |             if verbose > 1:
248 |                 print(comps, reflen)
249 | 
250 |         totalcomps['reflen'] = self._reflen
251 |         totalcomps['testlen'] = self._testlen
252 | 
253 |         bleus = []
254 |         bleu = 1.
255 |         for k in range(n):
256 |             bleu *= float(totalcomps['correct'][k] + tiny) \
257 |                     / (totalcomps['guess'][k] + small)
258 |             bleus.append(bleu ** (1. / (k + 1)))
259 |         ratio = (self._testlen + tiny) / (self._reflen + small)  ## N.B.: avoid zero division
260 |         if ratio < 1:
261 |             for k in range(n):
262 |                 bleus[k] *= math.exp(1 - 1 / ratio)
263 | 
264 |         if verbose > 0:
265 |             print(totalcomps)
266 |             print("ratio:", ratio)
267 | 
268 |         self._score = bleus
269 |         return self._score, bleu_list
270 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/get_vocab.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf8 -*-
 2 | # ==============================================================================
 3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | # ==============================================================================
17 | """
18 | Utility function to generate vocabulary file.
19 | """
20 | 
21 | 
22 | import argparse
23 | import sys
24 | import json
25 | 
26 | from itertools import chain
27 | 
28 | 
29 | def get_vocab(files, vocab_file):
30 |     """
31 |     Builds vocabulary file from field 'segmented_paragraphs'
32 |     and 'segmented_question'.
33 | 
34 |     Args:
35 |         files: A list of file names.
36 |         vocab_file: The file that stores the vocabulary.
37 |     """
38 |     vocab = {}
39 |     for f in files:
40 |         with open(f, 'r') as fin:
41 |             for line in fin:
42 |                 obj = json.loads(line.strip())
43 |                 paras = [
44 |                         chain(*d['segmented_paragraphs'])
45 |                         for d in obj['documents']]
46 |                 doc_tokens = chain(*paras)
47 |                 question_tokens = obj['segmented_question']
48 |                 for t in list(doc_tokens) + question_tokens:
49 |                     vocab[t] = vocab.get(t, 0) + 1
50 |     # output
51 |     sorted_vocab = sorted([(v, c) for v, c in vocab.items()],
52 |             key=lambda x: x[1],
53 |             reverse=True)
54 |     with open(vocab_file, 'w') as outf:
55 |         for w, c in sorted_vocab:
56 |             print >> outf, '{}\t{}'.format(w.encode('utf8'), c)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     parser = argparse.ArgumentParser()
61 |     parser.add_argument('--files', nargs='+', required=True,
62 |             help='file list to count vocab from.')
63 |     parser.add_argument('--vocab', required=True,
64 |             help='file to store counted vocab.')
65 |     args = parser.parse_args()
66 |     get_vocab(args.files, args.vocab)
67 | 
68 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/json_to_sentence.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def load_data(brc_data, tar_dir):
 5 |     # print('Converting ' + file)
 6 |     # fin = open(file, encoding='utf8')
 7 |     out_file = os.path.join(tar_dir, 'train_set.seg')
 8 |     with open(out_file, 'w', encoding='utf8') as ftrain:
 9 |         for sample in brc_data.train_set:
10 |             ftrain.write(' '.join(sample['segmented_question']) + '\n')
11 |             for passage in sample['passages']:
12 |                 ftrain.write(' '.join(passage['passage_tokens']) + '\n')
13 |             del sample
14 |     ftrain.close()
15 | 
16 |     out_file = os.path.join(tar_dir, 'dev_set.seg')
17 |     with open(out_file, 'w', encoding='utf8') as fdev:
18 |         for sample in brc_data.dev_set:
19 |             fdev.write(' '.join(sample['segmented_question']) + '\n')
20 |             for passage in sample['passages']:
21 |                 fdev.write(' '.join(passage['passage_tokens']) + '\n')
22 |             del sample
23 |     fdev.close()
24 | 
25 |     out_file = os.path.join(tar_dir, 'test_set.seg')
26 |     with open(out_file, 'w', encoding='utf8') as ftest:
27 |         for sample in brc_data.test_set:
28 |             ftest.write(' '.join(sample['segmented_question']) + '\n')
29 |             for passage in sample['passages']:
30 |                 ftest.write(' '.join(passage['passage_tokens']) + '\n')
31 |             del sample
32 |     ftest.close()
33 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | ###############################################################################
  2 | # ==============================================================================
  3 | # Copyright 2017 Baidu.com, Inc. All Rights Reserved
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #    http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | # ==============================================================================
 17 | """
 18 | This module finds the most related paragraph of each document according to recall.
 19 | """
 20 | 
 21 | import sys
 22 | # reload(sys)
 23 | # sys.setdefaultencoding('utf8')
 24 | import json
 25 | from collections import Counter
 26 | 
 27 | 
 28 | def precision_recall_f1(prediction, ground_truth):
 29 |     """
 30 |     This function calculates and returns the precision, recall and f1-score
 31 |     Args:
 32 |         prediction: prediction string or list to be matched
 33 |         ground_truth: golden string or list reference
 34 |     Returns:
 35 |         floats of (p, r, f1)
 36 |     Raises:
 37 |         None
 38 |     """
 39 |     if not isinstance(prediction, list):
 40 |         prediction_tokens = prediction.split()
 41 |     else:
 42 |         prediction_tokens = prediction
 43 |     if not isinstance(ground_truth, list):
 44 |         ground_truth_tokens = ground_truth.split()
 45 |     else:
 46 |         ground_truth_tokens = ground_truth
 47 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 48 |     num_same = sum(common.values())
 49 |     if num_same == 0:
 50 |         return 0, 0, 0
 51 |     p = 1.0 * num_same / len(prediction_tokens)
 52 |     r = 1.0 * num_same / len(ground_truth_tokens)
 53 |     f1 = (2 * p * r) / (p + r)
 54 |     return p, r, f1
 55 | 
 56 | 
 57 | def recall(prediction, ground_truth):
 58 |     """
 59 |     This function calculates and returns the recall
 60 |     Args:
 61 |         prediction: prediction string or list to be matched
 62 |         ground_truth: golden string or list reference
 63 |     Returns:
 64 |         floats of recall
 65 |     Raises:
 66 |         None
 67 |     """
 68 |     return precision_recall_f1(prediction, ground_truth)[1]
 69 | 
 70 | 
 71 | def f1_score(prediction, ground_truth):
 72 |     """
 73 |     This function calculates and returns the f1-score
 74 |     Args:
 75 |         prediction: prediction string or list to be matched
 76 |         ground_truth: golden string or list reference
 77 |     Returns:
 78 |         floats of f1
 79 |     Raises:
 80 |         None
 81 |     """
 82 |     return precision_recall_f1(prediction, ground_truth)[2]
 83 | 
 84 | 
 85 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 86 |     """
 87 |     This function calculates and returns the precision, recall and f1-score
 88 |     Args:
 89 |         metric_fn: metric function pointer which calculates scores according to corresponding logic.
 90 |         prediction: prediction string or list to be matched
 91 |         ground_truth: golden string or list reference
 92 |     Returns:
 93 |         floats of (p, r, f1)
 94 |     Raises:
 95 |         None
 96 |     """
 97 |     scores_for_ground_truths = []
 98 |     for ground_truth in ground_truths:
 99 |         score = metric_fn(prediction, ground_truth)
100 |         scores_for_ground_truths.append(score)
101 |     return max(scores_for_ground_truths)
102 | 
103 | 
104 | def find_best_question_match(doc, question, with_score=False):
105 |     """
106 |     For each docment, find the paragraph that matches best to the question.
107 |     Args:
108 |         doc: The document object.
109 |         question: The question tokens.
110 |         with_score: If True then the match score will be returned,
111 |             otherwise False.
112 |     Returns:
113 |         The index of the best match paragraph, if with_score=False,
114 |         otherwise returns a tuple of the index of the best match paragraph
115 |         and the match score of that paragraph.
116 |     """
117 |     most_related_para = -1
118 |     max_related_score = 0
119 |     most_related_para_len = 0
120 |     for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']):
121 |         if len(question) > 0:
122 |             related_score = metric_max_over_ground_truths(recall,
123 |                     para_tokens,
124 |                     question)
125 |         else:
126 |             related_score = 0
127 | 
128 |         if related_score > max_related_score \
129 |                 or (related_score == max_related_score \
130 |                 and len(para_tokens) < most_related_para_len):
131 |             most_related_para = p_idx
132 |             max_related_score = related_score
133 |             most_related_para_len = len(para_tokens)
134 |     if most_related_para == -1:
135 |         most_related_para = 0
136 |     if with_score:
137 |         return most_related_para, max_related_score
138 |     return most_related_para
139 | 
140 | 
141 | def find_fake_answer(sample):
142 |     """
143 |     For each document, finds the most related paragraph based on recall,
144 |     then finds a span that maximize the f1_score compared with the gold answers
145 |     and uses this span as a fake answer span
146 |     Args:
147 |         sample: a sample in the dataset
148 |     Returns:
149 |         None
150 |     Raises:
151 |         None
152 |     """
153 |     for doc in sample['documents']:
154 |         most_related_para = -1
155 |         most_related_para_len = 999999
156 |         max_related_score = 0
157 |         for p_idx, para_tokens in enumerate(doc['segmented_paragraphs']):
158 |             if len(sample['segmented_answers']) > 0:
159 |                 related_score = metric_max_over_ground_truths(recall,
160 |                                                               para_tokens,
161 |                                                               sample['segmented_answers'])
162 |             else:
163 |                 continue
164 |             if related_score > max_related_score \
165 |                     or (related_score == max_related_score
166 |                         and len(para_tokens) < most_related_para_len):
167 |                 most_related_para = p_idx
168 |                 most_related_para_len = len(para_tokens)
169 |                 max_related_score = related_score
170 |         doc['most_related_para'] = most_related_para
171 | 
172 |     sample['answer_docs'] = []
173 |     sample['answer_spans'] = []
174 |     sample['fake_answers'] = []
175 |     sample['match_scores'] = []
176 | 
177 |     best_match_score = 0
178 |     best_match_d_idx, best_match_span = -1, [-1, -1]
179 |     best_fake_answer = None
180 |     answer_tokens = set()
181 |     for segmented_answer in sample['segmented_answers']:
182 |         answer_tokens = answer_tokens | set([token for token in segmented_answer])
183 |     for d_idx, doc in enumerate(sample['documents']):
184 |         if not doc['is_selected']:
185 |             continue
186 |         if doc['most_related_para'] == -1:
187 |             doc['most_related_para'] = 0
188 |         most_related_para_tokens = doc['segmented_paragraphs'][doc['most_related_para']][:1000]
189 |         for start_tidx in range(len(most_related_para_tokens)):
190 |             if most_related_para_tokens[start_tidx] not in answer_tokens:
191 |                 continue
192 |             for end_tidx in range(len(most_related_para_tokens) - 1, start_tidx - 1, -1):
193 |                 span_tokens = most_related_para_tokens[start_tidx: end_tidx + 1]
194 |                 if len(sample['segmented_answers']) > 0:
195 |                     match_score = metric_max_over_ground_truths(f1_score, span_tokens,
196 |                                                                 sample['segmented_answers'])
197 |                 else:
198 |                     match_score = 0
199 |                 if match_score == 0:
200 |                     break
201 |                 if match_score > best_match_score:
202 |                     best_match_d_idx = d_idx
203 |                     best_match_span = [start_tidx, end_tidx]
204 |                     best_match_score = match_score
205 |                     best_fake_answer = ''.join(span_tokens)
206 |     if best_match_score > 0:
207 |         sample['answer_docs'].append(best_match_d_idx)
208 |         sample['answer_spans'].append(best_match_span)
209 |         sample['fake_answers'].append(best_fake_answer)
210 |         sample['match_scores'].append(best_match_score)
211 | 
212 | 
213 | if __name__ == '__main__':
214 |     for line in sys.stdin:
215 |         sample = json.loads(line)
216 |         find_fake_answer(sample)
217 |         print(json.dumps(sample, encoding='utf8', ensure_ascii=False))
218 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/pretrain_embedding.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import logging
 4 | from gensim.models import word2vec
 5 | from .json_to_sentence import load_data
 6 | 
 7 | 
 8 | def pre_train(brc_data, segmented_dir):
 9 |     # parser = argparse.ArgumentParser('Reading Comprehension on BaiduRC dataset')
10 |     # path_settings = parser.add_argument_group('path settings')
11 |     # path_settings.add_argument('--train_files', nargs='+',
12 |     #                            default=['../data/trainset/search.train.json'],
13 |     #                            help='list of files that contain the preprocessed train data')
14 |     # path_settings.add_argument('--dev_files', nargs='+',
15 |     #                            default=['../data/devset/search.dev.json'],
16 |     #                            help='list of files that contain the preprocessed dev data')
17 |     # path_settings.add_argument('--test_files', nargs='+',
18 |     #                            default=['../data/testset/search.test.json'],
19 |     #                            help='list of files that contain the preprocessed test data')
20 |     # path_settings.add_argument('--segmented_dir', default='../data/segmented',
21 |     #                            help='the dir to store segmented sentences')
22 | 
23 |     sys.path.append('..')
24 |     # args = parser.parse_args()
25 |     # for files in args.train_files + args.dev_files + args.test_files:
26 |     #     json_to_sentence.load_data(files, args.segmented_dir)
27 |     load_data(brc_data, segmented_dir)
28 | 
29 |     program = os.path.basename(sys.argv[0])
30 |     logger = logging.getLogger(program)
31 |     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
32 |     logging.root.setLevel(level=logging.INFO)
33 |     logger.info("running %s" % ' '.join(sys.argv))
34 | 
35 |     model = word2vec.Word2Vec(word2vec.PathLineSentences(segmented_dir), size=300, min_count=2, workers=8, iter=10)
36 |     with open(os.path.join(segmented_dir, 'w2v_dic.data'), 'w', encoding='utf-8') as f:
37 |         for word in model.wv.vocab:
38 |             f.write(word + ' ')
39 |             f.write(' '.join(list(map(str, model[word]))))
40 |             f.write('\n')
41 |     f.close()
42 | 


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/BiDAF_Origin/utils/rouge_metric/__pycache__/rouge.cpython-36.pyc


--------------------------------------------------------------------------------
/BiDAF_Origin/utils/rouge_metric/rouge.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #
  3 | # File Name : rouge.py
  4 | #
  5 | # Description : Computes ROUGE-L metric as described by Lin and Hovey (2004)
  6 | #
  7 | # Creation Date : 2015-01-07 06:03
  8 | # Author : Ramakrishna Vedantam <vrama91@vt.edu>
  9 | 
 10 | import numpy as np
 11 | 
 12 | 
 13 | # import pdb
 14 | 
 15 | 
 16 | def my_lcs(string, sub):
 17 |     """
 18 |     Calculates longest common subsequence for a pair of tokenized strings
 19 |     :param string : list of str : tokens from a string split using whitespace
 20 |     :param sub : list of str : shorter string, also split using whitespace
 21 |     :returns: length (list of int): length of the longest common subsequence between the two strings
 22 | 
 23 |     Note: my_lcs only gives length of the longest common subsequence, not the actual LCS
 24 |     """
 25 |     if (len(string) < len(sub)):
 26 |         sub, string = string, sub
 27 | 
 28 |     lengths = [[0 for i in range(0, len(sub) + 1)] for j in range(0, len(string) + 1)]
 29 | 
 30 |     for j in range(1, len(sub) + 1):
 31 |         for i in range(1, len(string) + 1):
 32 |             if (string[i - 1] == sub[j - 1]):
 33 |                 lengths[i][j] = lengths[i - 1][j - 1] + 1
 34 |             else:
 35 |                 lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
 36 | 
 37 |     return lengths[len(string)][len(sub)]
 38 | 
 39 | 
 40 | class Rouge():
 41 |     '''
 42 |     Class for computing ROUGE-L score for a set of candidate sentences for the MS COCO test set
 43 | 
 44 |     '''
 45 | 
 46 |     def __init__(self):
 47 |         # vrama91: updated the value below based on discussion with Hovey
 48 |         self.beta = 1.2
 49 | 
 50 |     def calc_score(self, candidate, refs):
 51 |         """
 52 |         Compute ROUGE-L score given one candidate and references for an image
 53 |         :param candidate: str : candidate sentence to be evaluated
 54 |         :param refs: list of str : COCO reference sentences for the particular image to be evaluated
 55 |         :returns score: int (ROUGE-L score for the candidate evaluated against references)
 56 |         """
 57 |         assert (len(candidate) == 1)
 58 |         assert (len(refs) > 0)
 59 |         prec = []
 60 |         rec = []
 61 | 
 62 |         # split into tokens
 63 |         token_c = candidate[0].split(" ")
 64 | 
 65 |         for reference in refs:
 66 |             # split into tokens
 67 |             token_r = reference.split(" ")
 68 |             # compute the longest common subsequence
 69 |             lcs = my_lcs(token_r, token_c)
 70 |             prec.append(lcs / float(len(token_c)))
 71 |             rec.append(lcs / float(len(token_r)))
 72 | 
 73 |         prec_max = max(prec)
 74 |         rec_max = max(rec)
 75 | 
 76 |         if (prec_max != 0 and rec_max != 0):
 77 |             score = ((1 + self.beta ** 2) * prec_max * rec_max) / float(rec_max + self.beta ** 2 * prec_max)
 78 |         else:
 79 |             score = 0.0
 80 |         return score
 81 | 
 82 |     def compute_score(self, gts, res):
 83 |         """
 84 |         Computes Rouge-L score given a set of reference and candidate sentences for the dataset
 85 |         Invoked by evaluate_captions.py
 86 |         :param hypo_for_image: dict : candidate / test sentences with "image name" key and "tokenized sentences" as values
 87 |         :param ref_for_image: dict : reference MS-COCO sentences with "image name" key and "tokenized sentences" as values
 88 |         :returns: average_score: float (mean ROUGE-L score computed by averaging scores for all the images)
 89 |         """
 90 |         assert (list(gts.keys()) == list(res.keys()))
 91 |         imgIds = list(gts.keys())
 92 | 
 93 |         score = []
 94 |         for id in imgIds:
 95 |             hypo = res[id]
 96 |             ref = gts[id]
 97 | 
 98 |             score.append(self.calc_score(hypo, ref))
 99 | 
100 |             # Sanity check.
101 |             assert (type(hypo) is list)
102 |             assert (len(hypo) == 1)
103 |             assert (type(ref) is list)
104 |             assert (len(ref) > 0)
105 | 
106 |         average_score = np.mean(np.array(score))
107 |         return average_score, np.array(score)
108 | 
109 |     def method(self):
110 |         return "Rouge"
111 | 


--------------------------------------------------------------------------------
/GatedRNN/GatedRNN.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import logging
  3 | import time
  4 | import os
  5 | from basic_rnn import dot_attention, dense, cudnn_gru
  6 | 
  7 | 
  8 | class GatedRNN(object):
  9 |     def __init__(self, args, batch, token_embeddings=None, trainable=True, opt=True):
 10 |         # logging
 11 |         self.logger = logging.getLogger("brc")
 12 |         # basic config
 13 |         self.batch_size = args.batch_size
 14 |         self.hidden_size = args.hidden_size
 15 |         self.output_size = 3
 16 |         # self.d_a = 300
 17 |         # self.r = 64
 18 |         # self.p_coef = 1
 19 |         self.layer_num = args.layer_num
 20 |         self.optim_type = args.optim
 21 |         self.weight_decay = args.weight_decay
 22 |         self.dropout_keep_prob = args.dropout_keep_prob
 23 |         self.trainable = trainable
 24 |         # length limit
 25 |         self.max_q_len = args.max_q_len
 26 |         self.max_a_len = args.max_a_len
 27 |         # session info
 28 |         sess_config = tf.ConfigProto()
 29 |         sess_config.gpu_options.allow_growth = True
 30 |         self.sess = tf.Session(config=sess_config)
 31 | 
 32 |         self.a, self.q, self.answers_type, self.qa_id = batch.get_next()
 33 |         self.lr = tf.get_variable('lr', shape=[], dtype=tf.float32, trainable=False)
 34 |         self.is_train = tf.get_variable('is_train', shape=[], dtype=tf.bool, trainable=False)
 35 |         self.a_mask = tf.cast(self.a, tf.bool)
 36 |         self.q_mask = tf.cast(self.q, tf.bool)
 37 |         self.a_len = tf.reduce_sum(tf.cast(self.a_mask, tf.int32), axis=1)
 38 |         self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)
 39 |         self.N = tf.shape(self.qa_id)[0]
 40 | 
 41 |         self._build_graph(token_embeddings)
 42 | 
 43 |     def _build_graph(self, token_embeddings):
 44 |         start_t = time.time()
 45 |         self._embed(token_embeddings)
 46 |         self._encode()
 47 |         self._gated_attention()
 48 |         self._self_attention()
 49 |         # self._annotation()
 50 |         self._predict()
 51 |         self._compute_loss()
 52 |         if self.trainable:
 53 |             self._create_train_op()
 54 |         self.logger.info('Time to build graph: {} s'.format(time.time() - start_t))
 55 | 
 56 |     def _embed(self, token_embeddings):
 57 |         with tf.device('/cpu:0'), tf.variable_scope('word_embedding', reuse=tf.AUTO_REUSE):
 58 |             word_embeddings = tf.get_variable('word_embeddings',
 59 |                                               initializer=tf.constant(token_embeddings, dtype=tf.float32),
 60 |                                               trainable=False)
 61 |             self.a_emb = tf.nn.embedding_lookup(word_embeddings, self.a)
 62 |             self.q_emb = tf.nn.embedding_lookup(word_embeddings, self.q)
 63 | 
 64 |     def _encode(self):
 65 |         with tf.variable_scope('answer_encoding', reuse=tf.AUTO_REUSE):
 66 |             a_rnn = cudnn_gru(num_layers=2 * self.layer_num, num_units=self.hidden_size, batch_size=self.N,
 67 |                               input_size=self.a_emb.get_shape().as_list()[-1],
 68 |                               keep_prob=self.dropout_keep_prob, is_train=self.is_train)
 69 |             self.a_encodes = a_rnn(self.a_emb, seq_len=self.a_len)
 70 |         with tf.variable_scope('question_encoding', reuse=tf.AUTO_REUSE):
 71 |             q_rnn = cudnn_gru(num_layers=2 * self.layer_num, num_units=self.hidden_size, batch_size=self.N,
 72 |                               input_size=self.q_emb.get_shape().as_list()[-1],
 73 |                               keep_prob=self.dropout_keep_prob, is_train=self.is_train)
 74 |             self.q_encodes = q_rnn(self.q_emb, seq_len=self.q_len)
 75 | 
 76 |     def _gated_attention(self):
 77 |         with tf.variable_scope('gated_attention', reuse=tf.AUTO_REUSE):
 78 |             self.qa_att = dot_attention(self.a_encodes, self.q_encodes, mask=self.q_mask,
 79 |                                         hidden=self.hidden_size, keep_prob=self.dropout_keep_prob,
 80 |                                         is_train=self.is_train)
 81 |             gated_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N,
 82 |                                   input_size=self.qa_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob,
 83 |                                   is_train=self.is_train)
 84 |             self.gated_att = gated_rnn(self.qa_att, self.a_len)
 85 | 
 86 |     def _self_attention(self):
 87 |         with tf.variable_scope('self_attention', reuse=tf.AUTO_REUSE):
 88 |             self.aa_att = dot_attention(self.gated_att, self.gated_att, mask=self.a_mask,
 89 |                                         hidden=self.hidden_size, keep_prob=self.dropout_keep_prob,
 90 |                                         is_train=self.is_train)
 91 |             self_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N,
 92 |                                  input_size=self.aa_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob,
 93 |                                  is_train=self.is_train)
 94 |             self.self_att = self_rnn(self.aa_att, self.a_len)
 95 | 
 96 |     def _annotation(self):
 97 |         # shape(W_s1) = d_a * 2u
 98 |         self.W_s1 = tf.get_variable('W_s1', shape=[self.d_a, 2 * self.hidden_size],
 99 |                                     initializer=tf.contrib.layers.xavier_initializer())
100 |         # shape(W_s2) = r * d_a
101 |         self.W_s2 = tf.get_variable('W_s2', shape=[self.r, self.d_a],
102 |                                     initializer=tf.contrib.layers.xavier_initializer())
103 |         self.A = tf.nn.softmax(tf.map_fn(
104 |             lambda x: tf.matmul(self.W_s2, x),
105 |             tf.tanh(tf.map_fn(lambda x: tf.matmul(self.W_s1, tf.transpose(x)),
106 |                               self.gated_att))))
107 |         self.M = tf.matmul(self.A, self.gated_att)
108 |         self.A_T = tf.transpose(self.A, perm=[0, 2, 1])
109 |         tile_eye = tf.tile(tf.eye(self.r), [self.N, 1])
110 |         tile_eye = tf.reshape(tile_eye, [-1, self.r, self.r])
111 |         self.AA_T = tf.matmul(self.A, self.A_T) - tile_eye
112 |         self.P = tf.square(tf.norm(self.AA_T, axis=[-2, -1], ord='fro'))
113 | 
114 |     def _predict(self):
115 |         with tf.variable_scope('predict', reuse=tf.AUTO_REUSE):
116 |             self.att = tf.reshape(self.self_att, shape=[self.N, 2 * self.max_a_len * self.hidden_size])
117 |             self.mlp = tf.nn.relu(dense(self.att, hidden=4 * self.hidden_size, scope='dense_0'))
118 |             if self.is_train:
119 |                 self.mlp = tf.nn.dropout(self.mlp, self.dropout_keep_prob)
120 |             self.mlp = tf.nn.relu(dense(self.mlp, hidden=2 * self.hidden_size, scope='dense_1'))
121 |             if self.is_train:
122 |                 self.mlp = tf.nn.dropout(self.mlp, self.dropout_keep_prob)
123 |             self.outputs = dense(self.mlp, hidden=self.output_size, scope='output')
124 | 
125 |     def _compute_loss(self):
126 |         self.pre_labels = tf.argmax(self.outputs, axis=1)
127 |         self.loss = tf.reduce_mean(tf.reduce_sum(tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.outputs,
128 |                                                                                             labels=tf.stop_gradient(
129 |                                                                                                 self.answers_type))))
130 | 
131 |     def _create_train_op(self):
132 |         """
133 |         Selects the training algorithm and creates a train operation with it
134 |         """
135 |         with tf.variable_scope('optimizer', reuse=tf.AUTO_REUSE):
136 |             if self.optim_type == 'adadelta':
137 |                 self.optimizer = tf.train.AdadeltaOptimizer(self.lr)
138 |             elif self.optim_type == 'adam':
139 |                 self.optimizer = tf.train.AdamOptimizer(self.lr)
140 |             elif self.optim_type == 'rprop':
141 |                 self.optimizer = tf.train.RMSPropOptimizer(self.lr)
142 |             elif self.optim_type == 'sgd':
143 |                 self.optimizer = tf.train.GradientDescentOptimizer(self.lr)
144 |             else:
145 |                 raise NotImplementedError('Unsupported optimizer: {}'.format(self.optim_type))
146 |             self.train_op = self.optimizer.minimize(self.loss)
147 | 


--------------------------------------------------------------------------------
/GatedRNN/GatedRNN_prepro.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import pickle as pkl
  3 | from tqdm import tqdm
  4 | import ujson as json
  5 | import numpy as np
  6 | import jieba
  7 | import os
  8 | 
  9 | TYPE = {'Yes': 0, 'No': 1, 'Depends': 2, 'No_Opinion': 1}
 10 | 
 11 | 
 12 | def split_answers(answers):
 13 |     tokens = jieba.cut(answers)
 14 |     return [token for token in tokens]
 15 | 
 16 | 
 17 | def filter_questions(filenames):
 18 |     questions = {}
 19 |     for filename in filenames:
 20 |         with open(filename, 'r', encoding='utf8') as fh:
 21 |             for line in fh:
 22 |                 source = json.loads(line.strip())
 23 |                 if source['question_type'] != 'YES_NO':
 24 |                     continue
 25 |                 questions[source['question_id']] = source['segmented_question']
 26 |     print("{} questions in total".format(len(questions)))
 27 |     return questions
 28 | 
 29 | 
 30 | def process_test_file(filename, questions, max_p_len=500):
 31 |     print("Generating test examples...")
 32 |     total = 0
 33 |     examples = []
 34 |     other_examples = []
 35 |     eval_examples = {}
 36 |     with open(filename, 'r', encoding='utf8') as fh:
 37 |         for line in fh:
 38 |             source = json.loads(line.strip())
 39 |             if source['question_type'] != 'YES_NO':
 40 |                 other_examples.append(source)
 41 |                 continue
 42 |             total += 1
 43 |             answer_type = -1
 44 |             example = {'question_tokens': questions[str(source['question_id'])],
 45 |                        'answer_tokens': split_answers(source['answers'][0]),
 46 |                        'answer_type': answer_type,
 47 |                        'id': total}
 48 |             eval_examples[str(total)] = {'question_id': source['question_id'],
 49 |                                          'answers': source['answers']}
 50 |             examples.append(example)
 51 |     # random.shuffle(examples)
 52 |     print("{} questions in total".format(len(examples)))
 53 |     return examples, eval_examples, other_examples
 54 | 
 55 | 
 56 | def process_file(filenames, data_type, max_p_len=500):
 57 |     print("Generating {} examples...".format(data_type))
 58 |     total = 0
 59 |     examples = []
 60 |     eval_examples = {}
 61 |     for filename in filenames:
 62 |         with open(filename, 'r', encoding='utf8') as fh:
 63 |             for line in fh:
 64 |                 source = json.loads(line.strip())
 65 |                 if source['question_type'] != 'YES_NO':
 66 |                     continue
 67 |                 if len(source['answer_spans']) == 0:
 68 |                     continue
 69 |                 if source['answer_spans'][0][1] >= max_p_len:
 70 |                     continue
 71 |                 question_tokens = source['segmented_question']
 72 |                 for idx, answer_tokens in enumerate(source['segmented_answers']):
 73 |                     total += 1
 74 |                     answer_type = TYPE[source['yesno_answers'][idx]] if len(source['yesno_answers']) else -1
 75 |                     example = {'question_tokens': question_tokens,
 76 |                                'answer_tokens': answer_tokens,
 77 |                                'answer_type': answer_type,
 78 |                                'id': total}
 79 |                     eval_examples[str(total)] = {'question_id': source['question_id'],
 80 |                                                  'answer_type': answer_type}
 81 |                     examples.append(example)
 82 |     # random.shuffle(examples)
 83 |     print("{} questions in total".format(len(examples)))
 84 |     return examples, eval_examples
 85 | 
 86 | 
 87 | def build_features(config, examples, data_type, out_file, word2id):
 88 |     ans_limit = config.max_a_len
 89 |     ques_limit = config.max_q_len
 90 | 
 91 |     print("Processing {} examples...".format(data_type))
 92 |     writer = tf.python_io.TFRecordWriter(out_file)
 93 |     total = 0
 94 |     meta = {}
 95 |     for example in tqdm(examples):
 96 |         total += 1
 97 |         answer_token_ids = np.zeros([ans_limit], dtype=np.int32)
 98 |         question_token_ids = np.zeros([ques_limit], dtype=np.int32)
 99 |         answer_type = np.zeros([3], dtype=np.int32)
100 |         answer_type[example['answer_type']] = 1
101 | 
102 |         def _get_word(word):
103 |             for each in (word, word.lower(), word.capitalize(), word.upper()):
104 |                 if each in word2id:
105 |                     return word2id[each]
106 |             return 1
107 | 
108 |         answers_token_num = min(len(example['answer_tokens']), ques_limit)
109 |         for i in range(answers_token_num):
110 |             answer_token_ids[i] = _get_word(example['answer_tokens'][i])
111 |         question_token_num = min(len(example['question_tokens']), ques_limit)
112 |         for j in range(question_token_num):
113 |             question_token_ids[j] = _get_word(example['question_tokens'][j])
114 | 
115 |         record = tf.train.Example(features=tf.train.Features(
116 |             feature={
117 |                 'answer_token_ids': tf.train.Feature(
118 |                     bytes_list=tf.train.BytesList(value=[answer_token_ids.tostring()])),
119 |                 'question_token_ids': tf.train.Feature(
120 |                     bytes_list=tf.train.BytesList(value=[question_token_ids.tostring()])),
121 |                 'answer_type': tf.train.Feature(
122 |                     bytes_list=tf.train.BytesList(value=[answer_type.tostring()])),
123 |                 'id': tf.train.Feature(int64_list=tf.train.Int64List(value=[example['id']]))
124 |             }))
125 |         writer.write(record.SerializeToString())
126 |     print("Build {} instances of features in total".format(total))
127 |     meta["total"] = total
128 |     writer.close()
129 |     return meta
130 | 
131 | 
132 | def save(filename, obj, message=None):
133 |     if message is not None:
134 |         print("Saving {}...".format(message))
135 |         with open(filename, "w") as fh:
136 |             json.dump(obj, fh)
137 | 
138 | 
139 | def prepro(config, flags):
140 |     token2id = None
141 |     if os.path.isfile(flags.token2id_file):
142 |         with open(flags.token2id_file, 'r') as fh:
143 |             token2id = json.load(fh)
144 |     # train_examples, _ = process_file(config.train_files, 'train')
145 |     # train_meta = build_features(config, train_examples, 'train', flags.train_record_file, token2id)
146 |     # save(flags.train_meta, train_meta, message='train meta')
147 |     # del train_examples, train_meta
148 |     #
149 |     # dev_examples, dev_eval = process_file(config.dev_files, "dev")
150 |     # # 创建dev TFRecord文件
151 |     # dev_meta = build_features(config, dev_examples, "dev", flags.dev_record_file, token2id)
152 |     # save(flags.dev_eval_file, dev_eval, message="dev eval")
153 |     # save(flags.dev_meta, dev_meta, message="dev meta")
154 |     # del dev_examples, dev_eval, dev_meta
155 | 
156 |     # filtered_questions = filter_questions(config.test_files)
157 |     # save(flags.filtered_questions, filtered_questions, message='filtered questions')
158 |     filtered_questions = None
159 |     if os.path.isfile(flags.token2id_file):
160 |         with open(flags.filtered_questions, 'r') as fh:
161 |             filtered_questions = json.load(fh)
162 |     test_examples, test_eval, other_examples = process_test_file(flags.predicted_answers, filtered_questions)
163 |     # 创建test TFRecord文件
164 |     test_meta = build_features(config, test_examples, "test", flags.test_record_file, token2id)
165 |     save(flags.test_eval_file, test_eval, message="test eval")
166 |     save(flags.final_file, other_examples, message="test final")
167 |     with open(flags.final_file, 'w') as fout:
168 |         for example in other_examples:
169 |             fout.write(json.dumps(example, ensure_ascii=False) + '\n')
170 |     fout.close()
171 |     save(flags.test_meta, test_meta, message="test meta")
172 |     del test_examples, test_meta, test_eval, other_examples
173 | 


--------------------------------------------------------------------------------
/GatedRNN/GatedRNN_util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import os
 4 | import ujson as json
 5 | from sklearn.metrics import accuracy_score
 6 | TYPE = {0: 'Yes', 1: 'No', 2: 'Depends'}
 7 | 
 8 | 
 9 | def get_record_parser(config):
10 |     def parse(example):
11 |         ans_limit = config.max_a_len
12 |         ques_limit = config.max_q_len
13 |         features = tf.parse_single_example(example,
14 |                                            features={
15 |                                                'answer_token_ids': tf.FixedLenFeature([], tf.string),
16 |                                                'question_token_ids': tf.FixedLenFeature([], tf.string),
17 |                                                'answer_type': tf.FixedLenFeature([], tf.string),
18 |                                                'id': tf.FixedLenFeature([], tf.int64)
19 |                                            })
20 |         answer_token_ids = tf.reshape(tf.decode_raw(features['answer_token_ids'], tf.int32), [ans_limit])
21 |         question_token_ids = tf.reshape(tf.decode_raw(features['question_token_ids'], tf.int32), [ques_limit])
22 |         answer_type = tf.reshape(tf.decode_raw(features['answer_type'], tf.int32), [3])
23 |         qa_id = features['id']
24 |         return answer_token_ids, question_token_ids, answer_type, qa_id
25 | 
26 |     return parse
27 | 
28 | 
29 | def get_batch_dataset(record_file, parser, config):
30 |     num_threads = tf.constant(config.num_threads, dtype=tf.int32)
31 |     dataset = tf.data.TFRecordDataset(record_file).map(parser, num_parallel_calls=num_threads).shuffle(
32 |         config.capacity).batch(config.batch_size).repeat(config.epochs)
33 |     return dataset
34 | 
35 | 
36 | def get_dataset(record_file, parser, config):
37 |     num_threads = tf.constant(config.num_threads, dtype=tf.int32)
38 |     dataset = tf.data.TFRecordDataset(record_file).map(
39 |         parser, num_parallel_calls=num_threads).batch(config.batch_size).repeat()
40 |     return dataset
41 | 
42 | 
43 | def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle):
44 |     losses = []
45 |     # pred_answers = []
46 |     pre_ans_types, ref_ans_types = [], []
47 |     for i in range(num_batches):
48 |         qa_ids, loss, pre_labels = sess.run([model.qa_id, model.loss, model.pre_labels],
49 |                                             feed_dict={handle: str_handle} if handle is not None else None)
50 |         losses.append(loss)
51 |         for qa_id, pre_label in zip(qa_ids, pre_labels):
52 |             sample = eval_file[str(qa_id)]
53 |             pre_ans_types.append(pre_label)
54 |             ref_ans_types.append(sample['answer_type'])
55 | 
56 |     avg_loss = np.mean(losses)
57 |     avg_acc = accuracy_score(y_true=ref_ans_types, y_pred=pre_ans_types)
58 | 
59 |     loss_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/loss".format(data_type), simple_value=avg_loss), ])
60 |     acc_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/f1".format(data_type), simple_value=avg_acc), ])
61 |     return avg_loss, avg_acc, [loss_sum, acc_sum]
62 | 
63 | 
64 | def predict_batch(model, num_batches, eval_file, sess, data_type, final_file, logger):
65 |     pred_answers = []
66 |     for i in range(num_batches):
67 |         qa_ids, pre_labels = sess.run([model.qa_id, model.pre_labels])
68 |         for qa_id, pre_label in zip(qa_ids, pre_labels):
69 |             sample = eval_file[str(qa_id)]
70 |             pred_answers.append({'question_id': sample['question_id'],
71 |                                  'question_type': 'YES_NO',
72 |                                  'answers': sample['answers'],
73 |                                  'entity_answers': [[]],
74 |                                  'yesno_answers': [TYPE[pre_label]]})
75 | 
76 |     logger.info('{} questions'.format(len(pred_answers)))
77 |     with open(final_file, 'a') as fout:
78 |         for pred_answer in pred_answers:
79 |             fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
80 |     fout.close()
81 |     logger.info('Saving classification results')
82 | 


--------------------------------------------------------------------------------
/R-Net/S_model.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import logging
  3 | import time
  4 | from basic_rnn import cudnn_gru, dot_attention, ptr_net, summ
  5 | 
  6 | 
  7 | class Model(object):
  8 |     def __init__(self, args, batch, token_embeddings=None, trainable=True, opt=True):
  9 |         self.logger = logging.getLogger('brc')
 10 |         self.hidden_size = args.hidden_size
 11 |         self.batch_size = args.batch_size
 12 |         self.layer_num = args.layer_num
 13 |         self.optim_type = args.optim
 14 |         self.dropout_keep_prob = args.dropout_keep_prob
 15 |         self.learning_rate = args.learning_rate
 16 |         self.weight_decay = args.weight_decay
 17 |         self.trainable = trainable
 18 |         # length limit
 19 |         self.max_p_num = args.max_p_num
 20 |         self.max_p_len = args.max_p_len
 21 |         self.max_q_len = args.max_q_len
 22 |         self.max_a_len = args.max_a_len
 23 | 
 24 |         self.p, self.q, self.start_id, self.end_id, self.qa_id = batch.get_next()
 25 |         self.lr = tf.get_variable('lr', shape=[], dtype=tf.float32, trainable=False)
 26 |         self.is_train = tf.get_variable('is_train', shape=[], dtype=tf.bool, trainable=False)
 27 |         self.p_mask = tf.cast(self.p, tf.bool)
 28 |         self.q_mask = tf.cast(self.q, tf.bool)
 29 |         # passage的真实长度
 30 |         self.p_len = tf.reduce_sum(tf.cast(self.p_mask, tf.int32), axis=1)
 31 |         # self.p = tf.boolean_mask(self.p, mask=self.p_len)
 32 |         # question的真实长度
 33 |         self.q_len = tf.reduce_sum(tf.cast(self.q_mask, tf.int32), axis=1)
 34 |         # self.q = tf.boolean_mask(self.q, mask=self.q_len)
 35 | 
 36 |         if opt:
 37 |             self.N = tf.shape(self.start_id)[0]
 38 |             # 当前batch中passage的最大长度
 39 |             self.p_maxlen = tf.reduce_max(self.p_len)
 40 |             # 当前batch中question的最大长度
 41 |             self.q_maxlen = tf.reduce_max(self.q_len)
 42 |             self.p = tf.slice(self.p, [0, 0], [self.N, self.p_maxlen])
 43 |             self.q = tf.slice(self.q, [0, 0], [self.N, self.q_maxlen])
 44 |             self.p_mask = tf.slice(self.p_mask, [0, 0], [self.N, self.p_maxlen])
 45 |             self.q_mask = tf.slice(self.q_mask, [0, 0], [self.N, self.q_maxlen])
 46 |         else:
 47 |             self.p_maxlen, self.q_maxlen = self.max_p_len, self.max_q_len
 48 | 
 49 |         self._build_graph(token_embeddings)
 50 | 
 51 |     def _build_graph(self, token_embeddings):
 52 |         """
 53 |         Builds the computation graph with Tensorflow
 54 |         """
 55 |         start_t = time.time()
 56 |         # 对paragraph question做embedding
 57 |         self._embed(token_embeddings)
 58 |         # 对paragraph question分别用Bi-LSTM编码
 59 |         self._encode()
 60 |         # 基于question-aware的passage编码
 61 |         self._gated_attention()
 62 |         self._self_attention()
 63 |         self._pointer()
 64 |         # self._predict()
 65 |         # 对数似然损失，start end两部分损失取平均
 66 |         self._compute_loss()
 67 |         if self.trainable:
 68 |             # 选择优化算法
 69 |             self._create_train_op()
 70 |         self.logger.info('Time to build graph: {} s'.format(time.time() - start_t))
 71 | 
 72 |     def _embed(self, token_embeddings):
 73 |         with tf.device('/cpu:0'), tf.variable_scope('word_embedding', reuse=tf.AUTO_REUSE):
 74 |             word_embeddings = tf.get_variable('word_embeddings',
 75 |                                               initializer=tf.constant(token_embeddings, dtype=tf.float32),
 76 |                                               trainable=False)
 77 |             self.p_emb = tf.nn.embedding_lookup(word_embeddings, self.p)
 78 |             self.q_emb = tf.nn.embedding_lookup(word_embeddings, self.q)
 79 | 
 80 |     def _encode(self):
 81 |         with tf.variable_scope('passage_encoding', reuse=tf.AUTO_REUSE):
 82 |             self.p_rnn = cudnn_gru(num_layers=2*self.layer_num, num_units=self.hidden_size, batch_size=self.N,
 83 |                                    input_size=self.p_emb.get_shape().as_list()[-1],
 84 |                                    keep_prob=self.dropout_keep_prob, is_train=self.is_train)
 85 |             self.p_encodes = self.p_rnn(self.p_emb, seq_len=self.p_len)
 86 |         with tf.variable_scope('question_encoding', reuse=tf.AUTO_REUSE):
 87 |             self.q_rnn = cudnn_gru(num_layers=2*self.layer_num, num_units=self.hidden_size, batch_size=self.N,
 88 |                                    input_size=self.q_emb.get_shape().as_list()[-1],
 89 |                                    keep_prob=self.dropout_keep_prob, is_train=self.is_train)
 90 |             self.q_encodes = self.q_rnn(self.q_emb, seq_len=self.q_len)
 91 | 
 92 |     def _gated_attention(self):
 93 |         with tf.variable_scope('gated_attention', reuse=tf.AUTO_REUSE):
 94 |             self.qp_att = dot_attention(self.p_encodes, self.q_encodes, mask=self.q_mask,
 95 |                                         hidden=self.hidden_size, keep_prob=self.dropout_keep_prob,
 96 |                                         is_train=self.is_train)
 97 |             gated_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N,
 98 |                                   input_size=self.qp_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob,
 99 |                                   is_train=self.is_train)
100 |             self.gated_att = gated_rnn(self.qp_att, self.p_len)  # v_Pt
101 | 
102 |     def _self_attention(self):
103 |         with tf.variable_scope('self_attention', reuse=tf.AUTO_REUSE):
104 |             self.pp_att = dot_attention(self.gated_att, self.gated_att, mask=self.p_mask,
105 |                                         hidden=self.hidden_size, keep_prob=self.dropout_keep_prob,
106 |                                         is_train=self.is_train)
107 |             self_rnn = cudnn_gru(num_layers=self.layer_num, num_units=self.hidden_size, batch_size=self.N,
108 |                                  input_size=self.pp_att.get_shape().as_list()[-1], keep_prob=self.dropout_keep_prob,
109 |                                  is_train=self.is_train)
110 |             self.self_att = self_rnn(self.pp_att, self.p_len)
111 | 
112 |     def _pointer(self):
113 |         with tf.variable_scope('pointer', reuse=tf.AUTO_REUSE):
114 |             self.ques_vec = summ(self.q_encodes[:, :, -2 * self.hidden_size:], self.hidden_size, mask=self.q_mask,
115 |                                  keep_prob=self.dropout_keep_prob, is_train=self.is_train)  # r_Q
116 |             pointer = ptr_net(batch=self.N, hidden=self.ques_vec.get_shape().as_list()[-1],
117 |                               keep_prob=self.dropout_keep_prob, is_train=self.is_train)
118 |             self.logits1, self.logits2 = pointer(self.ques_vec, self.self_att, self.hidden_size, self.p_mask)
119 | 
120 |     def _predict(self):
121 |         with tf.variable_scope("predict"):
122 |             outer = tf.matmul(tf.expand_dims(tf.nn.softmax(self.logits1), axis=2),
123 |                               tf.expand_dims(tf.nn.softmax(self.logits2), axis=1))
124 |             self.outer = tf.matrix_band_part(outer, 0, self.max_a_len)
125 |             self.yp1 = tf.argmax(tf.reduce_max(outer, axis=2), axis=1)
126 |             self.yp2 = tf.argmax(tf.reduce_max(outer, axis=1), axis=1)
127 |             self.start_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits1, labels=tf.stop_gradient(
128 |                 tf.one_hot(self.start_id, tf.shape(self.logits1)[1], axis=1)))
129 |             self.end_loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits2, labels=tf.stop_gradient(
130 |                 tf.one_hot(self.start_id, tf.shape(self.logits2)[1], axis=1)))
131 |             self.loss = tf.reduce_mean(self.start_loss + self.end_loss)
132 | 
133 |     def _compute_loss(self):
134 |         def sparse_nll_loss(probs, labels, epsilon=1e-9, scope=None):
135 |             with tf.name_scope(scope, "log_loss"):
136 |                 labels = tf.one_hot(labels, tf.shape(probs)[1], axis=1)
137 |                 losses = - tf.reduce_sum(labels * tf.log(probs + epsilon), 1)
138 |             return losses
139 |         self.logits1 = tf.nn.softmax(self.logits1)
140 |         self.logits2 = tf.nn.softmax(self.logits2)
141 |         self.start_loss = sparse_nll_loss(probs=self.logits1, labels=self.start_id)
142 |         self.end_loss = sparse_nll_loss(probs=self.logits2, labels=self.end_id)
143 |         self.all_params = tf.trainable_variables()
144 |         self.loss = tf.reduce_mean(tf.add(self.start_loss, self.end_loss))
145 |         if self.weight_decay > 0:
146 |             with tf.variable_scope('l2_loss'):
147 |                 l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in self.all_params])
148 |             self.loss += self.weight_decay * l2_loss
149 | 
150 |     def _create_train_op(self):
151 |         with tf.variable_scope('optimizer', reuse=tf.AUTO_REUSE):
152 |             if self.optim_type == 'adadelta':
153 |                 self.optimizer = tf.train.AdadeltaOptimizer(self.lr)
154 |             elif self.optim_type == 'adam':
155 |                 self.optimizer = tf.train.AdamOptimizer(self.lr)
156 |             elif self.optim_type == 'rprop':
157 |                 self.optimizer = tf.train.RMSPropOptimizer(self.lr)
158 |             elif self.optim_type == 'sgd':
159 |                 self.optimizer = tf.train.GradientDescentOptimizer(self.lr)
160 |             else:
161 |                 raise NotImplementedError('Unsupported optimizer: {}'.format(self.optim_type))
162 |             self.train_op = self.optimizer.minimize(self.loss)
163 | 


--------------------------------------------------------------------------------
/R-Net/S_prepro.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import pickle as pkl
  3 | import os
  4 | from tqdm import tqdm
  5 | import ujson as json
  6 | from collections import Counter
  7 | import numpy as np
  8 | 
  9 | 
 10 | def process_file(filenames, data_type, max_p_len=500):
 11 |     print("Generating {} examples...".format(data_type))
 12 |     is_train = False
 13 |     if data_type == 'train':
 14 |         is_train = True
 15 |     examples = []
 16 |     eval_examples = {}
 17 |     total = 0
 18 |     for filename in filenames:
 19 |         with open(filename, 'r', encoding='utf8') as fh:
 20 |             for line in fh:
 21 |                 source = json.loads(line.strip())
 22 |                 if is_train:
 23 |                     if len(source['answer_spans']) == 0:
 24 |                         continue
 25 |                     if source['answer_spans'][0][1] >= max_p_len:
 26 |                         continue
 27 |                 total += 1
 28 |                 answers = []
 29 |                 if 'answer_docs' in source:
 30 |                     del source['fake_answers']
 31 |                     del source['segmented_answers']
 32 |                     answers = source['answers']
 33 |                 question_tokens = source['segmented_question']
 34 |                 passages = []
 35 |                 passages_len = []
 36 |                 start, end, answer_passages = 0, 0, 0
 37 |                 if 'answer_docs' in source and len(source['answer_docs']):
 38 |                     start = source['answer_spans'][0][0]
 39 |                     end = source['answer_spans'][0][1]
 40 |                     answer_passages = source['answer_docs'][0]
 41 |                 for idx, doc in enumerate(source['documents']):
 42 |                     del doc['paragraphs']
 43 |                     para_len = 0
 44 |                     if is_train:
 45 |                         para_len = min(len(doc['segmented_paragraphs'][doc['most_related_para']]), max_p_len)
 46 |                         passages += doc['segmented_paragraphs'][doc['most_related_para']][:para_len]
 47 |                     else:
 48 |                         para_infos = []
 49 |                         for para_tokens in doc['segmented_paragraphs']:
 50 |                             # para_tokens 每篇文档分词后的段落，question_tokens 问题分词
 51 |                             common_with_question = Counter(para_tokens) & Counter(question_tokens)
 52 |                             correct_preds = sum(common_with_question.values())
 53 |                             if correct_preds == 0:
 54 |                                 recall_wrt_question = 0
 55 |                             else:
 56 |                                 recall_wrt_question = float(correct_preds) / len(question_tokens)
 57 |                             para_infos.append((para_tokens, recall_wrt_question, len(para_tokens)))
 58 |                         # 排序 选出与question匹配recall最高的para_tokens
 59 |                         para_infos.sort(key=lambda x: (-x[1], x[2]))
 60 |                         fake_passage_tokens = []
 61 |                         for para_info in para_infos[:1]:
 62 |                             fake_passage_tokens += para_info[0]
 63 |                         para_len = min(len(fake_passage_tokens), max_p_len)
 64 |                         passages += fake_passage_tokens[:para_len]
 65 |                     if idx < answer_passages:
 66 |                         start += para_len
 67 |                         end += para_len
 68 |                     passages_len.append(para_len)
 69 |                 example = {'passages': passages,
 70 |                            'question_tokens': question_tokens,
 71 |                            'answer_passages': answer_passages,
 72 |                            'start_id': start,
 73 |                            'end_id': end,
 74 |                            'id': total}
 75 |                 if not is_train:
 76 |                     eval_examples[str(total)] = {'passages': passages,
 77 |                                                  'passages_len': passages_len,
 78 |                                                  'answers': answers,
 79 |                                                  'answer_passages': answer_passages,
 80 |                                                  'question': source['segmented_question'],
 81 |                                                  'question_id': source['question_id'],
 82 |                                                  'question_type': source['question_type']}
 83 |                 examples.append(example)
 84 |     # random.shuffle(examples)
 85 |     print("{} questions in total".format(len(examples)))
 86 |     return examples, eval_examples
 87 | 
 88 | 
 89 | def get_embedding(data_type, emb_file=None, vec_size=None, token2id_dict=None):
 90 |     print("Generating {} embedding...".format(data_type))
 91 |     filtered_tokens = {}
 92 |     if emb_file is not None:
 93 |         assert vec_size is not None
 94 |         with open(emb_file, 'rb') as fin:
 95 |             trained_embeddings = pkl.load(fin)
 96 |         fin.close()
 97 |         filtered_tokens = trained_embeddings.keys()
 98 | 
 99 |     NULL = "<NULL>"
100 |     OOV = "<OOV>"
101 |     # token2id
102 |     token2id = {token: idx for idx, token in
103 |                 enumerate(filtered_tokens, 2)} if token2id_dict is None else token2id_dict
104 |     id2token = {idx: token for idx, token in enumerate(filtered_tokens, 2)}
105 |     token2id[NULL] = 0
106 |     token2id[OOV] = 1
107 |     id2token['0'] = NULL
108 |     id2token['1'] = OOV
109 |     embedding_mat = np.zeros([len(token2id), vec_size])
110 |     # idx2emb = {idx: embedding_mat[token] for token, idx in token2id.items()}
111 |     # embedding_mat = [idx2emb[idx] for idx in range(len(idx2emb))]
112 |     for token in filtered_tokens:
113 |         # if token in trained_embeddings:
114 |         embedding_mat[token2id[token]] = trained_embeddings[token]
115 |     return embedding_mat, token2id, id2token
116 | 
117 | 
118 | def build_features(config, examples, data_type, out_file, word2id):
119 |     para_limit = config.max_p_len
120 |     ques_limit = config.max_q_len
121 | 
122 |     print("Processing {} examples...".format(data_type))
123 |     writer = tf.python_io.TFRecordWriter(out_file)
124 |     total = 0
125 |     meta = {}
126 |     for example in tqdm(examples):
127 |         total += 1
128 |         passages_token_ids = np.zeros([config.max_p_num * para_limit], dtype=np.int32)
129 |         question_token_ids = np.zeros([ques_limit], dtype=np.int32)
130 | 
131 |         def _get_word(word):
132 |             for each in (word, word.lower(), word.capitalize(), word.upper()):
133 |                 if each in word2id:
134 |                     return word2id[each]
135 |             return 1
136 | 
137 |         # passages token转id
138 |         idx = 0
139 |         for pdx, passage_token in enumerate(example['passages']):
140 |             passages_token_ids[pdx] = _get_word(passage_token)
141 |         # 问题token转id
142 |         question_token_num = min(len(example['question_tokens']), ques_limit)
143 |         for i in range(question_token_num):
144 |             question_token_ids[i] = _get_word(example['question_tokens'][i])
145 | 
146 |         record = tf.train.Example(features=tf.train.Features(feature={
147 |             "passages_token_ids": tf.train.Feature(
148 |                 bytes_list=tf.train.BytesList(value=[passages_token_ids.tostring()])),
149 |             "question_token_ids": tf.train.Feature(
150 |                 bytes_list=tf.train.BytesList(value=[question_token_ids.tostring()])),
151 |             "start_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example['start_id']])),
152 |             "end_id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example['end_id']])),
153 |             "id": tf.train.Feature(int64_list=tf.train.Int64List(value=[example["id"]]))
154 |         }))
155 |         writer.write(record.SerializeToString())
156 |     print("Build {} instances of features in total".format(total))
157 |     meta["total"] = total
158 |     writer.close()
159 |     return meta
160 | 
161 | 
162 | def save(filename, obj, message=None):
163 |     if message is not None:
164 |         print("Saving {}...".format(message))
165 |         with open(filename, "w") as fh:
166 |             json.dump(obj, fh)
167 | 
168 | 
169 | def prepro(config, flags):
170 |     token2id = None
171 |     if os.path.isfile(flags.token2id_file):
172 |         with open(flags.token2id_file, 'r') as fh:
173 |             token2id = json.load(fh)
174 | 
175 |     train_examples, train_eval = process_file(config.train_files, "train", config.max_p_len)
176 |     # 创建train TFRecord文件
177 |     train_meta = build_features(config, train_examples, "train", flags.train_record_file, token2id)
178 |     save(flags.train_eval_file, train_eval, message="train eval")
179 |     save(flags.train_meta, train_meta, message="dev meta")
180 |     del train_examples, train_eval, train_meta
181 | 
182 |     dev_examples, dev_eval = process_file(config.dev_files, "dev", config.max_p_len)
183 |     # 创建dev TFRecord文件
184 |     dev_meta = build_features(config, dev_examples, "dev", flags.dev_record_file, token2id)
185 |     save(flags.dev_eval_file, dev_eval, message="dev eval")
186 |     save(flags.dev_meta, dev_meta, message="dev meta")
187 |     del dev_examples, dev_eval, dev_meta
188 | 
189 |     test_examples, test_eval = process_file(config.test_files, "test", config.max_p_len)
190 |     # # 创建test TFRecord文件
191 |     test_meta = build_features(config, test_examples, "test", flags.test_record_file, token2id)
192 |     save(flags.test_eval_file, test_eval, message="test eval")
193 |     save(flags.test_meta, test_meta, message="test meta")
194 |     del test_examples, test_eval, test_meta
195 | 
196 |     # save(flags.token2id_file, token2id, message="word2idx")
197 | 
198 | # def draw_hist(x, bins, label):
199 | #     plt.hist(x=x, bins=bins)
200 | #     plt.xlabel(label)
201 | #     plt.ylabel('Num')
202 | #     plt.show()
203 | 


--------------------------------------------------------------------------------
/R-Net/S_util.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import os
  4 | import ujson as json
  5 | import mrc_eval
  6 | from bleu import BLEUWithBonus
  7 | from rouge import RougeLWithBonus
  8 | 
  9 | 
 10 | def get_record_parser(config):
 11 |     def parse(example):
 12 |         features = tf.parse_single_example(example,
 13 |                                            features={
 14 |                                                'passages_token_ids': tf.FixedLenFeature([], tf.string),
 15 |                                                'question_token_ids': tf.FixedLenFeature([], tf.string),
 16 |                                                'start_id': tf.FixedLenFeature([], tf.int64),
 17 |                                                'end_id': tf.FixedLenFeature([], tf.int64),
 18 |                                                'id': tf.FixedLenFeature([], tf.int64)
 19 |                                            })
 20 |         passages_token_ids = tf.reshape(tf.decode_raw(features["passages_token_ids"], tf.int32),
 21 |                                         [config.max_p_num * config.max_p_len])
 22 |         question_token_ids = tf.reshape(tf.decode_raw(features["question_token_ids"], tf.int32),
 23 |                                         [config.max_q_len])
 24 |         start_id = features['start_id']
 25 |         end_id = features['end_id']
 26 |         qa_id = features['id']
 27 |         return passages_token_ids, question_token_ids, start_id, end_id, qa_id
 28 | 
 29 |     return parse
 30 | 
 31 | 
 32 | def get_batch_dataset(record_file, parser, config):
 33 |     num_threads = tf.constant(config.num_threads, dtype=tf.int32)
 34 |     dataset = tf.data.TFRecordDataset(record_file).map(parser, num_parallel_calls=num_threads).shuffle(
 35 |         config.capacity).batch(config.batch_size).repeat(config.epochs)
 36 |     # if config.is_bucket:
 37 |     #     buckets = [tf.constant(num) for num in range(*config.bucket_range)]
 38 |     #
 39 |     #     def key_func(context_idxs, ques_idxs, context_char_idxs, ques_char_idxs, y1, y2, qa_id):
 40 |     #         c_len = tf.reduce_sum(
 41 |     #             tf.cast(tf.cast(context_idxs, tf.bool), tf.int32))
 42 |     #         buckets_min = [np.iinfo(np.int32).min] + buckets
 43 |     #         buckets_max = buckets + [np.iinfo(np.int32).max]
 44 |     #         conditions_c = tf.logical_and(
 45 |     #             tf.less(buckets_min, c_len), tf.less_equal(c_len, buckets_max))
 46 |     #         bucket_id = tf.reduce_min(tf.where(conditions_c))
 47 |     #         return bucket_id
 48 |     #
 49 |     #     def reduce_func(key, elements):
 50 |     #         return elements.batch(config.batch_size)
 51 |     #
 52 |     #     dataset = dataset.apply(
 53 |     #         tf.contrib.data.group_by_window(key_func, reduce_func, window_size=5 * config.batch_size)).shuffle(
 54 |     #         len(buckets) * 25)
 55 |     # else:
 56 |     return dataset
 57 | 
 58 | 
 59 | def get_dataset(record_file, parser, config):
 60 |     num_threads = tf.constant(config.num_threads, dtype=tf.int32)
 61 |     dataset = tf.data.TFRecordDataset(record_file).map(
 62 |         parser, num_parallel_calls=num_threads).batch(config.batch_size).repeat()
 63 |     return dataset
 64 | 
 65 | 
 66 | def evaluate_batch(model, num_batches, eval_file, sess, data_type, handle, str_handle, args, logger, result_prefix=None):
 67 |     losses = []
 68 |     pred_answers, ref_answers = [], []
 69 |     padded_p_len = args.max_p_len
 70 |     for i in range(num_batches):
 71 |         qa_id, loss, start_probs, end_probs = sess.run([model.qa_id, model.loss, model.logits1, model.logits2],
 72 |                                                        feed_dict={handle: str_handle} if handle is not None else None)
 73 |         losses.append(loss)
 74 |         start, end = 0, 0
 75 |         for id, start_prob, end_prob in zip(qa_id, start_probs, end_probs):
 76 |             best_p_idx, best_span, best_score = None, None, 0
 77 |             sample = eval_file[str(id)]
 78 |             for p_idx, passage_len in enumerate(sample['passages_len']):
 79 |                 if p_idx >= args.max_p_num:
 80 |                     continue
 81 |                 # 为每个passage找到best answer
 82 |                 end = start + passage_len
 83 |                 answer_span, score = find_best_answer_for_passage(start_prob[start: end], end_prob[start: end],
 84 |                                                                   passage_len, args.max_a_len)
 85 |                 answer_span[0] += start
 86 |                 answer_span[1] += start
 87 |                 # 各passage间最大score
 88 |                 if score > best_score:
 89 |                     best_score = score
 90 |                     best_p_idx = p_idx
 91 |                     best_span = answer_span
 92 |                 end = start
 93 |             # best_span = [start_prob, end_prob]
 94 |             # best_answer = sample['passages'][best_span[0]: best_span[1] + 1]
 95 |             # 根据span找到token
 96 |             if best_p_idx is None or best_span is None:
 97 |                 best_answer = ''
 98 |             else:
 99 |                 best_answer = ''.join(sample['passages'][best_span[0]: best_span[1] + 1])
100 |             # TODO 加入question tokens
101 |             pred_answers.append({'question_id': sample['question_id'],
102 |                                  'question_type': sample['question_type'],
103 |                                  'answers': [best_answer],
104 |                                  'yesno_answers': []})
105 |             # 标准答案
106 |             # if 'answers' in sample and len(sample['answers']) > 0:
107 |             if 'answers' in sample:
108 |                 ref_answers.append({'question_id': sample['question_id'],
109 |                                     'question_type': sample['question_type'],
110 |                                     'answers': sample['answers'],
111 |                                     'yesno_answers': []})
112 | 
113 |     if result_prefix is not None:
114 |         result_file = os.path.join(args.result_dir, result_prefix + '.json')
115 |         with open(result_file, 'w') as fout:
116 |             for pred_answer in pred_answers:
117 |                 fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
118 |         logger.info('Saving {} results to {}'.format(result_prefix, result_file))
119 | 
120 |     avg_loss = np.mean(losses)
121 |     bleu4, rouge_l = 0, 0
122 |     if len(ref_answers) > 0:
123 |         # K-V 问题ID-答案
124 |         pred_dict, ref_dict, bleu_rouge = {}, {}, {}
125 |         for pred, ref in zip(pred_answers, ref_answers):
126 |             question_id = ref['question_id']
127 |             if len(ref['answers']) > 0:
128 |                 # 将answer tokens转换为由空格连接的一句话
129 |                 pred_dict[question_id] = {'answers': mrc_eval.normalize(pred['answers']),
130 |                                           'yesno_answers': []}
131 |                 ref_dict[question_id] = {'question_type': ref['question_type'],
132 |                                          'answers': mrc_eval.normalize(ref['answers']),
133 |                                          'yesno_answers': []}
134 |         bleu_eval = BLEUWithBonus(4, alpha=1.0, beta=1.0)
135 |         rouge_eval = RougeLWithBonus(alpha=1.0, beta=1.0, gamma=1.2)
136 |         bleu4, rouge_l = mrc_eval.calc_metrics(pred_dict,
137 |                                                ref_dict,
138 |                                                bleu_eval,
139 |                                                rouge_eval)
140 |         bleu_rouge['Bleu-4'] = bleu4
141 |         bleu_rouge['Rouge-L'] = rouge_l
142 |     else:
143 |         bleu_rouge = None
144 | 
145 |     loss_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/loss".format(data_type), simple_value=avg_loss), ])
146 |     bleu_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/f1".format(data_type), simple_value=bleu4), ])
147 |     rouge_sum = tf.Summary(value=[tf.Summary.Value(tag="{}/em".format(data_type), simple_value=rouge_l), ])
148 |     return avg_loss, bleu_rouge, [loss_sum, bleu_sum, rouge_sum]
149 | 
150 | 
151 | def find_best_answer_for_passage(start_probs, end_probs, passage_len=None, max_a_len=None):
152 |     """
153 |     Finds the best answer with the maximum start_prob * end_prob from a single passage
154 |     """
155 |     if passage_len is None:
156 |         passage_len = len(start_probs)
157 |     else:
158 |         passage_len = min(len(start_probs), passage_len)
159 |     best_start, best_end, max_prob = -1, -1, 0
160 |     # 从头扫描passage
161 |     for start_idx in range(passage_len):
162 |         for ans_len in range(max_a_len):
163 |             end_idx = start_idx + ans_len
164 |             if end_idx >= passage_len:
165 |                 continue
166 |             prob = start_probs[start_idx] * end_probs[end_idx]
167 |             if prob > max_prob:
168 |                 best_start = start_idx
169 |                 best_end = end_idx
170 |                 max_prob = prob
171 |     return [best_start, best_end], max_prob
172 | 


--------------------------------------------------------------------------------
/R-Net/bleu.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | 
 3 | import math
 4 | 
 5 | import common
 6 | 
 7 | 
 8 | class BLEU(object):
 9 |     def __init__(self, n_size):
10 |         self.match_ngram = {}
11 |         self.candi_ngram = {}
12 |         self.bp_r = 0
13 |         self.bp_c = 0
14 |         self.n_size = n_size
15 | 
16 |     def add_inst(self, cand, ref_list):
17 |         for n_size in range(self.n_size):
18 |             self.count_ngram(cand, ref_list, n_size)
19 |         self.count_bp(cand, ref_list)
20 | 
21 |     def count_ngram(self, cand, ref_list, n_size):
22 |         cand_ngram = common.get_ngram(cand, n_size)
23 |         refs_ngram = []
24 |         for ref in ref_list:
25 |             refs_ngram.append(common.get_ngram(ref, n_size))
26 |         if n_size not in self.match_ngram:
27 |             self.match_ngram[n_size] = 0
28 |             self.candi_ngram[n_size] = 0
29 |         match_size, cand_size = common.get_match_size(cand_ngram, refs_ngram)
30 |         self.match_ngram[n_size] += match_size
31 |         self.candi_ngram[n_size] += cand_size
32 | 
33 |     def count_bp(self, cand, ref_list):
34 |         self.bp_c += len(cand)
35 |         self.bp_r += min([
36 |             (abs(len(cand) - len(ref)), len(ref))
37 |             for ref in ref_list]
38 |         )[1]
39 | 
40 |     def score(self):
41 |         prob_list = []
42 |         for n_size in range(self.n_size):
43 |             if float(self.candi_ngram[n_size]) == 0:
44 |                 prob_list.append(0)
45 |             else:
46 |                 prob_list.append(self.match_ngram[n_size] / float(self.candi_ngram[n_size]))
47 |         # prob_list = [
48 |         #     self.match_ngram[n_size] / float(self.candi_ngram[n_size])
49 |         #     for n_size in range(self.n_size)
50 |         # ]
51 |         bleu_list = [prob_list[0]]
52 |         for n in range(1, self.n_size):
53 |             bleu_list.append(bleu_list[-1] * prob_list[n])
54 |         for n in range(self.n_size):
55 |             bleu_list[n] = bleu_list[n] ** (1. / float(n + 1))
56 |         bp = math.exp(min(1 - self.bp_r / float(self.bp_c), 0))
57 |         for n in range(self.n_size):
58 |             bleu_list[n] = bleu_list[n] * bp
59 |         return bleu_list
60 | 
61 | 
62 | class BLEUWithBonus(BLEU):
63 |     def __init__(self, n_size, alpha=1.0, beta=1.0):
64 |         super(BLEUWithBonus, self).__init__(n_size)
65 |         self.alpha = alpha
66 |         self.beta = beta
67 | 
68 |     def add_inst(self,
69 |                  cand,
70 |                  ref_list,
71 |                  yn_label=None, yn_ref=None, entity_ref=None):
72 |         # super(BLEUWithBonus, self).add_inst(cand, ref_list)
73 |         BLEU.add_inst(self, cand, ref_list)
74 |         if yn_label is not None and yn_ref is not None:
75 |             self.add_yn_bonus(cand, ref_list, yn_label, yn_ref)
76 |         elif entity_ref is not None:
77 |             self.add_entity_bonus(cand, entity_ref)
78 | 
79 |     def add_yn_bonus(self, cand, ref_list, yn_label, yn_ref):
80 |         for n_size in range(self.n_size):
81 |             cand_ngram = common.get_ngram(cand, n_size, label=yn_label)
82 |             ref_ngram = []
83 |             for ref_id, r in enumerate(yn_ref):
84 |                 ref_ngram.append(common.get_ngram(ref_list[ref_id], n_size, label=r))
85 |             match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram)
86 |             self.match_ngram[n_size] += self.alpha * match_size
87 |             self.candi_ngram[n_size] += self.alpha * match_size
88 | 
89 |     def add_entity_bonus(self, cand, entity_ref):
90 |         for n_size in range(self.n_size):
91 |             cand_ngram = common.get_ngram(cand, n_size, label='ENTITY')
92 |             ref_ngram = []
93 |             for reff_id, r in enumerate(entity_ref):
94 |                 ref_ngram.append(common.get_ngram(r, n_size, label='ENTITY'))
95 |             match_size, cand_size = common.get_match_size(cand_ngram, ref_ngram)
96 |             self.match_ngram[n_size] += self.beta * match_size
97 |             self.candi_ngram[n_size] += self.beta * match_size
98 | 


--------------------------------------------------------------------------------
/R-Net/common.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | from functools import reduce
 3 | import math
 4 | import ujson as json
 5 | from collections import defaultdict
 6 | import sys
 7 | 
 8 | 
 9 | def get_match_size(cand_ngram, refs_ngram):
10 |     ref_set = defaultdict(int)
11 |     for ref_ngram in refs_ngram:
12 |         tmp_ref_set = defaultdict(int)
13 |         for ngram in ref_ngram:
14 |             tmp_ref_set[ngram] += 1
15 |         for ngram, count in tmp_ref_set.items():
16 |             ref_set[ngram] = max(ref_set[ngram], count)
17 |     cand_set = defaultdict(int)
18 |     for ngram in cand_ngram:
19 |         cand_set[ngram] += 1
20 |     match_size = 0
21 |     for ngram, count in cand_set.items():
22 |         match_size += min(count, ref_set.get(ngram, 0))
23 |     cand_size = len(cand_ngram)
24 |     return match_size, cand_size
25 | 
26 | 
27 | def get_ngram(sent, n_size, label=None):
28 |     def _ngram(sent, n_size):
29 |         ngram_list = []
30 |         for left in range(len(sent) - n_size):
31 |             ngram_list.append(sent[left: left + n_size + 1])
32 |         return ngram_list
33 | 
34 |     ngram_list = _ngram(sent, n_size)
35 |     if label is not None:
36 |         ngram_list = [ngram + '_' + label for ngram in ngram_list]
37 |     return ngram_list
38 | 
39 | 
40 | def word2char(str_in):
41 |     str_out = str_in.replace(' ', '')
42 |     return ''.join(str_out.split())
43 | 


--------------------------------------------------------------------------------
/R-Net/mrc_eval.py:
--------------------------------------------------------------------------------
  1 | # coding:utf8
  2 | """
  3 | This module computes evaluation metrics for DuReader dataset.
  4 | """
  5 | 
  6 | import argparse
  7 | import itertools
  8 | import ujson as json
  9 | import zipfile
 10 | from bleu import BLEUWithBonus
 11 | from rouge import RougeLWithBonus
 12 | 
 13 | EMPTY = ''
 14 | YESNO_LABELS = set(['Yes', 'No', 'Depends'])
 15 | 
 16 | 
 17 | def normalize(s):
 18 |     """
 19 |     Normalize strings to space joined chars.
 20 |     Args:
 21 |         s: a list of strings.
 22 |     Returns:
 23 |         A list of normalized strings.
 24 |     """
 25 |     if not s:
 26 |         return s
 27 |     normalized = []
 28 |     for ss in s:
 29 |         tokens = [c for c in list(ss) if len(c.strip()) != 0]
 30 |         normalized.append(''.join(tokens))
 31 |     return normalized
 32 | 
 33 | 
 34 | def data_check(obj):
 35 |     """
 36 |     Check data.
 37 | 
 38 |     Raises:
 39 |         Raises AssertionError when data is not legal.
 40 |     """
 41 |     # 判断是否有answer_id
 42 |     assert 'question_id' in obj, "Missing 'question_id' field."
 43 |     # assert 'yesno_answers' in obj, \
 44 |     #        "Missing 'yesno_answers' field. question_id: {}".format(obj['question_id'])
 45 |     # 如果包含yesno_answers，那么格式必须为list
 46 |     if "yesno_answers" in obj:
 47 |         assert isinstance(obj['yesno_answers'], list), \
 48 |             r"""'yesno_answers' field must be a list, if the 'question_type' is not
 49 |             'YES_NO', then this field should be an empty list.
 50 |             question_id: {}""".format(obj['question_id'])
 51 |     else:
 52 |         obj["yesno_answers"] = []
 53 |     if "entity_answers" not in obj:
 54 |         obj["entity_answers"] = []
 55 | 
 56 | 
 57 | def read_file(file_name, is_ref=False):
 58 |     """
 59 |     Read predict answers or reference answers from file.
 60 | 
 61 |     Args:
 62 |         file_name: the name of the file containing predict result or reference
 63 |                    result.
 64 | 
 65 |     Returns:
 66 |         A dictionary mapping question_id to the result information. The result
 67 |         information itself is also a dictionary with has four keys:
 68 |         - question_type: type of the query.
 69 |         - yesno_answers: A list of yesno answers corresponding to 'answers'.
 70 |         - answers: A list of predicted answers.
 71 |         - entity_answers: A list, each element is also a list containing the entities
 72 |                     tagged out from the corresponding answer string.
 73 |     """
 74 | 
 75 |     def _open(file_name, mode, zip_obj=None):
 76 |         if zip_obj is not None:
 77 |             return zip_obj.open(file_name, mode)
 78 |         return open(file_name, mode)
 79 | 
 80 |     results = {}
 81 |     # 是否是参考答案
 82 |     if is_ref:
 83 |         keys = ['source', 'answers', 'yesno_answers', 'entity_answers', 'question_type']
 84 |     else:
 85 |         keys = ['answers', 'yesno_answers']
 86 |     # 如果是zip文件则以zip方式读取
 87 |     zf = zipfile.ZipFile(file_name, 'r') if file_name.endswith('.zip') else None
 88 |     # zip包中文件列表
 89 |     file_list = [file_name] if zf is None else zf.namelist()
 90 | 
 91 |     for fn in file_list:
 92 |         for line in _open(fn, 'r', zip_obj=zf):
 93 |             try:
 94 |                 obj = json.loads(line.strip())
 95 |             except ValueError:
 96 |                 raise ValueError("Every line of data should be legal json")
 97 |             data_check(obj)
 98 |             qid = obj['question_id']
 99 |             # 必须有question id
100 |             assert qid not in results, "Duplicate question_id: {}".format(qid)
101 |             results[qid] = {}
102 |             for k in keys:
103 |                 if k == 'answers':
104 |                     results[qid][k] = normalize(obj[k])
105 |                 else:
106 |                     results[qid][k] = obj[k]
107 |             if is_ref:
108 |                 for i, e in enumerate(results[qid]['entity_answers']):
109 |                     results[qid]['entity_answers'][i] = normalize(e)
110 |     return results
111 | 
112 | 
113 | def calc_metrics(pred_result, ref_result, bleu_eval, rouge_eval):
114 |     """Computes bleu-4 and rouge-l.
115 | 
116 |     Args:
117 |         - pred_result: Refer to the returned dict of `read_file` with
118 |                        'is_ref=False'.
119 |         - ref_result: Refer to the returned dict of `ref_file` with
120 |                       'is_ref=True'.
121 |         - bleu_result: A BleuWithBonus object.
122 |         - rouge_result: A RougeLWithBonus object.
123 |     Returns:
124 |         bleu-4 and rouge-l values as a tuple of float values.
125 |     """
126 |     for qid, results in ref_result.items():
127 |         # 根据question id从预测结果中选择答案
128 |         cand_result = pred_result.get(qid, {})
129 |         pred_answers = cand_result.get('answers', [])
130 |         if not pred_answers:
131 |             pred_answers = EMPTY
132 |         else:
133 |             pred_answers = pred_answers[0]
134 |         pred_yn_label = None
135 |         ref_entities = None
136 |         ref_answers = results.get('answers', [])
137 |         if not ref_answers:
138 |             continue
139 |         if results['question_type'] == 'ENTITY':
140 |             ref_entities = set(
141 |                 itertools.chain(*results.get('entity_answers', [[]])))
142 |             if not ref_entities:
143 |                 ref_entities = None
144 |         if results['question_type'] == 'YES_NO':
145 |             cand_yesno = cand_result.get('yesno_answers', [])
146 |             pred_yn_label = None if len(cand_yesno) == 0 \
147 |                 else cand_yesno[0]
148 |         bleu_eval.add_inst(
149 |             pred_answers,
150 |             ref_answers,
151 |             yn_label=pred_yn_label,
152 |             yn_ref=results['yesno_answers'],
153 |             entity_ref=ref_entities)
154 |         rouge_eval.add_inst(
155 |             pred_answers,
156 |             ref_answers,
157 |             yn_label=pred_yn_label,
158 |             yn_ref=results['yesno_answers'],
159 |             entity_ref=ref_entities)
160 |     bleu4 = bleu_eval.score()[-1]
161 |     rouge_l = rouge_eval.score()
162 |     return bleu4, rouge_l
163 | 
164 | 
165 | def main(args):
166 |     err = None
167 |     metrics = {}
168 |     bleu4, rouge_l = 0.0, 0.0
169 |     alpha = args.alpha  # default 1.0
170 |     beta = args.beta  # default 1.0
171 |     bleu_eval = BLEUWithBonus(4, alpha=alpha, beta=beta)
172 |     rouge_eval = RougeLWithBonus(alpha=alpha, beta=beta, gamma=1.2)
173 |     # 载入answer文件 格式dict question_id: {answers:[], yesno_answers:[]}
174 |     pred_result = read_file(args.pred_file)
175 |     ref_result = read_file(args.ref_file, is_ref=True)
176 |     bleu4, rouge_l = calc_metrics(pred_result,
177 |                                   ref_result,
178 |                                   bleu_eval,
179 |                                   rouge_eval)
180 |     metrics = {
181 |         'ROUGE-L': round(rouge_l * 100, 2),
182 |         'BLEU-4': round(bleu4 * 100, 2),
183 |     }
184 |     print(json.dumps(metrics, ensure_ascii=False).encode('utf8'))
185 | 
186 | 
187 | if __name__ == '__main__':
188 |     parser = argparse.ArgumentParser()
189 |     parser.add_argument('--pred_file', help='predict file')
190 |     parser.add_argument('--ref_file', help='reference file')
191 |     parser.add_argument('--alpha', type=float, default=1.0,
192 |                         help='common value of alpha')
193 |     parser.add_argument('--beta', type=float, default=1.0,
194 |                         help='common value of beta')
195 |     args = parser.parse_args()
196 |     main(args)
197 | 


--------------------------------------------------------------------------------
/R-Net/rouge.py:
--------------------------------------------------------------------------------
 1 | # coding:utf8
 2 | 
 3 | from functools import reduce
 4 | import math
 5 | import json
 6 | import numpy as np
 7 | from collections import defaultdict
 8 | import sys
 9 | 
10 | # reload(sys)
11 | # sys.setdefaultencoding("utf-8")
12 | 
13 | 
14 | class RougeLWithBonus(object):
15 |     def __init__(self, alpha=1.0, beta=1.0, gamma=1.2):
16 |         self.alpha = alpha
17 |         self.beta = beta
18 |         self.gamma = gamma
19 |         self.inst_scores = []
20 | 
21 |     def lcs(self, string, sub):
22 |         if len(string) < len(sub):
23 |             sub, string = string, sub
24 |         lengths = np.zeros((len(string) + 1, len(sub) + 1))
25 |         for j in range(1, len(sub) + 1):
26 |             for i in range(1, len(string) + 1):
27 |                 if string[i - 1] == sub[j - 1]:
28 |                     lengths[i][j] = lengths[i - 1][j - 1] + 1
29 |                 else:
30 |                     lengths[i][j] = max(lengths[i - 1][j], lengths[i][j - 1])
31 |         return lengths[len(string)][len(sub)]
32 | 
33 |     def add_inst(self,
34 |                  cand,
35 |                  ref_list,
36 |                  yn_label=None, yn_ref=None, entity_ref=None):
37 |         precs, recalls = [], []
38 |         for i, ref in enumerate(ref_list):
39 |             basic_lcs = self.lcs(cand, ref)
40 |             yn_bonus, entity_bonus = 0.0, 0.0
41 |             if yn_ref is not None and yn_label is not None:
42 |                 yn_bonus = self.add_yn_bonus(cand, ref, yn_label, yn_ref[i])
43 |             elif entity_ref is not None:
44 |                 entity_bonus = self.add_entity_bonus(cand, entity_ref)
45 |             p_denom = len(cand) + self.alpha * yn_bonus + self.beta * entity_bonus
46 |             r_denom = len(ref) + self.alpha * yn_bonus + self.beta * entity_bonus
47 |             prec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \
48 |                    / p_denom if p_denom > 0. else 0.
49 |             rec = (basic_lcs + self.alpha * yn_bonus + self.beta * entity_bonus) \
50 |                   / r_denom if r_denom > 0. else 0.
51 |             precs.append(prec)
52 |             recalls.append(rec)
53 | 
54 |         prec_max = max(precs)
55 |         rec_max = max(recalls)
56 |         if prec_max != 0 and rec_max != 0:
57 |             score = ((1 + self.gamma ** 2) * prec_max * rec_max) / \
58 |                     float(rec_max + self.gamma ** 2 * prec_max)
59 |         else:
60 |             score = 0.0
61 |         self.inst_scores.append(score)
62 | 
63 |     def add_yn_bonus(self, cand, ref, yn_label, yn_ref):
64 |         if yn_label != yn_ref:
65 |             return 0.0
66 |         lcs_ = self.lcs(cand, ref)
67 |         return lcs_
68 | 
69 |     def add_entity_bonus(self, cand, entity_ref):
70 |         lcs_ = 0.0
71 |         for ent in entity_ref:
72 |             if ent in cand:
73 |                 lcs_ += len(ent)
74 |         return lcs_
75 | 
76 |     def score(self):
77 |         return 1. * sum(self.inst_scores) / len(self.inst_scores)
78 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MRC2018
 2 | - 2018机器阅读理解技术竞赛 [竞赛网站](http://mrc2018.cipsc.org.cn/)
 3 | - 参赛模型：BiDAF+Self Attention+Pre(single)
 4 | - 最终排名：28/105（菜鸡第一次参赛）
 5 | 
 6 | ## 最近更新
 7 | - 2018/09/11更新，AAAI赶完了，感谢大家的star。
 8 | 1. basic_rnn.py 现已支持multi-layer的RNNCell(Tensorflow单层RNN和多层RNN的用法完全不同。。。)，添加了最新的SRU和IndyRNN
 9 | 2. rc_model.py Adam更换为速度更快的LazyAdam，需tf>=1.9
10 | - 2018/08/20更新，po主在赶AAAI，焦头烂额。。。先简单写一下，整个模型的训练和修改流程（以BiDAF+Self Attention为例，后续做成PDF详解）：
11 | 1. /dureader/run.py --prepare（数据预处理） --train(训练、预测)
12 | 2. /dureader/rc_model.py 模型（做修改可从此处着手）
13 | 3. /dureader/layers 各种层（pointnet，match layer，cudnn rnn）
14 | 4. /dureader/json_to_sentence.py 从原始json文件中提取文本
15 | 5. /dureader/pretrain_embedding.py 预训练词向量
16 | 6. /dureader/SIF.py 参考论文A Simple but Tough-to-Beat Baseline for Sentence Embeddings，但效果不佳。。。
17 | 7. /utils 评价指标
18 | - 2018/08/20更新，最好成绩使用的参数即为BiDAF+Self Attention/run.py中[默认参数](https://github.com/shiningliang/MRC2018/blob/master/BiDAF%2BSelf%20Attention/dureader/run.py#L22)
19 | - 2018/08/06更新，po主参加了在[语言与智能高峰论坛](http://www.cipsc.org.cn/lis2018/index.html)上举办的比赛颁奖典礼，发现都是前期特征工程提升巨大，模型上未有亮眼工作，如果拿到了前几名的技术报告，会推上来
20 | - 2018/08/06更新，百度现已开放全部数据，下边的数据集统计表中已更新链接，比赛成绩也会放上来，大家可以日常打榜。颁奖典礼上负责人表示，比赛明年还会继续举办，大家加油！
21 | 
22 | ## 参考模型
23 | - [R-Net](https://www.microsoft.com/en-us/research/wp-content/uploads/2017/05/r-net.pdf)
24 | - [BiDAF](https://allenai.github.io/bi-att-flow/)
25 | 
26 | ## 参考代码
27 | - [HKUST](https://github.com/HKUST-KnowComp/R-Net)
28 | - [DuReader](https://github.com/baidu/DuReader)
29 | 
30 | ## Requirements
31 | ### General
32 | - Python >= 3.4
33 | - numpy
34 | 
35 | ### Python Packages
36 | - tensorflow-gpu >= 1.9.0
37 | - ujson
38 | - pickle
39 | - tqdm
40 | 
41 | ### Data
42 | 
43 | 类型 | train | dev | test
44 | ---|---|---|---|
45 | [比赛](http://ai.baidu.com/broad/download?dataset=dureader) | 27W| 1W | 2W |
46 | [开放](http://ai.baidu.com/broad/download) | 20W | 1W | 1W
47 | 
48 | ## Performance
49 | ### Score(Public Board)
50 | 
51 | Model | Rouge-L | Bleu-4
52 | ---|---|---
53 | BiDAF(cuDNN based) | 46.56 | 40.95
54 | R-Net | 42.09 | 41.1
55 | BiDAF+Self Attention | 47.28 | 41.3
56 | BiDAF+Self Attention+Gated RNN | 47.71 | 41.75
57 | 
58 | ### Memory and Time
59 | i7-7700k + 32G RAM + GTX1080Ti  
60 | batch size=32 dropout=0.7
61 | 
62 | Model | GPU Memory | Time(50 batch) | word embedding trainable
63 | ---|---|---|---
64 | BiDAF(origin)| 8431M | 47s | false
65 | MLSTM | 10655M | 1min27s | false
66 | R-Net | 4295M | 23s | false
67 | BiDAF+Self Attention(cuDNN based) | 8431M | 22s | false
68 | BiDAF+Self Attention+Gated RNN(Pre) | N/A | N/A | false
69 | 
70 | ## BUG：
71 | 1. BiDAF+Self Attention无法保存后再加载模型，tensorflow的cuDNN_LSTM虽然极快，但太难用了
72 | 2. R-Net本地的两个指标极差，提交的结果倒是正常
73 | 
74 | ## Other
75 | - 实际还有基于HKUST的BiDAF版本，显存和时间占用略小于R-Net，但效果比BiDAF(origin)大约低2个点，可能是使用了GRU的原因
76 | - 最终在训练时无法保存最优模型的情况下，只能针对当前最优epoch进行一次predict，极为耗时
77 | - 这个repo的Self Attention加在了match layer，后来发现cs224n的做法是基于match layer的输出做Self Attention，估计效果更好
78 | 


--------------------------------------------------------------------------------
/data/demo/README.md:
--------------------------------------------------------------------------------
 1 | # Data Preprocessing Strategy
 2 | 
 3 | Here is an example of preprocessed data:
 4 | ```
 5 | {
 6 |     "question_id": 186358,
 7 |     "question_type": "YES_NO",
 8 |     "question": "上海迪士尼可以带吃的进去吗",
 9 |     "segmented_question": ["上海", "迪士尼", "可以", "带", "吃的", "进去", "吗"],
10 |     "documents": [
11 |         {
12 |             "paragraphs": ["text paragraph 1", "text paragraph 2"],
13 |             "segmented_paragraphs": [["tokens of paragraph1"], ["tokens of paragraph2"]],
14 |             "title": "上海迪士尼可以带吃的进去吗",
15 |             "segmented_title": ["上海", "迪士尼", "可以", "带", "吃的", "进去", "吗"],
16 |             "bs_rank_pos": 1,
17 |             "is_selected": True
18 |             "most_related_para": 0,
19 |         },
20 |         # ...
21 |     ],
22 |     "answers": [
23 |         "完全密封的可以，其它不可以。",                                        # answer1
24 |         "可以的，不限制的。只要不是易燃易爆的危险物品，一般都可以带进去的。",  # answer2
25 |         "罐装婴儿食品、包装完好的果汁、水等饮料及包装完好的食物都可以带进乐园，但游客自己在家制作的食品是不能入园，因为自制食品有一定的安全隐患。"        # answer3
26 |     ]
27 |     "answer_docs": [0],
28 |     "answer_spans": [[0, 15]]
29 |     "fake_answers": ["完全密封的可以，其他不可以。"],
30 |     "match_scores": [1.00],
31 |     "segmented_answers": [
32 |         ["完全", "密封", "的", "可以", "，", "其它", "不可以", "。"],
33 |         ["tokens for answer2"],
34 |         ["tokens for answer3"],
35 | 
36 |     "yesno_answers": [
37 |         "Depends",                      # corresponding to answer 1
38 |         "Yes",                          # corresponding to answer 2
39 |         "Depends"                       # corresponding to asnwer 3
40 |     ]
41 | }
42 | ```
43 | 
44 | To make it easier for researchers to use DuReader Dataset, we also release the preprocessed data. The preprocessing mainly does the following things:
45 | 1. Word segmentation. We segment all questions, answers, document titles and paragraphs into Chinese words, and the results are stored with a new field which prefix the corresponding field name with "segmented_". For example, the segmented question is stored in "segmented_question".
46 | 2. Answer paragraph targeting. In DuReader dataset, each question has up to 5 related documents, and the average document length is 394, since it is too heavy to feed all 5 documents into popular RC models, so we previously find the most answer related paragraph that might contain an answer for each document. And we replace original documents with the most related paragraphs  in our baseline models. The most related paragraphs are selected according to highest recall of the answer tokens of each document, and the index of the selected paragraph of each document is stored in "most_related_para".
47 | 3. Locating answer span. For many popular RC models, an answer span is required in training. Since the original DuReader dataset doesn't provide the answer span, we provide a simple answer span locating strategy  for convenience in our preprocess code as an optional preprocess strategy. In the strategy, we match real answer with each documents, then search the substring with maximum F1-score of the real answers, and use the span of substring as the candidate answer span. For each question we find single span as candidate, and store it in the "answer_spans" field, the corresponding substring spanned by answer span is stored in "fake_answers", the recall of the answer span of the real answer is stored in "match_scores", and the document index of the answer span is stored in "answer_docs".
48 | 
49 | Except for word segmentation, the rest of the preprocessing strategy is implemented in `utils/preprocess.py`.
50 | 
51 | 数据预处理主要包含以下过程：
52 | 1. 分词。我们对所有的问题，答案，文档的标题和段落进行分词，将结果存储在以"segmented_"为前缀的新域中。例如，分词的问题被存储在"segmented_question"
53 | 2. 答案目标段落。在DuReader数据集中，每个问题至多有5篇相关文档，文档的平均长度为394。由于将5篇文档全部输入RC模型计算量过大，我们预先在每篇文档中找出可能包含答案的与答案最相关的段落。在baseline模型中用最相关段落代替原文档。根据每篇文档的答案tokens的最高召回率选择最相关段落，被选出的段落的下标存储在"most_related_para"
54 | 3. 定位answer span。对于大多数流行的RC模型，训练时需要answer span。因为原始数据集中没有提供answer span，为了方便，在预处理代码中，我们提供了一个简单的answer span定位策略。在该策略中，我们将真实答案与每篇文档匹配，然后搜索真实答案获得最大F1-score的子串，并将子串的span作为候选answer span。对每个问题，我们找出一个span作为候选，存储在"answer_span"域，与answer span对应的子串存储在"fake_answers"，真实答案的answer span的召回率存储在"match_scores"，answer span的文档下标存储在"answer_docs"


--------------------------------------------------------------------------------
/竞赛技术报告/Final_Naturali-2018机器阅读理解技术竞赛系统报告.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/竞赛技术报告/Final_Naturali-2018机器阅读理解技术竞赛系统报告.pptx


--------------------------------------------------------------------------------
/竞赛技术报告/东北大学-2018机器阅读理解竞赛报告.ppt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/竞赛技术报告/东北大学-2018机器阅读理解竞赛报告.ppt


--------------------------------------------------------------------------------
/竞赛技术报告/台达电子-Delta-MRC系統報告.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiningliang/MRC2018/9bf27efbbc028116eb01ea2df1bf0d063cb09ae9/竞赛技术报告/台达电子-Delta-MRC系統報告.pdf


--------------------------------------------------------------------------------