├── DuReader a Chinese Machine Reading Comprehension Dataset from Realworld Applications.pdf
├── LICENSE
├── README.md
├── code
    ├── __pycache__
    │   ├── baseQA.cpython-36.pyc
    │   ├── loader.cpython-36.pyc
    │   ├── main.cpython-36.pyc
    │   ├── test.cpython-36.pyc
    │   ├── train.cpython-36.pyc
    │   └── util.cpython-36.pyc
    ├── baseQA.py
    ├── loader.py
    ├── main.py
    ├── susht.py
    ├── test.h5
    ├── test.py
    ├── test.txt
    ├── tmp.py
    ├── train.py
    └── util.py
└── data
    ├── pre.docx
    ├── raw.docx
    ├── readme.txt
    ├── test.docx
    └── vocab.txt


/DuReader a Chinese Machine Reading Comprehension Dataset from Realworld Applications.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/DuReader a Chinese Machine Reading Comprehension Dataset from Realworld Applications.pdf


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Shuting Su
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 请转至[QANet_dureader](https://github.com/SeanLee97/QANet_dureader)
 2 | 
 3 | # duReader_pytorch
 4 | 基于duReader的阅读理解
 5 | 
 6 | 这个数据集的不合理之处：
 7 | 1）标注文章选择不合理，有一些没有被选到的文章，其实也包含了答案。
 8 |    在做开放域问答的时候我们会搜索到很多没有答案的文章，模型需要识别出来这些无用文章，所以在训练的时候需要负例，而这里的负例是不合理的。
 9 | 2）属性写得很模糊..........fake answer其实应该是golden answer，is_selectd也没有被选中作为答案文章
10 | 
11 | 
12 | so，还不如刷微软的MS MARCO数据集呢
13 | 


--------------------------------------------------------------------------------
/code/__pycache__/baseQA.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/__pycache__/baseQA.cpython-36.pyc


--------------------------------------------------------------------------------
/code/__pycache__/loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/__pycache__/loader.cpython-36.pyc


--------------------------------------------------------------------------------
/code/__pycache__/main.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/__pycache__/main.cpython-36.pyc


--------------------------------------------------------------------------------
/code/__pycache__/test.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/__pycache__/test.cpython-36.pyc


--------------------------------------------------------------------------------
/code/__pycache__/train.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/__pycache__/train.cpython-36.pyc


--------------------------------------------------------------------------------
/code/__pycache__/util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/__pycache__/util.cpython-36.pyc


--------------------------------------------------------------------------------
/code/baseQA.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.autograd as autograd
  3 | import torch.nn as nn
  4 | import torch.nn.functional as F
  5 | from torch.autograd import Variable
  6 | import numpy as np
  7 | 
  8 | class baseQA(nn.Module):
  9 | 	
 10 | 	def __init__(self, param):
 11 | 		super(baseQA, self).__init__()
 12 | 		self.vocab_size = param.vocab_size
 13 | 		self.embedding_size = param.embedding_size   
 14 | 		self.question_size = param.question_size
 15 | 		self.paragraph_size = param.paragraph_size
 16 | 
 17 | 		self.question_hidden_size = 48
 18 | 		self.paragraph_hidden_size = 64
 19 | 
 20 | 		self.question_num_layers = 1
 21 | 		self.paragraph_num_layers = 1
 22 | 		
 23 | 		self.lookup = nn.Embedding(self.vocab_size, self.embedding_size)
 24 | 
 25 | 		'''
 26 | 		if param.pre_embeds == True :
 27 | 			self.lookup.weight.data.copy_(torch.from_numpy(embeds))
 28 | 			for param in self.lookup.parameters():
 29 | 				param.requires_grad = False
 30 | 		'''
 31 | 
 32 | 		self.paragraph_input_size = self.embedding_size + self.question_hidden_size 
 33 | 		self.question_lstm = nn.LSTM(self.embedding_size, self.question_hidden_size, self.question_num_layers, dropout = 0.1)
 34 | 		self.paragraph_lstm = nn.LSTM(self.paragraph_input_size, self.paragraph_hidden_size // 2, self.paragraph_num_layers, dropout = 0.2, bidirectional = True)
 35 | 		#self.match_lstm = nn.LSTM(self.e_hidden_size, self.t_hidden_size, self.num_layers)
 36 | 		
 37 | 		self.att_linear = nn.Linear(self.question_hidden_size, 1)
 38 | 		self.start_net = nn.Linear(self.paragraph_hidden_size, self.paragraph_size)
 39 | 		self.end_net = nn.Linear(self.paragraph_hidden_size, self.paragraph_size)
 40 | 
 41 | 		#self.weight = torch.FloatTensor([1.4, 1.4, 0.8, 0.8, 0.8]).cuda()
 42 | 		#self.loss_func = nn.NLLLoss(weight = self.weight)
 43 | 		
 44 | 		self.loss_func = nn.NLLLoss()
 45 | 	
 46 | 	
 47 | 	def init_hidden(self, num_layers, batch_size, hidden_size):
 48 | 		h0 = Variable(torch.zeros(num_layers, batch_size, hidden_size))
 49 | 		c0 = Variable(torch.zeros(num_layers, batch_size, hidden_size))
 50 | 		if torch.cuda.is_available() == True:
 51 | 			h0, c0 = h0.cuad(), c0.cuda()
 52 | 		return (h0, c0)
 53 | 	
 54 | 	
 55 | 	# x = (batch, seq_len, hsize)
 56 | 	# return (batch, hidden_size)
 57 | 	def attention(self, x):
 58 | 		x_flat = x.view(-1, x.size(-1))
 59 | 		scores = self.att_linear(x_flat).view(x.size(0), x.size(1))
 60 | 		weights = F.softmax(scores)
 61 | 		out = weights.unsqueeze(1).bmm(x).squeeze(1)
 62 | 		return out
 63 | 	
 64 | 	
 65 | 	# return pack rnn inputs
 66 | 	def get_pack_rnn_inputs(self, x, lengths):
 67 | 		_, idx_sort = torch.sort(lengths, dim = 0, descending = True)
 68 | 		_, idx_unsort = torch.sort(idx_sort, dim = 0)
 69 | 
 70 | 		lengths = list(lengths[idx_sort])
 71 | 
 72 | 		# sort x
 73 | 		x = x.index_select(0, Variable(idx_sort))
 74 | 		if torch.cuda.is_available() == True:
 75 | 			x = x.cuda()
 76 | 		x = x.transpose(0, 1).contiguous()
 77 | 		rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths)
 78 | 
 79 | 		unsort = Variable(idx_unsort)
 80 | 		if torch.cuda.is_available() == True:
 81 | 			unsort = unsort.cuda()
 82 | 		
 83 | 		return rnn_input, unsort
 84 | 
 85 | 	
 86 | 	def get_pad_rnn_outputs(self, output, seq_len, idx_unsort):
 87 | 		output = nn.utils.rnn.pad_packed_sequence(output)[0]
 88 | 		
 89 | 		# transpose and unsort
 90 | 		output = output.transpose(0, 1).contiguous()
 91 | 		output = output.index_select(0, idx_unsort)
 92 | 
 93 | 		# pad up to original batch sequence length
 94 | 		if output.size(1) != seq_len:
 95 | 			padding = torch.zeros(output.size(0),
 96 | 								  seq_len - output.size(1),
 97 | 								  output.size(2)).type(output.data.type())
 98 | 			output = torch.cat([output, Variable(padding)], 1)
 99 | 		
100 | 		return output
101 | 	
102 | 	
103 | 	# embeds = (batch, seq_len, embedding_size)
104 | 	# return (batch, q_size)
105 | 	def get_question_lstm(self, question, question_length):
106 | 		batch_size = question.size()[0]
107 | 		embeds = self.lookup(question)
108 | 		inputs, idx_unsort = self.get_pack_rnn_inputs(embeds, question_length)
109 | 		
110 | 		init_hidden = self.init_hidden(self.question_num_layers, batch_size, self.question_hidden_size)
111 | 		lstm_out, _ = self.question_lstm(inputs, init_hidden)
112 | 		lstm_out = self.get_pad_rnn_outputs(lstm_out, self.question_size, idx_unsort)
113 | 		#print('q lstm: ', lstm_out.size())
114 | 		
115 | 		lstm_vector = self.attention(lstm_out)
116 | 		return lstm_vector
117 | 	
118 | 	
119 | 	# return (batch, paragraph_size, paragraph_hidden_size)
120 | 	def get_paragraph_lstm(self, paragraph, paragraph_length, question_vector):
121 | 		batch_size = paragraph.size()[0]
122 | 		embeds = self.lookup(paragraph)
123 | 
124 | 		question_vectors = question_vector.expand(self.paragraph_size, *question_vector.size()) 
125 | 		question_vectors = question_vectors.transpose(0,1).contiguous()
126 | 
127 | 		#print('embeds: ', embeds.size())
128 | 		#print('question: ', question_vectors.size())
129 | 		inputs = torch.cat([embeds, question_vectors], -1)
130 | 		inputs, idx_unsort = self.get_pack_rnn_inputs(inputs, paragraph_length)
131 | 		
132 | 		init_hidden = self.init_hidden(self.paragraph_num_layers * 2, batch_size, self.paragraph_hidden_size // 2)
133 | 		lstm_out, _ = self.paragraph_lstm(inputs, init_hidden)
134 | 		
135 | 		lstm_out = self.get_pad_rnn_outputs(lstm_out, self.paragraph_size, idx_unsort)
136 | 		#print('lstm : ', lstm_out.size())
137 | 		lstm_vector = torch.mean(lstm_out, 1)
138 | 
139 | 		return lstm_vector
140 | 
141 | 
142 | 	# return (batch, seq_len, tag_size)
143 | 	def forward(self, question, paragraph, question_length, paragraph_length):
144 | 		question_vector = self.get_question_lstm(question, question_length)
145 | 		paragraph_vector = self.get_paragraph_lstm(paragraph, paragraph_length, question_vector)  
146 | 		print('paragraph: ', paragraph_vector.size())
147 | 
148 | 		start_space = self.start_net(paragraph_vector)
149 | 		start_score = F.log_softmax(start_space)
150 | 		
151 | 		end_space = self.end_net(paragraph_vector)
152 | 		end_score = F.log_softmax(end_space)
153 | 
154 | 		return start_score, end_score
155 | 
156 | 	
157 | 	# return (batch, seq_len)
158 | 	def get_answer(self, question, paragraph, question_length, paragraph_length):
159 | 		start_score, end_score = self.forward(question, paragraph, question_length, paragraph_length)
160 | 		_, tag = torch.max(score, dim = -1)
161 | 		return tag.data.cpu().tolist()
162 |   
163 | 	
164 | 	# return one value
165 | 	def get_loss(self, question, paragraph, answer, question_length, paragraph_length):
166 | 		start_score, end_score = self.forward(question, paragraph, question_length, paragraph_length)
167 | 
168 | 		answer = answer.transpose(0, 1)
169 | 		start_loss = self.loss_func(start_score, answer[0])
170 | 		end_loss = self.loss_func(end_score, answer[1])
171 | 		loss = torch.mean(torch.cat([start_loss, end_loss], -1))
172 | 
173 | 		return loss
174 | 	
175 | 	
176 | 	
177 | 	
178 | 
179 | 
180 | 	


--------------------------------------------------------------------------------
/code/loader.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | import math
 3 | import torch
 4 | import torch.utils.data as data
 5 | 
 6 | class loadTrainDataset(data.Dataset):
 7 |     def __init__(self, path):
 8 |         self.file = h5py.File(path)
 9 |         self.nb_samples = len(self.file['question'][:])
10 |         print('Dataset: ', self.nb_samples)
11 | 
12 |     def __getitem__(self, index):
13 |         question = self.file['question'][index]
14 |         paragraph = self.file['paragraph'][index]
15 |         answer = self.file['answer'][index]
16 |         question_length = self.file['question_length'][index]
17 |         paragraph_length = self.file['paragraph_length'][index]
18 |         return question, paragraph, answer, question_length, paragraph_length
19 | 
20 |     def __len__(self):
21 |         return self.nb_samples
22 | 
23 | class loadValDataset(data.Dataset):
24 |     def __init__(self, path):
25 |         self.file = h5py.File(path)
26 |         self.nb_samples = len(self.file['question'][:])
27 |         print('Dataset: ', self.nb_samples)
28 | 
29 |     def __getitem__(self, index):
30 |         question_id = self.file['question_id'][index]
31 |         question = self.file['question'][index]
32 |         paragraphs = self.file['paragraphs'][index]
33 |         question_length = self.file['question_length'][index]
34 |         paragraph_lengths = self.file['paragraph_lengths'][index]
35 |         return question, paragraphs, question_length, paragraph_lengths
36 | 
37 |     def __len__(self):
38 |         return self.nb_samples
39 | 
40 | class loadTestDataset(data.Dataset):
41 |     def __init__(self, path):
42 |         self.file = h5py.File(path)
43 |         self.nb_samples = len(self.file['question'][:])
44 |         print('Dataset: ', self.nb_samples)
45 | 
46 |     def __getitem__(self, index):
47 |         question_id = self.file['question_id'][index]
48 |         question = self.file['question'][index]
49 |         paragraph = self.file['paragraph'][index]
50 |         question_length = self.file['question_length'][index]
51 |         paragraph_length = self.file['paragraph_length'][index]
52 |         return question_id, question, paragraph, question_length, paragraph_length
53 | 
54 |     def __len__(self):
55 |         return self.nb_samples
56 |         


--------------------------------------------------------------------------------
/code/main.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import os
 3 | import pickle
 4 | import time
 5 | from datetime import datetime
 6 | from baseQA import baseQA
 7 | from loader import loadTrainDataset, loadValDataset, loadTestDataset
 8 | #from util import load_vocab, load_webQA_vocab, load_webQA_embedding
 9 | from train import train, train_epoch, eval_epoch
10 | from test import test
11 | 
12 | 
13 | class Hyperparameters:
14 |     nb_epoch = 1000
15 |     batch_size = 128
16 |     tagset_size = 4
17 |     question_size = 32
18 |     paragraph_size = 512
19 | 
20 |     qe_embedding_size = 2
21 |     embedding_size = 64
22 |     
23 |     min_count = 10
24 |     batch_storage_size = 10000
25 |     
26 |     learning_rate = 0.001
27 |     model_dir = ''
28 | 
29 |     train_json_path = '../data/preprocessed/trainset/search.train.json'
30 |     val_json_path = '../data/preprocessed/devset/search.dev.json'
31 |     test_json_path = '../data/test.json'
32 | 
33 |     train_h5py_path = '../data/train.h5py'
34 |     test_h5py_path = '../data/test.h5py'
35 |     val_h5py_path = '../data/dev.h5py'
36 |     
37 |     vocab_path = '../data/vocab.txt'
38 | 
39 |     word2idx = {}
40 |     idx2word = {}
41 |     vocab_size = 0
42 | 
43 | def load_vocab(path):
44 |     print('Loading vocabulary...')
45 |     f = open(path, 'rb')
46 |     input2idx = pickle.load(f)
47 |     input_set = list(input2idx.keys())
48 |     input_set_size = len(input_set)
49 |     f.close()
50 |     print('Vacabulary size:', input_set_size, '\n')
51 |     return input2idx, input_set_size
52 | 
53 | def train_model(param):
54 |     #os.environ["CUDA_VISIBLE_DEVICES"] = "1"
55 |     param.model_dir = '../model/baseQA_' + str(datetime.now()).split('.')[0].split()[0] + '/'
56 |     if os.path.exists(param.model_dir) == False:
57 |         os.mkdir(param.model_dir)
58 | 
59 |     train_dataset = loadTrainDataset(param.train_h5py_path)
60 |     val_dataset = loadValDataset(param.val_h5py_path)
61 | 
62 |     train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = param.batch_size, num_workers = 1, shuffle = True)  
63 |     val_loader = torch.utils.data.DataLoader(val_dataset, batch_size = param.batch_size, num_workers = 1, shuffle = True)
64 | 
65 |     model = baseQA(param)
66 |     if torch.cuda.is_available() == True:
67 |         model = model.cuda()
68 |     train(model, train_loader, val_loader, param)
69 | 
70 | 
71 | def test_model(param):
72 |     test_dataset = loadTestDataset(param.test_h5py_path)
73 |     test_loader = torch.utils.data.DataLoader(test_dataset, batch_size = param.batch_size, num_workers = 1, shuffle = False)
74 | 
75 |     model = torch.load(param.model_path)
76 |     test(model, test_loader, param)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     param = Hyperparameters() 
81 |     
82 |     print('Biu ~ ~  ~ ~ ~ Give you buffs ~ \n')
83 |     
84 |     param.word2idx, param.vocab_size = load_vocab(param.vocab_path)
85 |     param.idx2word = dict(zip(param.word2idx.values(), param.word2idx.keys()))
86 | 
87 |     train_model(param)  
88 | 
89 |     #test_model(param)
90 |     
91 |     
92 |     
93 |     
94 |     
95 | 


--------------------------------------------------------------------------------
/code/susht.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | import math
  6 | import h5py
  7 | import numpy as np
  8 | 
  9 | # return pack rnn inputs
 10 | def get_pack_rnn_inputs( x, x_mask):
 11 |     lengths = x_mask.data.eq(0).long().sum(1).squeeze()
 12 |     _, idx_sort = torch.sort(lengths, dim = 0, descending = True)
 13 |     _, idx_unsort = torch.sort(idx_sort, dim = 0)
 14 | 
 15 |     lengths = list(lengths[idx_sort])
 16 | 
 17 |     # Sort x
 18 |     x = x.index_select(0, Variable(idx_sort))
 19 |     x = x.transpose(0, 1).contiguous()
 20 |     rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths)
 21 | 
 22 |     return rnn_input, Variable(idx_unsort)
 23 | 
 24 |     
 25 | def get_pad_rnn_outputs(output, x_mask, idx_unsort):
 26 |     output = nn.utils.rnn.pad_packed_sequence(output)[0]
 27 |     print('o: ', output.size())
 28 |     
 29 |     # Transpose and unsort
 30 |     output = output.transpose(0, 1).contiguous()
 31 |     output = output.index_select(0, idx_unsort)
 32 | 
 33 |     # Pad up to original batch sequence length
 34 |     if output.size(1) != x_mask.size(1):
 35 |         padding = torch.zeros(output.size(0),
 36 |                               x_mask.size(1) - output.size(1),
 37 |                               output.size(2)).type(output.data.type())
 38 |         output = torch.cat([output, Variable(padding)], 1)
 39 | 
 40 |     return output
 41 |     
 42 |     
 43 | def test_lstm():
 44 |     # batch:3, seq_len:5, embedding:10, hidden:20
 45 |     rnn = nn.LSTM(6, 7, 1)
 46 |     
 47 |     input = Variable(torch.randn(3, 5, 6))
 48 |     mask = Variable(torch.randn(3, 5).byte())
 49 |     print('input: ', input)
 50 |     print('mask: ', mask)
 51 |     
 52 |     input, i = get_pack_rnn_inputs(input, mask)
 53 |     output, hn = rnn(input)
 54 |     output = get_pad_rnn_outputs(output, mask, i)
 55 |     print('output: ', output)
 56 |     
 57 |     '''
 58 |     input2 = input.transpose(0,1).contiguous()
 59 |     print('ini input: ', input2)
 60 |     print('==========================')
 61 |     
 62 |     print('input: ', input)
 63 |     output, hn = rnn(input)
 64 |     print('lstm out: ', output.size())
 65 |     
 66 |     print('\n==========================\n')
 67 |     lengths = [4, 3, 1]
 68 |     rnn_input = nn.utils.rnn.pack_padded_sequence(input, lengths)
 69 |     
 70 |     print('rnn input: ', rnn_input)
 71 |     output, hn = rnn(rnn_input)
 72 |     print('lstm out: ', output)
 73 |     print('\n==========================\n')
 74 |     
 75 |     outputs = nn.utils.rnn.pad_packed_sequence(output)[0]
 76 |     print('outputs pack: ', outputs)
 77 | 
 78 |     input2 = Variable(torch.randn(3, 3, 6))
 79 |     
 80 |     input3 = torch.cat([input,input2], 0)
 81 |     
 82 |     print('1: ', input,'\n')
 83 |     print('==========================')
 84 |     print('2: ', input2, '\n')
 85 |     print('==========================')
 86 |     print('3:', input3)
 87 |     print('input3: ', input3.size())
 88 |     output, hn = rnn(input3)
 89 |     '''
 90 |     print('end')
 91 | 
 92 | def get_nll_loss(inputs, target):
 93 |     loss_list = []
 94 |     print()
 95 |     for seq, t in zip(inputs, target):
 96 |         loss = 0
 97 |         loss_list.append(seq[t])
 98 |     
 99 |     loss = torch.cat(loss_list, -1)
100 |     
101 |     w = 1.0 / len(loss)
102 |     weight = [w for i in range(len(loss))]
103 |     print('w: ', weight)
104 |     weight = torch.Tensor(weight)
105 |     
106 |     print('loss: ', loss)
107 |     print('weight: ', weight)
108 |         
109 |     loss2 = torch.dot(loss.cuda(), weight.cuda())
110 |     
111 |     loss = (-1) * loss
112 |     print('loss2: ', loss2)
113 |     
114 |     loss = torch.mean(torch.cat(loss_list, -1))
115 |     return loss
116 | 
117 | def test_loss():
118 |     inputs = Variable(torch.randn(3, 5))
119 |     target = Variable(torch.LongTensor([1, 0, 4]))
120 |     pred = F.log_softmax(inputs)
121 |     print('pred: ', pred)
122 |     
123 |     output = F.nll_loss(pred, target)
124 |     print('output: ', output)
125 |     #output.backward()
126 |     
127 |     mine = get_nll_loss(pred, target)
128 |     print('mine : ', mine)
129 | 
130 | def test_h5py():
131 |     file = h5py.File('fuck.h5','w')
132 |     data = [[1,2,3],[4,5]]
133 |     file.create_dataset('test', data=data)
134 |     file.close()
135 |     
136 | 
137 | class Hyperparameters:
138 |     nb_epoch = 1000
139 | 
140 | def test_param():
141 | 
142 | 
143 | if __name__ == '__main__':
144 |     param = Hyperparameters()
145 |     test_lstm()
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | 
157 | 
158 | 
159 |     


--------------------------------------------------------------------------------
/code/test.h5:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/test.h5


--------------------------------------------------------------------------------
/code/test.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | import os
  5 | import time
  6 | from datetime import datetime
  7 | 
  8 | STOP_TAG = '#OOV#'
  9 | 
 10 | def strict_match(preds, ans):
 11 |     for p in preds:
 12 |         if p in ans or ans in p:
 13 |             return 1
 14 |     return 0
 15 | 
 16 | 
 17 | def get_tagging_results(tokens, tags):
 18 |     chunks = set()
 19 |     start = -1
 20 |     for i, tok in enumerate(tokens):
 21 |         tag = tags[i]
 22 |         if tag == 0:  # B
 23 |             if start >= 0: chunks.add(''.join(tokens[start:i]))
 24 |             start = i
 25 |         elif tag == 1:  # I
 26 |             if start < 0: start = i
 27 |         else:
 28 |             if start < 0: continue
 29 |             chunks.add(''.join(tokens[start:i]))
 30 |             start = -1
 31 |     if start >= 0:
 32 |         chunks.add(''.join(tokens[start:]))
 33 |         
 34 |     if len(chunks) == 0:
 35 |         chunks.add('no_answer')
 36 |     return list(chunks)
 37 | 
 38 | 
 39 | def get_batch_scores(pred_tags, answer, question, evidence, idx2word):
 40 |     nb_pred = 0
 41 |     A, C, Q = 0, 0, 0
 42 |     for pred, ans , ques, evid in zip(pred_tags, answer, question, evidence):
 43 |         ques = [ idx2word[q] for q in ques if q != 0 ]
 44 |         evid = [ idx2word[e] for e in evid ]
 45 |         #pred = [ p for p in pred ]
 46 |         ans = ''.join( [ idx2word[a] for a in ans if a != 0 ] )
 47 |         
 48 |         pred_ans = get_tagging_results(evid, pred)
 49 |         
 50 |         evid = [ e for e in evid if e != STOP_TAG ]
 51 |         print('Question: ', ''.join(ques), '\n')
 52 |         print('Evidence: ', ''.join(evid), '\n')
 53 |         #print('Tags: ', pred, '\n')
 54 |         print('Predict Answers: ', pred_ans)
 55 |         print('Golden Answers: ', ans)
 56 |         print('\n ---------------------------- \n')
 57 |         
 58 |         if len(pred_ans) > 0 :
 59 |             nb_pred += 1
 60 |         
 61 |         C += strict_match(pred_ans, ans)
 62 |         A += len(pred_ans)
 63 |         Q += 1
 64 |     
 65 |     if ( A == 0):
 66 |         pre = 0
 67 |     else:
 68 |         pre = C / A
 69 |     
 70 |     if ( Q == 0):
 71 |         rec = 0
 72 |     else:
 73 |         rec = C / Q
 74 |         
 75 |     if (pre + rec == 0):
 76 |         f1 = 0
 77 |     else:
 78 |         f1 = (2 * pre * rec) / (pre + rec)
 79 |         
 80 |     return pre, rec, f1, nb_pred
 81 | 
 82 | 
 83 | def test(model, loader, idx2word):
 84 |     print('Testing model...')
 85 |     nb_batch = 0
 86 |     epoch_pre, epoch_rec, epoch_f1, epoch_pred = 0, 0, 0, 0
 87 |     for batch_idx, (question, evidence, q_mask, e_mask, qe_feat, answer) in enumerate(loader):
 88 |         nb_batch += 1
 89 |         question = Variable(question.long()).cuda()
 90 |         evidence = Variable(evidence.long()).cuda()
 91 |         qe_feat = Variable(qe_feat.long()).cuda()
 92 |         q_mask = Variable(q_mask.byte()).cuda()
 93 |         e_mask = Variable(e_mask.byte()).cuda()
 94 | 
 95 |         pred_tags = model.get_tags(question, evidence, q_mask, e_mask, qe_feat)
 96 |         
 97 |         question = question.data.cpu().numpy()
 98 |         evidence = evidence.data.cpu().numpy()
 99 |         
100 |         pre, rec, f1 , nb_pred = get_batch_scores(pred_tags, answer, question, evidence, idx2word)
101 |         print('batch:',batch_idx,'  nb_pred:', nb_pred, '   ||  pre: ', pre, '   rec: ', rec, '   f1  :', f1)
102 |         
103 |         epoch_pre += pre
104 |         epoch_rec += rec
105 |         epoch_f1 += f1
106 |         epoch_pred += nb_pred
107 |                                          
108 |     epoch_pre = epoch_pre / nb_batch
109 |     epoch_rec = epoch_rec / nb_batch
110 |     epoch_f1 = epoch_f1 / nb_batch
111 |     print('Pre:', epoch_pre, '    Rec:', epoch_rec,'    F1:', epoch_f1, '\n')
112 |     return epoch_pre, epoch_rec, epoch_f1, epoch_pred
113 | 
114 | 
115 | 
116 | if __name__ == '__main__':
117 |     
118 |     print('Hey')
119 |     
120 |     
121 | 
122 |     
123 |     
124 |     
125 |     
126 |     
127 |     
128 | 


--------------------------------------------------------------------------------
/code/test.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/code/test.txt


--------------------------------------------------------------------------------
/code/tmp.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import h5py
  3 | import numpy as np
  4 | from tqdm import *
  5 | from collections import Counter
  6 | import torch
  7 | from torch.autograd import Variable
  8 | 
  9 | # x = (batch, seq_len, hsize)
 10 | # return (batch, hidden_size)
 11 | def attention(self, x, x_mask):
 12 | 	x_flat = x.view(-1, x.size(-1))
 13 | 	scores = self.att_linear(x_flat).view(x.size(0), x.size(1))
 14 | 	scores.data.masked_fill_(x_mask.data, -float('inf'))
 15 | 	weights = F.softmax(scores)
 16 | 	out = weights.unsqueeze(1).bmm(x).squeeze(1)
 17 | 	return out
 18 | 
 19 | def uniform_weights(x, x_mask):
 20 | 	"""Return uniform weights over non-masked input."""
 21 | 	alpha = Variable(torch.ones(x.size(0), x.size(1)))
 22 | 	if x.data.is_cuda:
 23 | 		alpha = alpha.cuda()
 24 | 	alpha = alpha * x_mask.eq(0).float()
 25 | 	alpha = alpha / alpha.sum(1).expand(alpha.size())
 26 | 	return alpha
 27 | 
 28 | def weighted_avg(x, weights):
 29 | 	"""x = batch * len * d
 30 | 	weights = batch * len
 31 | 	"""
 32 | 	return weights.unsqueeze(1).bmm(x).squeeze(1)
 33 | 
 34 | def test_lengths():
 35 | 	x = Variable(torch.randn(3,5,8))
 36 | 	lens = Variable(torch.randn(3))
 37 | 	alpha = Variable(torch.zeros(x.size(0), x.size(1)))
 38 | 	for i in range(alpha.size(0)):
 39 | 		for j in range(lens[i].data):
 40 | 			alpha[i][j] = 1
 41 | 
 42 | 	print('a: ', alpha)
 43 | 
 44 | 	alpha = alpha / alpha.sum(1).expand(alpha.size(1), alpha.size(0))
 45 | 	print('alpha: ', alpha.size())
 46 | 
 47 | 
 48 | def test_weights():
 49 | 	"""Return uniform weights over non-masked x (a sequence of vectors).
 50 | 	Args:
 51 | 		x: batch * len * hdim
 52 | 		x_mask: batch * len (1 for padding, 0 for true)
 53 | 	Output:
 54 | 		x_avg: batch * hdim
 55 | 	"""
 56 | 	x = Variable(torch.randn(3,5,8))
 57 | 	x_mask = Variable(torch.zeros(3,5))
 58 | 
 59 | 	alpha = Variable(torch.ones(x.size(0), x.size(1)))
 60 | 	alpha = alpha * x_mask.eq(0).float()
 61 | 
 62 | 	print('a: ', alpha)
 63 | 
 64 | 	alpha = alpha / alpha.sum(1).expand(alpha.size(1), alpha.size(0))
 65 | 	print('alpha: ', alpha.size())
 66 | 
 67 | 	y = alpha.unsqueeze(1).bmm(x).squeeze(1)
 68 | 	print('y: ', y.size())
 69 | 
 70 | 
 71 | def test_h5py():
 72 | 	file = h5py.File('test.h5', 'w')
 73 | 	data = [1,2]
 74 | 	file.create_dataset('look', data = data, maxshape = (None, ))
 75 | 	print(file['look'][:])
 76 | 
 77 | 
 78 | 
 79 | def test():
 80 | 	ret = []
 81 | 	train_path = '../data/preprocessed/devset/search.dev.json'
 82 | 	#train_path = '../data/preprocessed/testset/zhidao.test.json'
 83 | 	i = 0
 84 | 	with open(train_path) as f:
 85 | 		for line in tqdm(f):
 86 | 			line = json.loads(line)
 87 | 			documents = line['documents']
 88 | 			document = ''
 89 | 			tmp = 0 
 90 | 			j = 0 
 91 | 			question = line['question']
 92 | 			question_tokens = line['segmented_question']
 93 | 			for d in documents:
 94 | 
 95 | 				para_infos = []
 96 | 				for para_tokens in d['segmented_paragraphs']:				  
 97 | 					common_with_question = Counter(para_tokens) & Counter(question_tokens)
 98 | 					correct_preds = sum(common_with_question.values())
 99 | 					if correct_preds == 0:
100 | 						recall_wrt_question = 0
101 | 					else:
102 | 						recall_wrt_question = float(correct_preds) / len(question_tokens)
103 | 					para_infos.append((para_tokens, recall_wrt_question, len(para_tokens)))
104 | 				para_infos.sort(key=lambda x: (-x[1], x[2]))
105 | 				print(len(para_infos))
106 | 				fake_passage_tokens = para_infos[0][0]
107 | 				print('fake: ', ''.join(fake_passage_tokens))
108 | 				continue
109 | 
110 | 				is_selected = d['is_selected']
111 | 				title = d['title']
112 | 				most_related_para = d['most_related_para']
113 | 				paragraphs = d['segmented_paragraphs']
114 | 
115 | 			question = line['question']
116 | 			question_type = line['question_type']
117 | 
118 | 			if len(line['answer_docs']) == 0 or len(line['fake_answers']) == 0:
119 | 				continue 
120 | 
121 | 			answer_docs = line['answer_docs'][0]
122 | 			answer_span = line['answer_spans'][0]
123 | 			fake_answer = line['fake_answers'][0]
124 | 
125 | 
126 | 			#if len(line['answer_docs']) != 1:
127 | 			#	print(len(line['answer_docs']),' ', len(line['answer_spans']),' ', len(line['fake_answers']))
128 | 			#continue
129 | 
130 | 			document = documents[answer_docs]
131 | 			#paragraph = document['paragraphs'][document['most_related_para']]
132 | 			segmented_paragraph = document['segmented_paragraphs'][document['most_related_para']]
133 | 			paragraph = ''.join(segmented_paragraph)
134 | 			'''
135 | 			if fake_answer !=  ''.join(segmented_paragraph[answer_span[0]: answer_span[1]+1]):
136 | 				print(fake_answer)
137 | 				print(''.join(segmented_paragraph[answer_span[0]: answer_span[1]+1]))
138 | 				print()
139 | 			'''
140 | 			answer_start = len(''.join(segmented_paragraph[:answer_span[0]])) 
141 | 			answer_end = len(''.join(segmented_paragraph[:answer_span[1]+1])) 
142 | 
143 | 			if paragraph != ''.join(segmented_paragraph):
144 | 				print(paragraph)
145 | 				print(''.join(segmented_paragraph))
146 | 				print()
147 | 
148 | 			#continue
149 | 			if fake_answer != ''.join(paragraph[answer_start: answer_end]):
150 | 				print(fake_answer)
151 | 				print(''.join(paragraph[answer_start: answer_end]))
152 | 				print()
153 | 
154 | 			#print(fake_answer)
155 | 			#print()
156 | 
157 | 			i = i+1
158 | 			if i == 5:
159 | 				break
160 | 
161 | if __name__ == '__main__':
162 | 	test_lengths()


--------------------------------------------------------------------------------
/code/train.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | from torch.autograd import Variable
  4 | import os
  5 | import time
  6 | from datetime import datetime
  7 | from test import get_batch_scores
  8 | 
  9 | 
 10 | def save_model(model, epoch, loss, bleu, model_dir):
 11 |     model_path = model_dir + 'bleu_' + str(round(bleu, 4)) + '_loss_' + str(round(loss, 4)) + '_' + str(epoch)
 12 |     with open(model_path, 'wb') as f:
 13 |         torch.save(model, f)
 14 |             
 15 | def train(model, train_loader, valid_loader, param):
 16 |     print('Training model...')
 17 |     parameters = filter(lambda p: p.requires_grad, model.parameters())
 18 |     optimizer = torch.optim.Adam(parameters, lr = param.learning_rate)  
 19 |     
 20 |     best_loss = 1000
 21 |     max_bleu_4 = 0
 22 |     for epoch in range(param.nb_epoch): 
 23 |         train_loss = train_epoch(model, epoch, train_loader, optimizer)
 24 |         if train_loss <= best_loss:
 25 |             best_loss = train_loss
 26 |             save_model(model, epoch, train_loss, 0, parm.model_dir)
 27 | 
 28 |         '''
 29 |         bleu_rouge = eval_epoch(model, epoch, valid_loader, param.idx2word)
 30 |         if bleu_rouge['Bleu-4'] > max_bleu_4:
 31 |             max_bleu_4 = bleu_rouge['Bleu-4']  
 32 |             save_model(model, epoch, train_loss, max_bleu_4, param.model_dir) 
 33 |         '''
 34 | 
 35 |     print('Train End.\n')
 36 |     
 37 | 
 38 | def train_epoch(model, epoch, loader, optimizer):
 39 |     print('Train epoch :', epoch)
 40 |     model.train()
 41 |                                          
 42 |     epoch_loss = 0.0
 43 |     nb_batch = 0
 44 |     for batch_idx, (question, paragraph, answer, question_length, paragraph_length) in enumerate(loader):
 45 |         nb_batch += 1
 46 | 
 47 |         question = Variable(question.long())
 48 |         paragraph = Variable(paragraph.long())
 49 |         answer = Variable(answer.long(), requires_grad = False)
 50 | 
 51 |         if torch.cuda.is_available() == True:
 52 |             question = question.cuda()
 53 |             paragraph = paragraph.cuda()
 54 |             answer = answer.cuda()
 55 | 
 56 |         batch_loss = model.get_loss(question, paragraph, answer, question_length, paragraph_length) 
 57 | 
 58 |         optimizer.zero_grad()
 59 |         batch_loss.backward()  
 60 |         #nn.utils.clip_grad_norm(model.parameters(), max_norm = 5.0)
 61 |         optimizer.step()
 62 |             
 63 |         epoch_loss += sum(batch_loss.data.cpu().numpy())
 64 |         print('-----epoch:', epoch, ' batch:',batch_idx,' train_loss:', batch_loss.data[0])
 65 |         
 66 |     epoch_loss = epoch_loss / nb_batch
 67 |     print('\nEpoch: ', epoch, ', Train Loss: ', epoch_loss, '\n')
 68 |     return epoch_loss
 69 | 
 70 | 
 71 | def eval_epoch(model, epoch, loader, idx2word):
 72 |     print('Eval epoch :', epoch)
 73 |     model.eval()
 74 |     
 75 |     nb_batch = 0
 76 |     epoch_pre, epoch_rec, epoch_f1, epoch_pred = 0, 0, 0, 0
 77 |     for batch_idx, (question, paragraph, answer, question_length, paragraph_length) in enumerate(loader):
 78 |         nb_batch += 1
 79 |         question = Variable(question.long()).cuda()
 80 |         paragraph = Variable(paragraph.long()).cuda()
 81 |         answer = Variable(answer.long(), requires_grad = False).cuda()
 82 | 
 83 |         pred_answer = model.get_answer(question, paragraph, question_length, paragraph_length)
 84 |         
 85 |         question = question.data.cpu().numpy()
 86 |         evidence = evidence.data.cpu().numpy()
 87 |         pre, rec, f1 , nb_pred = get_batch_scores(pred_tags, answer, question, evidence, idx2word)
 88 |         print('----epoch:', epoch, ' batch:',batch_idx,'  can_pred:', nb_pred, '   ||  pre: ', pre, '   rec: ', rec, '   f1  :', f1)
 89 |         
 90 | 
 91 |     print('\nEpoch: ', epoch, '  Pred: ', epoch_pred, '  || Pre:', epoch_pre, '    Rec:', epoch_rec,'    F1:', epoch_f1, '\n')
 92 |     return bleu_rouge
 93 | 
 94 | 
 95 | 
 96 | 
 97 | if __name__ == '__main__':
 98 | 
 99 |     print('Hey')
100 |     
101 |     
102 | 
103 |     
104 |     
105 |     
106 |     
107 |     
108 |     
109 | 


--------------------------------------------------------------------------------
/code/util.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import jieba
  3 | import pickle
  4 | import csv, h5py
  5 | import pandas as pd
  6 | import numpy as np
  7 | from tqdm import *
  8 | import torch
  9 | from torch import Tensor
 10 | from torch.autograd import Variable
 11 | import torch.utils.data as data
 12 | from main import Hyperparameters
 13 | from collections import Counter
 14 | 
 15 | STOP_TAG = "#stop#"   
 16 | UNK_TAG = "#unk#"   
 17 | 
 18 | def filter(ret, min_count):
 19 | 	count = pd.Series(ret).value_counts()
 20 | 	count = count[count >= min_count]
 21 | 	char_set = list(count.index)
 22 | 	return char_set
 23 | 
 24 | def get_vocab(param):
 25 | 	ret = []
 26 | 	with open(param.train_json_path) as f:
 27 | 		for line in tqdm(f):
 28 | 			line = json.loads(line)
 29 | 
 30 | 			if len(line['answer_docs']) == 0 or len(line['fake_answers']) == 0:
 31 | 				continue 
 32 | 
 33 | 			document = line['documents'][line['answer_docs'][0]]
 34 | 			paragraph = document['paragraphs'][document['most_related_para']]
 35 | 
 36 | 			for p in paragraph: ret.append(p)
 37 | 
 38 | 	ret = filter(ret, param.min_count)
 39 | 
 40 | 	ret = sorted(list(ret))
 41 | 	input_set = [STOP_TAG, UNK_TAG]
 42 | 	input_set.extend(list(ret))
 43 | 	input_set_size = len(input_set)
 44 | 	input2idx = dict(zip(input_set, range(input_set_size)))
 45 | 
 46 | 	print('Vacabulary size:', input_set_size, '\n')
 47 | 	return input2idx, input_set_size
 48 | 
 49 | 
 50 | def save_vocab(path, input2idx):
 51 | 	print('Saving bocabulary...')
 52 | 	f = open(path,'wb')
 53 | 	pickle.dump(input2idx, f)
 54 | 	f.close()
 55 | 
 56 | 
 57 | def load_vocab(path):
 58 | 	print('Loading vocabulary...')
 59 | 	f = open(path, 'rb')
 60 | 	input2idx = pickle.load(f)
 61 | 	input_set = list(input2idx.keys())
 62 | 	input_set_size = len(input_set)
 63 | 	f.close()
 64 | 	print('Vacabulary size:', input_set_size, '\n')
 65 | 	return input2idx, input_set_size
 66 | 
 67 | 
 68 | # ------------------ save h5py file --------------------------- #	
 69 | 	
 70 | 
 71 | def load_evidence_and_feats(evidence, feats, input2idx):
 72 | 	evidence_vector = []
 73 | 	feats_vector = []
 74 | 	for e, f in zip(evidence, feats):
 75 | 		if e in input2idx:
 76 | 			evidence_vector.append(input2idx[e])
 77 | 			feats_vector.append(f)	  
 78 | 	return evidence_vector, feats_vector, len(evidence_vector)
 79 | 
 80 | 
 81 | def pad_sequence(seq, seq_size, word2idx): 
 82 | 	vector = []
 83 | 	for i in range(seq_size):
 84 | 		if i >= len(seq):
 85 | 			vector.append(word2idx[STOP_TAG])
 86 | 		elif seq[i] not in word2idx:
 87 | 			vector.append(word2idx[UNK_TAG])
 88 | 		else:
 89 | 			vector.append(word2idx[seq[i]])
 90 | 
 91 | 	if len(seq) < seq_size: 
 92 | 		length = len(seq)
 93 | 	else: 
 94 | 		length = seq_size
 95 | 
 96 | 	return vector, length
 97 | 
 98 | 
 99 | def save_data(file, param, data, shape, i):
100 | 	if i <= param.batch_storage_size:
101 | 		for key, value in data.items():
102 | 			if value == []: continue
103 | 			file.create_dataset(key, data = value, maxshape = shape[key])
104 | 
105 | 	else:
106 | 		old_len = len(file['question'])
107 | 		new_len = old_len + len(data['question'])
108 | 
109 | 		for key, value in data.items():
110 | 			if value == []: continue
111 | 			new_shape = [new_len]
112 | 			for s in shape[key][1:]:
113 | 				new_shape.append(s)
114 | 			file[key].resize(new_shape)
115 | 
116 | 			file[key][old_len: new_len] = value
117 | 
118 | 	print(i)
119 | 
120 | 
121 | def get_train_data(param, line):
122 | 	document = line['documents'][line['answer_docs'][0]]
123 | 	#paragraph = document['paragraphs'][document['most_related_para']]
124 | 	segmented_paragraph = document['segmented_paragraphs'][document['most_related_para']]
125 | 	paragraph = ''.join(segmented_paragraph)
126 | 	if len(paragraph) > param.paragraph_size:
127 | 		return [], [], []
128 | 	paragraph, paragraph_length = pad_sequence(paragraph, param.paragraph_size, param.word2idx)
129 | 
130 | 	answer_span = line['answer_spans'][0]
131 | 	fake_answer = line['fake_answers'][0]
132 | 	answer_start = len(''.join(segmented_paragraph[:answer_span[0]])) 
133 | 	answer_end = len(''.join(segmented_paragraph[:answer_span[1]+1]))
134 | 	answer = [answer_start, answer_end]
135 | 
136 | 	return paragraph, paragraph_length, answer
137 | 
138 | def get_val_data(param, line):
139 | 	paragraphs, paragraph_lengths, answers = [], [], []
140 | 	documents = line['documents']
141 | 	question_tokens = line['segmented_question']
142 | 	for d in documents:
143 | 		para_infos = []
144 | 		for para_tokens in d['segmented_paragraphs']:				  
145 | 			common_with_question = Counter(para_tokens) & Counter(question_tokens)
146 | 			correct_preds = sum(common_with_question.values())
147 | 			if correct_preds == 0:
148 | 				recall_wrt_question = 0
149 | 			else:
150 | 				recall_wrt_question = float(correct_preds) / len(question_tokens)
151 | 			para_infos.append((para_tokens, recall_wrt_question, len(para_tokens)))
152 | 		para_infos.sort(key=lambda x: (-x[1], x[2]))
153 | 		fake_paragraph = ''.join(para_infos[0][0])
154 | 		if (len(fake_paragraph)) > param.paragraph_size:
155 | 			continue
156 | 		fake_paragraph, fake_paragraph_length = pad_sequence(fake_paragraph, param.paragraph_size, param.word2idx)
157 | 		paragraphs.append(fake_paragraph)
158 | 		paragraph_lengths.append(fake_paragraph_length)
159 | 	
160 | 	answers = line['answers']
161 | 
162 | 	return paragraphs, paragraph_lengths, answers
163 | 
164 | def save_h5py_file(param, old_path, new_path):
165 | 	print('Saving (', new_path, ')...')
166 | 	file = h5py.File(new_path,'w')
167 | 
168 | 	data = {'question_id':[], 'question_type':[], 'question':[], 'question_length':[], 
169 | 			'paragraph':[], 'answer':[], 'paragraph_length':[], 'paragraphs':[], 'paragraph_lengths':[]}
170 | 
171 | 	shape = {'question_id':(None,), 'question_type':(None,), 'question':(None, param.question_size), 'question_length':(None,), 
172 | 			'paragraph':(None, param.paragraph_size), 'answer':(None, 2), 'paragraph_length':(None,),
173 | 			'paragraphs':(None, None, param.paragraph_size), 'paragraph_lengths':(None, None,)}
174 | 	#evaluate = {}
175 | 
176 | 	i = 0
177 | 	with open(old_path) as f:
178 | 		for line in tqdm(f):
179 | 			line = json.loads(line)
180 | 			documents = line['documents']
181 | 
182 | 			question = line['question']
183 | 			question_id = line['question_id']
184 | 			question_type = line['question_type']
185 | 			question_tokens = line['segmented_question']
186 | 			if len(question) > param.question_size:
187 | 				continue
188 | 			
189 | 			# train
190 | 			if old_path == param.train_json_path:
191 | 				if len(line['answer_docs']) == 0 or len(line['fake_answers']) == 0:
192 | 					continue 
193 | 				paragraph, paragraph_length, answer = get_train_data(param, line)
194 | 				if paragraph == []: continue
195 | 
196 | 				data['paragraph'].append(paragraph)
197 | 				data['paragraph_length'].append(paragraph_length)
198 | 				data['answer'].append(answer)
199 | 				
200 | 			# val
201 | 			elif old_path == param.val_json_path:
202 | 				paragraphs, paragraph_lengths, answers = get_val_data(param, line)
203 | 				if paragraphs == []: continue
204 | 
205 | 				data['paragraphs'].append(paragraphs)
206 | 				data['paragraph_lengths'].append(paragraph_lengths)
207 | 				#data['answers'].append(answers)
208 | 				data['question_id'].append(question_id)
209 | 
210 | 			question, question_length = pad_sequence(question, param.question_size, param.word2idx)
211 | 			data['question'].append(question)
212 | 			data['question_length'].append(question_length)
213 | 
214 | 			# ---------------------------------
215 | 			i += 1
216 | 			if i % param.batch_storage_size == 0:
217 | 				save_data(file, param, data, shape, i)
218 | 				data = {'question_id':[], 'question_type':[], 'question':[], 'question_length':[], 
219 | 						'paragraph':[], 'answer':[], 'paragraph_length':[], 'paragraphs':[], 'paragraph_lengths':[]}
220 | 
221 | 		if i % param.batch_storage_size != 0:
222 | 			save_data(file, param, data, shape, i)
223 | 
224 | 	file.close()
225 | 	print('Dataset: ', i)
226 | 	
227 | 
228 | def get_answer():
229 | 	with open(param.val_json_path) as f:
230 | 		for line in tqdm(f):
231 | 			line = json.loads(line)
232 | 			question_id = line['question_id']
233 | 			answers = line['answers']
234 | 
235 | if __name__ == '__main__':
236 | 
237 | 	param =  Hyperparameters()  
238 | 
239 | 	# 5143
240 | 	#word2idx, word_set_size = get_vocab(param)
241 | 	#idx2word = dict(zip(word2idx.values(), word2idx.keys()))
242 | 	#print(word2idx['苏'], idx2word[520])
243 | 
244 | 	#save_vocab(param.vocab_path, word2idx)
245 | 	
246 | 	param.word2idx, param.vocab_size = load_vocab(param.vocab_path)
247 | 	param.idx2word = dict(zip(param.word2idx.values(), param.word2idx.keys()))
248 | 	#print(word2idx['苏'], idx2word[520])
249 | 
250 | 	
251 | 	#save_h5py_file(param, param.train_json_path, param.train_h5py_path)
252 | 	save_h5py_file(param, param.val_json_path, param.val_h5py_path)
253 | 
254 | 
255 | 	
256 | 	
257 | 	
258 | 	
259 | 	
260 |    
261 | 	
262 | 


--------------------------------------------------------------------------------
/data/pre.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/data/pre.docx


--------------------------------------------------------------------------------
/data/raw.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/data/raw.docx


--------------------------------------------------------------------------------
/data/readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/data/readme.txt


--------------------------------------------------------------------------------
/data/test.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/data/test.docx


--------------------------------------------------------------------------------
/data/vocab.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SeanLee97/duReader_pytorch/1d55022ed0a87054f9a0d6e012a75a6380984264/data/vocab.txt


--------------------------------------------------------------------------------