├── sequence_module ├── __init__.py ├── encoder.py ├── attention.py ├── decoder.py └── evaluation.py ├── .gitignore ├── README.md └── models ├── SiaGRU.py ├── ESIM.py ├── ABCNN2.py ├── BiMPM.py └── ABCNN.py /sequence_module/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.pickle 3 | .idea 4 | __pycache__ 5 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text-Similarity Method Implemented by Pytorch 2 | 3 | ## Method list 4 | 5 | 1. ESIM (Enhanced LSTM for Natural Language Inferenc) 6 | 2. SiaGRU (Siamese Recurrent Architectures for Learning Sentence Similarity) 7 | 3. ABCNN (ABCNN: Attention-Based Convolutional Neural Network for Modeling Sentence Pairs) 8 | 4. BiMPM (Bilateral Multi-Perspective Matching for Natural Language Sentences) 9 | 10 | -------------------------------------------------------------------------------- /models/SiaGRU.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class SiaGRU(nn.Module): 6 | def __init__(self, args): 7 | super(SiaGRU, self).__init__() 8 | self.args = args 9 | self.embeds_dim = args.embeds_dim 10 | num_word = 20000 11 | self.embeds = nn.Embedding(num_word, self.embeds_dim) 12 | self.ln_embeds = nn.LayerNorm(self.embeds_dim) 13 | self.hidden_size = args.hidden_size 14 | self.num_layer = args.num_layer 15 | self.gru = nn.LSTM(self.embeds_dim, self.hidden_size, batch_first=True, bidirectional=True, num_layers=2) 16 | self.h0 = self.init_hidden((2 * self.num_layer, 1, self.hidden_size)) 17 | 18 | def init_hidden(self, size): 19 | h0 = nn.Parameter(torch.randn(size)) 20 | nn.init.xavier_normal_(h0) 21 | return h0 22 | 23 | def forward_once(self, x): 24 | output, hidden, cell = self.gru(x) 25 | return hidden.squeeze() 26 | 27 | def forward(self, *input): 28 | # sent1: batch_size * seq_len 29 | sent1 = input[0] 30 | sent2 = input[1] 31 | 32 | # embeds: batch_size * seq_len => batch_size * seq_len * dim 33 | x1 = self.ln_embeds(self.embeds(sent1).transpose(1, 2).contiguous()).transpose(1, 2) 34 | x2 = self.ln_embeds(self.embeds(sent2).transpose(1, 2).contiguous()).transpose(1, 2) 35 | 36 | encoding1 = self.forward_once(x1) 37 | encoding2 = self.forward_once(x2) 38 | 39 | sim = torch.exp(-torch.norm(encoding1 - encoding2, p=2, dim=-1, keepdim=True)) 40 | return self.fc(sim) 41 | 42 | 43 | -------------------------------------------------------------------------------- /sequence_module/encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import torch.optim as optim 5 | import torch.nn.functional as F 6 | from torch.nn.utils.rnn import PackedSequence,pack_padded_sequence 7 | 8 | 9 | class Encoder(nn.Module): 10 | def __init__(self, input_size, embedding_size,hidden_size, n_layers=1,bidirec=False,dropout_p=0.5,use_cuda=False): 11 | super(Encoder, self).__init__() 12 | 13 | self.input_size = input_size 14 | self.hidden_size = hidden_size 15 | self.n_layers = n_layers 16 | self.dropout = nn.Dropout(dropout_p) 17 | self.embedding = nn.Embedding(input_size, embedding_size) 18 | self.use_cuda = use_cuda 19 | 20 | if bidirec: 21 | self.n_direction = 2 22 | self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True,bidirectional=True) 23 | else: 24 | self.n_direction = 1 25 | self.gru = nn.GRU(embedding_size, hidden_size, n_layers, batch_first=True) 26 | 27 | def init_hidden(self,inputs): 28 | hidden = Variable(torch.zeros(self.n_layers*self.n_direction,inputs.size(0),self.hidden_size)) 29 | return hidden.cuda() if self.use_cuda else hidden 30 | 31 | def init_weight(self): 32 | self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight) 33 | self.gru.weight_hh_l0 = nn.init.xavier_normal(self.gru.weight_hh_l0) 34 | self.gru.weight_ih_l0 = nn.init.xavier_normal(self.gru.weight_ih_l0) 35 | 36 | def forward(self, inputs, input_lengths): 37 | """ 38 | inputs : B,T (LongTensor) 39 | input_lengths : real lengths of input batch (list) 40 | """ 41 | hidden = self.init_hidden(inputs) 42 | 43 | embedded = self.embedding(inputs) 44 | embedded = self.dropout(embedded) 45 | packed = pack_padded_sequence(embedded, input_lengths,batch_first=True) 46 | outputs, hidden = self.gru(packed, hidden) 47 | outputs, output_lengths = torch.nn.utils.rnn.pad_packed_sequence(outputs,batch_first=True) # unpack (back to padded) 48 | 49 | if self.n_layers>1: 50 | if self.n_direction==2: 51 | hidden = hidden[-2:] 52 | else: 53 | hidden = hidden[-1] 54 | 55 | # B,T,D / B,1,D 56 | return outputs, torch.cat([h for h in hidden],1).unsqueeze(1) -------------------------------------------------------------------------------- /sequence_module/attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | 6 | 7 | class Attention(nn.Module): 8 | def __init__(self, hidden_size, method='general'): 9 | super(Attention, self).__init__() 10 | 11 | self.method = method 12 | self.hidden_size = hidden_size 13 | 14 | if self.method == 'general': 15 | self.attn = nn.Linear(self.hidden_size, hidden_size) 16 | 17 | elif self.method == 'concat': 18 | self.attn = nn.Linear(self.hidden_size * 2, hidden_size) 19 | self.v = nn.Parameter(torch.FloatTensor(1, hidden_size)) 20 | 21 | def forward(self, hidden, encoder_outputs,encoder_lengths=None,return_weight=False): 22 | """ 23 | hidden : query (previous hidden) B,1,D 24 | encoder_outputs : context (encoder outputs) B,T,D 25 | encoder_lengths : list[int] 26 | """ 27 | q, c = hidden, encoder_outputs 28 | 29 | batch_size_q, n_q, dim_q = q.size() 30 | batch_size_c, n_c, dim_c = c.size() 31 | 32 | if not (batch_size_q == batch_size_c): 33 | msg = 'batch size mismatch (query: {}, context: {}, value: {})' 34 | raise ValueError(msg.format(q.size(), c.size())) 35 | 36 | batch_size = batch_size_q 37 | 38 | s = self.score(q,c) 39 | 40 | # 인코딩 마스킹 41 | if encoder_lengths is not None: 42 | mask = s.data.new(batch_size, n_q, n_c) 43 | mask = self.fill_context_mask(mask, sizes=encoder_lengths, v_mask=float('-inf'), v_unmask=0) 44 | s = Variable(mask) + s 45 | 46 | # softmax로 normalize 47 | s_flat = s.view(batch_size * n_q, n_c) 48 | w_flat = F.softmax(s_flat,1) 49 | w = w_flat.view(batch_size, n_q, n_c) 50 | 51 | # Combine 52 | z = w.bmm(c) 53 | if return_weight: 54 | return w, z 55 | return z 56 | 57 | 58 | def score(self, q, c): 59 | """ 60 | q: B,1,D 61 | c: B,T,D 62 | """ 63 | if self.method == 'dot': 64 | return q.bmm(c.transpose(1, 2)) # B,1,D * B,D,T => B,1,T 65 | 66 | elif self.method == 'general': 67 | energy = self.attn(c) # B,T,D => B,T,D 68 | return q.bmm(energy.transpose(1,2)) # B,1,D * B,D,T => B,1,T 69 | 70 | elif self.method == 'concat': 71 | q = q.repeat(1,c.size(1),1) # B,T,D 72 | energy = self.attn(torch.cat([q, c], 2)) # B,T,2D => B,T,D 73 | v = self.v.repeat(c.size(1), 1).unsqueeze(1) # B,1,D 74 | return v.bmm(energy.transpose(1,2)) # B,1,D * B,D,T => B,1,T 75 | 76 | 77 | def fill_context_mask(self, mask, sizes, v_mask, v_unmask): 78 | """Fill attention mask inplace for a variable length context. 79 | Args 80 | ---- 81 | mask: Tensor of size (B, T, D) 82 | Tensor to fill with mask values. 83 | sizes: list[int] 84 | List giving the size of the context for each item in 85 | the batch. Positions beyond each size will be masked. 86 | v_mask: float 87 | Value to use for masked positions. 88 | v_unmask: float 89 | Value to use for unmasked positions. 90 | Returns 91 | ------- 92 | mask: 93 | Filled with values in {v_mask, v_unmask} 94 | """ 95 | mask.fill_(v_unmask) 96 | n_context = mask.size(2) 97 | for i, size in enumerate(sizes): 98 | if size < n_context: 99 | mask[i,:,size:] = v_mask 100 | return mask -------------------------------------------------------------------------------- /models/ESIM.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | 6 | class ESIM(nn.Module): 7 | def __init__(self, args): 8 | super(ESIM, self).__init__() 9 | self.args = args 10 | self.dropout = 0.5 11 | self.hidden_size = args.hidden_size 12 | self.embeds_dim = args.embeds_dim 13 | num_word = 20000 14 | self.embeds = nn.Embedding(num_word, self.embeds_dim) 15 | self.bn_embeds = nn.BatchNorm1d(self.embeds_dim) 16 | self.lstm1 = nn.LSTM(self.embeds_dim, self.hidden_size, batch_first=True, bidirectional=True) 17 | self.lstm2 = nn.LSTM(self.hidden_size*8, self.hidden_size, batch_first=True, bidirectional=True) 18 | 19 | self.fc = nn.Sequential( 20 | nn.BatchNorm1d(self.hidden_size * 8), 21 | nn.Linear(self.hidden_size * 8, args.linear_size), 22 | nn.ELU(inplace=True), 23 | nn.BatchNorm1d(args.linear_size), 24 | nn.Dropout(self.dropout), 25 | nn.Linear(args.linear_size, args.linear_size), 26 | nn.ELU(inplace=True), 27 | nn.BatchNorm1d(args.linear_size), 28 | nn.Dropout(self.dropout), 29 | nn.Linear(args.linear_size, 2), 30 | nn.Softmax(dim=-1) 31 | ) 32 | 33 | def soft_attention_align(self, x1, x2, mask1, mask2): 34 | ''' 35 | x1: batch_size * seq_len * dim 36 | x2: batch_size * seq_len * dim 37 | ''' 38 | # attention: batch_size * seq_len * seq_len 39 | attention = torch.matmul(x1, x2.transpose(1, 2)) 40 | mask1 = mask1.float().masked_fill_(mask1, float('-inf')) 41 | mask2 = mask2.float().masked_fill_(mask2, float('-inf')) 42 | 43 | # weight: batch_size * seq_len * seq_len 44 | weight1 = F.softmax(attention + mask2.unsqueeze(1), dim=-1) 45 | x1_align = torch.matmul(weight1, x2) 46 | weight2 = F.softmax(attention.transpose(1, 2) + mask1.unsqueeze(1), dim=-1) 47 | x2_align = torch.matmul(weight2, x1) 48 | # x_align: batch_size * seq_len * hidden_size 49 | 50 | return x1_align, x2_align 51 | 52 | def submul(self, x1, x2): 53 | mul = x1 * x2 54 | sub = x1 - x2 55 | return torch.cat([sub, mul], -1) 56 | 57 | def apply_multiple(self, x): 58 | # input: batch_size * seq_len * (2 * hidden_size) 59 | p1 = F.avg_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1) 60 | p2 = F.max_pool1d(x.transpose(1, 2), x.size(1)).squeeze(-1) 61 | # output: batch_size * (4 * hidden_size) 62 | return torch.cat([p1, p2], 1) 63 | 64 | def forward(self, *input): 65 | # batch_size * seq_len 66 | sent1, sent2 = input[0], input[1] 67 | mask1, mask2 = sent1.eq(0), sent2.eq(0) 68 | 69 | # embeds: batch_size * seq_len => batch_size * seq_len * dim 70 | x1 = self.bn_embeds(self.embeds(sent1).transpose(1, 2).contiguous()).transpose(1, 2) 71 | x2 = self.bn_embeds(self.embeds(sent2).transpose(1, 2).contiguous()).transpose(1, 2) 72 | 73 | # batch_size * seq_len * dim => batch_size * seq_len * hidden_size 74 | o1, _ = self.lstm1(x1) 75 | o2, _ = self.lstm1(x2) 76 | 77 | # Attention 78 | # batch_size * seq_len * hidden_size 79 | q1_align, q2_align = self.soft_attention_align(o1, o2, mask1, mask2) 80 | 81 | # Compose 82 | # batch_size * seq_len * (8 * hidden_size) 83 | q1_combined = torch.cat([o1, q1_align, self.submul(o1, q1_align)], -1) 84 | q2_combined = torch.cat([o2, q2_align, self.submul(o2, q2_align)], -1) 85 | 86 | # batch_size * seq_len * (2 * hidden_size) 87 | q1_compose, _ = self.lstm2(q1_combined) 88 | q2_compose, _ = self.lstm2(q2_combined) 89 | 90 | # Aggregate 91 | # input: batch_size * seq_len * (2 * hidden_size) 92 | # output: batch_size * (4 * hidden_size) 93 | q1_rep = self.apply_multiple(q1_compose) 94 | q2_rep = self.apply_multiple(q2_compose) 95 | 96 | # Classifier 97 | x = torch.cat([q1_rep, q2_rep], -1) 98 | similarity = self.fc(x) 99 | return similarity 100 | 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /models/ABCNN2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class ABCNN3(nn.Module): 7 | def __init__(self, args): 8 | super(ABCNN3, self).__init__() 9 | self.args = args 10 | self.embeds_dim = args.embeds_dim 11 | num_word = 20000 12 | self.embeds = nn.Embedding(num_word, self.embeds_dim) 13 | self.linear_size = args.linear_size 14 | self.num_layer = args.num_layer 15 | self.conv = nn.ModuleList([Wide_Conv(args.max_length, self.embeds_dim) for _ in range(self.num_layer)]) 16 | 17 | self.fc = nn.Sequential( 18 | nn.Linear(self.embeds_dim*(1+self.num_layer)*2, self.linear_size), 19 | nn.LayerNorm(self.linear_size), 20 | nn.ReLU(inplace=True), 21 | nn.Linear(self.linear_size, 2), 22 | nn.Softmax() 23 | ) 24 | 25 | def forward(self, *input): 26 | s1, s2 = input[0], input[1] 27 | mask1, mask2 = s1.eq(0), s2.eq(0) 28 | res = [[], []] 29 | s1, s2 = self.embeds(s1), self.embeds(s2) 30 | # eg: s1 => res[0] 31 | # (batch_size, seq_len, dim) => (batch_size, dim) 32 | # if num_layer == 0 33 | res[0].append(F.avg_pool1d(s1.transpose(1, 2), kernel_size=s1.size(1)).squeeze(-1)) 34 | res[1].append(F.avg_pool1d(s2.transpose(1, 2), kernel_size=s2.size(1)).squeeze(-1)) 35 | for i, conv in enumerate(self.conv): 36 | o1, o2 = conv(s1, s2, mask1, mask2) 37 | res[0].append(F.avg_pool1d(o1.transpose(1, 2), kernel_size=o1.size(1)).squeeze(-1)) 38 | res[1].append(F.avg_pool1d(o2.transpose(1, 2), kernel_size=o2.size(1)).squeeze(-1)) 39 | o1, o2 = attention_avg_pooling(o1, o2, mask1, mask2) 40 | s1, s2 = o1 + s1, o2 + s2 41 | # batch_size * (dim*(1+num_layer)*2) => batch_size * linear_size 42 | x = torch.cat([torch.cat(res[0], 1), torch.cat(res[1], 1)], 1) 43 | sim = self.fc(x) 44 | return sim 45 | 46 | 47 | class Wide_Conv(nn.Module): 48 | def __init__(self, seq_len, embeds_size): 49 | super(Wide_Conv, self).__init__() 50 | self.seq_len = seq_len 51 | self.embeds_size = embeds_size 52 | self.W = nn.Parameter(torch.randn([seq_len, embeds_size])) 53 | nn.init.xavier_normal_(self.W) 54 | self.conv = nn.Conv2d(in_channels=2, out_channels=1, kernel_size=3, padding=[1, 1], stride=1) 55 | self.tanh = nn.Tanh() 56 | 57 | def forward(self, sent1, sent2, mask1, mask2): 58 | ''' 59 | sent1, sent2: batch_size * seq_len * dim 60 | ''' 61 | # sent1, sent2 = sent1.transpose(0, 1), sent2.transpose(0, 1) 62 | # => A: batch_size * seq_len * seq_len 63 | A = match_score(sent1, sent2, mask1, mask2) 64 | # attn_feature_map1: batch_size * seq_len * dim 65 | attn_feature_map1 = A.matmul(self.W) 66 | attn_feature_map2 = A.transpose(1, 2).matmul(self.W) 67 | # x1: batch_size * 2 *seq_len * dim 68 | x1 = torch.cat([sent1.unsqueeze(1), attn_feature_map1.unsqueeze(1)], 1) 69 | x2 = torch.cat([sent2.unsqueeze(1), attn_feature_map2.unsqueeze(1)], 1) 70 | o1, o2 = self.conv(x1).squeeze(1), self.conv(x2).squeeze(1) 71 | o1, o2 = self.tanh(o1), self.tanh(o2) 72 | return o1, o2 73 | 74 | 75 | def match_score(s1, s2, mask1, mask2): 76 | ''' 77 | s1, s2: batch_size * seq_len * dim 78 | ''' 79 | batch, seq_len, dim = s1.shape 80 | s1 = s1 * mask1.eq(0).unsqueeze(2).float() 81 | s2 = s2 * mask2.eq(0).unsqueeze(2).float() 82 | s1 = s1.unsqueeze(2).repeat(1, 1, seq_len, 1) 83 | s2 = s2.unsqueeze(1).repeat(1, seq_len, 1, 1) 84 | a = s1 - s2 85 | a = torch.norm(a, dim=-1, p=2) 86 | return 1.0 / (1.0 + a) 87 | 88 | 89 | def attention_avg_pooling(sent1, sent2, mask1, mask2): 90 | # A: batch_size * seq_len * seq_len 91 | A = match_score(sent1, sent2, mask1, mask2) 92 | weight1 = torch.sum(A, -1) 93 | weight2 = torch.sum(A.transpose(1, 2), -1) 94 | s1 = sent1 * weight1.unsqueeze(2) 95 | s2 = sent2 * weight2.unsqueeze(2) 96 | s1 = F.avg_pool1d(s1.transpose(1, 2), kernel_size=3, padding=1, stride=1) 97 | s2 = F.avg_pool1d(s2.transpose(1, 2), kernel_size=3, padding=1, stride=1) 98 | s1, s2 = s1.transpose(1, 2), s2.transpose(1, 2) 99 | return s1, s2 100 | -------------------------------------------------------------------------------- /sequence_module/decoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from torch.autograd import Variable 4 | import torch.nn.functional as F 5 | from .attention import Attention 6 | 7 | 8 | class Decoder(nn.Module): 9 | def __init__(self, input_size, embedding_size, hidden_size, n_layers=1,dropout_p=0.3,use_cuda=False): 10 | super(Decoder, self).__init__() 11 | 12 | self.hidden_size = hidden_size 13 | self.n_layers = n_layers 14 | 15 | # Define the layers 16 | self.embedding = nn.Embedding(input_size, embedding_size) 17 | self.dropout = nn.Dropout(dropout_p) 18 | 19 | self.gru = nn.GRU(embedding_size+hidden_size, hidden_size, n_layers,batch_first=True) 20 | self.linear = nn.Linear(hidden_size*2, input_size) 21 | self.attention = Attention(hidden_size) 22 | self.use_cuda = use_cuda 23 | 24 | def init_hidden(self,inputs): 25 | hidden = Variable(torch.zeros(self.n_layers,inputs.size(0),self.hidden_size)) 26 | return hidden.cuda() if self.use_cuda else hidden 27 | 28 | 29 | def init_weight(self): 30 | self.embedding.weight = nn.init.xavier_uniform(self.embedding.weight) 31 | self.gru.weight_hh_l0 = nn.init.xavier_normal(self.gru.weight_hh_l0) 32 | self.gru.weight_ih_l0 = nn.init.xavier_normal(self.gru.weight_ih_l0) 33 | self.linear.weight = nn.init.xavier_uniform(self.linear.weight) 34 | 35 | 36 | def greedy_decode(self,inputs,context,max_length,encoder_outputs): 37 | embedded = self.embedding(inputs) 38 | hidden = self.init_hidden(inputs) 39 | 40 | decode=[] 41 | i=0 42 | # Apply GRU to the output so far 43 | decoded = inputs 44 | while decoded.data.tolist()[0]!=3: 45 | _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c) 46 | concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c) 47 | score = self.linear(concated.squeeze(0)) 48 | softmaxed = F.log_softmax(score,1) 49 | decode.append(softmaxed) 50 | decoded = softmaxed.max(1)[1] 51 | embedded = self.embedding(decoded).unsqueeze(1) # y_{t-1} 52 | 53 | # compute next context vector using attention 54 | context = self.attention(hidden, encoder_outputs,None) 55 | i+=1 56 | # column-wise concat, reshape!! 57 | scores = torch.cat(decode) 58 | return scores.max(1)[1] 59 | 60 | 61 | def beam_search_decode(self,inputs,context,max_length,encoder_outputs): 62 | embedded = self.embedding(inputs) 63 | hidden = self.init_hidden(inputs) 64 | _, hidden = self.gru(torch.cat((embedded,context),2), hidden) # h_t = f(h_{t-1},y_{t-1},c) 65 | concated = torch.cat((hidden,context.transpose(0,1)),2) # y_t = g(h_t,y_{t-1},c) 66 | score = self.linear(concated.squeeze(0)) 67 | 68 | beam = Beam([score,hidden],3) # beam init 69 | nodes = beam.get_next_nodes() 70 | i = 1 71 | 72 | while i